<?php
function convert_html_to_text($html) {
$html = fix_newlines($html);
$doc = new DOMDocument();
$doc->loadHTML( $html );
$output = iterate_over_node($doc);
$output = preg_replace("/[ \t]*\n[ \t]*/im", "\n", $output);
return trim($output);
}
function fix_newlines($text) {
$text = str_replace("\r\n", "\n", $text);
return str_replace("\r", "\n", $text);
}
function next_child_name($node) {
$nextNode = $node->nextSibling;
while ($nextNode != null) {
if ($nextNode instanceof DOMElement) {
break;
}
$nextNode = $nextNode->nextSibling;
}
$nextName = null;
if ($nextNode instanceof DOMElement && $nextNode != null) {
$nextName = strtolower($nextNode->nodeName);
}
return $nextName;
}
function prev_child_name($node) {
$nextNode = $node->previousSibling;
while ($nextNode != null) {
if ($nextNode instanceof DOMElement) {
break;
}
$nextNode = $nextNode->previousSibling;
}
$nextName = null;
if ($nextNode instanceof DOMElement && $nextNode != null) {
$nextName = strtolower($nextNode->nodeName);
}
return $nextName;
}
function iterate_over_node($node) {
if ($node instanceof DOMText) {
return preg_replace("/\\s+/im", " ", $node->wholeText);
}
if ($node instanceof DOMDocumentType) {
return "";
}
$nextName = next_child_name($node);
$prevName = prev_child_name($node);
$name = strtolower($node->nodeName);
switch ($name) {
case "hr":
return "------\n";
case "style":
case "head":
case "title":
case "meta":
case "script":
return "";
case "h1":
case "h2":
case "h3":
case "h4":
case "h5":
case "h6":
$output = "\n";
break;
case "p":
case "div":
$output = "\n";
break;
default:
$output = "";
break;
}
for ($i = 0; $i < $node->childNodes->length; $i++) {
$n = $node->childNodes->item($i);
$text = iterate_over_node($n);
$output .= $text;
}
switch ($name) {
case "style":
case "head":
case "title":
case "meta":
case "script":
return "";
case "h1":
case "h2":
case "h3":
case "h4":
case "h5":
case "h6":
$output .= "\n";
break;
case "p":
case "br":
if ($nextName != "div") {
$output .= "\n";
}
break;
case "div":
if ($nextName != "div" && $nextName != null) {
$output .= "\n";
}
break;
case "a":
$href = $node->getAttribute("href");
if ($href == null) {
if ($node->getAttribute("name") != null) {
$output = "[$output]";
}
} else {
if ($href == $output) {
$output;
} else {
$output = "$output ($href)";
}
}
switch ($nextName) {
case "h1": case "h2": case "h3": case "h4": case "h5": case "h6":
$output .= "\n";
break;
}
default:
}
return $output;
}