class HTML2Text {
private $htmlTextString = "";
/**
* @param $DOMNodeList
* @return bool
*/
private function traversalDOM($DOMNodeList)
{
// 判断传入的是否为nodeList
if (!is_iterable($DOMNodeList)) {
/* 传入的不是 NodeList */
return "";
} else {
foreach ($DOMNodeList as $DOMNode) {
// 判断 nodeList 的子元素是否为 nodeList
if (is_iterable($DOMNode)) {
/* 子元素还是 nodeList,递归,继续遍历 */
$this->traversalDOM($DOMNode);
}
else
{
/* 子元素不是 nodeList,而是 node */
// 判断该 node 是否有子元素
if ($DOMNode->hasChildNodes()) {
// 该 node 有子元素,取得所有所有子元素,存入一个 nodeList 中
$nodeList = $DOMNode->childNodes;
// 开始新一轮的 nodeList 遍历
$this->traversalDOM($nodeList);
}
else
{
// 该 node 没有子元素
if ($DOMNode->nodeValue != null) {
$this->htmlTextString .= $DOMNode->nodeValue . " ";
}
}
}
}
}
}
/**
* HTML2Text constructor.
* @param $htmlString
*/
public function getText($htmlString) {
$dom = new DOMDocument;
@$dom->loadHTML($htmlString);
$xpath = new DOMXPath($dom);
/* 移除 html 中的 script */
// 参考:https://stackoverflow.com/questions/7130867/remove-script-tag-from-html-content
$scripts = $dom->getElementsByTagName('script');
$elementsToRemove = [];
foreach($scripts as $script)
{
$elementsToRemove[] = $script;
}
foreach ($elementsToRemove as $element)
{
$element->parentNode->removeChild($element);
}
/* 移除HTML中的注释 */
// 参考:https://stackoverflow.com/questions/6305643/remove-comments-from-html-source-code
foreach ($xpath->query('//comment()') as $comment) {
$comment->parentNode->removeChild($comment);
}
/* html 转 text */
$nodes = $xpath->query("//body");
$this->traversalDOM($nodes);
return $this->htmlTextString;
}
}
一键复制
编辑
Web IDE
原始数据
按行查看
历史