php本身自带了,DomXpath,可以方便提取网页中的元素内容. 为了方便操作还需要启用tidy扩展包, windows系统在php.ini 中的extensions=php_tidy.dll行去掉注释即可.
<?php
$content= file_get_contents('http://php.net');
$document = new DOMDocument();
$document->strictErrorChecking = false;
//使用tidy规范网页中不规范的内容,否则可能导致document出错
$tidy = new tidy;
$content = $tidy->repairString($content);
//使用utf-8编码
$content = mb_convert_encoding($content, 'UTF-8');
$document->loadXML(
$content,
LIBXML_NOERROR | LIBXML_NONET | LIBXML_NOWARNING | LIBXML_NOCDATA
);
$xpath = new DOMXPAth($document);
//选择要查找的id
$elements = $xpath->query("//*[@id='intro']");
if (!is_null($elements)) {
foreach ($elements as $element) {
echo "[". $element->nodeName. "]";
}
}
?>