我之前都是用的simplehtmldom http://sourceforge.net/projects/simplehtmldom/ 这个工具来处理,可以像 jquery 那样来处理文档,比较方便。如果不使用这个第三方的工具,可以用下面的方式简单处理。
工具Curl
DOMDocument
DomXPath
演示
抓取
简单演示不包含模拟登录。function curl_get($url,$referer=false)
{
$curl = curl_init();
if(strpos($url,"https://")===0){
$curl_version = curl_version();
$ssl_version = isset($curl_version['ssl_version']) ? $curl_version['ssl_version'] : "";
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false);
if(strpos($ssl_version,"OpenSSL")!==false){
curl_setopt($curl, CURLOPT_SSL_CIPHER_LIST, 'TLSv1');
}
}
curl_setopt($curl, CURLOPT_URL, $url);
if ($referer) {
curl_setopt ($curl, CURLOPT_REFERER, $referer);
}
curl_setopt($curl, CURLOPT_HEADER, 0);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($curl, CURLOPT_FOLLOWLOCATION,1);
curl_setopt($curl, CURLOPT_TIMEOUT, 5);
if (isset($_SERVER['HTTP_USER_AGENT'])) {
curl_setopt($curl, CURLOPT_USERAGENT, $_SERVER['HTTP_USER_AGENT']);
}else{
curl_setopt($curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36');
}
$data = curl_exec($curl);
curl_close($curl);
return $data;
}
处理
demo 1$doc = new DOMDocument();
error_reporting(E_ERROR);
// fix 线上乱码问题
$content = '' . $content;
if ($doc->loadHTML($content) === false) {
return false;
}
error_reporting(E_ALL);
$finder = new DomXPath($doc);
$elements = $finder->query("//div[contains(@class, 'show-content')]");
$article = '';
$element = $elements->item(0);
$children = $element->childNodes;
foreach ($children as $child) {
$article .= $element->ownerDocument->saveHTML($child);
}
demo 2//创建一个DomDocument对象,用于处理一个HTML
$dom = new DOMDocument();
//从一个字符串加载HTML
@$dom->loadHTML($html);
//使该HTML规范化
$dom->normalize();
//用DOMXpath加载DOM,用于查询
$xpath = new DOMXPath($dom);
#获取所有的a标签的地址
$hrefs = $xpath->evaluate("/html/body//a//@href");
for ($i = 0; $i < $hrefs->length; $i++) {
$href = $hrefs->item($i);
$linktext = $href->nodeValue;
echo $linktext;
echo "
";
}
XPath 表达式