eechen
2015/08/31 09:54
回复 @木兰宿莽 :
用PHP抓网页一点都不难好不,DOM操作无非就是像jQuery那样写好选择器.
比如,使用PHP DOM操作库simple_html_dom.php,快速获取PHP官网文章标题/内容/时间:
<?php
require dirname(__FILE__).'/simple_html_dom.php';
$html = file_get_html('http://php.net/');
$news = array();
foreach($html->find('article.newsentry') as $article) {
$item['time'] = trim($article->find('time', 0)->plaintext);
$item['title'] = trim($article->find('h2.newstitle', 0)->plaintext);
$item['content'] = trim($article->find('div.newscontent', 0)->plaintext);
$news[] = $item;
}
print_r($news);
又比如利用PHP pthreads多线程高效抓取:
class WebRequest extends Thread {
public $url;
public $data;
public function __construct($url){
$this->url = $url;
}
public function run() {
$response = file_get_contents($this->url);
if ($response) {
$this->data = array($response);
}
}
}
$request = new WebRequest("http://pthreads.org");
$request->start();
$request->join();
var_dump($request->data);