关闭

PHP多进程抓取百度搜索结果

标签: PHP多进程抓取百度搜索结果
140人阅读 评论(0) 收藏 举报
分类:
<?php
/**
 *  多进程抓取百度结果页自然结果,包括标题、摘要、图片、链接、来源
 *  @since 2016-04-15
 */
class NaturalResultSpider {

	private $_strQuery = null;

	public $worker_process = 4;      //开启进程数

	private $_arrPids = array();

	private $_intPageNum;         //需要抓取的自然结果页数

	public $arrAllResult = array();

	public $dataHandler = null;    //钩子,可以回调指定的函数完成对应功能

	private $masterPid = null;

	private $retry_times = 1;

	private $strReg = '/<div\sclass="result\sc-result\sc-clk-recommend"(.*)?>(.*)?(<img\ssrc="(.*)?">)?(.*)?(<p\sclass="c-line-clamp3\sc-color">(.*)?)+<\/div>/Uis';

	private static $_arrPattern = array(
		array('name'=>'nature_result', 'reg'=>'/data-log=\"(.*?)\"/', 'location'=>1),
		array('name'=>'title', 'reg'=>'/<h3(.*?)>(.*?)<\/h3>/', 'location'=>2),
		array('name'=>'abstract', 'reg'=>'/<p class=\"c-line-clamp3 c-color\">(.*?)<\/p>/', 'location'=>1),
		array('name'=>'source_url', 'reg'=>'/<div class=\"c-showurl c-line-clamp1\"><span>(.*?)<\/span>/', 'location'=>1),
		array('name'=>'url', 'reg'=>'/<div class=\"c-container\"><a(.*?)class=\"c-blocka\" href=\"(.*?)\">/', 'location'=>2),
		array('name'=>'img', 'reg'=>'/<div class=\"c-img c-img-s\"><img data-imagedelaysrc=\"(.*?)\"/', 'location'=>1),
	);

	public function __construct($strQuery, $intPageNum=76) {
		$this->_strQuery = $strQuery;
		$this->_intPageNum = $intPageNum;
	}

	public function execute() {
		$this->setMasterPid();
		$this->forkWorker();
		$this->monitorWorker();
	}

	private function setMasterPid() {
		$this->masterPid = posix_getpid();
	}

	public function setWorkerProcess($intWorkerProcess) {
		if ($intWorkerProcess <= 0) {
			return false;
		}
		$this->worker_process = $intWorkerProcess;
	}

	public function setRetryTimes($intTimes) {
		if ($intTimes <= 0) {
			return false;
		}
		$this->retry_times = $intTimes;
	}

	public function setRegPattern($strReg) {
		if (empty($strReg)) {
			return false;
		}
		$this->strReg = $strReg;
	}

	public function setPattern($arrPattern) {
		if (!is_array($arrPattern) || empty($arrPattern)) {
			return false;
		}
		self::$_arrPattern[] = $arrPattern;
	}

	private function monitorWorker() {
		if ($this->masterPid === posix_getpid()) {
			foreach ($this->_arrPids as $intPid) {
				pcntl_waitpid($intPid, $status, WUNTRACED);
				$status = pcntl_wexitstatus($status);
				if ($status === 100) {
					unset($this->_arrPids[$inPid]);
				}
			}
		}
	}

	/*主调用方法*/
	public function forkWorker() {

		for ($i=0; $i<$this->worker_process; ++$i) {

			$pid = pcntl_fork();

			if ($pid === -1) {
				exit;
			} elseif ($pid > 0) {
				$this->_arrPids[$pid] = $pid;
			} else {
				$arrResult = $this->run($i);
				if ($this->dataHandler) {
					call_user_func($this->dataHandler, $arrResult);
				}
				exit(100);
			}
		}
	}

	/*为worker分配任务*/
	private function run($intWorkerId) {

		$intPage = ceil($this->_intPageNum / $this->worker_process);

		$intBegin = $intWorkerId * $intPage;

		$intEnd = ($intWorkerId + 1) * $intPage;

		$intEnd = $intEnd > $this->_intPageNum ? $this->_intPageNum : $intEnd;

		for ($i=$intBegin; $i<$intEnd; ++$i) {

			$strUrl = 'm.baidu.com/s?word=' . urlencode($this->_strQuery);
			$strUrl .= $i == 0 ? '' : '&pn=' . $i*10;
			//如果失败则重试
			$error_times = 0;
			while (true) {
				if ($error_times >= $this->retry_times) {
					break;
				}
				$strHtml = $this->curl($strUrl);
				$arrMatches = $this->getHtmlContent($strHtml);
				$arrNaturalResult = $this->getNaturalResult($arrMatches);
				if (!empty($arrNaturalResult)) {
					$arrResult[$i] = $arrNaturalResult;
					break;
				}
				$error_times++;
			}
		}
		return $arrResult;
	}

	private function curl($url) {

		$ch = curl_init();

		curl_setopt($ch, CURLOPT_URL, $url);
		curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
		curl_setopt($ch, CURLOPT_TIMEOUT, 10);
		curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 30);

		$result = curl_exec($ch);

		if (curl_errno($ch)) {
			exit;
		}

		return $result;
	}

	public function getHtmlContent($strHtml) {

		if (empty($strHtml)) {
			return false;
		}

		preg_match_all($this->strReg, $strHtml, $arrMatches);

		return $arrMatches[0];
	}

	public function getNaturalResult($arrMatches) {

		if (empty($arrMatches) || !is_array($arrMatches)) {
			return false;
		}

		$arrNaturalResult = array();

		foreach ($arrMatches as $key=>$div) {

			foreach (self::$_arrPattern as $val) {
				$strName = $val['name'];
				$$strName = '';
			}

			foreach (self::$_arrPattern as $val) {

				$strName = $val['name'];

				preg_match_all($val['reg'], $div, $matches);

				if (!isset($matches[$val['location']][0])) {
					continue;
				}

				$$strName = isset($matches[$val['location']][0]) ? $matches[$val['location']][0] : '';

				if ($val['name'] === 'nature_result') {

					$$strName = str_replace('\'', '"', $$strName);
					$$strName = json_decode($$strName, true);
				} else {
					$$strName = strip_tags($$strName);
				}
				$arrNaturalResult[$key][$val['name']] = $$strName;
			}
		}

		return $arrNaturalResult;
	}
}

调用方法:

$obj = new NaturalResultSpider($strQuery, $pageNo);

指定需要抓取什么query的搜索结果,和抓取的页数,最多76页

$obj->setWorkerProcess(4);

指定4个进程进行抓取

$obj->setRetryTimes(3);

抓取失败重试次数

$obj->dataHandler = 'printRes';

指定回调方法进行数据处理

$obj->execute();

以上设置好之后开始运行

0
0

查看评论
* 以上用户言论只代表其个人观点,不代表CSDN网站的观点或立场
    个人资料
    • 访问:4614次
    • 积分:194
    • 等级:
    • 排名:千里之外
    • 原创:15篇
    • 转载:4篇
    • 译文:0篇
    • 评论:1条
    文章分类
    最新评论