** 不使用PhantomJs多线程采集,左转tp5使用curl特性进行定时多线程爬虫(或任务),使用redis队列
重要扩展
pcntl(推荐使用linux系统)
redis扩展
而且配置mysql,redis服务器
以及扩展程序
# phantomjs需要使用二进制文件,
# 注意如果使用了文件,那么文件阻塞,多进程变为单进程
composer require jaeger/querylist
composer require jaeger/querylist-phantomjs
# 可以不用querylist-curl-multi
composer require jaeger/querylist-curl-multi
composer require topthink/think-queue
主要程序
<?php
namespace app\index\controller;
use QL\Ext\CurlMulti;
use QL\Ext\PhantomJs;
use QL\QueryList;
use think\Db;
use think\Queue;
/**
* 执行cli:
* /vhs/php/php71/bin/php think do2
* 执行队列监听:
* /vhs/php/php71/bin/php think queue:work --queue PaChongShuJu --daemon
* /vhs/php/php71/bin/php think queue:listen --queue PaChongShuJu
*/
class Pcntl
{
function do() {
$end = 15399999;
$bengin = 15000949;
$max = $end - $bengin;
$workers = 200; // 进程数量
$pids = array();
for ($i = 0; $i < $workers; $i++) {
$pids[$i] = pcntl_fork();
switch ($pids[$i]) {
case -1:
echo "fork error : {$i} \r\n";
exit;
case 0:
$param = array(
'lastid' => (int) ceil($max / $workers) * $i,
'maxid' => (int) ceil($max / $workers) * ($i + 1),
'url_begen' => $end + (int) ceil($max / $workers) * $i,
'url_end' => $end + (int) ceil($max / $workers) * ($i + 1),
);
// dump($param);
$this->executeWorker($param);
exit;
default:
break;
}
}
foreach ($pids as $i => $pid) {
if ($pid) {
pcntl_waitpid($pid, $status);
}
}
}
/**
* 业务直接执行
* @param [type] $param [description]
* @return [type] [description]
*/
public function executeWorker($param)
{
// 每一个进程处理循环事件
for ($i = $param['url_begen']; $i < $param['url_end']; $i++) {
$url = 'http://www.whnews.cn/newss/node_' . $i . '.html';
$this->startnojs($url, $i);
}
}
/**
* 采集实现
* @param array $url [description]
* @return [type] [description]
*/
public function startnojs($url, $i)
{
$ql = QueryList::getInstance();
$ql->use(CurlMulti::class);
// PhantomJs使用,模拟浏览器browser运行js(部分网页不需要,看情况)
$ql->use(PhantomJs::class, '/home/lxx/phantomjs-2.1.1-linux-x86_64/bin/phantomjs');
$html = $ql->browser($url)->getHtml();
preg_match("/alt=\"(.*?)\" onload=/", $html, $m);
preg_match("/f_rb\">(.*?)<\/span>/", $html, $m2);
preg_match("/人:<\/label> <span>(.*?)<\/span><\/li>/", $html, $m3);
preg_match("/址:<\/label> <span>(.*?)<a/", $html, $m4);
$data = [
'compay' => $m[1] ?? '',
'mobile' => $m2[1] ?? '',
'name' => $m3[1] ?? '',
'address' => $m4[1] ?? '',
'page' => $i,
];
// dump($data);
if (isset($m[1])) {
$check = Db::table('pachong2')->where('compay', $m[1])->count();
if ($check < 1) {
// 推送到队列
$this->push($data);
}
}
}
/**
* 推送列队
* @param array $data [description]
* @return [type] [description]
*/
public function push($data = [])
{
$jobData = json_encode($data);
$jobHandlerClassName = 'app\index\controller\Job';
$jobQueueName = "PaChongShuJu";
$isPushed = Queue::push($jobHandlerClassName, $jobData, $jobQueueName);
if ($isPushed) {
echo "ok";
} else {
dump($isPushed);
}
}
}
队列文件
自行配置Command文件和控制器
<?php
namespace app\index\controller;
use think\Db;
use think\queue\Job as QueueJob;
class Job
{
public function fire(QueueJob $job, $data)
{
$pieces = json_encode($data);
$this->add_db($pieces);
if ($job->attempts() > 3) {
//通过这个方法可以检查这个任务已经重试了几次了
$job->delete();
}
//如果任务执行成功后 记得删除任务,不然这个任务会重复执行,直到达到最大重试次数后失败后,执行failed方法
$job->delete();
// 也可以重新发布这个任务
// $job->release($delay); //$delay为延迟时间
}
public function failed($data)
{
// ...任务达到最大重试次数后,失败了
}
public function add_db($data = [])
{
$data = (array) json_decode(json_decode($data));
$count = Db::table('pachong2')->where('compay', $data['compay'])->count();
if ($count == 0) {
Db::table('pachong2')->insert($data);
}
dump($data);
}
}
其他
单进程PhantomJs采集
<?php
namespace app\index\controller;
use QL\Ext\CurlMulti;
use QL\Ext\PhantomJs;
use QL\QueryList;
use think\Db;
class Index
{
public function index()
{
phpinfo();
}
public function more_openexcel()
{
header("Content-type:application/vnd.ms-excel");
header("Content-Disposition:attachement;filename=Haoyunyun_" . date("Ymd") . ".xls");
// 表头
$ReportArr[] = ['ID', '名称', '地址', '电话', 'p'];
Db::table('tp_logo_content')->chunk(100, function ($datas) use (&$ReportArr) {
foreach ($datas as $data) {
$ReportArr[] = [$data['id'], $data['title'], $data['keyword'], $data['content'], $data['thumbsup']];
}
});
$ReportContent = '';
$num1 = count($ReportArr);
for ($i = 0; $i < $num1; $i++) {
$num2 = count($ReportArr[$i]);
for ($j = 0; $j < $num2; $j++) {
$ReportContent .= '"' . $ReportArr[$i][$j] . '"' . "\t";
}
$ReportContent .= "\n";
}
// $ReportContent = mb_convert_encoding($ReportContent, "gb2312", "utf-8");
die($ReportContent); // 框架内推荐使用die
}
//http://www.whnews.cn/newss/node_15000000.html
function do() {
// 15000000
for ($i = 15000949; $i <= 15399999; $i++) {
$url = 'http://www.whnews.cn/newss/node_' . $i . '.html';
$this->startnojs($url, $i);
}
}
/**
* 采集
* /vhs/php/php71/bin/php think do
* @param array $url [description]
* @return [type] [description]
*/
public function startnojs($url, $i)
{
$ql = QueryList::getInstance();
$ql->use(CurlMulti::class);
$ql->use(PhantomJs::class, '/home/lxx/phantomjs-2.1.1-linux-x86_64/bin/phantomjs');
$html = $ql->browser($url)->getHtml();
preg_match("/alt=\"(.*?)\" onload=/", $html, $m);
preg_match("/f_rb\">(.*?)<\/span>/", $html, $m2);
preg_match("/人:<\/label> <span>(.*?)<\/span><\/li>/", $html, $m3);
preg_match("/址:<\/label> <span>(.*?)<a/", $html, $m4);
$data = [
'compay' => $m[1] ?? '',
'mobile' => $m2[1] ?? '',
'name' => $m3[1] ?? '',
'address' => $m4[1] ?? '',
'page' => $i,
];
// 文件进程阻塞,直接送到数据库,不使用redis
// $thid->push($data);
if (isset($m[1])) {
$check = Db::table('pachong')->where('compay', $m[1])->count();
if ($check < 1) {
Db::table('pachong')->insert($data);
dump($data);
}
}
}
}