小说爬虫php,PHP 爬虫爬取社区文章内容

namespace App\Console\Commands;

use Goutte\Client as GoutteClient;

use GuzzleHttp\Client as GuzzleClient;

use GuzzleHttp\Pool;

use Illuminate\Console\Command;

use Illuminate\Support\Facades\Storage;

class Spider extends Command

{

protected $signature = 'command:spider {concurrency} {keyWords*}'; //concurrency为并发数 keyWords为查询关键词

protected $description = 'php spider';

public function __construct()

{

parent::__construct();

}

public function handle()

{

//

$concurrency = $this->argument('concurrency'); //并发数

$keyWords = $this->argument('keyWords'); //查询关键词

$guzzleClent = new GuzzleClient();

$client = new GoutteClient();

$client->setClient($guzzleClent);

$request = function ($total) use ($client,$keyWords){

foreach ($keyWords as $key){

$url='https://laravel-china.org/search?q='.$key;

yield function () use($client,$url){

return $client->request('GET',$url);

};

}

};

$pool = new Pool($guzzleClent,$request(count($keyWords)),[

'concurrency' => $concurrency,

'fulfilled' => function ($response, $index) use ($client){

$response->filter('h2 > a')->reduce(function($node) use ($client){

if(strlen($node->attr('title'))==0) {

$title = $node->text(); //文章标题

$link = $node->attr('href'); //文章链接

$carwler = $client->request('GET',$link); //进入文章

$content=$carwler->filter('#emojify')->first()->text(); //获取内容

Storage::disk('local')->put($title,$content); //储存在本地

}

});

},

'rejected' => function ($reason, $index){

$this->error("Error is ".$reason);

}

]);

//开始爬取

$promise = $pool->promise();

$promise->wait();

}

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值