c .php,c.php · yelstr/crawl - Gitee.com

/**

* Created by IntelliJ IDEA.

* User: yelstr

* Date: 2019/2/23

* Time: 11:48

*/

require_once __DIR__ . '/vendor/autoload.php';

ini_set('date.timezone','Asia/Shanghai');

global $type_map;

$type_map = [

'dianying' => 1,

'dongzuopian' => 2,

'kehuanpian' => 3,

'xijupian' => 4,

'kongbupian' => 5,

'aiqingpian' => 6,

'zhanzhengpian' => 7,

'juqingpian' => 8,

];

function http_get($url,$timeout=30) {

$user_agent = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.146 Safari/537.36";

$ch = curl_init($url);

curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 3);

curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); // 跳过证书检查

curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false); // 从证书中检查SSL加密算法是否存在

curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);

curl_setopt($ch, CURLOPT_CUSTOMREQUEST, 'GET');

curl_setopt($ch, CURLOPT_USERAGENT,$user_agent);

curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);

$res = curl_exec($ch);

curl_close($ch);

return $res;

}

function get_data_from_html($res)

{

global $type_map;

$dom = new \PHPHtmlParser\Dom();

$dom->load($res);

$contents = $dom->find('.movie-item');

$data = [];

foreach ($contents as $content) {

$a = $content->find('a')[0];

$href = $a->getAttribute('href');

$c_type = explode('/',$href)[1];

$movie_id = explode('_',trim($href,'/'))[1];

$img = $content->find('img')[0];

$img_src = $img->getAttribute('src');

$name = $img->getAttribute('title');

if ($other = $content->find('.otherinfo')[0]) {

$m_update_time = explode(':',$content->find('.otherinfo')[0]->innerHtml)[1];

} else {

$m_update_time = '';

}

if ($content->find('.hdtag')[0]) {

$movie_desc = $content->find('.hdtag')[0]->innerHtml;

} else {

$movie_desc = '';

}

$tmp = [

'video_id' => $movie_id,

'name' => $name,

'memo' => $movie_desc,

'main_img' => $img_src,

'detail_url' => 'http://www.bdfuli.com' . $href,

'first_cid' => 1,

'second_cid' => $type_map[$c_type] ?? -1,

'price' => 0,

];

$data[] = $tmp;

}

return $data;

}

function insert_data($data)

{

$user = 'root';

// $pwd = 'wu198702';

$pwd = '123456';

$db = 'ymr';

$mysql = new mysqli('127.0.0.1',$user,$pwd,$db,3306);

$query_sql = sprintf('select * from ymr_video where video_id="%s"',$data['video_id']);

$result = $mysql->query($query_sql);

if ($result == false) {

return ;

}

if ($result->fetch_assoc()) {

return ;

}

$sql = sprintf(

'insert into ymr_video (video_id,price,main_img,`name`,detail_url,first_cid,second_cid)

value("%s","%s","%s","%s","%s","%s","%s") '

,$data['video_id'],$data['price'],$data['main_img'],$data['name'],$data['detail_url'],$data['first_cid'],$data['second_cid']);

$mysql->query($sql);

return $mysql->affected_rows;

}

//$base_url = 'http://www.bdfuli.com/dianying/index_';

//

//$end = 1684;

//try {

// for($i=$end;$i>=1;$i--) {

//

// $url = sprintf("%s%d",$base_url,$i);

//

// $res = http_get($url);

//

// $data = get_data_from_html($res);

//

// foreach ($data as $one) {

// insert_data($one);

// }

// }

//} catch (\Throwable $e) {

// $str = sprintf("[%s] error_msg:%s\n",date('Y-m-d H:i:s'),$e->getMessage());

// file_put_contents(__DIR__ . '/d_log.log',$str,FILE_APPEND);

//}

class MovieCrawl

{

public $base_url = 'http://www.bdfuli.com/dianying/index_';

public $end = 1684;

public $start = 1;

public $log_file = __DIR__ . '/runtime/d_log.log';

public $pid;

public function __construct($start,$end)

{

$this->end = $end;

$this->start = $start;

try {

$this->pid = posix_getpid();

$this->run();

}catch (\Exception $e){

die('ALL ERROR: '.$e->getMessage());

}

}

public function run()

{

$process = new swoole_process(function(){

for($i=$this->end;$i>=$this->start;$i--) {

try {

$url = sprintf("%s%d",$this->base_url,$i);

$res = http_get($url);

$data = get_data_from_html($res);

foreach ($data as $one) {

insert_data($one);

}

} catch (\Throwable $e) {

$str = sprintf("[%s] error_msg:%s\n",date('Y-m-d H:i:s'),$e->getMessage());

file_put_contents($this->log_file,$str,FILE_APPEND);

}

}

});

$process->start();

}

}

if ($argc < 3) {

exit('input param error!');

}

$start_page = $argv[1];

$end_page = $argv[2];

$end = $end_page - $start_page + 1;

//拉去原始电影数据

new MovieCrawl(1,$end);

一键复制

编辑

Web IDE

原始数据

按行查看

历史

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值