/**
* Created by IntelliJ IDEA.
* User: yelstr
* Date: 2019/2/23
* Time: 11:48
*/
require_once __DIR__ . '/vendor/autoload.php';
ini_set('date.timezone','Asia/Shanghai');
global $type_map;
$type_map = [
'dianying' => 1,
'dongzuopian' => 2,
'kehuanpian' => 3,
'xijupian' => 4,
'kongbupian' => 5,
'aiqingpian' => 6,
'zhanzhengpian' => 7,
'juqingpian' => 8,
];
function http_get($url,$timeout=30) {
$user_agent = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.146 Safari/537.36";
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 3);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); // 跳过证书检查
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false); // 从证书中检查SSL加密算法是否存在
curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
curl_setopt($ch, CURLOPT_CUSTOMREQUEST, 'GET');
curl_setopt($ch, CURLOPT_USERAGENT,$user_agent);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$res = curl_exec($ch);
curl_close($ch);
return $res;
}
function get_data_from_html($res)
{
global $type_map;
$dom = new \PHPHtmlParser\Dom();
$dom->load($res);
$contents = $dom->find('.movie-item');
$data = [];
foreach ($contents as $content) {
$a = $content->find('a')[0];
$href = $a->getAttribute('href');
$c_type = explode('/',$href)[1];
$movie_id = explode('_',trim($href,'/'))[1];
$img = $content->find('img')[0];
$img_src = $img->getAttribute('src');
$name = $img->getAttribute('title');
if ($other = $content->find('.otherinfo')[0]) {
$m_update_time = explode(':',$content->find('.otherinfo')[0]->innerHtml)[1];
} else {
$m_update_time = '';
}
if ($content->find('.hdtag')[0]) {
$movie_desc = $content->find('.hdtag')[0]->innerHtml;
} else {
$movie_desc = '';
}
$tmp = [
'video_id' => $movie_id,
'name' => $name,
'memo' => $movie_desc,
'main_img' => $img_src,
'detail_url' => 'http://www.bdfuli.com' . $href,
'first_cid' => 1,
'second_cid' => $type_map[$c_type] ?? -1,
'price' => 0,
];
$data[] = $tmp;
}
return $data;
}
function insert_data($data)
{
$user = 'root';
// $pwd = 'wu198702';
$pwd = '123456';
$db = 'ymr';
$mysql = new mysqli('127.0.0.1',$user,$pwd,$db,3306);
$query_sql = sprintf('select * from ymr_video where video_id="%s"',$data['video_id']);
$result = $mysql->query($query_sql);
if ($result == false) {
return ;
}
if ($result->fetch_assoc()) {
return ;
}
$sql = sprintf(
'insert into ymr_video (video_id,price,main_img,`name`,detail_url,first_cid,second_cid)
value("%s","%s","%s","%s","%s","%s","%s") '
,$data['video_id'],$data['price'],$data['main_img'],$data['name'],$data['detail_url'],$data['first_cid'],$data['second_cid']);
$mysql->query($sql);
return $mysql->affected_rows;
}
//$base_url = 'http://www.bdfuli.com/dianying/index_';
//
//$end = 1684;
//try {
// for($i=$end;$i>=1;$i--) {
//
// $url = sprintf("%s%d",$base_url,$i);
//
// $res = http_get($url);
//
// $data = get_data_from_html($res);
//
// foreach ($data as $one) {
// insert_data($one);
// }
// }
//} catch (\Throwable $e) {
// $str = sprintf("[%s] error_msg:%s\n",date('Y-m-d H:i:s'),$e->getMessage());
// file_put_contents(__DIR__ . '/d_log.log',$str,FILE_APPEND);
//}
class MovieCrawl
{
public $base_url = 'http://www.bdfuli.com/dianying/index_';
public $end = 1684;
public $start = 1;
public $log_file = __DIR__ . '/runtime/d_log.log';
public $pid;
public function __construct($start,$end)
{
$this->end = $end;
$this->start = $start;
try {
$this->pid = posix_getpid();
$this->run();
}catch (\Exception $e){
die('ALL ERROR: '.$e->getMessage());
}
}
public function run()
{
$process = new swoole_process(function(){
for($i=$this->end;$i>=$this->start;$i--) {
try {
$url = sprintf("%s%d",$this->base_url,$i);
$res = http_get($url);
$data = get_data_from_html($res);
foreach ($data as $one) {
insert_data($one);
}
} catch (\Throwable $e) {
$str = sprintf("[%s] error_msg:%s\n",date('Y-m-d H:i:s'),$e->getMessage());
file_put_contents($this->log_file,$str,FILE_APPEND);
}
}
});
$process->start();
}
}
if ($argc < 3) {
exit('input param error!');
}
$start_page = $argv[1];
$end_page = $argv[2];
$end = $end_page - $start_page + 1;
//拉去原始电影数据
new MovieCrawl(1,$end);
一键复制
编辑
Web IDE
原始数据
按行查看
历史