php采集豆瓣评分,记一次爬取豆瓣电影详情(PHP)

帮朋友爬取豆瓣电影的介绍里面的内容,给他出个接口让他爬取,废话不多说了,上代码

----.png

简单的爬取分为两个文件

fectch.php

require "./getfunction.php";

$name = "复仇者联盟3:无限战争";

$url = "https://movie.douban.com/j/subject_suggest?q=".$name;

$curl = curl_init(); // 启动一个CURL会话

curl_setopt($curl, CURLOPT_URL, $url);

curl_setopt($curl, CURLOPT_HEADER, 0);

curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);

curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false); // 跳过证书检查

curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false);  // 从证书中检查SSL加密算法是否存在

$tmpInfo = curl_exec($curl);     //返回api的json对象

$tmpInfo = json_decode($tmpInfo);

// var_dump($tmpInfo);die;

$arrat_res = [];

foreach ($tmpInfo as $v) {

if ($name == $v->title) {

$arrat_res[] = $v;

}

}

if (empty($arrat_res)) {

$data = [

"code"=>10001,

"msg"=>"暂无片源信息"

];

echo json_encode($data);die;

}

$url2 = $arrat_res[0]->url;

curl_setopt($curl, CURLOPT_URL, $url2);

curl_setopt($curl, CURLOPT_HEADER, 0);

curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);

curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false); // 跳过证书检查

curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false);  // 从证书中检查SSL加密算法是否存在

$tmpInfo2 = curl_exec($curl);     //返回api的json对象

if (!$tmpInfo2) {

echo "
cURL error number:" .curl_errno($curl);

echo "
cURL error:" . curl_error($curl);

exit;

}

//创建一个DomDocument对象,用于处理一个HTML

$dom = new DOMDocument();

//从一个字符串加载HTML

@$dom->loadHTML($tmpInfo2);

//使该HTML规范化

$dom->normalize();

//用DOMXpath加载DOM,用于查询

$xpath = new DOMXPath($dom);

//获取导演信息

$directors = $xpath->evaluate("//*[@id='info']/span[1]/span[2]/a/text()");

$directors_res = "";

for ($i = 0; $i < $directors->length; $i++) {

$director = $directors->item($i);

$director = $director->nodeValue;

if ($i != 0) {

$directors_res = $directors_res.",".$director;

}else{

$directors_res = $director;

}

}

//名称

$name = $xpath->evaluate("//*[@id='content']/h1/span[1]/text()");

if (!empty($name->length)) {

$name = $name->item(0)->nodeValue;

}

//年份

$years = $xpath->evaluate("//*[@id='content']/h1/span[2]/text()");

if (!empty($years->length)) {

$years = $years->item(0)->nodeValue;

}

//海报

//*[@id="mainpic"]/a/img

$img = $xpath->evaluate("//*[@id='mainpic']/a/img/@src");

if (!empty($img->length)) {

$img = $img->item(0)->nodeValue;

}

// var_dump($img);die;

//是否上映

//*[@id="interest_sectl"]/div/div[2]/div/div[2]

$is_on = $xpath->evaluate("//*[@id='interest_sectl']/div/div[2]/div/div[2]");

if (!empty($is_on->length)) {

$is_on = $is_on->item(0)->nodeValue;

if (trim($is_on) == "尚未上映") {

$is_on = 1;

}else{

$is_on = 2;

}

}

// var_dump($is_on);die;

//获取编剧信息

$screenwriters = $xpath->evaluate("//*[@id='info']/span[2]/span[2]/a/text()");

$screenwriters_res = "";

for ($i = 0; $i < $screenwriters->length; $i++) {

$screenwriter = $screenwriters->item($i);

$screenwriter = $screenwriter->nodeValue;

if ($i != 0) {

$screenwriters_res = $screenwriters_res. ",".$screenwriter;

}else{

$screenwriters_res = $screenwriter;

}

}

//获取演员信息

//*[@id="info"]/span[3]/span[2]/span[1]/a

$actors = $xpath->query("//*[@id='info']/span[3]/span[2]");

$actors_res = "";

for ($i = 0; $i < $actors->length; $i++) {

$actor = $actors->item($i);

$actor = $actor->nodeValue;

if ($i != 0) {

$actors_res = $actors_res. ",".$actor;

}else{

$actors_res = $actor;

}

}

// $types = $xpath->query("//*[@id='info']/span[30]");

// var_dump($types->item(0)->nodeValue);die;

//获取类型

$getfunction = new getFunction();

$sear_res = $getfunction->getRes(5,"制片国家/地区:",$xpath);

$types_res = $sear_res["res"];

$num = $sear_res["num"];

//获取语言

$attr = [];

$langs = $xpath->evaluate("//*[@id='info']/text()");

for ($i = 0; $i < $langs->length; $i++) {

$lang = $langs->item($i);

$lang = $lang->nodeValue;

if (preg_match('/[\x{4e00}-\x{9fa5}]/u', $lang)>0) {

$attr [] = $lang;

}

}

// var_dump($attr);die;

// if (count($attr) == 3) {

//   // code...

// }

if ($is_on == 1) {

$show_res = "";

$sear2_res = $getfunction->getRes($num+4,"又名:",$xpath);

$time_res = $sear2_res["res"];

$num = $sear2_res["num"];

}else{

//获取上映时间

$sear2_res = $getfunction->getRes($num+4,"片长:",$xpath);

$time_res = $sear2_res["res"];

$num = $sear2_res["num"];

//时长

$sear3_res = $getfunction->getRes($num+1,"又名:",$xpath);

$show_res = $sear3_res["res"];

$num = $sear3_res["num"];

}

if (count($attr) == 4) {

$show_res = $show_res.$attr[2];

$country = $attr[0];

$languages = $attr[1];

$byname = $attr[3];

}else{

$country = $attr[0];

$languages = $attr[1];

$byname = $attr[2];

}

$imbd = "";

$urlim = $xpath->evaluate("//*[@id='info']/a[2]/@href");

if (!empty($urlim->length)) {

$urlim = $urlim->item(0)->nodeValue;

//获取url

$urls = "";

$urls = $xpath->evaluate("//*[@id='info']/a[1]/@href");

if (!empty($urls->length)) {

$urls = $urls->item(0)->nodeValue;

}

}else{

$urls = "";

$urlim = $xpath->evaluate("//*[@id='info']/a[1]/@href");

if (!empty($urlim->length)) {

$urlim = $urlim->item(0)->nodeValue;

}

}

$final_res = [

"all_name" => $name.$years,

"name" => $name,

"year" => $years,

"img" => $img,

"directors" => $directors_res,

"screenwriters" => $screenwriters_res,

"actors" => $actors_res,

"types" => $types_res,

"web_url" => $urls,

"country" => $country,

"languages" => $languages,

"ontime" => $time_res,

"showtime" => $show_res,

"byname" => $byname,

"imbd" => $urlim

];

$return = ["code"=>0, "msg"=>"抓取成功", "data"=>$final_res ];

echo json_encode($return);

getfunction.php

class getFunction{

public static function getRes($start,$key,$xpath){

$res = "";

$num = "";

// $key = "官方网站:";

for($i = $start; $i<30; $i++ ){

$types = $xpath->query("//*[@id='info']/span[".$i."]");

if (!empty($types->length)) {

$info_res = $types->item(0)->nodeValue;

if ($info_res == $key) {

$num = $i;

}elseif ($info_res == "官方网站:") {

$num = $i;

}else{

if(empty($num)){

if ($i != $start) {

$res = $res. ",".$info_res;

}else{

$res = $info_res;

}

}

}

}

}

$data = ["res"=>$res,"num"=>$num];

return $data;

}

}

效果图

QQ--20180611110823.png

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值