Tp5+QueryList简单爬取,图片本地下载
直接上代码
public function index(Request $request)
{
$page = $request->param('id');
//return 2;
//采集地址
$url = "https://www.chinapp.com/pinpai/zhuangxiujiancai-0-0-$page";//255
//规则一
$rule_one = [
'href' => ['.company_intro_shop>a','href'], //详情地址
'introduce' => ['.company_intro','text'] //公司简介
];
//规则二
$rule_two = [
'title' => ['.pjct_xm_intro>a','text'], //设备标题
'company' => ['.company','text'], //所属公司
'product' => ['.zypro>span','text'], //主营产品
'company_img' => ['.pjct_xm_imgs>img','src'], //公司图片
'details' => ['.brd_jx>.brd_jx_intro>p','text'], //详情介绍
'details_img' => ['.brd_jx_imgs>img','src'] //详情图片
];
//采集详情地址
$data = QueryList::get($url)->rules($rule_one)->range('.pplb_item')->queryData();
//var_dump($data);
//拼接条件
$fake_url = "https://www.chinapp.com";
foreach ($data as $key => $value)
{
//拼接地址
$true_url = $fake_url . $value['href'];
// //采集需要信息
$arr = QueryList::get($true_url)->rules($rule_two)->queryData();
//var_dump($arr);
//采集以下信息
$ql = QueryList::get("$true_url");
//法定代表人
$arr['representative'] = $ql->find('.newAdd_pjct_xm_intro>li')->eq(0)->text();
//电话
$arr['tel']= $ql->find('.newAdd_pjct_xm_intro>li')->eq(1)->text();
//品牌发源地
$arr['address'] = $ql->find('.newAdd_pjct_xm_intro>li')->eq(2)->text();
//品牌创立时间
$arr['bct'] = $ql->find('.newAdd_pjct_xm_intro>li')->eq(3)->text();
//var_dump($arr);die;
//入库信息
$warehousing[] = [
'uid' => 1,
'introduce' => $value['introduce'],//公司简介
'title' => $arr['title'], //品牌标题
'company' => substr($arr['company'],15), //所属公司
'product' => $arr['product'], //主营产品
'company_img' => strrchr($arr['company_img'],'/'), //处理完成的公司图片
'fake_company_img' => $arr['company_img'], //未处理的公司图片
'details' => $arr['details'], //详情介绍
'details_img' => strrchr($arr['details_img'],'/'), //处理完成的详情图片
'fake_details_img' => $arr['details_img'], //未处理详情图片
'representative' => substr($arr['representative'],18), //法定代表人
'tel' => substr($arr['tel'],18), //电话
'address' => substr($arr['address'],18), //品牌发源地
'bct' => substr($arr['bct'],21), //品牌创立时间
'create_time' => date('Y-m-d H:i:s')
];
//查重
if ( !empty( $dc = Db::table()->where('title',$warehousing[0]['title'])->select()->toArray() ))
{
var_dump($dc);
return '数据已存在';
echo "<br />";
continue;
}
}
// var_dump($dc);
//var_dump($warehousing[0]['title']);die;
$con = count($warehousing);
echo "一共".$con."条数据";
echo "<br />";
foreach ($warehousing as $k => $v)
{
$data = $this->getImage($v['fake_company_img'],'public/zhuangxiu',$v['company_img']);
$this->getImage($v['fake_details_img'],'public/zhuangxiu',$v['details_img']);
}
if ($data['error'] != 0)
{
return '图片下载失败';
}
else
{
$res = Db::table()
->insertAll($warehousing);
echo "入库成功".$res."条数据";
echo "<br />";
}
}
/*
*功能:php完美实现下载远程图片保存到本地
*参数:文件url,保存文件目录,保存文件名称,使用的下载方式
*当保存文件名称为空时则使用远程文件原来的名称
*转载
*/
function getImage($url,$save_dir='',$filename='',$type=0)
{
if (trim($url) == '')
{
return array('file_name' => '', 'save_path' => '', 'error' => 1);
}
if (trim($save_dir) == '') {
$save_dir = './';
}
if (trim($filename) == '')
{ //保存文件名
$ext = strrchr($url, '.');
if ($ext != '.gif' && $ext != '.jpg' && $ext != '.png' && $ext != '.jpeg')
{
return array('file_name' => '', 'save_path' => '', 'error' => 3);
}
$filename = time() . $ext;
}
if (0 !== strrpos($save_dir, '/'))
{
$save_dir .= '/';
}
//创建保存目录
if (!file_exists($save_dir) && !mkdir($save_dir, 0777, true))
{
return array('file_name' => '', 'save_path' => '', 'error' => 5);
}
//获取远程文件所采用的方法
if ($type)
{
$ch = curl_init();
$timeout = 800;
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
$img = curl_exec($ch);
curl_close($ch);
}
else
{
ob_start();
readfile($url);
$img = ob_get_contents();
ob_end_clean();
}
//$size=strlen($img);
//文件大小
$fp2 = @fopen($save_dir . $filename, 'a');
fwrite($fp2, $img);
fclose($fp2);
unset($img, $url);
return array('file_name' => $filename, 'save_path' => $save_dir . $filename, 'error' => 0);
}