之前 php杂(一)这个过于简单,现在写个模板类来爬取整个webapp。
一般来说爬取webapp可以先获取他的菜单api,再列表api,最后内容api。
例子
<?php
/**
*
*/
class GetWeb
{
private $item = [];
//使用curl的get方式
public function get_url($value='')
{
$url = $value;
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_VERBOSE, true);
//curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_NOBODY, 0);
curl_setopt($ch, CURLOPT_CUSTOMREQUEST, 'GET');
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_TIMEOUT, 20);
curl_setopt($ch, CURLOPT_AUTOREFERER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
// 下面两行为不验证证书和 HOST,建议在此前判断 URL 是否是 HTTPS
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
// $ret 返回跳转信息
$text = $ret = curl_exec($ch);
// $info 以 array 形式返回跳转信息
$info = curl_getinfo($ch);
// 跳转后的 URL 信息
$retURL = $info['url'];
// 记得关闭curl
curl_close($ch);
return $ret;
}
//首页界面正则匹配出菜单函数
public function getcat($value = ''){
preg_match_all('/<div class="swiper-wrapper">(.*)<\/div>/',$value,$arr);
preg_match_all('/https:\/\/m.nfapp.southcn.com\/.*?"/',$arr[0][0],$arr2);
$arr3 = str_replace("\"", "", $arr2[0]);
return $arr3;
}
//列表界面正则匹配出列表内容以及内容链接
public function getlist($value = ''){
preg_match_all('/data-href="https:\/\/static.nfapp.southcn.com\/content\/.*?.html/',$value,$arr);
$arr2 = str_replace("data-href=\"", "", $arr[0]);
return $arr2;
}
public function getarticleId($value = ''){
preg_match_all('/data-article="(.*?)"/',$value,$arr);
//$arr2 = str_replace("data-article=\"", "", $arr[0]);
//$arr3 = str_replace("\"", "", $arr2);
return $arr[1];
}
//内容界面正则匹配出内容
public function getdetail($value = ''){
preg_match_all('/<p(.*)<\/p>/',$value,$arr);//抓取<p></p>所有内容
preg_match_all('/data-time="(.*?)"/',$value,$arr2);//抓取<p></p>所有内容
$value2 = str_replace(PHP_EOL, '', $value);
preg_match_all('/<div class="p-information" (.*)<\/div>/',$value2,$arr3);//抓取<p></p>所有内容
$this->item['cp_text'] = $this->delhtml($arr[1]);
$this->item['pubtime'] = strtotime($arr2[1][0]);
$this->item['source'] = $arr3;
return $value2;
}
//过滤内容,留下汉字或者英文
public function delhtml($value = ''){
$str = '';
if (is_array($value)) {
foreach($value as $k => $v){
$str .= filter_var($v,FILTER_SANITIZE_STRING);
$str = str_replace(' ', '', $str);
$str = trim($str);
}
}else{
$str .= filter_var($value,FILTER_SANITIZE_STRING);
$str = str_replace(' ', '', $str);
$str = trim($str);
}
return $str;
}
public function run(){
$url = 'https://m.nfapp.southcn.com/index/';
$row = $this->get_url($url);
$cat = $this->getcat($row);
//$cat_url = 'https://m.nfapp.southcn.com/index/';
$cat_url = 'https://m.nfapp.southcn.com/list/index?lastFileId=0&page=1';
$row = $this->get_url($cat_url);
$list = $this->getlist($row);
//$articleId = $this->getarticleId($row);
//$lastId = end($articleId);
//$cat_url = "https://m.nfapp.southcn.com/list/index?lastFileId=".$lastId."&page=1";
$detail_url = 'http://api.nfapp.southcn.com/nanfang_if/getArticleContent?articleId=2595382';
$row = $this->get_url($detail_url);
$detail = $this->getdetail($row);
var_dump($detail);
//var_dump($this->item);
}
}
$obj = new GetWeb();
$obj->run();
die();