概括
爬虫这个话题已经不是什么新的话题,今天我还是要就这个做一些自己的思路整理
思路
1. 爬虫程序一定要适合各种网页,并且支持自定义网页打开的流程
2. 必须支持HTTPS,HTTP,支持代理防止封IP
3. 一定要利用计算机的不厌其烦的做重复的事情,给他注入灵活的思维
4. 支持自定义保存数据
思维导图
下面直接贴代码
配置文件
$config = array(
1 => array( //1 表示保存分类 比如这是体育
'news.163.com' => array( //KEY
'host' => 'https://sports.163.com/', //要抓取的地址
'char' => 'utf-8',//网页编码
'urls' => array(
1 => array('nba/','cba/','world/),//NBA,CBA,国际足球,13表示抓取后保存到本地的分类
2 => 'gjb/', //欧冠,怎么分类本地自定义好,
),
'listdom' => array( //这里是重点,这里记录网页的点击打开过程,这个依赖URLS中获得的内容
array(
'list' => array(
'tag' => 'div',
'attrs' => array(
'class' => 'topnews'
),
'title' => 'a', //标题 这里表示是取值A标签的text
'image' => false,//是否获得图片,false | true
'deltag' => array( //删除不要的内容防止取到不需要的数据
'tag' => array(
'div',
'span'
),
'attrs' => array( //标签过滤的条件 比如 标签名称:div ,属性 class="aaa"
array(
'class' => 'aaa'
),
array()
)
),
//下一个页面的连接 ,如果没有下一个页面 'linkDefault' => false
'linkDefault' => array(
'tag' => 'a',
'attrs' => 'href'
)
)
),
//第二步,每一个List都是一次用户的点击打开页面,在利用配置来获取不同的内容
array(
'list' => array(
'tag' => 'div',
'attrs' => array(
'class' => 'box news_text'
),
//内容过滤 可以支持docuemnt的方式,也可以直接写一条正则匹配,如果是正则,content 不是数组,是字符串
'content' => array(
'tag' => 'div',
'attrs' => array(
'class' => 'content'
)
),
'linkDefault' => false
)
)
)
)
);
爬虫类
class graspContent
{
private $index = 0;
private $list = array();
private $dir = '';
private $char = '';
public function graspContent_test($config, $argv, $type, &$fp)
{
$max = 5;
$start = 1;
if (isset($argv[1])) {
$stype = $argv[1];
}
if (isset($argv[2])) {
$start = $argv[2];
}
if (isset($argv[3])) {
$max = $argv[3];
}
$domain = '';
if (isset($argv[4])) {
$domain = $argv[4];
}
if (!isset($config[$type][$domain])) {
exit('domain no exists');
}
$urlparam = $config[$type][$domain]['urls'];
$error = '';
$host = $config[$type][$domain]['host'];
$urlArray = array();
$listConfigDom = $config[$type][$domain]['listdom'];
$max = $max + $start;
for ($i = $max; $i >=$start ; $i--) {
if (isset($urlparam[$stype])) {
if (is_array($urlparam[$stype])) {
for ($m = 0; $m < count($urlparam[$stype]); $m++) {
$urlArray[] = sprintf($host . $urlparam[$stype][$m], $i);
}
} else {
$urlArray[] = sprintf($host . $urlparam[$stype], $i);
}
}
}
$dom = new DOMDocument;
$linkHref = array();
for ($i = 0; $i < count($urlArray); $i++) {
//根据URL进行去重复
$dir = createDir(md5($urlArray[$i].'isdir'));
if($dir === false) {
continue;
}
$this -> dir = $dir;
$this -> char = $config[$type][$domain]['char'];
$str = getFarHtml($urlArray[$i], $config[$type][$domain]['char']);
echo 'list -' . $urlArray[$i] . "\n";
$contentArray = array();
$n = 0;
$this ->listDom($host, $str, $n, $dom, $contentArray, $listConfigDom);
foreach($contentArray['content'] as $n => $v) {
$content = $contentArray['content'][$n];
$title = $contentArray['title'];
$image = isset($contentArray['image']) ?$contentArray['image'] : array() ;
$url = $contentArray['url'];
$is_Cf = true;
if(empty($content) || (is_string($content) && strlen($content) <=100)) {
$is_Cf = false;
continue;
}
$_content = '';
if(is_array($content)) {
for ($mm = 0; $mm < count($content); $mm++) {
if(is_array($content[$mm])) {
for($nn = 0;$nn<count($content[$mm]);$nn++) {
if(is_array($content[$mm][$nn])) {
$listArray = $content[$mm][$nn];
for($kk=0;$kk< count($listArray);$kk++) {
if($type == 2) {
$_content .= $listArray[$kk]."\n";
}
if($type == 1) {
$_content .=$listArray[$kk];
}
if($type == 3) {
$_content .= $image[$n].','.$listArray[$kk];
}
}
} else {
if($type == 2) {
$_content .= $content[$mm][$nn]."\n";
}
if($type == 1) {
$_content .= $content[$mm][$nn];
}
if($type == 3) {
$_content .= $image[$n].','.$content[$mm][$nn];
}
}
}
} else {
if($type == 2) {
$_content .= $content[$mm]."\n";
}
if($type == 1) {
$_content .= $content[$mm];
}
if($type == 3) {
$_content .= $image[$n].','. $content[$mm];
}
}
}
}else {
if ($type == 3) {
$_content = $image[$n] . ',' . $content;
} else if ($type == 2) {
$_content = $content;
} else if ($type == 1) {
$_content = $content;
}
}
$_content = trim($_content);
if(empty($_content) || (is_string($_content) && strlen($_content) <=20)) {
$is_Cf = false;
continue;
}
$__str = $type.chr(0x1).$stype.chr(0x1).$title[$n].chr(0x1).$_content.chr(0x0);
//file_put_contents('aaa.txt',$__str,FILE_APPEND);
//echo $__str;
fwrite($fp,$__str);
if($is_Cf ==true) {
$urlfile = md5($url[$n].'isfile');
createFile($dir,$urlfile);
}
}
// file_put_contents('aaaaaaa.txt', print_r($contentArray, true));
}
fclose($fp);
}
private function listDom($host, $surceStr, $n = 0, DOMDocument $dom, &$contentArray, &$listConfigDom)
{
$linkHref = array();
echo "listDom $n\n";
$contentArray['content'][$this -> index] = '';
@$dom->loadHtml($surceStr);
echo 'load sourceStr OK!';
$listdom = array();
if (isset($listConfigDom[$n])) {
$listdom = $listConfigDom[$n];
}
$listDiv = $surceStr;
if (empty($listdom)) {
echo 'listdom empty!';
// return true;
$listDiv = $surceStr;
} else {
$list = $listdom['list'];
$tag = isset($list['tag']) ? $list['tag'] : null;
$attrs = isset($list['attrs']) ? $list['attrs'] : null;
if (!empty($tag) && !empty($attrs)) {
$domElement = $this -> getContentDom($tag, $attrs, $dom);
if($domElement instanceof DOMNode) {
$listDiv = $dom->saveHtml($domElement);
}
}
}
if(!empty($listDiv)) {
$listDiv = '<meta http-equiv="content-type" content="text/html;charset=utf-8">' . $listDiv;
$surceStr = $listDiv;
}
@$dom->loadHTML($listDiv);
if(isset($list['deltag'])) {
$_tags = $list['deltag']['tag'];
$_attrs = $list['deltag']['attrs'];
for ($_n = 0; $_n < count($_tags); $_n++) {
echo "del tag={$_tags[$_n]} attrs=".print_r($_attrs[$_n],true)."\n";
$node = $this->getContentDom($_tags[$_n], $_attrs[$_n], $dom);
while($node instanceof DOMElement ) {
$parentNode = $node->parentNode;
$node->parentNode->removeChild($node);
echo "del OK!\n";
$node = $this->getContentDom($_tags[$_n], $_attrs[$_n], $dom);
}
/* if ($node instanceof DOMElement) {
}*/
}
}
$title = '';
if (isset($list['title'])) {
$title = $list['title'];
}
$image = false;
if (isset($list['image'])) {
$image = $list['image'];
}
$content = '';
if (isset($list['content'])) {
$_cc = array();
if(isset($list['content']['tag']) && is_array($list['content']['tag'])) {
for($_c=0;$_c<count($list['content']);$_c++) {
$content = array('tag' => $list['content']['tag'][$_c],'attrs'=> $list['content']['attrs'][$_c]);
if (!empty($content)) {
if (is_string($content)) {
$urls = explode(':', $content);
$content = $urls[0];
if (preg_match_all($content, $surceStr, $a)) {
if (isset($a[1])) {
if (isset($urls[1])) {
for ($u = 0; $u < count($a[1]); $u++) {
$a[1][$u] = $urls[1]($a[1][$u]);
}
}
$cc [] = $a[1][0];
}
} else {
$contentArray['content'][ $this -> index ] = '';
}
} else {
$_cc[] = $this -> getContentToArray($content, $dom);
}
}
}
}else{
if(is_string($list['content'])) {
$urls = explode(':', $list['content']);
$content = $urls[0];
if (preg_match_all($content, $surceStr, $a)) {
if (isset($a[1])) {
if (isset($urls[1])) {
for ($u = 0; $u < count($a[1]); $u++) {
$a[1][$u] = $urls[1]($a[1][$u]);
}
}
$_cc [] = $a[1][0];
}
} else {
$_cc[] = '';
}
}else if(is_array($list['content'])) {
$_cc[] = $this -> getContentToArray($list['content'], $dom);
}
}
$contentArray['content'][ $this -> index ] = $_cc;
}
$linkDefault = false;
if (isset($list['linkDefault'])) {
if(!is_array($list['linkDefault'])) {
if($list['linkDefault'] == true) {
$linkDefault = array(
'tag' => 'a',
'attrs' => 'href',
);
}else{
$linkDefault = false;
}
}else{
$linkDefault = $list['linkDefault'];
}
}
if (!empty($title)) {
$contentArray['title'] = $this ->getContentToArray($title, $dom);
}
if ($image == true) {
$contentArray['image'] = $this ->getContentToArray(array('tag' => 'img', 'attrs' => 'src'), $dom);
}elseif(is_array($image)) {
$contentArray['image'] = $this ->getContentToArray($image, $dom);
}
if (!empty($linkDefault)) {
$linkHref = $this -> getContentToArray($linkDefault, $dom);
} else {
$linkHref = array();
}
if (!empty($linkHref)) {
for ($i = 0; $i < count($linkHref); $i++) {
$url = $linkHref[$i];
if (!preg_match('/^http/i', $url)) {
$url = $host . $url;
}
echo "get url " . $url . " n=$n \n";
if($n == 0) {
$urlfile = md5($url.'isfile');
$this -> index = $i;
$contentArray['url'][] = $url;
if(is_file($this->dir.$urlfile)) {
echo 'url is exists';
continue;
}
createFile($this -> dir,$urlfile);
}
$str = getFarHtml($url, $this->char);
// $str = @file_get_contents();
if (empty($str)) {
echo "link href is null url is $url\n";
$str = '';
}
$this ->listDom($host, $str, $n+1, $dom, $contentArray, $listConfigDom);
}
}
}
private function getContentToArray($contentIndex, &$dom)
{
if (is_array($contentIndex)) {
$tag = $contentIndex['tag'];
$attrs = $contentIndex['attrs'];
} else {
$tag = $contentIndex;
$attrs = '';
}
$tt = explode(':', $tag);
$tag = $tt[0];
$index = false;
if (isset($tt[1])) {
$index = $tt[1];
}
$books = $dom->getElementsByTagName($tag);
$n = 0;
$str = '';
print_r($contentIndex);echo "\n";
$contentArr = array();
foreach ($books as $book) {
if ($index !== false && $index != $n) {
continue;
}
if (!empty($attrs)) {
if (is_array($attrs)) {
foreach ($attrs as $k => $v) {
$cname = $book->getAttribute($k);
if ($cname == $v) {
$content = $book->nodeValue;
}
}
} else {
$s = $book->getAttribute($attrs);
if (!empty($s)) {
$content = $s;
}
}
} else {
$content = $book->nodeValue;;
}
if (!empty($content)) {
$contentArr[] = $content;
}
if($index !== false) {
$n++;
}
}
return $contentArr;
}
private function getContentDom($tag, $attrs, $dom)
{
$tt = explode(':', $tag);
$tag = $tt[0];
$index = false;
if (isset($tt[1])) {
$index = $tt[1];
}
$books = $dom->getElementsByTagName($tag);
$n = 0;
$str = '';
foreach ($books as $book) {
if ($index !== false && $index != $n) {
continue;
}
if (!empty($attrs)) {
if (is_array($attrs)) {
foreach ($attrs as $k => $v) {
$cname = $book->getAttribute($k);
if ($cname == $v) {
return $book; //找到了
$str = $dom->saveHTML($book);
break; //找到了
}
}
}
} else {
//echo aa
return $book; //找到了
//$str = $dom->saveHTML($book);
//break;
}
$n++;
}
return null;
}
}
网络CURL
这里网络访问做了简单的处理,没有考虑IP可能会被限制访问,以及登录限制等,这个大家可以去自己扩展
function getFarHtml($url, $sourchar)
{
$str = file_get_contents($url);
$char = $sourchar;
$tarchar = 'utf-8';
if ($char != $tarchar) {
$str = iconvTo($char, $tarchar, $str);
}
return $str;
}
CURL方法
这个方法可以做到登录,传入Cookie,并且支持HTTPS,HTTP代理,关于这快以后做个详细的文档
function curlGetFar($url,$headers = array(),$data=array(),$method="POST",&$reHeader,$isjson = false,$proxy=array(),$inchar='utf-8') {
$timeout = 30;
$isheader = true;
$ch = curl_init();
$getStr = '';
$inchar = strtolower($inchar);
if($inchar !='utf-8') {
foreach($data as $k => $v) {
$data[$k] = iconv('utf-8', $inchar,$v);
}
}
$h = array(
'Accept-Language: zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding: gzip, deflate',
'Connection: keep-alive',
'X-Requested-With: XMLHttpRequest',
// 'User-Agent:Mozilla/5.0 (Linux; Android 5.1.1; DUK-AL20 Build/LMY48Z) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36',
);
if($method == 'POST') {
curl_setopt($ch, CURLOPT_CUSTOMREQUEST, "POST");
if($isjson == true) {
$jsonStr = json_encode($data);
$h [] = 'Content-Type: application/json';
$h []= 'Content-Length: ' . strlen($jsonStr);
$data = $jsonStr;
}else{
$h[] = 'Content-type: application/x-www-form-urlencoded; charset=UTF-8';
$data = http_build_query($data);
}
curl_setopt($ch, CURLOPT_POSTFIELDS,$data);
}else{
foreach($data as $k => $v) {
// $v = urlencode($v);
$getStr .= "$k=$v&";
}
if(!empty($getStr)) {
if(strpos($url,'?') === false) {
$url .='?'.$getStr;
}else{
$url .='&'.$getStr;
}
}
}
$SSL = substr($url, 0, 8) == "https://" ? true : false;
//$url = urlencode($url);
if($SSL) {
@curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, true);
@curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 2);
@curl_setopt($ch, CURLOPT_SSL_OPTIONS, CURLSSLOPT_ALLOW_BEAST);
curl_setopt($ch, CURLOPT_SSLVERSION, 4);
curl_setopt($ch, CURLOPT_CAINFO, getcwd() . "/cacert.pem");
}
curl_setopt($ch, CURLOPT_ENCODING,'gzip');
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout-2);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
if(isset($proxy['ip']) && isset($proxy['port'])) {
curl_setopt($ch, CURLOPT_PROXY, $proxy['ip']);
curl_setopt($ch, CURLOPT_PROXYPORT, $proxy['port']);
}
if($isheader) {
curl_setopt($ch, CURLOPT_HEADER, true);
}
foreach($headers as $k => $v) {
$h[] = $k.": ".$v;
}
curl_setopt($ch, CURLOPT_HTTPHEADER, $h);
$ret = curl_exec($ch);
// print_r($ret);
//print_r(curl_error($ch));
if($isheader) {
$headerSize = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
$header = substr($ret, 0, $headerSize);
$reHeader = explode("\n",$header);
//$url302 = is302($reHeader);
/* if(!empty($url302)) {
// echo $url302;
setSession($reHeader);
$headers['Cookie'] = getCookieString();
$headers['Referer'] = $url;
return curlGetFar($url302,$headers,array(),'get',$reHeader,$inchar);
} */
$ret = substr($ret,$headerSize,strlen($ret));
//is302($header);
//echo $ret."------";
}
//Content-Type: text/html
if(preg_match('/content-type:\s*text/i',$header)) {
if($inchar != 'utf-8') {
// $ret = iconv($inchar, 'UTF-8', $ret);
}
}
curl_close($ch);
return $ret;
}
function checkHttpStatus($header) {
return false;
//HTTP/1.1 302 Object moved
if(preg_match('/http\/1.1\s*([\d]+)\s/i',$header,$a)) {
if($a[1] != 200) {
if($a[1] == 302) {
//Location: user.asp
if(preg_match('/Location:\s*([^\s]+)/i',$header,$_a)) {
if(isset($_a[1])) {
echo $_a[1];
return $_a[1];
}
}
}
}
}
return false;
}
function getGzipStr($str) {
$s = '';
$isGzip = false;
for($i=0;$i<strlen($str);$i++) { //判断是不是Gzip压缩的内容,需要进行解压
//ID1 = 31 (0x1F),ID2 = 139(0x8B)
$c = ord($str[$i]);
if($c == 31) {
if(ord($str[$i+1]) == 139) {
$isGzip = true;
}
}
if($isGzip == true) {
$s .= $str[$i];
}
}
return $s;
}
function checkHttpGzip($header){
//Content-Encoding: gzip
if(preg_match('/content-encoding\s*:\s*gzip/i',$header)) {
return true;
}
return false;
}
调用
需要建立一个脚本文件,调用这个类
$type = 0;
$stype = 0;
$type = 2; //防止错误
$fp = fopen(dirname(__FILE__).'/content','a'); //保存抓取成功的内容
$grasp = new graspContent();
//这个可以在命令行下执行,也可以在网页界面上执行
$grasp -> graspContent_test($config,$argv,$type,$fp);