用PHP编写爬虫的思路,抛砖引玉

概括

爬虫这个话题已经不是什么新的话题,今天我还是要就这个做一些自己的思路整理

思路

1. 爬虫程序一定要适合各种网页,并且支持自定义网页打开的流程
2. 必须支持HTTPS,HTTP,支持代理防止封IP
3. 一定要利用计算机的不厌其烦的做重复的事情,给他注入灵活的思维
4. 支持自定义保存数据

思维导图

在这里插入图片描述

下面直接贴代码

配置文件

$config = array(
    1 => array( //1 表示保存分类 比如这是体育
        'news.163.com' => array( //KEY
            'host' => 'https://sports.163.com/', //要抓取的地址
            'char' => 'utf-8',//网页编码
            'urls' => array(
                1 => array('nba/','cba/','world/),//NBA,CBA,国际足球,13表示抓取后保存到本地的分类
                2 => 'gjb/', //欧冠,怎么分类本地自定义好,
            ),
            'listdom' => array( //这里是重点,这里记录网页的点击打开过程,这个依赖URLS中获得的内容
                array(
                    'list' => array(
                        'tag' => 'div',
                        'attrs' => array(
                            'class' => 'topnews'
                        ),
                        'title' => 'a', //标题 这里表示是取值A标签的text
                        'image' => false,//是否获得图片,false | true
                        'deltag' => array( //删除不要的内容防止取到不需要的数据
                            'tag' => array(
                                'div',
                                'span'
                            ),
                            'attrs' => array( //标签过滤的条件 比如 标签名称:div ,属性 class="aaa"
                                array(
                                    'class' => 'aaa'
                                ),
                                array()
                            )
                        ),
                        //下一个页面的连接 ,如果没有下一个页面 'linkDefault' => false
                        'linkDefault' => array(
                            'tag' => 'a',
                            'attrs' => 'href'
                        )
                    )
                ),
                //第二步,每一个List都是一次用户的点击打开页面,在利用配置来获取不同的内容
                array(
                    'list' => array(
                        'tag' => 'div',
                        'attrs' => array(
                            'class' => 'box news_text'
                        ),

			//内容过滤 可以支持docuemnt的方式,也可以直接写一条正则匹配,如果是正则,content 不是数组,是字符串
                        'content' => array(
                            'tag' => 'div',
                            'attrs' => array(
                                'class' => 'content'
                            )
                        ),
                        'linkDefault' => false
                    )
                )
            )
        
        )
    );

爬虫类

class graspContent
{
    private $index = 0;

    private $list = array();

    private $dir = '';

    private $char = '';

    public function graspContent_test($config, $argv, $type, &$fp)
    {
        $max = 5;
        $start = 1;
        if (isset($argv[1])) {
            $stype = $argv[1];
        }

        if (isset($argv[2])) {
            $start = $argv[2];
        }

        if (isset($argv[3])) {
            $max = $argv[3];
        }
        $domain = '';

        if (isset($argv[4])) {
            $domain = $argv[4];
        }
        if (!isset($config[$type][$domain])) {
            exit('domain no exists');
        }

        $urlparam = $config[$type][$domain]['urls'];
        $error = '';
        
        $host = $config[$type][$domain]['host'];
        $urlArray = array();
        $listConfigDom = $config[$type][$domain]['listdom'];
        $max = $max + $start;
        for ($i = $max; $i >=$start ; $i--) {
            if (isset($urlparam[$stype])) {
                if (is_array($urlparam[$stype])) {
                    for ($m = 0; $m < count($urlparam[$stype]); $m++) {
                        $urlArray[] = sprintf($host . $urlparam[$stype][$m], $i);
                    }
                } else {
                    $urlArray[] = sprintf($host . $urlparam[$stype], $i);
                }
            }
        }

        $dom = new DOMDocument;
        $linkHref = array();
        for ($i = 0; $i < count($urlArray); $i++) {
		//根据URL进行去重复
            $dir = createDir(md5($urlArray[$i].'isdir'));
            if($dir === false) {
                continue;
            }
            $this -> dir = $dir;
            $this -> char = $config[$type][$domain]['char'];
            $str = getFarHtml($urlArray[$i], $config[$type][$domain]['char']);
            echo 'list -' . $urlArray[$i] . "\n";
            $contentArray = array();
            $n = 0;
            $this ->listDom($host, $str, $n, $dom, $contentArray, $listConfigDom);

            foreach($contentArray['content'] as $n => $v) {
                $content = $contentArray['content'][$n];
                $title = $contentArray['title'];
                $image = isset($contentArray['image']) ?$contentArray['image'] : array() ;
                $url = $contentArray['url'];
                $is_Cf = true;
                if(empty($content) || (is_string($content) && strlen($content) <=100)) {
                    $is_Cf = false;
                    continue;
                }
                $_content = '';
                if(is_array($content)) {
                    for ($mm = 0; $mm < count($content); $mm++) {
                        if(is_array($content[$mm])) {
                            for($nn = 0;$nn<count($content[$mm]);$nn++) {
                                if(is_array($content[$mm][$nn])) {
                                    $listArray = $content[$mm][$nn];
                                    for($kk=0;$kk< count($listArray);$kk++) {
                                        if($type == 2) {
                                            $_content .= $listArray[$kk]."\n";
                                        }
                                        if($type == 1) {
                                            $_content .=$listArray[$kk];
                                        }
                                        if($type == 3) {
                                            $_content .= $image[$n].','.$listArray[$kk];
                                        }
                                    }
                                } else {
                                    if($type == 2) {
                                        $_content .=  $content[$mm][$nn]."\n";
                                    }
                                    if($type == 1) {
                                        $_content .= $content[$mm][$nn];
                                    }
                                    if($type == 3) {
                                        $_content .= $image[$n].','.$content[$mm][$nn];
                                    }
                                }
                            }
                        } else {
                            if($type == 2) {
                                $_content .=  $content[$mm]."\n";
                            }
                            if($type == 1) {
                                $_content .= $content[$mm];
                            }
                            if($type == 3) {
                                $_content .= $image[$n].','. $content[$mm];
                            }
                        }
                    }
                }else {
                    if ($type == 3) {
                        $_content = $image[$n] . ',' . $content;
                    } else if ($type == 2) {
                        $_content = $content;
                    } else if ($type == 1) {
                        $_content = $content;
                    }
                }
                $_content =  trim($_content);
                if(empty($_content) || (is_string($_content) && strlen($_content) <=20)) {
                    $is_Cf = false;
                    continue;
                }
               
                 $__str = $type.chr(0x1).$stype.chr(0x1).$title[$n].chr(0x1).$_content.chr(0x0);
                //file_put_contents('aaa.txt',$__str,FILE_APPEND);
                //echo $__str;
                fwrite($fp,$__str);
                if($is_Cf ==true) {
                    $urlfile = md5($url[$n].'isfile');
                    createFile($dir,$urlfile);
                }
            }
           // file_put_contents('aaaaaaa.txt', print_r($contentArray, true));
        }
        fclose($fp);
    }

 
    
    private function listDom($host, $surceStr, $n = 0, DOMDocument $dom, &$contentArray, &$listConfigDom)
    {
        $linkHref = array();
        echo "listDom $n\n";
        $contentArray['content'][$this -> index] = '';
        @$dom->loadHtml($surceStr);

        echo 'load sourceStr OK!';
        $listdom = array();
        if (isset($listConfigDom[$n])) {
            $listdom = $listConfigDom[$n];
        }
        $listDiv = $surceStr;

        if (empty($listdom)) {
            echo 'listdom empty!';
            // return true;
            $listDiv = $surceStr;
        } else {
            $list = $listdom['list'];

            $tag = isset($list['tag']) ? $list['tag'] : null;
            $attrs = isset($list['attrs']) ? $list['attrs'] : null;
            if (!empty($tag) && !empty($attrs)) {
                $domElement = $this -> getContentDom($tag, $attrs, $dom);
                if($domElement instanceof DOMNode) {
                    $listDiv = $dom->saveHtml($domElement);
                }
            }
        }
        if(!empty($listDiv)) {
            $listDiv = '<meta http-equiv="content-type" content="text/html;charset=utf-8">' . $listDiv;
            $surceStr = $listDiv;
        }
        @$dom->loadHTML($listDiv);
        if(isset($list['deltag'])) {
            $_tags = $list['deltag']['tag'];
            $_attrs = $list['deltag']['attrs'];

            for ($_n = 0; $_n < count($_tags); $_n++) {
                echo "del tag={$_tags[$_n]} attrs=".print_r($_attrs[$_n],true)."\n";
                $node = $this->getContentDom($_tags[$_n], $_attrs[$_n], $dom);
                while($node instanceof DOMElement ) {
                    $parentNode = $node->parentNode;
                    $node->parentNode->removeChild($node);
                    echo "del OK!\n";
                    $node = $this->getContentDom($_tags[$_n], $_attrs[$_n], $dom);
                }
               /* if ($node instanceof DOMElement) {

                }*/
            }
        }
        $title = '';
        if (isset($list['title'])) {
            $title = $list['title'];
        }
        $image = false;
        if (isset($list['image'])) {
            $image = $list['image'];
        }
        $content = '';
        if (isset($list['content'])) {
            $_cc = array();
            if(isset($list['content']['tag']) && is_array($list['content']['tag'])) {
                for($_c=0;$_c<count($list['content']);$_c++) {
                    $content = array('tag' => $list['content']['tag'][$_c],'attrs'=> $list['content']['attrs'][$_c]);
                    if (!empty($content)) {
                        if (is_string($content)) {
                            $urls = explode(':', $content);
                            $content = $urls[0];
                            if (preg_match_all($content, $surceStr, $a)) {
                                if (isset($a[1])) {
                                    if (isset($urls[1])) {
                                        for ($u = 0; $u < count($a[1]); $u++) {
                                            $a[1][$u] = $urls[1]($a[1][$u]);
                                        }
                                    }
                                    $cc [] = $a[1][0];
                                }
                            } else {
                                $contentArray['content'][ $this -> index ] = '';
                            }
                        } else {
                            $_cc[] = $this -> getContentToArray($content, $dom);
                        }
                    }
                }
            }else{
                if(is_string($list['content'])) {
                    $urls = explode(':', $list['content']);
                    $content = $urls[0];
                    if (preg_match_all($content, $surceStr, $a)) {
                        if (isset($a[1])) {
                            if (isset($urls[1])) {
                                for ($u = 0; $u < count($a[1]); $u++) {
                                    $a[1][$u] = $urls[1]($a[1][$u]);
                                }
                            }
                            $_cc [] = $a[1][0];
                        }
                    } else {
                        $_cc[] = '';
                    }
                }else if(is_array($list['content'])) {
                    $_cc[] = $this -> getContentToArray($list['content'], $dom);
                }
            }
            $contentArray['content'][ $this -> index ] = $_cc;
        }

        $linkDefault = false;
        if (isset($list['linkDefault'])) {
            if(!is_array($list['linkDefault'])) {
                if($list['linkDefault'] == true) {
                    $linkDefault = array(
                        'tag' => 'a',
                        'attrs' => 'href',
                    );
                }else{
                    $linkDefault = false;
                }
            }else{
                $linkDefault = $list['linkDefault'];
            }
        }

        if (!empty($title)) {
            $contentArray['title'] = $this ->getContentToArray($title, $dom);
        }
        if ($image == true) {
            $contentArray['image'] = $this ->getContentToArray(array('tag' => 'img', 'attrs' => 'src'), $dom);
        }elseif(is_array($image)) {
            $contentArray['image'] = $this ->getContentToArray($image, $dom);
        }

        if (!empty($linkDefault)) {
            $linkHref = $this -> getContentToArray($linkDefault, $dom);
        } else {
            $linkHref = array();
        }

        if (!empty($linkHref)) {
            for ($i = 0; $i < count($linkHref); $i++) {
                $url = $linkHref[$i];

                if (!preg_match('/^http/i', $url)) {
                    $url = $host . $url;
                }
                echo "get url " . $url . " n=$n \n";
                if($n == 0) {
                    $urlfile = md5($url.'isfile');
                    $this -> index = $i;
                    $contentArray['url'][] = $url;
                    if(is_file($this->dir.$urlfile)) {
                        echo 'url is exists';
                        continue;
                    }
                    createFile($this -> dir,$urlfile);
                }

                $str = getFarHtml($url, $this->char);
               // $str = @file_get_contents();

                if (empty($str)) {
                    echo "link href is null url is $url\n";
                    $str = '';
                }
                $this ->listDom($host, $str, $n+1, $dom, $contentArray, $listConfigDom);
            }
        }
    }

    private function getContentToArray($contentIndex, &$dom)
    {

        if (is_array($contentIndex)) {
            $tag = $contentIndex['tag'];
            $attrs = $contentIndex['attrs'];

        } else {
            $tag = $contentIndex;
            $attrs = '';
        }

        $tt = explode(':', $tag);

        $tag = $tt[0];
        $index = false;
        if (isset($tt[1])) {
            $index = $tt[1];
        }
        $books = $dom->getElementsByTagName($tag);
        $n = 0;
        $str = '';

        print_r($contentIndex);echo "\n";
        $contentArr = array();
        foreach ($books as $book) {
            if ($index !== false && $index != $n) {
                continue;
            }
            if (!empty($attrs)) {
                if (is_array($attrs)) {
                    foreach ($attrs as $k => $v) {
                        $cname = $book->getAttribute($k);
                        if ($cname == $v) {
                            $content = $book->nodeValue;
                        }
                    }
                } else {
                    $s = $book->getAttribute($attrs);
                    if (!empty($s)) {
                        $content = $s;
                    }
                }
            } else {
                $content = $book->nodeValue;;
            }
            if (!empty($content)) {
                $contentArr[] = $content;
            }
            if($index !== false) {
                $n++;
            }
        }
        return $contentArr;
    }

    private function getContentDom($tag, $attrs, $dom)
    {

        $tt = explode(':', $tag);
        $tag = $tt[0];
        $index = false;
        if (isset($tt[1])) {
            $index = $tt[1];
        }
        $books = $dom->getElementsByTagName($tag);
        $n = 0;
        $str = '';
        foreach ($books as $book) {
            if ($index !== false && $index != $n) {
                continue;
            }
            if (!empty($attrs)) {
                if (is_array($attrs)) {

                    foreach ($attrs as $k => $v) {
                        $cname = $book->getAttribute($k);
                        if ($cname == $v) {
                            return $book; //找到了
                            $str = $dom->saveHTML($book);
                            break; //找到了
                        }
                    }
                }
            } else {
                //echo aa
                return $book; //找到了
                //$str = $dom->saveHTML($book);
                //break;
            }
            $n++;
        }

        return null;
    }
}

网络CURL

这里网络访问做了简单的处理,没有考虑IP可能会被限制访问,以及登录限制等,这个大家可以去自己扩展

function getFarHtml($url, $sourchar)
{
    $str = file_get_contents($url);

    $char = $sourchar;
    $tarchar = 'utf-8';
    if ($char != $tarchar) {
        $str = iconvTo($char, $tarchar, $str);
    }
    return $str;
}

CURL方法

这个方法可以做到登录,传入Cookie,并且支持HTTPS,HTTP代理,关于这快以后做个详细的文档

function curlGetFar($url,$headers = array(),$data=array(),$method="POST",&$reHeader,$isjson = false,$proxy=array(),$inchar='utf-8') {
    $timeout = 30;
    $isheader = true;
    $ch = curl_init();
    $getStr = '';
    $inchar = strtolower($inchar);
    if($inchar !='utf-8') {
        foreach($data as $k => $v) {
            $data[$k] = iconv('utf-8', $inchar,$v);
        }
    }
    $h = array(
        'Accept-Language: zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
        'Accept-Encoding: gzip, deflate',
        'Connection: keep-alive',
        'X-Requested-With: XMLHttpRequest',
       // 'User-Agent:Mozilla/5.0 (Linux; Android 5.1.1; DUK-AL20 Build/LMY48Z) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36',
    );
    if($method == 'POST')  {
        curl_setopt($ch, CURLOPT_CUSTOMREQUEST, "POST");
        if($isjson == true) {
            $jsonStr = json_encode($data);
            $h [] =    'Content-Type: application/json';
            $h []= 'Content-Length: ' . strlen($jsonStr);
            $data = $jsonStr;
        }else{
            $h[] =   'Content-type: application/x-www-form-urlencoded; charset=UTF-8';
            $data = http_build_query($data);
        }
        curl_setopt($ch, CURLOPT_POSTFIELDS,$data);
    }else{
        foreach($data as $k => $v) {
            // $v = urlencode($v);
            $getStr .= "$k=$v&";
        }
        if(!empty($getStr)) {
            if(strpos($url,'?') === false) {
                $url .='?'.$getStr;
            }else{
                $url .='&'.$getStr;
            }
        }
    }
    
    $SSL = substr($url, 0, 8) == "https://" ? true : false;
    //$url = urlencode($url);
    
    if($SSL) {
        @curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, true);
        @curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 2);
        @curl_setopt($ch, CURLOPT_SSL_OPTIONS, CURLSSLOPT_ALLOW_BEAST);
  
        curl_setopt($ch, CURLOPT_SSLVERSION, 4);
        curl_setopt($ch, CURLOPT_CAINFO, getcwd() . "/cacert.pem");
    }
    
    curl_setopt($ch, CURLOPT_ENCODING,'gzip');
    curl_setopt($ch, CURLOPT_URL, $url);
    curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
    curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout-2);
    
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
    
    if(isset($proxy['ip']) && isset($proxy['port'])) {
        curl_setopt($ch, CURLOPT_PROXY, $proxy['ip']);
        curl_setopt($ch, CURLOPT_PROXYPORT, $proxy['port']);
    }
    if($isheader) {
        curl_setopt($ch, CURLOPT_HEADER, true);
    }
    
    
    foreach($headers as $k => $v) {
        $h[] = $k.": ".$v;
    }
    
    curl_setopt($ch, CURLOPT_HTTPHEADER, $h);
    
    $ret = curl_exec($ch);
    // print_r($ret);
    //print_r(curl_error($ch));
    if($isheader) {
        $headerSize = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
        
        $header = substr($ret, 0, $headerSize);
        
        $reHeader = explode("\n",$header);
        //$url302 =  is302($reHeader);
        /* if(!empty($url302)) {
         // echo $url302;
         setSession($reHeader);
         $headers['Cookie'] = getCookieString();
         $headers['Referer'] = $url;
         return curlGetFar($url302,$headers,array(),'get',$reHeader,$inchar);
         } */
        $ret = substr($ret,$headerSize,strlen($ret));
        //is302($header);
        //echo $ret."------";
    }
    //Content-Type: text/html
    if(preg_match('/content-type:\s*text/i',$header)) {
        if($inchar != 'utf-8') {
          //  $ret = iconv($inchar, 'UTF-8', $ret);
        }
    }
    curl_close($ch);
    return $ret;
}

function checkHttpStatus($header) {
    return false;
    //HTTP/1.1 302 Object moved
    if(preg_match('/http\/1.1\s*([\d]+)\s/i',$header,$a)) {
        if($a[1] != 200) {
            if($a[1] == 302) {
                //Location: user.asp
                if(preg_match('/Location:\s*([^\s]+)/i',$header,$_a)) {
                    if(isset($_a[1])) {
                        echo $_a[1];
                        return $_a[1];
                    }
                }
            }
        }
    }
    return false;
}

function getGzipStr($str) {
    $s = '';
    $isGzip = false;
    for($i=0;$i<strlen($str);$i++) { //判断是不是Gzip压缩的内容,需要进行解压
        //ID1 = 31 (0x1F),ID2 = 139(0x8B)
        $c = ord($str[$i]);
        if($c == 31) {
            if(ord($str[$i+1]) == 139) {
                $isGzip = true;
            }
        }
        if($isGzip == true) {
            $s .= $str[$i];
        }
    }
    return $s;
}

function checkHttpGzip($header){
    //Content-Encoding: gzip
    if(preg_match('/content-encoding\s*:\s*gzip/i',$header)) {
        return true;
    }
    return false;
}

调用

需要建立一个脚本文件,调用这个类

$type = 0;
$stype = 0;

$type = 2; //防止错误
$fp = fopen(dirname(__FILE__).'/content','a'); //保存抓取成功的内容
$grasp = new graspContent();
//这个可以在命令行下执行,也可以在网页界面上执行
$grasp -> graspContent_test($config,$argv,$type,$fp);
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值