多进程爬站

爬取和解析分离,先将数据爬取到文件系统,方便留存,再解析保留有效数据。

  1. 爬取,登录验证码图片识别采用百度识别。
    发送curl类
<?php
class PhpCurl
{
    public $cookie = '';

    /**
     * 发送post请求
     *
     * @param [type] $url
     * @param array $params
     * @param integer $isShowHeader
     * @return void
     */
    function post_curl($url, $params = [], $isShowHeader = 0,$isJson=false,$header=[])
    {
        $ret = [];
        $ch = curl_init();
        
        curl_setopt($ch, CURLOPT_HEADER, $isShowHeader);
        curl_setopt($ch, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_1);
        curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36');
        curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 30);
        curl_setopt($ch, CURLOPT_TIMEOUT, 10);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
        curl_setopt($ch, CURLINFO_HEADER_OUT, true);
        curl_setopt($ch, CURLOPT_POST, true);
        if($isJson){
            curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($params));
        }else{
            curl_setopt($ch, CURLOPT_POSTFIELDS, http_build_query($params));
        }
        curl_setopt($ch, CURLOPT_URL, $url);
        // 设置请求头
        if(!empty($header)) curl_setopt($ch, CURLOPT_HTTPHEADER, $header);

        curl_setopt($ch, CURLOPT_COOKIE, $this->cookie);
        $response = curl_exec($ch);
        $info = curl_getinfo($ch);
        // $info1 = curl_getinfo($ch, CURLINFO_HEADER_OUT);
        // var_dump($info1);
        curl_close($ch);
        // echo '状态码'.$info['http_code'].'--'.$url.PHP_EOL;
        // echo 'cookie:'.$this->cookie.PHP_EOL;
        $ret['code'] = $info['http_code'];
        $ret['data'] = $response;
        return $ret;
    }

    function get_curl($url)
    {
        $curl = curl_init();
        curl_setopt($curl, CURLOPT_URL, $url);
        curl_setopt($curl, CURLOPT_HEADER, 0);
        curl_setopt($curl, CURLOPT_TIMEOUT, 10);
        curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
        curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);
        curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false);
        curl_setopt($curl, CURLINFO_HEADER_OUT, true);
        curl_setopt($curl, CURLOPT_COOKIE, $this->cookie);
        $data = curl_exec($curl);
        $info = curl_getinfo($curl);
        curl_close($curl);
        $ret['code'] = $info['http_code'];
        $ret['data'] = $data;
        return $ret;
    }
}

爬取站点继承自curl,创建有效的共享cookie池,数据库存储有效的JSESSIONID的cookie,发送爬取请求之前,获取未被占用的cookie,并标记为使用状态。

<?php
include_once realpath(__DIR__.'/../PhpCurl.php'); 
include_once realpath(__DIR__.'/../VerifyCode.php');
/**
 * 运维监控系统的登录逻辑类
 */
class PCYW extends PhpCurl
{
    public $db = null;
    public $count = 0;
    public $maxCount = 30;
    // const COOKIE_FILE = '../cookie/cookie';
    const TABLE_COOKIE = 'T_CURL_COOKIE';
    const SITE_FLAG = 'XXXXX';

    public function __construct($db)
    {
        $this->db = $db;
    }
    /**
     * 生成cookie
     *
     * @param [type] $url
     * @return void
     */
    private function start($url)
    {
        $re = $this->post_curl($url, [], 1);
        list($header, $body) = explode("\r\n\r\n", $re['data']);
        preg_match("/set\-cookie:([^\r\n]*)/i", $header, $matches);
        $cookie = explode(';', $matches[1])[0];
        $this->cookie = trim($cookie);
        return $re;
    }

    private function verifyCode($url)
    {
        $re = $this->post_curl($url);
        return $re['data'] ? $re['data'] : false;
    }

    /**
     * 登录 获取cookie
     *
     * @return void
     */
    public function login()
    {
        if($this->cookie!=''){
            $this->delCurlCookie($this->cookie);
        }
        if ($this->count > $this->maxCount) return false;
        $this->count++;
        $verifyImgName = './verifyImage/code' . rand(1, 99999) . '.jpg';  //验证码图片路径
        $loginIspUrl = 'http://ip:port/baf/jsp/uiframe/login.jsp'; //登录页url
        $loginUrl = 'http://ip:port/jf/login/checkLogin'; //登录操作url
        $codeUrl = 'http://ip:port/servlet/ValidateCodeServlet'; //验证码url

        $this->start($loginIspUrl);
        $imgSource = $this->verifyCode($codeUrl);    //获取验证码图片
        $im = imagecreatefromstring($imgSource);
        imagejpeg($im, $verifyImgName);
        imagedestroy($im);

        $code = VerifyCode::Do($verifyImgName);     //识别验证码图片
        @unlink($verifyImgName);
        if (strlen($code) != 4) {
            return $this->login();
        }
        $re = $this->post_curl($loginUrl, [
            'loginName' => 'XXXX',
            'password' => 'XXXXXXXXXXXXXXXXXX',
            'picCode' => $code
        ]);
        $qwe = json_decode($re['data'], true);
        if ($re['data'] && !is_null($qwe) && isset($qwe['status']) && $qwe['status'] == '1') {
            $this->setCurlCookie();
            // echo '登陆成'.PHP_EOL;
            // echo 'cookieLogin:'. $this->cookie.PHP_EOL;
            return true;
        } else {
            // echo '登陆失败,chongshi '.PHP_EOL;
            sleep(1);
            return $this->login();
        }
    }

    // 不使用文件系统存储cookie 是因为多个脚本同时启动的时候 要加锁 麻烦! 所以选用数据库(自带锁。。)
    function getCurlCookie()
    {
        // $cok = file_get_contents(self::COOKIE_FILE);
        // $arr = explode(PHP_EOL,$cok);
        // foreach ($arr as $key => $value) {
        //     $a = json_decode($value,true);
        //     if(is_null($a)) return '';
        //     if(isset($a['using'])&&$a['using']==false){
        //         return $a['cookie'];
        //     }
        // }
        $res = $this->db->selectOne(self::TABLE_COOKIE,['USING'=>'0','SITE_FLAG'=>self::SITE_FLAG]);
        if($res){
            return $res['JS'];
        }else{
            return '';
        }
    }

    function setCurlCookie()
    {
        // $fpcontent = json_encode([
        //     'cookie'=>$this->cookie,
        //     'using'=>true
        // ]);
        // file_put_contents(self::COOKIE_FILE,$fpcontent.PHP_EOL,FILE_APPEND);
        $this->db->insert(self::TABLE_COOKIE,[
            'JS'=>$this->cookie,
            'USING'=>'1',
            'SITE_FLAG'=>self::SITE_FLAG
        ]);
        return true;
    }

    function updateCurlCookie($cookie,$using)
    {
        // $cok = file_get_contents(self::COOKIE_FILE);
        // $arr = explode(PHP_EOL,$cok);
        // $new = [];
        // foreach ($arr as $key => $value) {
        //     $a = json_decode($value,true);
        //     if(is_null($a)) continue;
        //     if(isset($a['cookie'])&&$a['cookie']==$cookie){
        //         $a['using'] = $using;
        //     }
        //     array_push($new,json_encode($a));
        // }
        // file_put_contents(self::COOKIE_FILE,implode(PHP_EOL,$new));
        $using = $using?'1':'0';
        $res = $this->db->selectOne(self::TABLE_COOKIE,['JS'=>$cookie,'SITE_FLAG'=>self::SITE_FLAG]);
        if($res){
            $this->db->update(self::TABLE_COOKIE,['SID'=>$res['SID']],['USING'=>$using]);
        }
        return true;
    }

    function delCurlCookie($cookie)
    {
        // $cok = file_get_contents(self::COOKIE_FILE);
        // $arr = explode(PHP_EOL,$cok);
        // foreach ($arr as $key => $value) {
        //     $a = json_decode($value,true);
        //     if(is_null($a)) return '';
        //     if(isset($a['cookie'])&&$a['cookie']==$cookie){
        //         unset($arr[$key]);
        //         file_put_contents(self::COOKIE_FILE,implode(PHP_EOL,$arr));
        //         return true;
        //     }
        // }
        $st = $this->db->delete(self::TABLE_COOKIE,['JS'=>$cookie,'SITE_FLAG'=>self::SITE_FLAG]);
        return true;
    }

    function __destruct()
    {
        $this->updateCurlCookie($this->cookie,false);
    }
}

爬取数据方法 基类

<?php 
include_once './md.php';
/**
 * 爬取数据方法 基类
 */
class FuncBase
{
    public $md = null;

    public function __construct($num)
    {
        $this->md = new md($num);
    }
	
	/**
	 * 递归创建文件夹
	 * /
    function createDir($dir){  
        return  is_dir($dir) or $this->createDir(dirname($dir)) and  mkdir($dir,0777);
    }
	/**
	 * 带颜色的进度条  
	 * 总数
	 * 当前
	 * /
    function progress($total, $current)
    {
        if (0 == $total) return;
        if ($current > $total) $current = $total;
        $percent = $current / $total * 100;
        $percent_current = floor($percent / 2);
        $percent_remain = 100 / 2 - $percent_current;
        echo "\033[?25l\033[42m" . str_pad(' ', $percent_current) . "\033[0m"
            . ($percent_remain ? "\033[41m" . str_pad(' ', $percent_remain) . "\033[0m" : "")
            . " \033[32m" . sprintf('%.2f', $percent) . "%\033[0m\r";
        if ($current == $total) echo PHP_EOL;
    }
}

运维监控系统的爬取数据方法 实现 类

<?php
include_once realpath(__DIR__.'/../PC/PCYW.php');
include_once realpath(__DIR__.'/../FuncBase.php');
use voku\helper\HtmlDomParser;

/**
 *  运维监控系统爬取数据 方法实现 类
 */
class YWFunc extends FuncBase
{
    public $curl = null;
    public $db =  null;
    public $j_id = '';
    const ERROR_DIR = './error';

    function __construct($db, $num = 4)
    {
        parent::__construct($num);
        $this->db = $db;
        $this->curl = new PCYW($this->db);

        //ping
        $cookie = $this->curl->getCurlCookie();
        $this->curl->cookie = $cookie;
        $this->curl->updateCurlCookie($cookie,true);
        $this->sendCurl('http://ip:port/baf/jsp/bulletin/viewBulletin.xhtml', [], 'get');
        $url = 'http://ip:port/business/resMge/pwMge/performanceMge/perfdata.xhtml';
        $res1 = $this->sendCurl($url, [], 'get');
        preg_match("/id=\"javax.faces.ViewState\" value=\"(\w+)\"/", $res1['data'], $matchArr);
        $this->j_id = $matchArr[1];
        // echo 'cookie: '.$this->curl->cookie.PHP_EOL;
        // echo '状态码:'.$res1['code'].'------j_id:'.$this->j_id.PHP_EOL;
    }

    function reTry($retry, $fn)
    {
        if ($retry > 0) {
            $st = call_user_func($fn, $retry);
            // if(!$st){
            //     $f = fopen('./qwer.txt','a');
            //     fwrite($f,$retry.PHP_EOL);
            //     fclose($f);
            //     $this->reTry($retry-1,$fn);
            // }
        }
    }
	
	function getMonitorData()
	{
		// ...
	}

}

多进程实现

<?php

/**
 * 启动多个进程脚本
 */
class md
{
    public $max = 5;
    private $current = 0;
    private $quitProcess = 0;
    private $allProcess = [];
    private $endFn = null;
    private $key = '';
    public function __construct(int $max = 3)
    {
        $this->max = $max;
        $this->key = posix_getpid();
    }
    /**
     * 启动
     *
     * @param callable $fn
     * @return void
     */
    public function start(callable $fn): void
    {
        // if($this->max==1){
        //     call_user_func($fn,0);
        //     return ;
        // }
        
        //监听子进程退出信号
        pcntl_signal(SIGCHLD, function ($sig) {
            while (($pid = pcntl_waitpid(-1, $status, false)) > 0) {
                // echo '有一个子进程退出了' . PHP_EOL;
                $this->quitProcess++;
                if ($this->quitProcess == $this->current) {
                    // echo "所有子进程全部退出,主进程结束" . PHP_EOL;
                    if (is_callable($this->endFn)) {
                        call_user_func($this->endFn);
                    } else {
                        exit(0);
                    }
                }
            }
        });
        while (true) {
            pcntl_signal_dispatch();
            $pid = -1;
            if ($this->current < $this->max) {
                $pid = pcntl_fork();
            }
            if ($pid > 0) {
                //父进程
                $this->current++;
                $this->allProcess[] = $pid;
            } elseif ($pid === 0) {
                //子进程
                call_user_func($fn, $this->current);
                return;
            } else {
                sleep(2);
            }
        }
    }
    /**
     * 设置主进程结束处理函数
     *
     * @param callable $fn
     * @return void
     */
    public function onEnd(callable $fn): void
    {
        $this->endFn = $fn;
    }
}

// $t1 = new md(4);
// $t1->start(function ($i) {
//     sleep(rand(0, 10));
//     echo $i . PHP_EOL;
//     // echo posix_getpid() . PHP_EOL;
// });

验证码识别

<?php 
require_once realpath(__DIR__.'/../../server/lib/baidu_api/text/AipOcr.php'); //百度识别
class VerifyCode{
	
    static $Db = null;
    const APP_ID = 'XXXXXX';
    const API_KEY = 'XXXXXX';
    const SECRET_KEY = 'XXXXXXX';
    
    static $API = null;
    static function Do($file){
        static::GetApi();
        $image = file_get_contents($file);
        $a = static::$API->basicAccurate($image);
        return $a['words_result'][0]['words'];
    }

    static function GetApi(){
        if( is_null(static::$API) ) static::$API = new AipOcr(static::APP_ID, static::API_KEY, static::SECRET_KEY);
    }
}
  1. 解析 HtmlDomParser
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值