爬取和解析分离,先将数据爬取到文件系统,方便留存,再解析保留有效数据。
- 爬取,登录验证码图片识别采用百度识别。
发送curl类
<?php
class PhpCurl
{
public $cookie = '';
/**
* 发送post请求
*
* @param [type] $url
* @param array $params
* @param integer $isShowHeader
* @return void
*/
function post_curl($url, $params = [], $isShowHeader = 0,$isJson=false,$header=[])
{
$ret = [];
$ch = curl_init();
curl_setopt($ch, CURLOPT_HEADER, $isShowHeader);
curl_setopt($ch, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_1);
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36');
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($ch, CURLOPT_TIMEOUT, 10);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLINFO_HEADER_OUT, true);
curl_setopt($ch, CURLOPT_POST, true);
if($isJson){
curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($params));
}else{
curl_setopt($ch, CURLOPT_POSTFIELDS, http_build_query($params));
}
curl_setopt($ch, CURLOPT_URL, $url);
// 设置请求头
if(!empty($header)) curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
curl_setopt($ch, CURLOPT_COOKIE, $this->cookie);
$response = curl_exec($ch);
$info = curl_getinfo($ch);
// $info1 = curl_getinfo($ch, CURLINFO_HEADER_OUT);
// var_dump($info1);
curl_close($ch);
// echo '状态码'.$info['http_code'].'--'.$url.PHP_EOL;
// echo 'cookie:'.$this->cookie.PHP_EOL;
$ret['code'] = $info['http_code'];
$ret['data'] = $response;
return $ret;
}
function get_curl($url)
{
$curl = curl_init();
curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_HEADER, 0);
curl_setopt($curl, CURLOPT_TIMEOUT, 10);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($curl, CURLINFO_HEADER_OUT, true);
curl_setopt($curl, CURLOPT_COOKIE, $this->cookie);
$data = curl_exec($curl);
$info = curl_getinfo($curl);
curl_close($curl);
$ret['code'] = $info['http_code'];
$ret['data'] = $data;
return $ret;
}
}
爬取站点继承自curl,创建有效的共享cookie池,数据库存储有效的JSESSIONID的cookie,发送爬取请求之前,获取未被占用的cookie,并标记为使用状态。
<?php
include_once realpath(__DIR__.'/../PhpCurl.php');
include_once realpath(__DIR__.'/../VerifyCode.php');
/**
* 运维监控系统的登录逻辑类
*/
class PCYW extends PhpCurl
{
public $db = null;
public $count = 0;
public $maxCount = 30;
// const COOKIE_FILE = '../cookie/cookie';
const TABLE_COOKIE = 'T_CURL_COOKIE';
const SITE_FLAG = 'XXXXX';
public function __construct($db)
{
$this->db = $db;
}
/**
* 生成cookie
*
* @param [type] $url
* @return void
*/
private function start($url)
{
$re = $this->post_curl($url, [], 1);
list($header, $body) = explode("\r\n\r\n", $re['data']);
preg_match("/set\-cookie:([^\r\n]*)/i", $header, $matches);
$cookie = explode(';', $matches[1])[0];
$this->cookie = trim($cookie);
return $re;
}
private function verifyCode($url)
{
$re = $this->post_curl($url);
return $re['data'] ? $re['data'] : false;
}
/**
* 登录 获取cookie
*
* @return void
*/
public function login()
{
if($this->cookie!=''){
$this->delCurlCookie($this->cookie);
}
if ($this->count > $this->maxCount) return false;
$this->count++;
$verifyImgName = './verifyImage/code' . rand(1, 99999) . '.jpg'; //验证码图片路径
$loginIspUrl = 'http://ip:port/baf/jsp/uiframe/login.jsp'; //登录页url
$loginUrl = 'http://ip:port/jf/login/checkLogin'; //登录操作url
$codeUrl = 'http://ip:port/servlet/ValidateCodeServlet'; //验证码url
$this->start($loginIspUrl);
$imgSource = $this->verifyCode($codeUrl); //获取验证码图片
$im = imagecreatefromstring($imgSource);
imagejpeg($im, $verifyImgName);
imagedestroy($im);
$code = VerifyCode::Do($verifyImgName); //识别验证码图片
@unlink($verifyImgName);
if (strlen($code) != 4) {
return $this->login();
}
$re = $this->post_curl($loginUrl, [
'loginName' => 'XXXX',
'password' => 'XXXXXXXXXXXXXXXXXX',
'picCode' => $code
]);
$qwe = json_decode($re['data'], true);
if ($re['data'] && !is_null($qwe) && isset($qwe['status']) && $qwe['status'] == '1') {
$this->setCurlCookie();
// echo '登陆成'.PHP_EOL;
// echo 'cookieLogin:'. $this->cookie.PHP_EOL;
return true;
} else {
// echo '登陆失败,chongshi '.PHP_EOL;
sleep(1);
return $this->login();
}
}
// 不使用文件系统存储cookie 是因为多个脚本同时启动的时候 要加锁 麻烦! 所以选用数据库(自带锁。。)
function getCurlCookie()
{
// $cok = file_get_contents(self::COOKIE_FILE);
// $arr = explode(PHP_EOL,$cok);
// foreach ($arr as $key => $value) {
// $a = json_decode($value,true);
// if(is_null($a)) return '';
// if(isset($a['using'])&&$a['using']==false){
// return $a['cookie'];
// }
// }
$res = $this->db->selectOne(self::TABLE_COOKIE,['USING'=>'0','SITE_FLAG'=>self::SITE_FLAG]);
if($res){
return $res['JS'];
}else{
return '';
}
}
function setCurlCookie()
{
// $fpcontent = json_encode([
// 'cookie'=>$this->cookie,
// 'using'=>true
// ]);
// file_put_contents(self::COOKIE_FILE,$fpcontent.PHP_EOL,FILE_APPEND);
$this->db->insert(self::TABLE_COOKIE,[
'JS'=>$this->cookie,
'USING'=>'1',
'SITE_FLAG'=>self::SITE_FLAG
]);
return true;
}
function updateCurlCookie($cookie,$using)
{
// $cok = file_get_contents(self::COOKIE_FILE);
// $arr = explode(PHP_EOL,$cok);
// $new = [];
// foreach ($arr as $key => $value) {
// $a = json_decode($value,true);
// if(is_null($a)) continue;
// if(isset($a['cookie'])&&$a['cookie']==$cookie){
// $a['using'] = $using;
// }
// array_push($new,json_encode($a));
// }
// file_put_contents(self::COOKIE_FILE,implode(PHP_EOL,$new));
$using = $using?'1':'0';
$res = $this->db->selectOne(self::TABLE_COOKIE,['JS'=>$cookie,'SITE_FLAG'=>self::SITE_FLAG]);
if($res){
$this->db->update(self::TABLE_COOKIE,['SID'=>$res['SID']],['USING'=>$using]);
}
return true;
}
function delCurlCookie($cookie)
{
// $cok = file_get_contents(self::COOKIE_FILE);
// $arr = explode(PHP_EOL,$cok);
// foreach ($arr as $key => $value) {
// $a = json_decode($value,true);
// if(is_null($a)) return '';
// if(isset($a['cookie'])&&$a['cookie']==$cookie){
// unset($arr[$key]);
// file_put_contents(self::COOKIE_FILE,implode(PHP_EOL,$arr));
// return true;
// }
// }
$st = $this->db->delete(self::TABLE_COOKIE,['JS'=>$cookie,'SITE_FLAG'=>self::SITE_FLAG]);
return true;
}
function __destruct()
{
$this->updateCurlCookie($this->cookie,false);
}
}
爬取数据方法 基类
<?php
include_once './md.php';
/**
* 爬取数据方法 基类
*/
class FuncBase
{
public $md = null;
public function __construct($num)
{
$this->md = new md($num);
}
/**
* 递归创建文件夹
* /
function createDir($dir){
return is_dir($dir) or $this->createDir(dirname($dir)) and mkdir($dir,0777);
}
/**
* 带颜色的进度条
* 总数
* 当前
* /
function progress($total, $current)
{
if (0 == $total) return;
if ($current > $total) $current = $total;
$percent = $current / $total * 100;
$percent_current = floor($percent / 2);
$percent_remain = 100 / 2 - $percent_current;
echo "\033[?25l\033[42m" . str_pad(' ', $percent_current) . "\033[0m"
. ($percent_remain ? "\033[41m" . str_pad(' ', $percent_remain) . "\033[0m" : "")
. " \033[32m" . sprintf('%.2f', $percent) . "%\033[0m\r";
if ($current == $total) echo PHP_EOL;
}
}
运维监控系统的爬取数据方法 实现 类
<?php
include_once realpath(__DIR__.'/../PC/PCYW.php');
include_once realpath(__DIR__.'/../FuncBase.php');
use voku\helper\HtmlDomParser;
/**
* 运维监控系统爬取数据 方法实现 类
*/
class YWFunc extends FuncBase
{
public $curl = null;
public $db = null;
public $j_id = '';
const ERROR_DIR = './error';
function __construct($db, $num = 4)
{
parent::__construct($num);
$this->db = $db;
$this->curl = new PCYW($this->db);
//ping
$cookie = $this->curl->getCurlCookie();
$this->curl->cookie = $cookie;
$this->curl->updateCurlCookie($cookie,true);
$this->sendCurl('http://ip:port/baf/jsp/bulletin/viewBulletin.xhtml', [], 'get');
$url = 'http://ip:port/business/resMge/pwMge/performanceMge/perfdata.xhtml';
$res1 = $this->sendCurl($url, [], 'get');
preg_match("/id=\"javax.faces.ViewState\" value=\"(\w+)\"/", $res1['data'], $matchArr);
$this->j_id = $matchArr[1];
// echo 'cookie: '.$this->curl->cookie.PHP_EOL;
// echo '状态码:'.$res1['code'].'------j_id:'.$this->j_id.PHP_EOL;
}
function reTry($retry, $fn)
{
if ($retry > 0) {
$st = call_user_func($fn, $retry);
// if(!$st){
// $f = fopen('./qwer.txt','a');
// fwrite($f,$retry.PHP_EOL);
// fclose($f);
// $this->reTry($retry-1,$fn);
// }
}
}
function getMonitorData()
{
// ...
}
}
多进程实现
<?php
/**
* 启动多个进程脚本
*/
class md
{
public $max = 5;
private $current = 0;
private $quitProcess = 0;
private $allProcess = [];
private $endFn = null;
private $key = '';
public function __construct(int $max = 3)
{
$this->max = $max;
$this->key = posix_getpid();
}
/**
* 启动
*
* @param callable $fn
* @return void
*/
public function start(callable $fn): void
{
// if($this->max==1){
// call_user_func($fn,0);
// return ;
// }
//监听子进程退出信号
pcntl_signal(SIGCHLD, function ($sig) {
while (($pid = pcntl_waitpid(-1, $status, false)) > 0) {
// echo '有一个子进程退出了' . PHP_EOL;
$this->quitProcess++;
if ($this->quitProcess == $this->current) {
// echo "所有子进程全部退出,主进程结束" . PHP_EOL;
if (is_callable($this->endFn)) {
call_user_func($this->endFn);
} else {
exit(0);
}
}
}
});
while (true) {
pcntl_signal_dispatch();
$pid = -1;
if ($this->current < $this->max) {
$pid = pcntl_fork();
}
if ($pid > 0) {
//父进程
$this->current++;
$this->allProcess[] = $pid;
} elseif ($pid === 0) {
//子进程
call_user_func($fn, $this->current);
return;
} else {
sleep(2);
}
}
}
/**
* 设置主进程结束处理函数
*
* @param callable $fn
* @return void
*/
public function onEnd(callable $fn): void
{
$this->endFn = $fn;
}
}
// $t1 = new md(4);
// $t1->start(function ($i) {
// sleep(rand(0, 10));
// echo $i . PHP_EOL;
// // echo posix_getpid() . PHP_EOL;
// });
验证码识别
<?php
require_once realpath(__DIR__.'/../../server/lib/baidu_api/text/AipOcr.php'); //百度识别
class VerifyCode{
static $Db = null;
const APP_ID = 'XXXXXX';
const API_KEY = 'XXXXXX';
const SECRET_KEY = 'XXXXXXX';
static $API = null;
static function Do($file){
static::GetApi();
$image = file_get_contents($file);
$a = static::$API->basicAccurate($image);
return $a['words_result'][0]['words'];
}
static function GetApi(){
if( is_null(static::$API) ) static::$API = new AipOcr(static::APP_ID, static::API_KEY, static::SECRET_KEY);
}
}
- 解析 HtmlDomParser