目录:
robots.php
iptable.conf
./robots
iptable.conf
<
http://www.baidu.com
http://www.sogou.com
http://www.iapall.com
http://www.oschina.net
--------- 内容 -------->>
1.[代码][PHP]代码
function spider($url, $spider = 'Blankspider', $port = 80, $timeout = 15) {
$content= '';
$resolve = parse_url($url);
$host = $resolve['host'];
$path = empty($resolve['path']) ? '/' : $resolve['path'].(!empty($resolve['query']) ? '?'.$resolve['query'] : '');
if(empty($host)) { return 'Requested host name can\'t be empty'; }
$fp = fsockopen($host, $port, $errno, $errstr, $timeout);
if (!$fp) {
return $errstr;
} else {
$fputs = "GET $path HTTP/1.1\r\n";
$fputs.= "Accept: */*\r\n";
$fputs.= "Host: $host\r\n";
$fputs.= "Spider: $spider\r\n";
$fputs.= "Connection: Close\r\n\r\n";
stream_set_blocking($fp, 1);
stream_set_timeout($fp, $timeout);
fputs($fp, $fputs);
while(!feof($fp)) {
if(($return = fgets($fp)) && ($return == "\r\n" || $return == "\n")) {
break;
}
}
while(!feof($fp)) { $content .= fgets($fp, 8192); }
fclose($fp);
return $content;
}
}
function cron2spider($iptable, $sleep = 5) {
set_time_limit(0);
$i = 0;
date_default_timezone_set('PRC');
if(!file_exists($iptable)) {
return json_encode(array(
'status'=> 'error',
'description'=> 'iptable.conf file not exists'));
}
$file = file($iptable);
if(empty($file)) {
return json_encode(array(
'status'=> 'error',
'description'=> 'iptable.conf can\'t be empty'));
}
while($i< count($file)) {
if(!file_exists('robots') || !is_writable('robots')) {
return json_encode(array(
'status'=> 'error',
'description'=> 'directory doesn\'t exist or don\'t have write permissions'));
}
$dir = 'robots/'.preg_replace('/(http\:\/\/)|(\s)|(www\.)/', '', $file[$i]);
if(!file_exists($dir)){ mkdir($dir); }
file_put_contents($dir.'/'.date('Y.m.d.H.i.s', time()).'.txt', spider(preg_replace('/\s/', '', $file[$i])));
$i++;
sleep($sleep);
}
return json_encode(array(
'status'=> 'ok',
'description'=> 'robots program execution success'));
}
echo cron2spider('iptable.conf');