set_time_limit(0);
$url = 'xxxxxxxxxxxx';
$file = 'index.html';
$robot = new Spider(array(
'site' => $url,
'queue' => array( $file),
));
$robot->begin();
class Spider
{
protected $site = '';
protected $extension = array( 'gif', 'jpg', 'png', 'eot', 'svg', 'ttf', 'woff', 'woff2', 'js');
public function __construct( $config = array())
{
$this->site = $config['site'];
Queue::push( $config['queue']);
}
public function begin()
{
while( $file = Queue::pull()) {
$url = $this->comple( $file);
$content = File::save( $file, $url);
if( empty( $content) || !$this->is_match( $url)) continue;
$content = new Content( $content);
$links = $content->links();
foreach ( $links as $k => $link) {
$links[$k] = $this->relative( $url, $link);
}
Queue::push( $links);
log_message('debug', 'contents', array(
'url' => $url,
'links' => $links,
));
}
}
protected function is_match( $url)
{
$info = pathinfo( $url);
if( empty( $info['extension'])) return true;
return !in_array( $info['extension'], $this->extension);
}
protected function relative( $url, $link)
{
$time = null;
$rpos = strrpos($link, '../');
if( $rpos !== false) $time = $time/3;
$time = $time === null ? 0 : $time+1;
$links = trim( $link, './');
$url = explode('/', substr( $url, strlen( $this->site)));
array_pop( $url);
while( $time) {
array_pop( $url);
$time--;
}
$url[] = $links;
return implode( '/', $url);
}
protected function comple( $url)
{
return $this->site.$url;
}
}
class Content
{
protected $content;
public function __construct( $content)
{
$this->content = $content;
}
public function links()
{
$links = array_merge(
$this->href(),
$this->src(),
$this->url(),
);
foreach( $links as $k => $link)
{
$link = trim( $link,'\'"');
$pos = strpos( $link, '?');
if( $pos !== false) $link = substr( $link, 0, $pos);
if( $this->filter( $link)) {
unset( $links[$k]);
continue;
}
$links[$k] = $link;
}
return array_unique($links);
}
protected function href()
{
preg_match_all('/href="(.*?)"/', $this->content,$matches);
return $matches[1];
}
protected function src()
{
preg_match_all('/src="(.*?)"/', $this->content,$matches);
return $matches[1];
}
protected function url()
{
preg_match_all('/url\((.*?)\)/', $this->content,$matches);
return $matches[1];
}
protected function filter( $link)
{
$filters = array(
'#', '+', '%', '//', 'http', 'data:', 'javascript', 'tel', 'mailto', '{{'
);
foreach( $filters as $filter) {
if( substr( $link, 0,strlen( $filter)) == $filter) return true;
}
if( in_array( $link, array( '/', './'))) {
return true;
}
return false;
}
}
class File
{
public static function save( $file, $url)
{
$content = self::read( $url);
self::write( $file, $content);
log_message('debug', 'save file', array(
'url' => $url,
));
return preg_replace("/\s+/", " ", $content);
}
protected static function read( $url)
{
$cache = Cache::get( $url);
$content = $cache ? $cache : Http::request( $url);
$content = $content ? $content : @file_get_contents( $url);
if( !$cache) {
Cache::set( $url, $content);
}
return $content;
}
protected static function write( $file, $content)
{
self::mkdir( $file);
return file_put_contents( $file, $content);
}
protected static function mkdir( $filename)
{
$dir = dirname($filename);
if( is_dir( $dir)) return true;
if( mkdir( $dir, 0777, true)) return true;
}
}
class Queue
{
protected static $data = array();
protected static $worked = array();
public static function pull()
{
$row = array_shift(self::$data);
array_push(self::$worked, $row);
return $row;
}
public static function push( $list)
{
$list = (array)$list;
foreach ($list as $k => $v) {
if( !self::is_worked( $v)) {
array_push(self::$data, $v);
}
}
array_unique( self::$data);
return true;
}
public static function is_worked( $url)
{
return in_array( $url, self::$worked);
}
}
class Cache
{
protected static $dir = './cache/';
public static function set( $name, $value)
{
if( !is_dir( self::$dir)) mkdir( self::$dir);
file_put_contents( self::$dir.md5( $name), $value);
}
public static function get( $name)
{
$file = self::$dir.md5( $name);
if( file_exists( $file)) {
return file_get_contents( $file);
}
return Null;
}
}
class Http
{
public static function request( $url, $params = array(), $method = 'GET', $multi = false, $extheaders = array())
{
if(!function_exists('curl_init')) exit('Need to open the curl extension');
$method = strtoupper($method);
$ci = curl_init();
curl_setopt($ci, CURLOPT_USERAGENT, '');
curl_setopt($ci, CURLOPT_CONNECTTIMEOUT, 3);
$timeout = $multi?30:3;
curl_setopt($ci, CURLOPT_TIMEOUT, $timeout);
curl_setopt($ci, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ci, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ci, CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($ci, CURLOPT_HEADER, false);
$headers = (array)$extheaders;
switch ($method) {
case 'POST':
curl_setopt($ci, CURLOPT_POST, TRUE);
if (!empty($params)) {
if($multi) {
if( is_array( $multi)){
foreach($multi as $key => $file) {
$params[$key] = '@' . $file;
}
}
curl_setopt($ci, CURLOPT_POSTFIELDS, $params);
$headers[] = 'Expect: ';
} else {
curl_setopt($ci, CURLOPT_POSTFIELDS, http_build_query($params));
}
}
break;
case 'DELETE':
case 'GET':
$method == 'DELETE' && curl_setopt($ci, CURLOPT_CUSTOMREQUEST, 'DELETE');
if (!empty($params)) {
$url = $url . (strpos($url, '?') ? '&' : '?')
. (is_array($params) ? http_build_query($params) : $params);
}
break;
}
curl_setopt($ci, CURLINFO_HEADER_OUT, TRUE);
curl_setopt($ci, CURLOPT_URL, $url);
if($headers) {
curl_setopt($ci, CURLOPT_HTTPHEADER, $headers);
}
$response = curl_exec($ci);
curl_close ($ci);
return $response;
}
}
function log_message( $level, $message, $context = array())
{
$message = date('Y-m-d H:i:s').' - '. $message;
if( $context) {
$message .= ' - [context] '.json_encode( $context);
}
$message .= "\n";
echo $message;
file_put_contents('./log.log', $message, FILE_APPEND);
}
抓取脚本
最新推荐文章于 2021-05-21 19:52:26 发布