抓取脚本

set_time_limit(0);
$url = 'xxxxxxxxxxxx';
$file = 'index.html';

$robot = new Spider(array(
    'site'  => $url,
    'queue' => array( $file),
));
$robot->begin();

class Spider
{
    protected $site = '';

    protected $extension = array(  'gif', 'jpg', 'png', 'eot', 'svg', 'ttf', 'woff', 'woff2', 'js');

    public function __construct( $config = array())
    {
        $this->site = $config['site'];
        Queue::push( $config['queue']);
    }
	
    public function begin()
    {
        while( $file = Queue::pull()) {
            $url = $this->comple( $file);
			$content = File::save( $file, $url);
			if( empty( $content) || !$this->is_match( $url)) continue;
			$content = new Content( $content);
			$links = $content->links();
			foreach ( $links as $k => $link) {
				$links[$k] = $this->relative( $url, $link);
			}
			Queue::push( $links);
            log_message('debug', 'contents', array(
                'url'   => $url,
                'links' => $links,
            ));
        }
    }

    protected function is_match( $url)
    {
        $info = pathinfo( $url);
        if( empty( $info['extension'])) return true;
        return !in_array( $info['extension'], $this->extension);
    }

    protected function relative( $url, $link)
    {
        $time = null;
        $rpos = strrpos($link, '../');
        if( $rpos !== false) $time = $time/3;
        $time = $time === null ? 0 : $time+1;
        $links = trim( $link, './');
        $url = explode('/', substr( $url, strlen( $this->site)));
        array_pop( $url);
        while( $time) {
            array_pop( $url);
            $time--;
        }
        $url[] = $links;
        return implode( '/', $url);

    }

    protected function comple( $url)
    {
        return $this->site.$url;
    }
}

class Content
{
    protected $content;

    public function __construct( $content)
    {
        $this->content = $content;
    }

	public function links()
    {
        $links = array_merge(
            $this->href(),
            $this->src(),
            $this->url(),
        );

        foreach( $links as $k => $link)
        {
            $link = trim( $link,'\'"');
            $pos = strpos( $link, '?');
            if( $pos !== false) $link = substr( $link, 0, $pos);

            if( $this->filter( $link)) {
                unset( $links[$k]);
                continue;
            }
            $links[$k] = $link;
        }
        return array_unique($links);
    }

    protected function href()
    {
        preg_match_all('/href="(.*?)"/', $this->content,$matches);
        return $matches[1];
    }

    protected function src()
    {
        preg_match_all('/src="(.*?)"/', $this->content,$matches);
        return $matches[1];
    }

    protected function url()
    {
        preg_match_all('/url\((.*?)\)/', $this->content,$matches);
        return $matches[1];
    }

    protected function filter( $link)
    {
        $filters = array(
            '#', '+', '%', '//', 'http', 'data:', 'javascript', 'tel', 'mailto', '{{'
        );
        foreach( $filters as $filter) {
            if( substr( $link, 0,strlen( $filter)) == $filter) return true;
        }
        if( in_array( $link, array(  '/', './'))) {
            return true;
        }
        return false;
    }
}

class File
{
    public static function save( $file, $url)
    {
        $content = self::read( $url);
        self::write( $file, $content);
        log_message('debug', 'save file', array(
            'url'   => $url,
        ));
        return preg_replace("/\s+/", " ", $content);
    }

    protected static function read( $url)
    {
        $cache = Cache::get( $url);
        $content =  $cache ? $cache : Http::request( $url);
        $content = $content ? $content : @file_get_contents( $url);
        if( !$cache) {
            Cache::set( $url, $content);
        }
        return $content;
    }

    protected static function write( $file, $content)
    {
        self::mkdir( $file);
        return file_put_contents( $file, $content);
    }

    protected static function mkdir( $filename)
    {
        $dir = dirname($filename);
        if( is_dir( $dir)) return true;
        if( mkdir( $dir, 0777, true)) return true;
    }
}

class Queue
{
    protected static $data = array();

    protected static $worked = array();

    public static function pull()
    {
        $row =  array_shift(self::$data);
        array_push(self::$worked, $row);
        return $row;
    }

    public static function push( $list)
    {
        $list = (array)$list;
        foreach ($list as $k => $v) {
            if( !self::is_worked( $v)) {
                array_push(self::$data, $v);
            }
        }
        array_unique( self::$data);
        return true;
    }

    public static function is_worked( $url)
    {
        return in_array( $url, self::$worked);
    }
}

class Cache
{
    protected static $dir = './cache/';

    public static function set( $name, $value)
    {
        if( !is_dir( self::$dir)) mkdir( self::$dir);
        file_put_contents( self::$dir.md5( $name), $value);
    }

    public static function get( $name)
    {
        $file = self::$dir.md5( $name);
        if( file_exists( $file)) {
            return file_get_contents( $file);
        }
        return Null;
    }
}

class Http
{
    public static function request( $url, $params = array(), $method = 'GET', $multi = false, $extheaders = array())
    {
        if(!function_exists('curl_init')) exit('Need to open the curl extension');
        $method = strtoupper($method);
        $ci = curl_init();
        curl_setopt($ci, CURLOPT_USERAGENT, '');
        curl_setopt($ci, CURLOPT_CONNECTTIMEOUT, 3);
        $timeout = $multi?30:3;
        curl_setopt($ci, CURLOPT_TIMEOUT, $timeout);
        curl_setopt($ci, CURLOPT_RETURNTRANSFER, true);
        curl_setopt($ci, CURLOPT_SSL_VERIFYPEER, false);
        curl_setopt($ci, CURLOPT_SSL_VERIFYHOST, false);
        curl_setopt($ci, CURLOPT_HEADER, false);
        $headers = (array)$extheaders;

        switch ($method) {
            case 'POST':
                curl_setopt($ci, CURLOPT_POST, TRUE);
                if (!empty($params)) {
                    if($multi) {
                        if( is_array( $multi)){
                            foreach($multi as $key => $file) {
                                $params[$key] = '@' . $file;
                            }
                        }
                        curl_setopt($ci, CURLOPT_POSTFIELDS, $params);
                        $headers[] = 'Expect: ';
                    } else {
                        curl_setopt($ci, CURLOPT_POSTFIELDS, http_build_query($params));
                    }
                }
                break;
            case 'DELETE':
            case 'GET':
                $method == 'DELETE' && curl_setopt($ci, CURLOPT_CUSTOMREQUEST, 'DELETE');
                if (!empty($params)) {
                    $url = $url . (strpos($url, '?') ? '&' : '?')
                        . (is_array($params) ? http_build_query($params) : $params);
                }
                break;
        }
        curl_setopt($ci, CURLINFO_HEADER_OUT, TRUE);
        curl_setopt($ci, CURLOPT_URL, $url);
        if($headers) {
            curl_setopt($ci, CURLOPT_HTTPHEADER, $headers);
        }

        $response = curl_exec($ci);
        curl_close ($ci);
        return $response;
    }
}

function log_message( $level, $message, $context = array())
{
    $message = date('Y-m-d H:i:s').' - '. $message;
    if( $context) {
        $message .= ' - [context] '.json_encode( $context);
    }
    $message .= "\n";
    echo $message;
    file_put_contents('./log.log', $message, FILE_APPEND);
}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值