获取前端网页 php爬虫 get_html.php

最新推荐文章于 2024-03-08 21:59:45 发布

brain_ning

最新推荐文章于 2024-03-08 21:59:45 发布

阅读量2.3k

点赞数

本文链接：https://blog.csdn.net/gongpeng1966/article/details/52733845

版权

<!DOCTYPE html>
<html>
<head>
<title>spider</title>
</head>
<body>
<form method="get" action="get_html.php">
crawl web html address:<input type="text" name="url" >
<input type="submit" value="crawl">
</form>
<?php
date_default_timezone_set('PRC');
function dump($var){
	echo "<pre>";
	var_dump($var);
	echo "<pre>";
	exit(date("Y-m-d H:i:s",time()));
}
//catetory html resource into local project file.
class spider{
		public $url;
		public $http;
		public $host;
		public $html;
		public $path;
		public $title;

	function __construct($url,$imagesPath=''){
		set_time_limit(60);
		//dump($url);
		//$url="http://www.hose.com";
		preg_match('#(https?)\s?:\s?//([\w\.-]+)/?#', $url,$matches);
		$this->http=$matches[1];
		$this->host=$matches[2];
		//dump($this->http.$this->host);
		if ($url) {
			$ch=curl_init($url);
		    //curl_setopt($ch,CURLOPT_URL, $url);
		    curl_setopt($ch, CURLOPT_HEADER, 0);
		    curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
		    curl_setopt($ch, CURLOPT_TIMEOUT, 5);
		    $res=curl_exec($ch);
		    curl_close($ch);
		    $this->html=$res;
		    $this->url=$url;

		    /*if(preg_match('#<title>(.*?)</title>#', $res,$matches)){
		    	$this->title=substr($matches[1],0,9);
		    }*/
		    $this->title=$this->host;
		    if($res){
		    	$this->path=dirname(__FILE__).'/'.$this->title;
		    	if(!file_exists($this->path)){
		    		mkdir($this->path);
		    		chmod($this->path,0777);
		    	}


		    	if(!file_exists($this->path.'/style')){
		    		mkdir($this->path.'/style');
		    		chmod($this->path.'/style/',0777);
		    	}

		    	
		    
			}else{
				exit('could not load html webpage.');
			}
		}else{
			exit("Please input url!");
		}
	}

	function get_resource($url_array){
		foreach ($url_array as $key => $url) {
			$ch=curl_init();
			curl_setopt($ch, CURLOPT_URL, $url);
			curl_setopt($ch, CURLOPT_HEADER, 0);
			curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
		    curl_setopt($ch, CURLOPT_TIMEOUT, 5);

			$res=curl_exec($ch);
			
			file_put_contents($this->path.'/'.basename($url), $res);
			chmod($this->path.'/'.basename($url), 0777);
			curl_close($ch);
		}
	}

	function get_image(){
		$matches=array();
		preg_match_all("/<img.*?src=['\"](.*?\/[\w-]+\.(gif|png|jpg)).*?['\"]/i",$this->html, $matches);	
		
		foreach ($matches[1] as $key => $url) {
			if(strpos($url, '/')===0){
				$url=$this->http."://".$this->host.$url;
			}elseif (strpos($url,'//')===false) {
				$url=$this->http."://".$this->host.'/'.$url;
			}
			$ch=curl_init();
			curl_setopt($ch, CURLOPT_URL, $url);
			curl_setopt($ch, CURLOPT_HEADER, 0);
			curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
			curl_setopt($ch, CURLOPT_TIMEOUT, 5);
			$res=curl_exec($ch);
			if(!file_exists($this->path.'/style/img')){
				mkdir($this->path.'/style/img');
				chmod($this->path.'/style/img',0777);
			}
			file_put_contents($this->path.'/style/img/'.basename($url), $res);
			chmod($this->path.'/style/img/'.basename($url), 0777);
			curl_close($ch);
		}
		echo "<br />get image over.";
	}

	function get_css(){
		$matches=array();
		//var_dump($this->html);
		preg_match_all("/<link.*?href=['\"](.*?\.css).*?>/i",$this->html, $matches);	
		//var_dump($matches);
		foreach ($matches[1] as $key => $url) {
			if(strpos($url, '/')===0){
				$url=$this->http."://".$this->host.$url;
			}elseif (strpos($url,'//')===false) {
				$url=$this->http."://".$this->host.'/'.$url;
			}
			//dump($url);
			$ch=curl_init();
			curl_setopt($ch, CURLOPT_URL, $url);
			curl_setopt($ch, CURLOPT_HEADER, 0);
			curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
		    curl_setopt($ch, CURLOPT_TIMEOUT, 5);
			
			$res=curl_exec($ch);
			if(!file_exists($this->path.'/style/css')){
				mkdir($this->path.'/style/css');
				chmod($this->path.'/style/css', 0777);
			}
			file_put_contents($this->path.'/style/css/'.basename($url), $res);
			chmod($this->path.'/style/css/'.basename($url),0777);
			curl_close($ch);
		}
		echo "<br />get css over.";
	}

	function get_js(){
		$matches=array();
		//.js文件后面带参数一般是为了不要让浏览器读缓存，过旧的js版本
		preg_match_all("/<script.*?src=['\"](.*?\.js).*?>/i",$this->html, $matches);	
		
		foreach ($matches[1] as $key => $url) {
			if(strpos($url, '/')===0){
				$url=$this->http."://".$this->host.$url;
			}elseif (strpos($url,'//')===false) {
				$url=$this->http."://".$this->host.'/'.$url;
			}
			$ch=curl_init();
			curl_setopt($ch, CURLOPT_URL, $url);
			curl_setopt($ch, CURLOPT_HEADER, 0);
			curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
			curl_setopt($ch, CURLOPT_TIMEOUT, 5);
			$res=curl_exec($ch);
			if(!file_exists($this->path.'/style/js')){
				mkdir($this->path.'/style/js');
				chmod($this->path.'/style/js',0777);
			}
			file_put_contents($this->path.'/style/js/'.basename($url), $res);
			chmod($this->path.'/style/js/'.basename($url), 0777);
			curl_close($ch);
		}
		echo "<br />get js over.";
	}

	function formate_html(){
		$res=$this->html;
		$url=$this->url;
		//process the source link
				$res=preg_replace("/<link.*?href=['\"].*?\/([\w-]+\.css).*?>/i", '<link href="./style/css/$1" rel="stylesheet" type="text/css" />', $res);
				$res=preg_replace("/<script.*?src=['\"].*?\/([\w-\.]+\.js).*?>/i", '<script type="text/javascript" src="./style/js/$1">', $res);
				$res=preg_replace_callback("/<img.*?src=['\"].*?\/([\w-]+\.(gif|png|jpg)).*?>/i",
						function ($res){
							return preg_replace("/src=['\"].*?\/([\w-]+\.(gif|png|jpg)).*?['\"]/i", "src='./style/img/".$res[1]."'", $res[0]);
						 	//dump($res);
						}
					 , $res);
		    	$file_name=preg_replace('/\.\w+$/', '', basename($url));
	   	    	file_put_contents($this->path.'/'.$file_name.'.html', $res);
	   	    	chmod($this->path.'/'.$file_name.'.html', 0777);
	}
}

function crawl($url){
	$spider=new spider($url);
	$spider->get_css();
	$spider->get_js();
	$spider->get_image();
	$spider->formate_html();
}

if (!empty($_GET['url'])) {
	crawl($_GET['url']);
}

?>
</body>
</html>

brain_ning

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
获取前端网页 php爬虫 get_html.php

spidercrawl web html address:date_default_timezone_set('PRC');function dump($var){echo "";var_dump($var);echo "";exit(date("Y-m-d H:i:s",time()));}//catetory html resourc
复制链接

扫一扫