<!DOCTYPE html>
<html>
<head>
<title>spider</title>
</head>
<body>
<form method="get" action="get_html.php">
crawl web html address:<input type="text" name="url" >
<input type="submit" value="crawl">
</form>
<?php
date_default_timezone_set('PRC');
function dump($var){
echo "<pre>";
var_dump($var);
echo "<pre>";
exit(date("Y-m-d H:i:s",time()));
}
//catetory html resource into local project file.
class spider{
public $url;
public $http;
public $host;
public $html;
public $path;
public $title;
function __construct($url,$imagesPath=''){
set_time_limit(60);
//dump($url);
//$url="http://www.hose.com";
preg_match('#(https?)\s?:\s?//([\w\.-]+)/?#', $url,$matches);
$this->http=$matches[1];
$this->host=$matches[2];
//dump($this->http.$this->host);
if ($url) {
$ch=curl_init($url);
//curl_setopt($ch,CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
curl_setopt($ch, CURLOPT_TIMEOUT, 5);
$res=curl_exec($ch);
curl_close($ch);
$this->html=$res;
$this->url=$url;
/*if(preg_match('#<title>(.*?)</title>#', $res,$matches)){
$this->title=substr($matches[1],0,9);
}*/
$this->title=$this->host;
if($res){
$this->path=dirname(__FILE__).'/'.$this->title;
if(!file_exists($this->path)){
mkdir($this->path);
chmod($this->path,0777);
}
if(!file_exists($this->path.'/style')){
mkdir($this->path.'/style');
chmod($this->path.'/style/',0777);
}
}else{
exit('could not load html webpage.');
}
}else{
exit("Please input url!");
}
}
function get_resource($url_array){
foreach ($url_array as $key => $url) {
$ch=curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_TIMEOUT, 5);
$res=curl_exec($ch);
file_put_contents($this->path.'/'.basename($url), $res);
chmod($this->path.'/'.basename($url), 0777);
curl_close($ch);
}
}
function get_image(){
$matches=array();
preg_match_all("/<img.*?src=['\"](.*?\/[\w-]+\.(gif|png|jpg)).*?['\"]/i",$this->html, $matches);
foreach ($matches[1] as $key => $url) {
if(strpos($url, '/')===0){
$url=$this->http."://".$this->host.$url;
}elseif (strpos($url,'//')===false) {
$url=$this->http."://".$this->host.'/'.$url;
}
$ch=curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_TIMEOUT, 5);
$res=curl_exec($ch);
if(!file_exists($this->path.'/style/img')){
mkdir($this->path.'/style/img');
chmod($this->path.'/style/img',0777);
}
file_put_contents($this->path.'/style/img/'.basename($url), $res);
chmod($this->path.'/style/img/'.basename($url), 0777);
curl_close($ch);
}
echo "<br />get image over.";
}
function get_css(){
$matches=array();
//var_dump($this->html);
preg_match_all("/<link.*?href=['\"](.*?\.css).*?>/i",$this->html, $matches);
//var_dump($matches);
foreach ($matches[1] as $key => $url) {
if(strpos($url, '/')===0){
$url=$this->http."://".$this->host.$url;
}elseif (strpos($url,'//')===false) {
$url=$this->http."://".$this->host.'/'.$url;
}
//dump($url);
$ch=curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_TIMEOUT, 5);
$res=curl_exec($ch);
if(!file_exists($this->path.'/style/css')){
mkdir($this->path.'/style/css');
chmod($this->path.'/style/css', 0777);
}
file_put_contents($this->path.'/style/css/'.basename($url), $res);
chmod($this->path.'/style/css/'.basename($url),0777);
curl_close($ch);
}
echo "<br />get css over.";
}
function get_js(){
$matches=array();
//.js文件后面带参数一般是为了不要让浏览器读缓存,过旧的js版本
preg_match_all("/<script.*?src=['\"](.*?\.js).*?>/i",$this->html, $matches);
foreach ($matches[1] as $key => $url) {
if(strpos($url, '/')===0){
$url=$this->http."://".$this->host.$url;
}elseif (strpos($url,'//')===false) {
$url=$this->http."://".$this->host.'/'.$url;
}
$ch=curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_TIMEOUT, 5);
$res=curl_exec($ch);
if(!file_exists($this->path.'/style/js')){
mkdir($this->path.'/style/js');
chmod($this->path.'/style/js',0777);
}
file_put_contents($this->path.'/style/js/'.basename($url), $res);
chmod($this->path.'/style/js/'.basename($url), 0777);
curl_close($ch);
}
echo "<br />get js over.";
}
function formate_html(){
$res=$this->html;
$url=$this->url;
//process the source link
$res=preg_replace("/<link.*?href=['\"].*?\/([\w-]+\.css).*?>/i", '<link href="./style/css/$1" rel="stylesheet" type="text/css" />', $res);
$res=preg_replace("/<script.*?src=['\"].*?\/([\w-\.]+\.js).*?>/i", '<script type="text/javascript" src="./style/js/$1">', $res);
$res=preg_replace_callback("/<img.*?src=['\"].*?\/([\w-]+\.(gif|png|jpg)).*?>/i",
function ($res){
return preg_replace("/src=['\"].*?\/([\w-]+\.(gif|png|jpg)).*?['\"]/i", "src='./style/img/".$res[1]."'", $res[0]);
//dump($res);
}
, $res);
$file_name=preg_replace('/\.\w+$/', '', basename($url));
file_put_contents($this->path.'/'.$file_name.'.html', $res);
chmod($this->path.'/'.$file_name.'.html', 0777);
}
}
function crawl($url){
$spider=new spider($url);
$spider->get_css();
$spider->get_js();
$spider->get_image();
$spider->formate_html();
}
if (!empty($_GET['url'])) {
crawl($_GET['url']);
}
?>
</body>
</html>
获取前端网页 php爬虫 get_html.php
最新推荐文章于 2024-03-08 21:59:45 发布