php数据采集(2)

一个网页只有一篇文章

<?php 

header('Content-type:text/html;charset=utf-8');
//连接数据库
$link  = @mysql_connect('localhost','root','root');
//判断连接是否成功
if(mysql_errno()){
	exit('数据库连接失败'.mysql_error());
}
//选择数据库
mysql_select_db('555');        
$jid = substr(__FILE__,strrpos(__FILE__,'\\')+1,strrpos(__FILE__,'.')-strrpos(__FILE__,'\\')-1);
showDir("F:/FeigeDownload/2922");
function showDir( $filedir ){
    if(is_dir($filedir)){
        //打开目录  
        $dir = @ dir($filedir);  
        while (($file = $dir->read())!==false){  
             if(is_dir($filedir."/".$file) AND ($file!=".") AND ($file!="..")) {  
                   showDir($filedir."/".$file);  
              } else {
                  if ($file != "." and $file != ".."){  
                      getDBData($filedir.'/'.$file,$file);
                       echo  $filedir.'/'.$file.'<br>';
                       //echo $file;
                 }  
              }  
         }
        $dir->close();
    }else{
        getDBData($filedir);
        echo  $filedir;
    }
}  
function getDBData($filename,$file) //file指文件名100000.HTML
{
	 //$urls="E:/html/auth/auth2/1000000.html";
	//$allurl=file_get_contents($filename);  //获取到网页的所有源代码
	 
 
preg_match('/(\d{0,20})\.html/',$file,$ids);
$urls=file_get_contents($filename);
$title = gettitle($urls);
$Geo = getGeo($urls);
$Family = getFamily($urls);
$Addname = getAddname($urls);
$Veg = getVeg($urls);
$img_href = getimg_href($urls);
$img_title = getimg_title($urls);
$Plant = getPlant($urls);
$Flowering = getFlowering($urls);
$Leaf = getLeaf($urls);
$Habitat = getHabitat($urls);
$book_title = getbook_title($urls);
$book_href = getbook_href($urls);
$page_url = getpage_url($urls);
//echo $Veg;
//exit;

//echo $auth;


$sql="insert into `55`(ids,title,img_href,Geo,img_title,Veg,Family,Addname,Plant,Flowering,Leaf,Habitat,book_title,book_href,page_url) values('{$ids[1]}',
'{$title}','{$img_href}','{$Geo}','{$img_title}','{$Veg}','{$Family}','{$Addname}','{$Plant}','{$Flowering}','{$Leaf}','{$Habitat}','{$book_title}','{$book_href}','{$page_url}')";
//echo $sql;
//exit;

mysql_query($sql);
}


function gettitle($data)
{
	preg_match('/<title>([\s\S]*?)<\/title>/',$data,$title);
	
	return $title[1]; //返回目录
}	
	
function getpage_url($data)
{
	preg_match('/<meta property="og:url" content="([\s\S]*?)" \/>/',$data,$page_url);
	return $page_url[1];
}

// Geo
// 进行两层遍历
function getGeo($data)
{
	preg_match('/<h3>Geogr. District<\/h3>([\s\S]*?)<\/ul>/',$data,$Geo);
	if(!empty($Geo))
	{
		//preg_match('/<a href="[\s\S]*?">([\s\S]*?)<\/a>\s+<span class="cover">([\s\S]*?)<\/span>/',$data,$Geo);
		preg_match_all('/<a href="[\s\S]*?">([\s\S]*?)<\/a>/',$Geo[1],$Geo1);
		if(empty($Geo1))
		{
			return "";
		}else{
			$Geo2="";
			foreach($Geo1[1] as $val)
			{
				$Geo2.=$val."@@$$||";
			}
			return $Geo2;
		}
	}
}

// Veg
function getVeg($data)
{
	preg_match('/<h3>Vegetation Units<\/h3>([\s\S]*?)<\/ul>/',$data,$Veg);
	if(!empty($Veg))
	{
		preg_match_all('/<a href="[\s\S]*?">([\s\S]*?)<\/a>/',$Veg[1],$Veg1);
		//var_dump($Veg1);
		//exit;
		//遍历二维数组,连接拼接符
		if(empty($Veg1))
		{
			return "";
		}else{
			$Veg2 = "";
			foreach($Veg1[1] as $val)
			{
				$Veg2.=$val."@@$$||";
			}
			return $Veg2;
			//var_dump($Veg2);
			//exit;
			
		}
	}
}
// img_href
function getimg_href($data)
{
	preg_match_all('/<div class="slide">\s+<a href="([\s\S]*?)"\s+title/',$data,$img_href);
	if(empty($img_href))
		{
			return "";
		}else{
			$img_href1 ="";
			foreach($img_href[1] as $val)
			{
				$img_href1.=$val."@@$$||";
			}
			return $img_href1;
		}
}

// img_title
function getimg_title($data)
{
	preg_match_all('/<div class="slide">[\s\S]*?title="([\s\S]*?)"\s+rel/',$data,$img_title);
	
	// 用分隔符把数据分开,下面不对,用explode('|',你的数组名);
	//	拆分之后在implode用分隔符
	if(empty($img_title))
		{
			return "";
		}else{
			$img_title1 = "";
			foreach($img_title[1] as $val)
			{
				$img_title1.=$val."@@$$||";
			}
			return clean($img_title1);
			//var_dump($img_title);
	//exit;
		}

}

// 科名 Family
function getFamily($data)
{
	preg_match('/<h2>Family<\/h2>\s+<ul>\s+<li><a href="[\s\S]*?">([\s\S]*?)<\/a>/',$data,$Family);
	
	return $Family[1];
	//echo $Family[1];  // 真	
}
// 其他名称  Addname
function getAddname($data)
{
	//preg_match('/<h2>Additional Names<\/h2>\s+<dl>\s+<dt>Hebrew with Vowels:</dt><dd class="dir-rtl">זוּגַן הַשִּׂיחַ</dd>/');
	preg_match('/<h2>Additional Names<\/h2>\s+<dl>\s+<dt>Hebrew with Vowels:<\/dt><dd class="dir-rtl">([\s\S]*?)<\/dd>/',$data,$Addname);
	return $Addname[1];
	//echo $Addname[1]; // 好了
	
}

// 四个面板  Plant
//遍历二维数组,连接拼接符
		/*array(2) { [0]=> array(4) { [0]=> string(77) "chamaephyte" 
					[1]=> string(70) "absent" [2]=> string(84) "leaf succulent" 
					[3]=> string(82) "perennating" } 

		[1]=> array(4) { [0]=> string(11) "chamaephyte" 
						[1]=> string(6) "absent" 
						[2]=> string(14) "leaf succulent" 
						[3]=> string(11) "perennating" } }  */

function getPlant($data)
{
	preg_match('/<div id="tab1">\s+<dl class="info">([\s\S]*?)<\/dl>/',$data,$Plant);
	if(!empty($Plant))
	{
		preg_match_all('/<a href="[\s\S]*?">([\s\S]*?)<\/a>/',$Plant[1],$Plant1);
		// $Plant1是一个数组
		if(empty($Plant1))
		{
			return "";
		}else{
			$Plant2 = "";
			foreach($Plant1[1] as $val)
			{
				$Plant2.=$val."@@$$||";
			}
			return $Plant2;
		}
		//var_dump($Plant1);
		//exit;
	}
}
	
// Flowering
function getFlowering($data)
{
	preg_match('/<div id="tab2">\s+<dl class="info">([\s\S]*?)<\/dl>/',$data,$Flowering);
	if(!empty($Flowering))
	{
		preg_match_all('/<a href="[\s\S]*?">([\s\S]*?)<\/a>/',$Flowering[1],$Flowering1);
		//var_dump($Plant1);
		//exit;
		if(empty($Flowering))
		{
			return "";
		}else{
			$Flowering2 = "";
			foreach($Flowering1[1] as $val)
			{
				$Flowering2.=$val."@@$$||";
			}
			return $Flowering2;
		}
	}
}
// Leaf
function getLeaf($data)
{
	preg_match('/<div id="tab3">\s+<dl class="info">([\s\S]*?)<\/dl>/',$data,$Leaf);
	if(!empty($Leaf))
	{
		preg_match_all('/<a href="[\s\S]*?">([\s\S]*?)<\/a>/',$Leaf[1],$Leaf1);
		//var_dump($Plant1);
		//exit;
		if(empty($Leaf1))
		{
			return "";
		}else{
			$Leaf2 = "";
			foreach($Leaf1[1] as $val)
			{
				$Leaf2.=$val."@@$$||";
			}
			//var_dump($arr);
			//exit;
			return $Leaf2;
		}
	}
}
// Habitat
function getHabitat($data)
{
	preg_match('/<div id="tab4">\s+<dl class="info">([\s\S]*?)<\/dl>/',$data,$Habitat);
	if(!empty($Habitat))
	{
		preg_match_all('/<a href="[\s\S]*?">([\s\S]*?)<\/a>/',$Habitat[1],$Habitat1);
		//var_dump($Plant1);
		//exit;
		if(empty($Habitat1))
		{
			return "";
		}else{
			$Habitat2 = "";
			foreach($Habitat1[1] as $val)
			{
				$Habitat2.=$val."@@$$||";
			}
			return $Habitat2;
		}
	}
}

// 参考文献的title  book_title
// 怎么同时获取文章title和href
function getbook_title($data)
{
	preg_match('/<h3 class="floraBookPart">Books <\/h3>\s+<ul>([\s\S]*?)<\/ul>/',$data,$book_title);
	if(!empty($book_title))
	{
		preg_match_all('/<a href="[\s\S]*?">([\s\S]*?)<\/a>/',$book_title[1],$book_title1);
		//var_dump($Plant1);
		//exit;
		if(empty($book_title1))
		{
			return "";
		}else{
			$book_title2 = "";
			foreach($book_title1[1] as $val)
			{
				$book_title2.=$val."@@$$||";
			}
			return $book_title2;
		}
	}
}
// 参考文献的href   book_href
function getbook_href($data)
{
	preg_match('/<h3 class="floraBookPart">Books <\/h3>\s+<ul>([\s\S]*?)<\/ul>/',$data,$book_href);
	if(!empty($book_href))
	{
		preg_match_all('/<a href="[\s\S]*?">([\s\S]*?)<\/a>/',$book_href[1],$book_href1);
		//var_dump($Plant1);
		//exit;
		if(empty($book_href1))
		{
			return "";
		}else{
			$book_href2 = "";
			foreach($book_href1[1] as $val)
			{
				$book_href2.=$val."@@$$||";
			}
			return $book_href2;
		}
	}
}


	// 清除 html 标签
	function clean($str){
		$str = str_replace('\'','\\\'',$str);
		$str = str_replace('\’','\\\’',$str);
		$str = str_replace('\'','\\\‘',$str);
		$str = preg_replace('/\s\s+/', ' ', $str);
		$str = trim($str);
		$str = strip_tags($str);
		return $str;
 	}
 ?>
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值