博客已经搬家,请访问如下地址:http://www.czhphp.com
抓取页面,正则不会写该怎么办那,有个好工具推荐给大家simple_html_dom.php。。。。
直接贴代码了,哈哈
<?php
define('DBHOST', '');
define('DBUSER', '');
define('DBPW', '');
define('DBNAME', '');
require("simple_html_dom.php");
$sfarr = array(
//game
array(
"site"=>"game.hao123.htm",
"func"=>"snap",
"f"=>"dl.fav_container dt.fav_tl",
"s"=>"dl.fav_container dd.fav_links",
'classid'=>4,
'url'=>"http://game.hao123.com/"
),
//book
array(
"site"=>"book.hao123.htm",
"func"=>"snap",
"f"=>"dl.favsites-list dt.fav-title",
"s"=>"dl.favsites-list dd.fav-links",
"classid"=>3,
'url'=>"http://book.hao123.com/"
),
//shopping
array(
"site"=>"gouwu.hao123.htm",
"func"=>"snap",
"f"=>"dl.netbuy-box dt",
"s"=>"dl.netbuy-box",
"classid"=>1,
'url'=>"http://gouwu.hao123.com/sc/"
),
//video
array(
"site"=>"video.hao123.htm",
"func"=>"snap",
"f"=>"div.content-con h2",
"s"=>"div.content-con ul.content-link",
"classid"=>2,
'url'=>"http://www.hao123.com/video"
),
);
foreach($sfarr as $var){
$data = $var['func']($var);
foreach($data as $key){
$con = mysql_connect(DBHOST, DBUSER, DBPW, MYSQL_CLIENT_INTERACTIVE);
if (!$con) {
die('Could not connect: ' . mysql_error());
}
mysql_select_db(DBNAME,$con);
mysql_query("set names utf8",$con);
$sub = !empty($key['sub']) ? $key['sub'] : '';
$classid = !empty($key['classid']) ? $key['classid'] : 0;
$site = $key['site'];
$subclassid = 0;
if($sub){
$sub = str_replace(array("[","]"), '', $sub);
$sql = "insert into subclass (name ,classid) values('".$sub."', ".$classid.")";
// echo $sql;exit;
$res = mysql_query($sql);
$subclassid = mysql_insert_id();
echo $subclassid;exit;
echo 'subclassid '. $subclassid.' has create succ in:'.$var['site'].'</br>';
if($site){
foreach($site as $key){
$sql = "insert into site (name,url,classid, subclassid) values('".$key['text']."', '".$key['href']."', ".$classid.", ".$subclassid.")";
$res = mysql_query($sql);
}
}
echo $var['site'].'website snap succ </br>';
}
}
}
function snap($data){
$html = file_get_html($data['site']);
$dts = $html->find($data['f']);
$cnt = count($dts);
for($i=0;$i<$cnt;$i++){
$ss = array();
$ss['sub'] = trim($dts[$i]->plaintext);
$ss['classid'] = $data['classid'];
$h = str_get_html($html->find($data['s'], $i)->innertext);
foreach($h->find("a") as $a){
if($a->href && $a->plaintext){
$tmp = array();
$tmp['text']= trim($a->plaintext);
$tmp['href']= $a->href;
$ss['site'] []= $tmp;
}
}
$all[]= $ss;
}
return $all;
}
?>