自己以前写的一个采集程序比较简单
<?php
include('global.php');
$conn = new db();
if(isset($_GET['company'])){
$company = $_GET['company'];
}else{
$company = '';
}
if(isset($_GET['memberid'])){
$memberid = $_GET['memberid'];
}else{
$memberid = '';
}
if(isset($_GET['zj_num'])){
$zj_num = $_GET['zj_num'];
}else{
$zj_num = '';
}
if(isset($_GET['zj_code'])){
$zj_code = $_GET['zj_code'];
}else{
$zj_code = '';
}
$url="http://zsb.house365.com/main.php?infotype=0&price=0&buildarea=0&district=0&keyword=&order_=1&page=".$zj_num."&agentcode=".$zj_code."&pkind=selllist&roomtype=&topic=&order=";
$text = @file_get_contents($url);
preg_match_all('/<td align="center" valign="middle"><a href=\'(.*?)\' target=\'_blank\' title="(.*?)"><img src="http:\/\/sell.house365.com\/images\/sellesflist_12.gif" width="77" height="18" border="0" \/><\/a><\/td>/i',$text,$row);
$num_all = 0;
$len = count($row[1]);
//$len = 1;
for($i=0;$i<$len;$i++)
{
$mrent = array();
$url1 = $row[1][$i];
$text1 = @file_get_contents($url1);
$mrent['coltype'] = 'second';
$mrent['menuid'] = '10';
$mrent['memberid'] = $memberid;
$mrent['memberprop'] = '2';
$mrent['shangquan'] = '0';
$mrent['infotype'] = 'sale';
$mrent['jz'] = '1';
$mrent['menuid'] ='10';
$mrent['author'] = $company;//iconv("UTF-8", "GBK", $company);
//房源名称
preg_match_all('/<td colspan="2" class="fy_name"><h1 style="text-align:center;font-size:20px;font-family:黑体;font-weight:normal">(.*?)<\/h1><\/td>/i',$text1,$name_arr);
$louopan = trim($name_arr[1][0]);
$mrent['loupan'] =iconv( "GBK","UTF-8", $louopan);
//售价
preg_match_all('/<td width="215" class="dash_line">售价:<span>(.*?)<\/span> 万元/i',$text1,$rentall_arr);
$rentall = trim($rentall_arr[1][0]);
$mrent['rentall'] = $rentall;
//面积
preg_match_all('/<td class="dash_line">面积:<span>(.*?)<\/span> 平方米<\/td>/i',$text1,$area_arr);
$area = trim($area_arr[1][0]);
$mrent['area'] = $area;
//单价
preg_match_all('/<td class="dash_line">单价:(\d*) 元\/平方米<\/td>/i',$text1,$rentavg_arr);
$rentaverage = trim($rentavg_arr[1][0]);
$mrent['rentaverage'] =$rentaverage;
//户型
preg_match_all('/<td class="dash_line">户型:(\d)房(\d)厅(\d)卫(\d)阳台<\/td>/i',$text1,$type_arr);
$shi = $type_arr[1][0];
$ting = $type_arr[2][0];
$wei = $type_arr[3][0];
$tai = $type_arr[4][0];
$mrent['shi'] = $shi;
$mrent['ting'] = $ting;
$mrent['wei'] = $wei;
$mrent['tai'] = $tai;
//楼层
preg_match_all('/<td class="dash_line">楼层:(\d*)楼,总高(\d*)层<\/td>/i',$text1,$floor_arr);
$nowfloor = trim($floor_arr[1][0]);
$allfloors = trim($floor_arr[2][0]);
$mrent['nowfloor'] = $nowfloor;
$mrent['allfloors'] = $allfloors;
//年代
preg_match_all('/<td class="dash_line">年代:(\d*)年<\/td>/i',$text1,$buildtime_arr);
$buildtime = trim($buildtime_arr[1][0]);
$mrent['buildtime'] = $buildtime;
//装修
preg_match_all('/<td class="dash_line">装修:(.*?)<\/td>/i',$text1,$upholster_arr);
$upholster = trim($upholster_arr[1][0]);
switch($upholster){
case "毛坯":
$fitment = 1;
break;
case "简装":
$fitment = 2;
break;
case "精装":
$fitment = 3;
break;
case "豪华装":
$fitment = 4;
break;
default:
$fitment = 1;
break;
}
$mrent['fitment'] = $fitment;
//朝向
preg_match_all('/<td class="dash_line">朝向:(.*?)<\/td>/i',$text1,$face_arr);
$face_to = trim($face_arr[1][0]);
$mrent['chaoxiang'] = iconv( "GBK","UTF-8", $face_to);
//物业类型
preg_match_all('/<td class="dash_line">类型:<a (.*?)>(.*?)<\/a><\/td>/i',$text1,$housetype_arr);
$housetype = trim($housetype_arr[2][0]);
$mrent['housetype'] = iconv( "GBK","UTF-8", $housetype);
$cat_arr =array(
"2" => "住宅" ,
"31" =>"写字楼" ,
"32" =>"商铺" ,
"33" =>"别墅" ,
);
if(in_array($housetype,$cat_arr)){
$catid = array_search($housetype,$cat_arr);
}else{
$catid = 0;
}
$mrent['catid'] = $catid;
//更新时间
preg_match_all('/<td class="dash_line">更新时间:(.*?)<\/td>/i',$text1,$uptime_arr);
$uptime = strtotime(trim($uptime_arr[1][0]));
$mrent['uptime'] = $uptime;
//区属
preg_match_all('/<td width="220" class="dash_line">区属:<a (.*?)>(.*?)<\/a>\s<\/td>/i',$text1,$cat_arr);
$cat = trim($cat_arr[2][0]);
$district =array(
"46" => "玄武区",
"45" => "鼓楼区",
"48" => "白下区",
"49" => "建邺区",
"47" => "秦淮区",
"44" => "下关区",
"51" => "雨花台区",
"50" => "栖霞区",
"52" => "江宁区",
"53" => "浦口区",
"54" => "六合区",
"55" => "溧水县",
"56" => "高淳县",
"60" => "其它",
);
$zoneid = array_search($cat,$district);
$mrent['zoneid'] = $zoneid;
//板块
preg_match_all('/<td width="150" class="dash_line">板块:<a (.*?)>(.*?)<\/a> <\/td>/i',$text1,$board_arr);
$board = trim($board_arr[2][0]);
$board = iconv( "GBK","UTF-8", $board);
$sql = "SELECT id FROM `quyu` where sort =$zoneid and `name` like '%".$board."%'";
$quyu = $conn-> Query2SingleRowArray($sql);
$mrent['quyu'] =$quyu['id'];
//小区
preg_match_all('/<td class="dash_line">小区:<a (.*?)>(.*?)<\/a><\/td>\s*<td class="dash_line">(.*?)<\/td>/i',$text1,$xiaoqu_arr);
$xq_name = trim($xiaoqu_arr[2][0]);
$xq_address = trim($xiaoqu_arr[3][0]);
$mrent['address'] = iconv( "GBK","UTF-8", $xq_address);
//联系人
preg_match_all('/<td width="245" align="center"><strong><span>(.*?)<\/span><\/strong><\/td>\s*<td width="185">联系人:<span>(.*?)<\/span><\/td>/i',$text1,$lxr_arr);
$lxr_tel = trim($lxr_arr[1][0]);
$lxr_name = trim($lxr_arr[2][0]);
$mrent['lxr'] = iconv( "GBK","UTF-8", $lxr_name);
$th_tel = array('<span style="font-size:14px">','</span>');
$lxr_tel = str_replace($th_tel,"",$lxr_tel);
$tel_arr = explode("-",$lxr_tel);
for($tj=0;$tj<count($tel_arr);$tj++)
{
$tel_len = strlen($tel_arr[$tj]);
if($tel_len==11)
{
$mrent['lxdh'] = $tel_arr[$tj];
//$mrent['lxrshouji'] = $tel_arr[$tj];
}else{
$mrent['lxdh'] = $tel_arr[$tj];
}
}
//详细信息
preg_match_all('/<div class="infor_fp_con">\s*(.*?)\s*<\/div>/i',$text1,$info_arr);
$info = explode("<br>",$info_arr[1][0]);
$fbdate = trim($info[0]);
for($j=1;$j<count($info);$j++)
{
$if_type = substr($info[$j],0,10);
switch($if_type)
{
case "交通线路:":
$bus = substr(trim($info[$j]),10);
break;
case "基础配套:":
$base = substr(trim($info[$j]),10);
break;
case "附属设施:":
$attach = substr(trim($info[$j]),10);
break;
}
}
$mrent['fbdate'] = iconv( "GBK","UTF-8", $fbdate);
// $mrent['froute'] = iconv( "GBK","UTF-8", $bus);
$jichu = $base.','.$attach;
$jc = explode(",",$jichu);
foreach($jc as $jc_value)
{
switch($jc_value)
{
case "宽带" :
$facnet = 1;
case "管道煤气":
$facgas = 1;
case "有线电视":
$factvnet = 1;
case "电话" :
$facphone = 1;
case "冰箱" :
$facfridge = 1;
case "电视机" :
$factv = 1;
case "洗衣机" :
$facwasher = 1;
case "热水器" :
$facwheater = 1;
case "空调" :
$facaircon = 1;
case "家具" :
$facfurniture = 1;
}
}
$mrent['facfurniture'] = $facfurniture;
$mrent['factvnet'] = $factvnet;
$mrent['factv'] = $factv;
$mrent['facnet'] = $facnet;
$mrent['facphone'] = $facphone;
$mrent['facwheater'] = $facwheater;
$mrent['facaircon'] = $facaircon;
$mrent['facwasher'] = $facwasher;
$mrent['facfridge'] = $facfridge;
$mrent['facgas'] = $facgas;
if($mrent['loupan']!='')
{
$sql = "SELECT id FROM `hou_mrent` where memberid =$memberid loupan ='".$mrent['loupan']."'";
$chk = $conn-> Query2SingleRowArray($sql);
if($chk['id']==''){
$conn ->insert('hou_mrent',$mrent,$debug = true);
$rows = $conn->GetQueryAffectedRows();
if($rows>0)
{
$num_all++;
}
}
}
}
$note = "抓取完成,本页一共抓取".$num_all."条房源";
$note = iconv("GBK", "UTF-8", $note);
?>
<script type="text/javascript">
parent.document.all('note').innerHTML="<?=$note?>";
parent.document.all('btn_sc').disabled="";
parent.document.all('btn_zq').disabled="";
</script>