<?php
/**
* 抓取“华强电子网”供应商主程序
* author Lee.
* Last modify $Date: 2012-2-2 12:55:35 $
*/
require_once './config.inc.php';
class huaqiang {
private $key; // 型号
private $pageNum; // 页码
/**
* 入口程序
*/
public function go($key) {
$this->key = $key;
if ($this->checkIsExistsData()) {
$this->pageNum = $this->getPageNum();
$this->getInfo();
}
}
/**
* 获取页面内容
* @param Number $page
* @return string
*/
private function getContent($page=1) {
$re = file_get_contents($this->getUrl($this->key, $page));
return $re;
}
/**
* 检查第一页是否有数据
* @return 有返回 true;无返回 false
*/
private function checkIsExistsData() {
if (stristr($this->getContent(), '<span class="s_curr g_vm">1</span>')) {
return true;
} else {
return false;
}
}
/**
* 获取供应商 url 链接数组
* @return ArrayObject
*/
private function getInfo() {
if ($this->pageNum==1) { # 处理只有一页的情况
$arr = $this->shopAddContact($this->shopUrlMatchReArr($this->getContent()));
# 循环抓取信息
foreach ($arr as $k=>$v) {
$infoArr = $this->getInfoByShopUrl($v);
if ($this->execAdd($infoArr)) echo 'Add Success!!';
$this->sleep();
}
$this->sleep();
} elseif ($this->pageNum>1) { # 多页
for ($i=1; $i<=$this->pageNum; $i++) {
$arr = $this->shopAddContact($this->shopUrlMatchReArr($this->getContent($i)));
# 循环抓取信息
foreach ($arr as $k=>$v) {
$infoArr = $this->getInfoByShopUrl($v);
if ($this->execAdd($infoArr)) echo 'Add Success!!';
$this->sleep();
}
$this->sleep();
}
}
}
/**
* 执行添加数据库
* @param array $infoArr
* @return Number 是否添加成功
*/
private function execAdd($infoArr) {
$m = new Model();
if (!$m->isExists('huaqiang', "company='{$infoArr['company']}'")) {
$num = $m->insert('huaqiang', array('company','mobile','phone','fax','region','address','website','zip','email','qq','msn','market','shopUrl'), array($infoArr['company'],$infoArr['mobile'],$infoArr['phone'],$infoArr['fax'],$infoArr['region'],$infoArr['address'],$infoArr['website'],$infoArr['zip'],$infoArr['email'],$infoArr['qq'],$infoArr['msn'],$infoArr['market'],$infoArr['shopUrl']));
}
return $num;
}
/**
* 抓取信息
* @param $url
* @return ArrayObject
*/
private function getInfoByShopUrl($url) {
$re = $mobileRe = $faxRe = $marketRe = $msnRe = $zipRe = $urlRe = $emailRe = $qqRe = $this->getUrlInfo($url);
preg_match_all('/<li class=\"g\_fl tit\">公司名称:<\/li><li class="g_fl cont">(.+)<\/li>.+<li class="g_fl tit">电话:<\/li><li class=\"g\_fl cont\">(.+)<\/li>.+<li class=\"g\_fl tit\">所在地区:<\/li><li class=\"g\_fl cont\">(.+)<\/li>.+<li class=\"g\_fl tit\">详细地址:<\/li><li class=\"g\_fl cont\">(.+)<\/li>.+<li class=\"g\_fl tit\">QQ:<\/li><li class=\"g\_fl cont\">(.+)<\/li>/Usi', $re, $shopArr);
preg_match_all('/<li class=\"g\_fl tit\">手机:<\/li><li class=\"g\_fl cont\">(.*)<\/li>/Usi', $mobileRe, $mobileArr);
preg_match_all('/<li class=\"g\_fl tit\">传真:<\/li><li class=\"g\_fl cont\">(.*)<\/li>/Usi', $faxRe, $faxArr);
preg_match_all('/<li class=\"g\_fl tit\">网址:<\/li><li class=\"g\_fl cont\">(.*)<\/li>/Usi', $urlRe, $urlArr);
preg_match_all('/<li class=\"g\_fl tit\">MSN:<\/li><li class=\"g\_fl cont\">(.*)<\/li>/Usi', $msnRe, $msnArr);
preg_match_all('/<li class=\"g\_fl tit\">邮政编码:<\/li><li class=\"g\_fl cont\">(.*)<\/li>/Usi', $zipRe, $zipArr);
preg_match_all('/<li class=\"g\_fl tit\">所属电子市场:<\/li><li class=\"g\_fl cont\">(.*)<\/li>/Usi', $marketRe, $marketArr);
preg_match_all('/<li class=\"g\_fl tit\">电子邮箱:<\/li><li class=\"g\_fl cont cor\">(.*)<\/li>/Usi', $emailRe, $emailArr);
preg_match_all('/<li class=\"g\_fl tit\">QQ:<\/li><li class=\"g\_fl cont\">(.*)<\/li>/Usi', $qqRe, $qqArr);
$infoArr = array(
'company'=>trim($shopArr[1][0]),
'mobile'=>empty($mobileArr[1][0]) ? '' : $mobileArr[1][0],
'phone'=>$this->stripPhoneTags(trim($shopArr[2][0])),
'fax'=>empty($faxArr[1][0]) ? '' : $faxArr[1][0],
'region'=>trim($shopArr[3][0]),
'address'=>trim($shopArr[4][0]),
'zip'=>empty($zipArr[1][0]) ? '' : $zipArr[1][0],
'email'=>empty($emailArr[1][0]) ? '' : $emailArr[1][0],
'qq'=>empty($qqArr[1][0]) ? '' : $qqArr[1][0],
'msn'=>empty($msnArr[1][0]) ? '' : $msnArr[1][0],
'market'=>empty($marketArr[1][0]) ? '' : $marketArr[1][0],
'website'=>empty($urlArr[1][0]) ? '' : $this->stripATags($urlArr[1][0]),
'shopUrl'=>$url
);
return $infoArr;
}
/**
* 供应商店铺链接添加 contact.html
* @param array $arr
* @return string
*/
private function shopAddContact($arr) {
foreach ($arr as $k=>$v) {
$arr[$k] = $v . '/contact.html';
}
return $arr;
}
/**
* 去掉网址的 A 标签
* @param string $site
* @return string
*/
private function stripATags($site) {
$site = preg_replace('/<a.+>(.+)<\/a>/', '\1', $site);
return $site;
}
/**
* 去掉手机多余标签
* @param string $phone
* @return string
*/
private function stripPhoneTags($phone) {
$phone = str_replace('<span>', '', $phone);
$phone = str_replace('</span>', ' ', $phone);
$phone = str_replace('<br />', '', $phone);
return $phone;
}
/**
* 根据页面获取供应商 url 数组
* @param string $re
* @return ArrayObject
*/
private function shopUrlMatchReArr($re) {
preg_match_all('/<li class="col3"><a class=\"company\" target=\"\_blank\" href=\"(.+)\" value=\".+\">.+<\/a>/Usi', $re, $arr);
$arr = array_unique($arr[1]);
return $arr;
}
/**
* 获取页码
* @return Number
*/
private function getPageNum() {
$i = 1;
while (true) {
$re = $this->getContent($i);
if (stristr($re, '<span class="g_vm s_f0f s_f0f1" title="下一页">')) break;
$i++;
$this->sleep();
}
return $i;
}
/**
* 获取 URL 链接
* @param string $str
* @param int $page 页码
* @return string
*/
private function getUrl($str, $page=1) {
return "http://www.hqew.com/ic/{$str}_____0_00_0_{$page}.html";
}
/**
* 获取页面内容
* @param string $url
* @return string
*/
private function getUrlInfo($url) {
$re = file_get_contents($url);
return $re;
}
/**
* 休眠时间,默认5秒
*/
private function sleep($seconds=5) {
sleep($seconds);
}
}
/**
* 使用方法:1、先实例化一个类;2、调用 go($param) 方法,$param 为型号
* 程序运行思路:根据“华强电子网”的IC搜索功能,输入型号进行搜索,然后抓取供应商信息
*/
/**
* 数据库结构
*
CREATE TABLE `huaqiang` (
`id` mediumint(8) unsigned NOT NULL auto_increment,
`company` varchar(500) NOT NULL,
`mobile` varchar(500) NOT NULL,
`phone` varchar(500) NOT NULL,
`fax` varchar(500) NOT NULL,
`region` varchar(500) NOT NULL,
`address` varchar(500) NOT NULL,
`website` varchar(200) NOT NULL,
`zip` varchar(100) NOT NULL,
`email` varchar(500) NOT NULL,
`qq` varchar(200) NOT NULL,
`msn` varchar(200) NOT NULL,
`market` varchar(500) NOT NULL,
`shopUrl` varchar(200) NOT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8
*/
$c = new huaqiang();
$arr = array_unique(array('MAX3232', 'AML8613', 'MT6225A', 'OM8373PS/N3/A', 'PT7313', 'MAX8212ESA', 'TL431', 'S3C2440', 'TMS320F2812PGFA', 'PCM1704', 'AN6717', 'CA3162E', 'CA3161E', 'LM393N', 'DS18B20', 'SHT10', 'AML8613', 'AN6717', 'LM393N', 'CA3161E', 'CA3162E', 'PCM1704', 'STK392-040', 'K1667', 'MAX232', 'STM32F103', 'LM358'));
foreach ($arr as $v) {
$c->go($v);
}
?>
抓取“华强电子网”供应商程序
最新推荐文章于 2023-07-01 20:53:42 发布