php抓取 asp,分享下页面关键字抓取www.icbase.com站点代码(带asp.net参数的)

/**

* HOST: www.icbase.com

*/

//set_time_limit(0);

// base function

function curl_get($url, $data = array(), $header = array(), $timeout = 15, $port = 80, $reffer = '', $proxy = '')

{

$ch = curl_init();

if (!empty($data)) {

$data = is_array($data)?http_build_query($data): $data;

$url .= (strpos($url,'?')? '&': "?") . $data;

}

curl_setopt($ch, CURLOPT_URL, $url);

curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);

curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);

curl_setopt($ch, CURLOPT_POST, 0);

curl_setopt($ch, CURLOPT_PORT, $port);

curl_setopt($ch, CURLOPT_HTTPHEADER, $header);

curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); //是否抓取跳转后的页面

$reffer && curl_setopt($ch, CURLOPT_REFERER, $reffer);

if($proxy) {

curl_setopt($ch, CURLOPT_PROXY, $proxy);

curl_setopt($ch, CURLOPT_PROXYPORT, 1723);

curl_setopt($ch, CURLOPT_PROXYUSERPWD,"andhm001:andhm123");

}

$result = array();

$result['result'] = curl_exec($ch);

if (0 != curl_errno($ch)) {

$result['error'] = "Error:\n" . curl_error($ch);

}

curl_close($ch);

return $result;

}

function curl_post($url, $data = array(), $header = array(), $timeout = 5, $port = 80)

{

$ch = curl_init();

curl_setopt($ch, CURLOPT_URL, $url);

curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);

curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);

//curl_setopt($ch, CURLOPT_PORT, $port);

!empty ($header) && curl_setopt($ch, CURLOPT_HTTPHEADER, $header);

curl_setopt($ch, CURLOPT_POST, 1);

curl_setopt($ch, CURLOPT_POSTFIELDS, $data);

$result = array();

$result['result'] = curl_exec($ch);

if (0 != curl_errno($ch)) {

$result['error'] = "Error:\n" . curl_error($ch);

}

curl_close($ch);

return $result;

}

/**

* 获取列表页的html源码

* @param string $keywords 搜索关键字

* @param int $page 页数

* @return boolean|array

*/

function getListHtml($keywords, $page=1)

{

if ($page < 0)

{

return false;

}

$page = $page == 0 ? 1 : intval($page);

if ($page == 1)

{

$result = curl_get('http://www.icbase.com/ProResult.aspx', array('ProKey' => $keywords));

if ( isset($result['error']) )

{

return false;

//exit($result['error']);

}

$result = $result['result'];

// asp.net post提交数据

if(! defined('__VIEWSTATE') && preg_match('/

{

define('__VIEWSTATE', $matches[1]);

} else {

return false;

}

if(! defined('__PREVIOUSPAGE') && preg_match('/

{

define('__PREVIOUSPAGE', $matches[1]);

} else {

return false;

}

if(! defined('__EVENTVALIDATION') && preg_match('/

{

define('__EVENTVALIDATION', $matches[1]);

} else {

return false;

}

return $result;

}

$data = array(

'__EVENTTARGET' => 'pager',

'__EVENTARGUMENT' => $page,

'__VIEWSTATE' => __VIEWSTATE,

'__PREVIOUSPAGE' => __PREVIOUSPAGE,

'__EVENTVALIDATION' => __EVENTVALIDATION,

);

$result = curl_post('http://www.icbase.com/ProResult.aspx?ProKey=' . $keywords, $data);

if ( isset($result['error']) )

{

return false;

//exit($result['error']);

}

$result = $result['result'];

return $result;

}

/**

* 获取列表页 a链接的url

* @param string $html html源码

* @return array

*/

function getListHref($html)

{

 $pattern = '/[\s\n]*]\/>/isU';

if (preg_match_all($pattern, $html, $matches))

{

return $matches[1];

} else {

// 没有匹配项

return array();

}

}

/**

* 获取下一页数字

* @param string $html html源码

* @return number

*/

function getListNextPage($html)

{

$pattern = '/

]>.+ >/isU';

if (preg_match($pattern, $html, $matches))

{

return intval($matches[1]);

} else {

return -1;

}

}

/**

* 获取列表也所有的href

* @param string $keywords 搜索关键字

* @return boolean|array

*/

function getListHrefAll($keywords)

{

if (empty($keywords))

{

return false;

}

$html = getListHtml($keywords);

$hrefList = getListHref($html);

if (empty($hrefList))

{

// 没有结果

return array();

}

$nextPage = getListNextPage($html);

while ($nextPage > 0)

{

$html = getListHtml($keywords, $nextPage);

$tmpHrefList = getListHref($html);

$hrefList = array_merge($hrefList, $tmpHrefList);

$nextPage = getListNextPage($html);

}

return $hrefList;

}

/**

* 获取详情页信息

* @param string $url url地址或者是抓取到的html源代码 根据@see $is_url 区分

* @param int $is_url 1使用的是url地址 0直接处理html源代码

* @return boolean|multitype:|multitype:string

*/

function getDetail($url, $is_url = 1)

{

if ( empty($url) )

{

return false;

}

$host = 'www.icbase.com';

$html = $url;

if ($is_url) {

$url = '/' . ltrim($url, '/');

$result = curl_get($host . $url);

if ( isset($result['error']) )

{

exit($result['error']);

}

$html = $result['result'];

}

$result = array(

'sup_part' => '', // 供应商型号

'sup_id' => '', // 供应商ID

'mfg_part' => '', // 制造商型号

'mfg_name' => '', // 制造商名称

'cat_name' => '', // 分类名称

'para' => '', // 属性

'desc' => '', // 描述

'pdf_url' => '', // PDF地址

'sup_stock' => '', // 库存

'min_purch' => '', // 最小订购量

'price' => '', // 价格

'img_url' => '', // 图片地址

'createtime' => '', // 创建时间

'datacode' => '', // 批号

'package' => '', // 封装

'page_url' => '', // 页面地址

);

// mfg_part

$pattern = '/

产品型号(.[^

if (preg_match($pattern, $html, $matches))

{

$result['mfg_part'] = trim($matches[1]);

} else {

// 此项木有,说明也没处处了

return array();

}

// mfg_name

$pattern = '/

厂商[\s\n]*(.+)/isU';

if (preg_match($pattern, $html, $matches))

{

$result['mfg_name'] = trim($matches[1]);

}

// para

$pattern = '/

(.+)/isU';

if (preg_match($pattern, $html, $matches))

{

if (preg_match_all('/

(.+)/isU', $matches[1], $matches))

{

$count = count($matches[1]);

$count = intval($count / 2 );

foreach ($matches[1] as $k=>$v)

{

if ($k >= $count)

{

break;

}

if (trim($v) == '描述')

{

// desc

$result['desc'] = trim($matches[1][$count + $k]);

continue;

}

$v = trim($v);

$result['para'][$v] = trim($matches[1][$count + $k]);

}

}

}

// pdf_url

$pattern = '/

详细资料

if (preg_match($pattern, $html, $matches))

{

$result['pdf_url'] = trim($matches[1]);

}

// sup_stock

$pattern = '/

库存数量[\s\n]*(\d+)/isU';

if (preg_match($pattern, $html, $matches))

{

$result['sup_stock'] = trim($matches[1]);

}

// price

$pattern = '/

]+>(\d+)\+]+>.[^\d]*([\d.]+)/isU';

if (preg_match_all($pattern, $html, $matches))

{

foreach ($matches[1] as $k=>$v)

{

$result['price'][$v] = '¥' . $matches[2][$k];

}

}

//img_url

 $pattern = '/

图片

if (preg_match($pattern, $html, $matches))

{

$result['img_url'] = trim($matches[1]);

}

// page_url

if ($is_url)

{

$result['page_url'] = $host . $url;

}

return $result;

}

/**

* 最终调用函数

* @param string $keywords 搜索关键字

* @return array

*/

function getData($keywords)

{

$hrefList = getListHrefAll($keywords);

$result = array();

foreach ($hrefList as $k=>$v)

{

$result[] = getDetail($v);

}

return $result;

}

// Test Script

$keywords = trim($_GET['keywords']);

$result = getData($keywords);

print_r($result);

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值