帮女友采集天猫商品数据,搞了一天才解决了基本原理,以下为主要api采集。
查看天猫中商品的源码,会在底部js部分发现initApi,changeLocationApi,initExtensionApi,initExtraApi,selectCityApi这几个api和它们的链接,以下为源码:
"initApi" : "http://mdskip.taobao.com/core/initItemDetail.htm?trialErrNum=0&isSpu=false&isIFC=false&isNewDecorate=true&sellerUserTag4=1152921509608588675¬AllowOriginPrice=false&sellerUserTag2=18020085583054856&sellerUserTag3=144185556855718016&isAreaSell=false&isForbidBuyItem=false&rtk=rstime_sid66761675&isMeizTry=false&tmallBuySupport=true&itemTags=385,775,907,1163,1478,1483,1803,2049,2882,3586,3974,4166,4801,5698,6146,6401,6849,7041,7810,7938,8258,8578,12226,12738,12930,14082,14210&household=false&sellerUserTag=303632416&queryMemberRight=true&tgTag=false&isRegionLevel=false&itemId=13845151308&isUseInventoryCenter=true&isSecKill=false&renderTime=1376820705059&isApparel=false&showTagPrice=true&lastVersionMD5=ae44c374fdd61aa24f78ae2fc5bc6462&releaseTime=1376730616000&hitPrizeAuction=false&showShopProm=true&service3C=false&cartEnable=true", "changeLocationApi":"http://mdskip.taobao.com/core/changeLocation.htm?itemTags=385,775,907,1163,1478,1483,1803,2049,2882,3586,3974,4166,4801,5698,6146,6401,6849,7041,7810,7938,8258,8578,12226,12738,12930,14082,14210&sellerUserTag=303632416&household=false&tgTag=false&isSpu=false&isRegionLevel=false&itemId=13845151308&isUseInventoryCenter=true&isSecKill=false&sellerUserTag4=1152921509608588675¬AllowOriginPrice=false&sellerUserTag2=18020085583054856&sellerUserTag3=144185556855718016&showTagPrice=true&isAreaSell=false&hitPrizeAuction=false&showShopProm=true&service3C=false&cartEnable=true",
"initExtensionApi": "http://ext.mdskip.taobao.com/extension/initExtension.htm?showBreadCrumb=false&sellerId=705777015&brand=%B1%A6%BC%D2%BD%E0&showShopProm=true&showSpuMaintainer=false" ,
"initExtraApi":"http://ext.mdskip.taobao.com/extension/initExtra.htm",
"selectCityApi":"http://mdskip.taobao.com/core/selectCity.htm?isAreaSell=false&itemId=13845151308"
根据单词理解,可以大概猜到是商品基本资料和城市的api数据,可是打开这些api,却都是404页面,由此可见api对此做了限制,基于经验猜想,应该是判断了来源地址,于是用php curl 进行采集,伪造了来源链接,成功获得了json格式的商品数据,可是用php的json_decode进行解析,却发现了很多问题,我搞了一天才搞好,以下为源码。
<?php
//开启cookie
$cookie_file = tempnam('./temp','cookie');
$url='http://mdskip.taobao.com/core/initItemDetail.htm?trialErrNum=0&isSpu=false&isIFC=false&isNewDecorate=false&sellerUserTag4=1152921509608588675¬AllowOriginPrice=false&sellerUserTag2=18020085583054856&sellerUserTag3=144185556855718016&isAreaSell=false&isForbidBuyItem=false&rtk=rstime_sid66761675&isMeizTry=false&tmallBuySupport=false&itemTags=385,775,907,1163,1478,1483,1803,2049,2882,3586,3974,4166,4801,5698,6146,6401,6849,7041,7810,7938,8258,8578,12226,12738,12930,14082,14210&household=false&sellerUserTag=303632416&queryMemberRight=false&tgTag=false&isRegionLevel=false&itemId=13845151308&isUseInventoryCenter=false&isSecKill=false&renderTime=1376730619454&isApparel=false&showTagPrice=false&lastVersionMD5=ae44c374fdd61aa24f78ae2fc5bc6462&releaseTime=1376730616000&hitPrizeAuction=false&showShopProm=false&service3C=false&cartEnable=false';
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
//设置来源链接,这里是商品详情页链接
curl_setopt($ch,CURLOPT_REFERER,"http://detail.tmall.com/item.htm?spm=a1z10.5.w4011-3291539650.55.Sd2bee&id=13845151308&rn=b0a1efacfc070bf8c7a2af009faea807");
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 6.0; zh-CN; rv:1.8.1.3)");
curl_setopt($ch,CURLOPT_HEADER,0);
curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);
curl_setopt($ch,CURLOPT_POST,1);
curl_setopt($ch,CURLOPT_COOKIEJAR,$cookie_file);
$result = curl_exec($ch);
$info = curl_getinfo($ch);
curl_close($ch);
//去除回车、空格等
$result=str_replace(array("\r\n","\n","\r","\t",chr(9),chr(13)),'',$result);
//将json数据中,以纯数字为key的字段加上双引号,例如28523678201:{"areaSold":1}转为:"28523678201":{"areaSold":1},否则json_decode会出现错误
$mode="#([0-9]+)\:#m";
preg_match_all($mode,$result,$s);
$s=$s[1];
if(count($s)>0){
foreach($s as $v){
$result=str_replace($v.':','"'.$v.'":',$result);
}
}
//将字符编码转为utf-8,并且将中文转译,否则json_decode会出现错误
$result=iconv('gb2312','utf-8',$result);
$str=array();
$mode='/([\x80-\xff]*)/i';
if(preg_match_all($mode,$result,$s)){
foreach($s[0] as $v){
if(!empty($v)){
$str[base64_encode($v)]=$v;
$result=str_replace('"'.$v.'"','"'.base64_encode($v).'"',$result);
}
}
}
$result=json_decode($result,true);
//这里得到的数据中,中文数据被转译,下面将中文数据解析
$result=arr_foreach($result,$str);
print_r($result);exit;
function arr_foreach ($arr,$str)
{
if (!is_array ($arr))
{
return false;
}
foreach ($arr as $key => $val )
{
if (is_array ($val))
{
$arr[$key]=arr_foreach($val,$str);
}
else
{
if(!empty($val)){
if($str[$val]){
$arr[$key]=$str[$val];
}
}
}
}
return $arr;
}
?>
主要问题为,api返回的数据格式错误,经过半天的时间,发现问题主要是,json格式中,某些数据以纯数字为key,例如28523678201:{"areaSold":1},而json中的key必须要以双引号""包围,必须将28523678201:{"areaSold":1}转为:"28523678201":{"areaSold":1},否则json_decode会出现错误。
解决了上面的问题,发现json_decode还是错误,于是又找了半天,发现json格式不能带中文,于是设法将json中的中文先转译,再解析,如此便成功了,下面是initApi解析后的数组:
Array
(
[isSuccess] => 1
[defaultModel] => Array
(
[deliveryDO] => Array
(
[deliveryAddress] => 浙江嘉兴
[deliverySkuMap] => Array
(
[28523678201] => Array
(
[0] => Array
(
[money] =>
[name] =>
[postage] => 商家承担运费
[postageFree] => 1
[serviceLinkText] =>
[serviceLinkUrl] =>
[signText] =>
[tagImage] =>
[tips] =>
[type] => 0
)
)
[28523678202] => Array
(
[0] => Array
(
[money] =>
[name] =>
[postage] => 商家承担运费
[postageFree] => 1
[serviceLinkText] =>
[serviceLinkUrl] =>
[signText] =>
[tagImage] =>
[tips] =>
[type] => 0
)
)
[default] => Array
(
[0] => Array
(
[money] =>
[name] =>
[postage] => 商家承担运费
[postageFree] => 1
[serviceLinkText] =>
[serviceLinkUrl] =>
[signText] =>
[tagImage] =>
[tips] =>
[type] => 0
)
)
)
[hasHomeDeliveryService] =>
[otherServiceList] => Array
(
)
)
[gatewayDO] => Array
(
[changeLocationGateway] => Array
(
[queryDelivery] => 1
[queryProm] =>
)
[trade] => Array
(
[addToBuyNow] => Array
(
)
[addToCart] => Array
(
)
)
)
[inventoryDO] => Array
(
[icTotalQuantity] => 7814
[skuQuantity] => Array
(
[28523678201] => Array
(
[quantity] => 1238
[type] => 1
)
[28523678202] => Array
(
[quantity] => 6576
[type] => 1
)
)
[success] => 1
[totalQuantity] => 7814
[type] => 1
)
[itemPriceResultDO] => Array
(
[areaId] => 330100
[campaignInfo] =>
[largeScalePromOfficial] =>
[largeScalePromPeriod] => -1
[largeScalePromUnOfficial] =>
[largeScalePromUnderFiftyPOff] =>
[priceInfo] => Array
(
[28523678201] => Array
(
[areaSold] => 1
[price] => 186.80
[promotionList] => Array
(
[0] => Array
(
[add] =>
[amountRestriction] =>
[gift] =>
[limitProm] =>
[limitTime] =>
[price] => 99.00
[promText] =>
[promType] => normal
[start] =>
[status] => 2
[type] => 酷暑热卖
[typeUrl] =>
)
[1] => Array
(
[add] =>
[amountRestriction] =>
[gift] =>
[limitProm] =>
[limitTime] =>
[price] => 183.06
[promText] => 登录后确认是否享有此优惠
[promType] => normal
[start] =>
[status] => 2
[type] => 店铺vip
[typeUrl] =>
)
[2] => Array
(
[add] =>
[amountRestriction] =>
[gift] =>
[limitProm] =>
[limitTime] =>
[price] => 99.00
[promText] => 登录后确认是否享有此优惠
[promType] => normal
[start] =>
[status] => 2
[type] => VIP价格
[typeUrl] =>
)
)
[tagPrice] =>
[umpBigPromotionDisplayPrice] =>
)
[28523678202] => Array
(
[areaSold] => 1
[price] => 168.00
[promotionList] => Array
(
[0] => Array
(
[add] =>
[amountRestriction] =>
[gift] =>
[limitProm] =>
[limitTime] =>
[price] => 89.04
[promText] =>
[promType] => normal
[start] =>
[status] => 2
[type] => 酷暑热卖
[typeUrl] =>
)
[1] => Array
(
[add] =>
[amountRestriction] =>
[gift] =>
[limitProm] =>
[limitTime] =>
[price] => 164.64
[promText] => 登录后确认是否享有此优惠
[promType] => normal
[start] =>
[status] => 2
[type] => 店铺vip
[typeUrl] =>
)
[2] => Array
(
[add] =>
[amountRestriction] =>
[gift] =>
[limitProm] =>
[limitTime] =>
[price] => 89.04
[promText] => 登录后确认是否享有此优惠
[promType] => normal
[start] =>
[status] => 2
[type] => VIP价格
[typeUrl] =>
)
)
[tagPrice] =>
[umpBigPromotionDisplayPrice] =>
)
)
[promType] =>
[queryProm] =>
[tmallShopProm] =>
[umpBigPromotionItem] =>
[wanrentuanInfo] =>
)
[memberRightDO] => Array
(
[activeName] =>
[activityId] =>
[activityType] => 0
[gradeName] =>
[level] => 0
[postageFree] =>
[shopMember] =>
[success] =>
[time] => 0
[times] => 0
[value] => 0
)
[miscDO] => Array
(
[bucketId] => 12
[city] => 杭州
[cityId] => 330100
[region] => 上城区
[regionId] => 330102
[sellCountDown] => 0
[systemTime] => 1376830108189
)
[sellCountDO] => Array
(
[cspuSellCountMap] => Array
(
)
[sellCount] => 1939
)
[specialServiceList] => Array
(
)
[tradeResult] => Array
(
[cartEnable] =>
[cartType] => 2
[miniTmallCartEnable] => 1
[param] =>
[tradeDisableTypeEnum] =>
[tradeEnable] => 1
[tradeType] =>
)
[userInfoDO] => Array
(
[juKeBuyerLogin] =>
[loginCC] =>
[loginUserType] => buyer
[nicker] =>
[userId] => 0
)
)
)