今天爬取的是某二手车网站
首先,分析进行抓包处理:
然后点击进去看看数据:
关键在于clue_id怎么获取,发现在其上一个网页中有clue_id,于是就解决了。
代码:
import requests
import time
import re
import json
headers = {
'User-Agent':'',
'Referer':''
}
def get_page(i):
url="https://mapi.guazi.com/car-source/carList/pcList?versionId=0.0.0.0&sourceFrom=wap&deviceId=323816f4-f012-4cf5-8471-8b4737ea22e5&osv=Windows+10&minor=&sourceType=&ec_buy_car_list_ab=&location_city=&district_id=&tag=-1&license_date=&auto_type=&driving_type=&gearbox=&road_haul=&air_displacement=&emission=&car_color=&guobie=&bright_spot_config=&seat=&fuel_type=&order=&priceRange=0,-1&tag_types=&diff_city=&intention_options=&initialPriceRange=&monthlyPriceRange=&transfer_num=&car_year=&carid_qigangshu=&carid_jinqixingshi=&cheliangjibie=&page="+str(i)+"&pageSize=20&city_filter=12&city=12&guazi_city=12&qpres=&platfromSource=wap"
response = requests.get(url,headers=headers)
if response.status_code == 200:
return response.text
else:
return None
def get_fado(num):
url="https://mapi.guazi.com/car-source/carRecord/pcConfigurations?versionId=0.0.0.0&sourceFrom=wap&deviceId=323816f4-f012-4cf5-8471-8b4737ea22e5&osv=Windows+10&clueId="+str(num)+"&platfromSource=wap"
response = requests.get(url,headers=headers)
if response.status_code == 200:
return response.text
else:
return None
if __name__ == "__main__":
for i in range(1,30):
html = get_page(i)
data = json.loads(html)
print("正在提取第"+str(i)+"页")
print("-------------------"*5)
for j in range(0,20):
print(data["data"]["postList"][j]["title"])
print("-------------------"*3)
time.sleep(1)
num = data["data"]["postList"][j]["clue_id"]
html2 = get_fado(num)
data2 = json.loads(html2)
for dit in data2["data"]["list"]:
print(dit["title"])
print("-------------------"*2)
for dit2 in dit["children"]:
print(dit2["title"])
print(dit2[list(dit2.keys())[-1]])
print("-------------------")
print("第"+str(i)+"页提取完毕")
结果:
最近新开了公众号,请大家关注一下。