以前曾经想爬取一些汽车官网,了解一下他们在中国的店的情况,分布。然后这个网站之前爬取过,不过没成功,最近重新爬取了一下,成功了。这个网站我要的数据主要是用json储存的,需要先获得省份id,再拼接链接获取城市id,再从城市id拼接的链接获取我想要的内容。之后会做一个可视化。 import requests import lxml from lxml import etree import json import numpy as np headers = { "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Mobile Safari/537.36" } # 获取省份id def get_province_Num(): url = "http://api.faw-benteng.com/ajax/v1_getprov.php?callback=p11" reponse = requests.get(url, headers = headers) jsonT = reponse.content.decode('utf-8') jsonT = jsonT[5:-2] # print(jsonT) dict1= {} list1 = eval(jsonT) # print(list1) for item in list1: provid = item['id'] provname = item['name'] dict1[provid] = provname return dict1 # 获取城市id def getCityNum(url,provname): # url = "http://api.faw-benteng.com/ajax/v1_getcity.php?callback=p12&prov=11" response = requests.get(url, headers = headers) jsonT = response.content.decode('utf-8') jsonT= jsonT[5:-2] # print(jsonT) # print(len(jsonT)) if jsonT: list1 = eval(jsonT) # print(list1) # print(len(list1)) dict2 = {} if len(jsonT) <41: cityid = list1['id'] cityname = list1['name'] dict2[cityid] = cityname return provname,dict2 # print(provname,dict2) elif len(list1) ==0: return None elif len(jsonT) >41: for item in list1: cityid = item['id'] cityname = item['name'] # print(cityname,cityid) dict2[cityid] = cityname return provname, dict2 # print(provname,dict2) else: return ("该网址没有经销店") # 获取需要的字段 def getcarurl(pageprov,cityname,url): response = requests.get(url, headers = headers) if response.status_code == 200: html = response.content.decode('utf-8') data = json.loads(html) data = data.values() for item in data: yield { 'pageprov':pageprov, 'cityname':cityname, 'attr': item['attr'], 'biz_name': item['biz_name'], 'sale_phone': item['sale_phone'], 'serv_phone': item['serv_phone'], 'address': item['address'], 'zip': item['zip'] } else: return None # 保存 def save_to_txt(): html = getcarurl(pageprov,cityname,url) with open('奔腾经销商.txt','a+',encoding='utf-8',errors='ignore') as f: for item in html: print(item) f.write(str(pageprov+','+cityname +','+ item['attr']+','+ item['biz_name']+','+ item['sale_phone']+','+item['serv_phone']+ item['address']+','+ item['zip']+'\n')) # print(mt) if __name__ == '__main__': pageprov = get_province_Num() # print(pageprov) # 获得省份链接跟省份名字 for provid,provname in pageprov.items(): url = "http://api.faw-benteng.com/ajax/v1_getcity.php?callback=p12&prov=%d" %int(provid) city = getCityNum(url,provname) # 获得城市链接跟城市名 pageprov = city[0] city = city[1] # print(pageprov,city) for cityid,cityname in city.items(): url = "http://api.faw-benteng.com/ajax/v1_getdealer.php?brand=benteng&city=%d"%int(cityid) # print(cityid,cityname) getcarurl(pageprov,cityname,url) save_to_txt()
爬取奔腾官网汽车经销商(上)
最新推荐文章于 2024-04-24 09:50:44 发布