本文旨在交流学习,勿作他用,否则后果自负
环境 win+pycharm+anaconda
import re
import csv
import requests
from lxml import etree
from user_agent import UA
head = {
'User-Agent': UA,
'X-Requested-With': 'XMLHttpRequest',
'Referer': '*/*',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4',
'Accept': 'application/json, text/javascript, */*; q=0.01',
}
base_url='https://car.autohome.com.cn/AsLeftMenu/As_LeftListNew.ashx?typeId=1%20&brandId=0%20&fctId=0%20&seriesId=0'
r=requests.get(url=base_url,headers=head,timeout=30)
data=etree.HTML(r.text)
title_list=data.xpath(r".//ul/li/h3/a/text()")
url_list=data.xpath(r".//ul/li/h3/a/@href")
id_list=data.xpath(r".//ul/li/@id")
for i,j,k in zip(title_list,url_list,id_list):
j='https://car.autohome.com.cn'+j
k=re.compile(r'b(.*)').findall(k)
list_url='https://car.autohome.com.cn/AsLeftMenu/As_LeftListNew.ashx?typeId=1&brandId={}&fctId=0&seriesId=0'.format(k[0])
n_r = requests.get(url=list_url, headers=head, timeout=30)
n_data = etree.HTML(n_r.text)
title=n_data.xpath(r".//dl/dd/a/text()")
url=n_data.xpath(r'.//dl/dd/a/@href')
for t,u in zip(title,url):
u='https://car.autohome.com.cn'+u
car_d = requests.get(url=u, headers=head, timeout=30)
# d = bytes(car_d.text, car_d.encoding).decode('GBK', 'ignore')
car_data = etree.HTML(car_d.text)
car_score = " ".join(car_data.xpath(r'.//a[@class="font-bold"]/div[@class="score-cont"]/text()|.//a[@class="font-bold"]/div[@class="score-cont"]//span[@class="score-number"]/text()'))#评分
car_level = " ".join(car_data.xpath(r'.//ul[@class="lever-ul"]/li/span[@class="info-gray"]/text()'))#级别
car_stru = " ".join(car_data.xpath(r'.//ul[@class="lever-ul"]/li[2]/a/text()|.//ul[@class="lever-ul"]/li[2]/span/text()'))#结构或续航
car_engine = " ".join(car_data.xpath(r'.//ul[@class="lever-ul"]/li[3]//a/text()|.//ul[@class="lever-ul"]/li[3]/span/text()'))#发动机
car_tran_case = " ".join(car_data.xpath(r'.//ul[@class="lever-ul"]/li[4]//a/text()|.//ul[@class="lever-ul"]/li[4]/span/text()'))#变速箱或充电时间
car_colour = " ".join(car_data.xpath(r'.//ul[@class="lever-ul"]/li[5]//div[@class="tip-content"]/text()'))#颜色
car_price=" ".join(car_data.xpath(r'.//div/span[@class="lever-price red"]/span[@class="font-arial"]/text()'))#指导价
car_img='https:'+"".join(car_data.xpath(r'.//div[@class="list-cont-img"]//img/@src'))#图片
with open('car.csv', 'a+',encoding="utf-8") as f:
f_csv = csv.writer(f)
f_csv.writerow([t,car_score,car_level,car_stru,car_engine,car_tran_case,car_colour,car_price,car_img])
print(t,car_score,car_level,car_stru,car_engine,car_tran_case,car_colour,car_price,car_img)