任务:爬取某公开网站上的汽车用户消费投诉数据('http://tousu.315che.com/tousulist/serial/55467/'),字段包含品牌、投诉内容、品牌车型、单号、投诉问题、投诉时间和经销商,并为完成后续项目做准备。
# 导入相应库
import requests
from lxml import etree
url = 'http://tousu.315che.com/tousulist/serial/55467/' # 第一页的网页链接
# 发送请求
req = requests.get(url)
req.encoding = 'UTF-8'
# 解析网页
html = etree.HTML(req.text)
# 获取数据
brand = html.xpath('//*[@id="letterTabList"]/div/a/text()') # 品牌
href = html.xpath('//*[@id="letterTabList"]/div/a/@href') # 品牌链接
href[0] ='http://tousu.315che.com/tousulist/serial/55467/'
brand_complain = []
for h in range(0, 2):
req = requests.get(href[h])
html = etree.HTML(req.text)
brand_complain.append(html.xpath('/html/body/div[1]/div[2]/div[2]/div[1]/div[2]/div[2]/ul/li/a/@href')) #第一个品牌投诉链接
#print(brand_complain)
content = []
model = []
number = []
problem = []
time = []
store = []
for i in brand_complain:
for j in i:
req = requests.get(j)
req.encoding = 'UTF-8'
html = etree.HTML(req.text)
content.append(html.xpath('/html/body/div[1]/div[2]/div[2]/div[1]/div[1]/div/div[1]/p/text()')[0]) # 投诉内容
model.append(html.xpath('/html/body/div[1]/div[2]/div[2]/div[1]/div[1]/div/div[3]/p[1]/text()')[0][5:]) # 车牌型号
number.append(html.xpath('/html/body/div[1]/div[2]/div[2]/div[1]/div[1]/div/div[3]/p[2]/text()')[0][3:]) # 单号
problem.append(html.xpath('/html/body/div[1]/div[2]/div[2]/div[1]/div[1]/div/div[3]/p[3]/text()')[0][5:]) # 问题
time.append(html.xpath('/html/body/div[1]/div[2]/div[2]/div[1]/div[1]/div/div[3]/p[4]/text()')[0][5:]) # 时间
store.append(html.xpath('/html/body/div[1]/div[2]/div[2]/div[1]/div[1]/div/div[3]/p[5]/text()')[0][4:]) # 商家
for i in range(len(problem)):
print("车牌型号:"+model[i]+"\n"+"单号:"+number[i]+"\n"+"问题:"+problem[i]+"\n"+"时间:"+time[i]+"\n"+"经销商:"+store[i]+"\n")