1、配置好pycharm环境
2、代码如下:
# -*- coding:utf-8 -*-
import requests
from lxml import etree
import random
import time
import re
import os
def Brank():
url = "http://**********************/"
print(url)
res = geturl_detil(url)
# 利用xpath获取所需的节点信息
res = res.xpath("//div[@class='leter-class con']/ul/li/dl")
for indexa,a in enumerate(res):
makeid = res[indexa].xpath("./dt/a/@data-id")
makename = res[indexa].xpath("./dt/a/text()")
dict1={}
dict1['makeid'] = makeid[0]
dict1['makename'] = makename[0]
res1 = a.xpath(".//dd[@class='maker ']")
for b in res1:
dict1['manufacturerid'] = b.xpath("./a/@data-id")[0]
dict1['manufacturername'] = b.xpath("./a/text()")[0]
manufacturerHref = b.xpath("./a/@href")[0]
manufacturerUrl = "http:" + manufacturerHref
print(manufacturerUrl)
try:
res2 = geturl_detil(manufacturerUrl)
res2 = res2.xpath("//li[@class='fL']/p[@class='title']")
for c in res2:
modelHref = c.xpath("./a/@href")[0]
dict1['modelname'] = c.xpath("./a/text()")[0]
dict1['modelid'] = re.findall("//db.auto.sina.com.cn/(\d*)/",modelHref)[0]
modelUrl = "http:" + modelHref
print("@" * 30)
print(modelUrl)
try:
res3 = geturl_detil(modelUrl)
# 中间网页结构不一样,需要针对不同网页,获取所需信息
trimCommon = res3.xpath("//div[@class='cartype_list lump']/table/tbody/tr")
trimGreen = res3.xpath("//div[@class='cartype_list']/div[@class='tab_con_cartype_list']/ul[@class='listul clearfix']/li")
print(len(trimCommon),len(trimGreen))
if len(trimCommon) !=0:
for d in trimCommon:
trimidHref = d.xpath("./td[1]/a[1]/@href")[0]
dict1['trimname'] = d.xpath("./td[1]/a[1]/span[@class='s']/text()")[0]
dict1['trimid'] = re.findall("http://db.auto.sina.com.cn/car/(\d*)/.*",trimidHref)[0]
dict1['year'] = re.findall("(\d*)款.*?",dict1['trimname'])[0]
dict1['price'] = d.xpath("./td[4]/a[1]/span/text()")[0].replace("\r","").replace("\n","").replace(" ","")
print(dict1)
saveDataTxt(dict1,"xinlangCarBrank")
elif len(trimGreen) !=0:
for e in trimGreen:
trimHrefG = e.xpath("./a/@href")[0]
dict1['trimid'] = re.findall("http://db.auto.sina.com.cn/car/(\d*)/.*",trimHrefG)[0]
dict1['trimname'] = e.xpath("./a[@class='txt']/span/text()")[0]
dict1['price'] = e.xpath("./span[@class='zhidaojia']/text()")[0]
dict1['year'] = re.findall("(\d*)款.*?",dict1['trimname'])[0]
print(dict1)
saveDataTxt(dict1, "Brank")
elif len(trimCommon)==0 and len(trimGreen)==0:
print('None')
with open('output/None_modelUrl.txt', 'a') as ER:
ER.write(modelUrl + '\n')
else:
print("otherError")
with open('output/Other_modelUrl.txt', 'a') as ER:
ER.write(modelUrl + '\n')
except:
with open('output/ModeerrorUrl_url.txt', 'a') as ER:
ER.write(modelUrl + '\n')
except:
with open('output/ManuerrorUrl_url.txt', 'a') as ER:
ER.write(manufacturerUrl + '\n')
# 保存为txt格式,作为暂时存储。
def saveDataTxt(concent,txt):
folder_name = 'output'
# 创建文件夹
if not os.path.exists(folder_name):
os.mkdir(folder_name)
# 定义文件
file_name = txt + '.txt'
# 定义文件路径
file_path=folder_name+'/'+file_name
# 保存数据为txt格式文件
try:
with open(file_path,'a',encoding='utf-8') as fp:
lines=concent
fp.write(str(lines) + "\n")
except IOError as err:
print("error:"+str(err))
finally:
fp.close()
def geturl_detil(url):
SER_AGENTS = [
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5"
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
]
Agent_Value = random.choice(SER_AGENTS)
headers = {
# "Cookie": "P_INFO=lihuan_0315@163.com|1514970868|0|other|00&99|bej&1514965301&other#bej&null#10#0#0|&0||lihuan_0315@163.com; _ntes_nnid=ed493b844d10f0ef1fa2a58a2e8e8806,1514970874909; _ntes_nuid=ed493b844d10f0ef1fa2a58a2e8e8806; usertrack=ezq0plplmMG+e5IOBB+EAg==; _ga=GA1.2.722318528.1516607687; UM_distinctid=1626572087081e-0d25b7bc56797-3a614f0b-15f900-1626572087139c; __gads=ID=6691c18dab77a023:T=1522120391:S=ALNI_Ma8PyeGVUWnKIL6nlD_1hU62Nm24g; vjuids=45ed65165.16265720de8.0.7ba88cc32bd11; Province=010; City=010; vjlast=1522120396.1532328572.23; _gid=GA1.2.1303919916.1532328588; s_n_f_l_n3=738a1beca030b6d11532332377636; ne_analysis_trace_id=1532332675201; vinfo_n_f_l_n3=738a1beca030b6d1.1.3.1522120396315.1532331724180.1532332791168; pgr_n_f_l_n3=738a1beca030b6d115323289860492531",
# "User-Agent": "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5"
# "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
"User-Agent":Agent_Value ,
"Host": "db.auto.sina.com.cn",
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
}
# 获取状态码
# print(context.status_code)
r = requests.get(url=url,headers = headers,timeout=15)
print(r.status_code)
# print(r.text)
time.sleep(2)
assert r.status_code == 200
return etree.HTML(r.content)
Brank()