抓取并解析二手车信息：Python爬虫实例-CSDN博客

本文链接：https://blog.csdn.net/u010132177/article/details/121933081

import requests as r
from lxml import etree
import re
import pandas as pd

headers = {
    'Accept': '*/*',
    'Accept-Language': 'en-US,en;q=0.8',
    'Cache-Control': 'max-age=0',
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36',
    'Connection': 'keep-alive',
    'Referer': 'http://www.baidu.com/'
}


# url='https://www.che168.com/china/qirui/ruihu5/'
url='https://www.che168.com/china/qirui/ruihu5/a0_0msdgscncgpi1ltocsp2exx0/?pvareaid=102179#currengpostion'
res=r.get(url)
'''
with open('ruihu5.html','w',encoding='utf-8') as f:
    f.write(res.text)
with open('ruihu5.html','r',encoding='utf-8') as f:
    htm=f.read()
'''

html=etree.HTML(res.text)
# html=etree.parse('./ruihu5.html',etree.HTMLParser())
lis=html.xpath('//ul[@class="viewlist_ul"]/li[@name="lazyloadcpc"]')
print('共得到车个数：',len(lis))
# ns=lis[0].xpath('/a/div[2]/h4[@class="card-name"]/text()')
# print(ns)

cars=[]
for lii in lis:
    li=etree.HTML(etree.tostring(lii))
    name=li.xpath('//h4[@class="card-name"]/text()')[0]
    # print(name)
    zonghe=li.xpath('//p[@class="cards-unit"]/text()')[0]
    # print(zonghe)
    # print(type(zonghe))
    # print(zonghe)
    zh=re.findall(r'(.*?)／(.*?)／(.*?)／(.*)',zonghe)
    # print(zh[0][0])
    licheng=zh[0][0]
    year=zh[0][1]
    address=zh[0][2]
    shangjia=zh[0][3]

    price=li.xpath('//span[@class="pirce"]//text()')
    price=price[-1] if price[-2]=='抢购价' else price[-2] #如果内部有抢购价则价格取-1,否则取-2
    price2=li.xpath('//div[@class="cards-price-box"]/s/text()')[0]
    tag=li.xpath('//div[@class="cards-price-box"]/span[@class="tags"]/i/text()')
    tag=tag[0] if tag else '' #如果tag存在内容则=tag[0],否则=''
    cars.append({'车名':name,'里程':licheng,'年份':year,'地址':address,'商家':shangjia,
        '价格':price,'原价':price2,'其它':tag
        })

    # print(name,zonghe,price,price2,tag)
# print(cars)

df=pd.DataFrame(cars)
print(df)
df.to_csv('二手车表.csv',mode='a',encoding='utf-8-sig',index=False)