爬虫之二手车

import requests
from lxml import etree

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3941.4 Safari/537.36',
    'Cookie':'antipas=55v581405369424H929nae24; uuid=96ce9e94-82a7-491d-9a27-528083315979; cityDomain=wh; clueSourceCode=%2A%2300; user_city_id=194; Hm_lvt_936a6d5df3f3d309bda39e92da3dd52f=1591926097; ganji_uuid=3813796056650654274555; sessionid=4d3eb87b-fee1-4d7c-e053-7c5f698cd529; lg=1; close_finance_popup=2020-06-12; cainfo=%7B%22ca_a%22%3A%22-%22%2C%22ca_b%22%3A%22-%22%2C%22ca_s%22%3A%22seo_baidu%22%2C%22ca_n%22%3A%22default%22%2C%22ca_medium%22%3A%22-%22%2C%22ca_term%22%3A%22-%22%2C%22ca_content%22%3A%22-%22%2C%22ca_campaign%22%3A%22-%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22scode%22%3A%22-%22%2C%22keyword%22%3A%22-%22%2C%22ca_keywordid%22%3A%22-%22%2C%22display_finance_flag%22%3A%22-%22%2C%22platform%22%3A%221%22%2C%22version%22%3A1%2C%22client_ab%22%3A%22-%22%2C%22guid%22%3A%2296ce9e94-82a7-491d-9a27-528083315979%22%2C%22ca_city%22%3A%22wh%22%2C%22sessionid%22%3A%224d3eb87b-fee1-4d7c-e053-7c5f698cd529%22%7D; preTime=%7B%22last%22%3A1591926266%2C%22this%22%3A1591926094%2C%22pre%22%3A1591926094%7D; Hm_lpvt_936a6d5df3f3d309bda39e92da3dd52f=1591926269'
}

def get_urls(url):

    resp = requests.get(url,headers=headers)
    text = resp.content.decode('utf-8')
    html = etree.HTML(text)
    urls = html.xpath('//ul[@class="carlist clearfix js-top"]/li/a/@href')
    detail_urls = []
    for url in urls:
        url = 'https://www.guazi.com'+url
        detail_urls.append(url)
    return detail_urls

def parse_detail_page(url):
    resp = requests.get(url,headers=headers)
    text = resp.content.decode('utf-8')
    html = etree.HTML(text)
    title = html.xpath('//div[@class="product-textbox"]/h2/text()')[0]
    title = title.replace(r'\r\n','').strip()
    info = html.xpath('//div[@class="product-textbox"]/ul/li/span/text()')
    infos = {}
    km = info[2]
    l = info[3]
    speed = info[4]
    infos['title'] = title
    infos['km'] = km
    infos['l'] = l
    infos['speed'] = speed
    return infos

def save_data(infos,f):

    f.write('{},{},{},{}\n'.format(infos['title'],infos['km'],infos['l'],infos['speed'],))



def main():

    base_url = 'https://www.guazi.com/wh/buy/o{}'

    with open('guazi_cs.csv','a',encoding='utf-8') as f:
        for x in range(1,6):
            url = base_url.format(x)
            detail_urls = get_urls(url)
            for detail_url in detail_urls:
                infos = parse_detail_page(detail_url)
                save_data(infos,f)

if __name__ == '__main__':
    main()

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值