多线程爬取毛豆新车

#运用多线程爬取详情页
import requests
from lxml import etree
from queue import Queue
import threading
import json
num=0
#设置每一页的链接网址

class shengchanshang(threading.Thread):
    def __init__(self,car_queue):
        threading.Thread.__init__(self)
        self.car_queue = car_queue
    def run(self):
        while True:
            if car_queue.empty():
                break
            url = self.car_queue.get()
            self.get_response(url)


    def get_response(self,url):
        headers = {
            'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
        }
        response = requests.get(url,headers=headers).content.decode('utf-8')

        url_queue.put(response)



class xiaofeizhe(threading.Thread):
    def run(self):
        try:
            while True:
                if car_queue.empty() and flag:
                    break
                else:
                    response = url_queue.get(timeout=4)
                    self.get_data(response)
        except:
            pass
    def get_data(self,response):

        tree = etree.HTML(response)
        photo = tree.xpath('//img[@class="lazy-slider"]/@src')
        title = tree.xpath('//h2[@class="banner-tit"]/text()')
        price = tree.xpath('//p[@class="price "]/text()')
        #爬取车辆配置:
        peizhi_name = tree.xpath('//p[@class="config-tit-top"]/text()')
        peizhi_jieshao = tree.xpath('//p[@class="config-tit-foot"]/text()')
        global num
        num+=1
        data={
            "num":num,
            "车名":title[0],
            "图片":photo[0],
            "价格":price[0],
            "车辆配置":[{peizhi_name[i]:peizhi_jieshao[i]} for i in range(len(peizhi_name))]
            ""
        }
        print(data)
        with open('che.txt','w') as fp:
            fp.write(json.dumps(data))

#
#创建一个函数,用于提取所有的标签
def biaoqian(a):
    response = requests.get(a).content.decode('utf-8')
    tree = etree.HTML(response)
    car = tree.xpath('//div[@class="list-wrap clearfix"]/a/@href')
    return car
flag = False
url_queue=Queue()
if __name__ == '__main__':
    car_queue = Queue()
    wangye = ['https://www.maodou.com/car/list/all/pg'+str(page) for page in range(1,11)]
    for i in wangye:
        for j in biaoqian(i):

            car_queue.put(j)

    shengchan=[]
    for cre in range(3):
        craw1 = shengchanshang(car_queue)
        craw1.start()
        shengchan.append(craw1)
    xiaofei=[]
    for cus in range(3):
        custom = xiaofeizhe()
        custom.start()
        xiaofei.append(custom)

    [i.join() for i in shengchan]
    flag=True
    [b.join() for b in xiaofei]













 

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值