多线程+队列--爬虫

最新推荐文章于 2024-01-24 20:46:06 发布

可爱的大崔儿

最新推荐文章于 2024-01-24 20:46:06 发布

阅读量591

点赞数 1

参考文章：连接我找不到了，这里有原博主的代码，写的特别详尽

# coding=utf-8
import requests
from lxml import etree
import threading
from queue import Queue

# https://docs.python.org/3/library/queue.html#module-queue
# 队列使用方法简介
# q.qsize() 返回队列的大小
# q.empty() 如果队列为空，返回True,反之False
# q.full() 如果队列满了，返回True,反之False
# q.full 与 maxsize 大小对应
# q.get([block[, timeout]]) 获取队列，timeout等待时间
# q.get_nowait() 相当q.get(False)
# q.put(item) 写入队列，timeout等待时间
# q.put_nowait(item) 相当q.put(item, False)
# q.task_done() 在完成一项工作之后，q.task_done() 函数向任务已经完成的队列发送一个信号
# q.join() 实际上意味着等到队列为空，再执行别的操作


class QiubaiSpdier:
    def __init__(self):
        self.url_temp = "https://www.qiushibaike.com/8hr/page/{}/"
        self.headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"}
        self.url_queue = Queue()
        self.html_queue  = Queue()
        self.content_queue = Queue()
    def get_url_list(self):
        # return [self.url_temp.format(i) for i in range(1,14)]
        for i in range(1,14):
            # 把13个索引页面的Url放进url_queue队列里
            self.url_queue.put(self.url_temp.format(i))

    def parse_url(self):
        while True:
            # get方法和task_done搭配使用
            # 在put是队列+1，get和task_done一起使用时队列才会-1
            url = self.url_queue.get()
            print(url)
            response = requests.get(url,headers=self.headers)
            # 然后把索引页的响应页面放进html_queue队列里
            self.html_queue.put(response.content.decode())
            #类似于向队列发送清空信号，如果不用这个的话。主进程会一直卡在join这一块，认为队列没有清空
            self.url_queue.task_done()

    def get_content_list(self): #提取数据
        while True:
            # 先从索引页响应页面html_queue队列里面取出索引页面
            html_str = self.html_queue.get()

            html = etree.HTML(html_str)
            div_list = html.xpath("//div[@id='content-left']/div")  #分组
            content_list = []
            for div in div_list:
                item= {}
                item["content"] = div.xpath(".//div[@class='content']/span/text()")
                item["content"] = [i.replace("\n","") for i in item["content"]]
                item["author_gender"] = div.xpath(".//div[contains(@class,'articleGender')]/@class")
                item["author_gender"] = item["author_gender"][0].split(" ")[-1].replace("Icon","") if len(item["author_gender"])>0 else None
                item["auhtor_age"] = div.xpath(".//div[contains(@class,'articleGender')]/text()")
                item["auhtor_age"] = item["auhtor_age"][0] if len(item["auhtor_age"])>0 else None
                item["content_img"] = div.xpath(".//div[@class='thumb']/a/img/@src")
                item["content_img"] = "https:"+item["content_img"][0] if len(item["content_img"])>0 else None
                item["author_img"] = div.xpath(".//div[@class='author clearfix']//img/@src")
                item["author_img"] = "https:"+item["author_img"][0] if len(item["author_img"])>0 else None
                item["stats_vote"] = div.xpath(".//span[@class='stats-vote']/i/text()")
                item["stats_vote"] = item["stats_vote"][0] if len(item["stats_vote"])>0 else None
                content_list.append(item)
            # 把content_list放进content_queue里面
            self.content_queue.put(content_list)
            self.html_queue.task_done()

    def save_content_list(self): #保存
        while True:
            content_list = self.content_queue.get()
            for i in content_list:
                print(i)
                pass
            self.content_queue.task_done()

    def run(self): #实现主要逻辑
        thread_list = []
        #1.url_list
        # threading.Thread不需要传参数，参数都是从队列里面取得
        t_url = threading.Thread(target=self.get_url_list)
        thread_list.append(t_url)
        #2.遍历，发送请求，获取响应
        for i in range(20): # 添加20个线程
            t_parse = threading.Thread(target=self.parse_url)
            thread_list.append(t_parse)
        #3.提取数据
        for i in range(2): # 添加2个线程
            t_html = threading.Thread(target=self.get_content_list)
            thread_list.append(t_html)
        #4.保存
        t_save = threading.Thread(target=self.save_content_list)
        thread_list.append(t_save)
        for t in thread_list:
            t.setDaemon(True) #把子线程设置为守护线程，该线程不重要，主线程结束，子线程结束(子线程是while true不会自己结束)
            t.start()

        for q in [self.url_queue,self.html_queue,self.content_queue]:
            q.join() #让主线程等待阻塞，等待队列的任务完成（即队列为空时 ）之后再进行主线程

        print("主线程结束")



if __name__ == '__main__':
    qiubai = QiubaiSpdier()
    qiubai.run()

我自己的代码在下面展示，这里主要讲解一下我遇到的问题
问题一：self.dateInfo.task_done()，这个task_done()主要是为了告诉线程池已经取走一个了，不然线程池会一直卡在join里面，无限挂起
问题二： t.setDaemon(True) # 把子线程设置为守护线程，该线程不重要，主线程结束，子线程结束，因为多线程都是写的while True 循环。不会结束，所以这个方法呢就是只要主线程结束，代码结束，这个代码一定要写在start之前。这个方法刚好与join相反

from cui.mybaseCode import *
from queue import Queue
import time
import threading

class Batch:
    def __init__(self,cookie,dates):
        self.cookie = cookie

        self.pgdb_conn = pg.connect(database="cui", user="postgres", password="4OZ5EvxekT", host="127.0.0.1",
                                    port="5432")
        self.cur = self.pgdb_conn.cursor()
        self.endindex = 0
        self.dates=dates
        self.dateInfo = Queue()
        self.cateInfo =Queue()
        self.content_queue = Queue()

    def getCrowInfo(self):
        three_url = 'https://databank.yushanfang.com/api/paasapi?path=/api/dimension/listChildDimension&type=CATEGORY&id='
        response = requests_method(self.cookie, 'get', three_url, {})
        info = json.loads(response)

        for item in info['data']:
            self.cateInfo.put([item['bizId'], item['name']])
        self.cateInfo.put(['', '全部'])
        print(self.cateInfo)

    def test(self):
        if self.content_queue.empty() == True:
            print('cui')
    #
    # def putCateInfo(self):

    def putDate(self):
        for mydate in self.dates:
            self.dateInfo.put(mydate)

    def getData(self):
        print('开始爬取')
        cate=['','全部']
        date =  self.dateInfo.get()
        first_url = 'https://databank.yushanfang.com/api/ecapi'
        params={
            'path': '/databank/crowdFullLink/detail',
            'beginTheDate': date,
            'endTheDate':str(int(date)+1)
        }
        if cate[0] != '':
            print(cate[0])

            params['cateId']=str(cate[0])
        res =json.loads(requests_method(self.cookie, 'get', first_url, params))

        for item in res['data'][4:]:
            self.content_queue.put([item['name'], item['cnt'], str(date) +str(cate[1])])


        self.dateInfo.task_done()
        time.sleep(random.random())
        print(self.dateInfo.empty())


    def save_content_list(self): #保存
        print('开始存储')
        while True:
            mycontent = self.content_queue.get()
            print('mycontent---------------:%s'%(mycontent))
            self.cur.execute("insert into rightconner values(%s,%s,%s)",
                (mycontent[0],mycontent[1],mycontent[2]))
            self.content_queue.task_done()
            print('储存数据---------:%s'%(self.content_queue.empty()))


    def run(self):


        thread_list = []

        # t_CateInfo = threading.Thread(target=self.putCateInfo)
        # thread_list.append(t_CateInfo)

        for i in range(2):
            t_putDate = threading.Thread(target=self.putDate)
            thread_list.append(t_putDate)

        for i in range(3):
            t_getData = threading.Thread(target=self.getData)
            thread_list.append(t_getData)

        t_save = threading.Thread(target=self.save_content_list)
        thread_list.append(t_save)
        for t in thread_list:
            #这个方法一定要在start方法之前设置
            t.setDaemon(True)  # 把子线程设置为守护线程，该线程不重要，主线程结束，子线程结束(子线程是while true不会自己结束)
            t.start()

        for q in [ self.dateInfo,self.content_queue]:
            q.join()  # 让主线程等待阻塞，等待队列的任务完成（即队列为空时 ）之后再进行主线程

        self.pgdb_conn.commit()
        self.cur.close()
        print("主线程结束")



if __name__ == '__main__':
    cookie='cna=Xl4OFI+aFD8CAdINZPa5UoXn; _tb_token_=QNhOPtgFPRg96H1MLojb; bs_n_lang=zh_CN; c_token=6de723623112a374f649f951c25a6eab; ck2=09c36266afd1c4b7d35b734d859c4f55; an=5LiK5rW354K55q2j5LqS6IGU572R56eR5oqA; lg=true; sg=A09; lvc=sAhojP%2BY2S2dOA%3D%3D; isg=BJSUSroks7nCWicMTbkpDcFLZdIMtbhenvNPcC51Mp-iGTRjVvi0ZdafHVEk4fAv'
    startDate = '2017-09-01'
    endDate = '2017-09-10'
    frequency='1'
    start,end=calculateDays(startDate,endDate,frequency)
    date=days(start,end)
    tests=Batch(cookie,date)
    tests.run()

可爱的大崔儿

关注

1
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
多线程+队列--爬虫

参考文章：连接我找不到了，这里有原博主的代码，写的特别详尽# coding=utf-8import requestsfrom lxml import etreeimport threadingfrom queue import Queue# https://docs.python.org/3/library/queue.html#module-queue# 队列使用方法简介# q...
复制链接

扫一扫