多线程爬虫案例(浅)一

单线程爬虫:

# coding = utf-8
import requests
from lxml import etree
import time


class bdjSpider():
    def __init__(self):
        self.start_url = 'http://www.budejie.com/text/'
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) '
                          'AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/62.0.3202.62 Safari/537.36'
        }

    def get_url(self):  # 生成url_list
        page_num = 1
        url_list = []
        while page_num <= 50:
            url = self.start_url + str(page_num)
            page_num += 1
            url_list.append(url)
        return url_list

    def parse(self, url):  # 主请求
        ret = requests.get(url, headers=self.headers)
        response = ret.content
        return response

    def get_content(self, response):  # 匹配内容
        ret = etree.HTML(response)
        content_list = ret.xpath('//div[@class="j-r-list-c-desc"]/a/text()')
        return content_list

    def run(self):  # 主逻辑运行
        url_list = self.get_url()
        for url in url_list:
            response = self.parse(url)
            content_list = self.get_content(response)
            for joke in content_list:
                print(joke)


if __name__ == '__main__':
    time1 = time.time()
    start = bdjSpider()
    start.run()
    time2 = time.time()
    print(time2 - time1)

多线程爬虫

# coding = utf-8
import requests
from lxml import etree
import time
from queue import Queue
import threading


class bdjSpider():
    def __init__(self):
        self.start_url = 'http://www.budejie.com/text/'
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) '
                          'AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/62.0.3202.62 Safari/537.36'
        }
        self.url_list_queue = Queue()
        self.response_queue = Queue()

    def get_url(self):  # 生成url_list
        page_num = 1
        while page_num <= 50:
            url = self.start_url + str(page_num)
            page_num += 1
            self.url_list_queue.put(url)

    def parse(self):  # 主请求
        while True:
            url = self.url_list_queue.get()
            ret = requests.get(url, headers=self.headers)
            response = ret.content
            self.response_queue.put(response)
            self.url_list_queue.task_done()

    def get_content(self):  # 匹配内容
        while True:
            response = self.response_queue.get()
            ret = etree.HTML(response)
            content_list = ret.xpath('//div[@class="j-r-list-c-desc"]/a/text()')
            for content in content_list:
                print(content)
            self.response_queue.task_done()

    def run(self):  # 主逻辑运行
        thread_list = []
        t_url = threading.Thread(target=self.get_url)
        thread_list.append(t_url)
        for i in range(3):
            t_parse = threading.Thread(target=self.parse)
            thread_list.append(t_parse)
        t_content = threading.Thread(target=self.get_content)
        thread_list.append(t_content)
        for t in thread_list:
            t.setDaemon(True)  # 守护线程
            t.start()
        for q in [self.url_list_queue, self.response_queue]:
            q.join()
        print('运行结束')


if __name__ == '__main__':
    time1 = time.time()
    start = bdjSpider()
    start.run()
    time2 = time.time()
    print(time2 - time1)

对比

多线程运行速度快于单线程(废话)

此处采用队列和线程Queue 以及threading

在类中初始化队列容器

例如:

self.response_queue = Queue()

在每个方法中在队列中get数据

并并向队列put数据

采用task_done() 删减队列数据

def parse(self):  # 主请求
        while True:
            url = self.url_list_queue.get()
            ret = requests.get(url, headers=self.headers)
            response = ret.content
            self.response_queue.put(response)
            self.url_list_queue.task_done()
在逻辑运行run函数中

建立线程 threading.Thread(target=函数名)

把所有线程加入到线程列表thread_list

for t in thread_list:

#设置守护进程

t.setdaemon(Ture) 

# 运行线程

t.start()

按照以上写法,在主进程结束后,子进程就结束,这不是我所要的

多以采用

for q in [self.url_list_queue, self.response_queue]:
	q.join()  
当队列中无数据,销毁进程




 

  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值