爬虫 - 多线程、进程池、协程

1 篇文章 0 订阅
1 篇文章 0 订阅

进程池

#!/usr/bin/env python
# -*- coding:utf-8 -*-

from lxml import etree
#import threading

# multiprocessing.dummy 是多进程类库里里的一个多线程模块,有一个类Pool,表示线程池
from multiprocessing.dummy import Pool
import requests
import Queue
import time

class Douban(object):
    def __init__(self):
        self.base_url = "https://movie.douban.com/top250?start="
        self.url_list = [self.base_url + str(page) for page in range(0, 225 + 1, 25)]
        self.headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"}

        # 创建队列保存数据
        self.data_queue = Queue.Queue()
        self.count = 0

    def send_request(self, url):
        print url
        html = requests.get(url, headers = self.headers).content
        time.sleep(1)
        self.parse_page(html)


    def parse_page(self, html):
        html_obj = etree.HTML(html)
        node_list = html_obj.xpath("//div[@class='info']")

        for node in node_list:
            # 获取电影标题
            title = node.xpath("./div[@class='hd']/a/span/text()")[0]
            # 获取电影评分
            score = node.xpath("./div[@class='bd']//span[@class='rating_num']/text()")[0]

            # 将数据存储到队列里
            self.data_queue.put(score + "\t" + title)
            #print score, title

    def start_work(self):
        #for url in self.url_list:
            # 发送请求
        #    html = self.send_request(url)
            # 解析响应

        # 创建线程池
        pool = Pool(len(self.url_list))
        pool.map(self.send_request, self.url_list)
        pool.close()
        # 主线程等待所有子线程执行结束,主线程再执行后面的代码
        pool.join()

        while not self.data_queue.empty():
            print self.data_queue.get()
            self.count += 1

        print "\n%d" % self.count


if __name__ == "__main__":
    douban = Douban()
    start = time.time()
    douban.start_work()

    print "[INFO]: Useing %f secend" % (time.time() - start)

多线程

#!/usr/bin/env python
# -*- coding:utf-8 -*-

from lxml import etree
import requests
import threading
import Queue
import time

class Douban(object):
    def __init__(self):
        self.base_url = "https://movie.douban.com/top250?start="
        self.url_list = [self.base_url + str(page) for page in range(0, 225 + 1, 25)]
        self.headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"}

        # 创建队列保存数据
        self.data_queue = Queue.Queue()
        self.count = 0

    def send_request(self, url):
        print url
        html = requests.get(url, headers = self.headers).content
        time.sleep(1)
        self.parse_page(html)


    def parse_page(self, html):
        html_obj = etree.HTML(html)
        node_list = html_obj.xpath("//div[@class='info']")

        for node in node_list:
            # 获取电影标题
            title = node.xpath("./div[@class='hd']/a/span/text()")[0]
            # 获取电影评分
            score = node.xpath("./div[@class='bd']//span[@class='rating_num']/text()")[0]

            # 将数据存储到队列里
            self.data_queue.put(score + "\t" + title)
            #print score, title

    def start_work(self):
        thread_list = []
        for url in self.url_list:
            # 发送请求
            #html = self.send_request(url)
            thread = threading.Thread(target = self.send_request, args = [url])
            thread.start()
            thread_list.append(thread)


        # 让主线程等待,等待所有子线程执行结束,再向下执行代码
        for thread in thread_list:
            thread.join()

        while not self.data_queue.empty():
            print self.data_queue.get()
            self.count += 1

        print "\n%d" % self.count


if __name__ == "__main__":
    douban = Douban()
    start = time.time()
    douban.start_work()

    print "[INFO]: Useing %f secend" % (time.time() - start)

协程

#!/usr/bin/env python
# -*- coding:utf-8 -*-

from lxml import etree
import requests
import Queue
import time

import gevent
from gevent import monkey
monkey.patch_all()
# gevent 可以用同步的语法写异步的程序。
# monkey.patch_all() 在Python程序执行的时候,会动态的将网络库(socket, select)打个补丁,变为异步的库。
# 让程序在进行网络操作的时候,都变为异步的方式去执行。

class Douban(object):
    def __init__(self):
        self.base_url = "https://movie.douban.com/top250?start="
        self.url_list = [self.base_url + str(page) for page in range(0, 225 + 1, 25)]
        self.headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"}

        # 创建队列保存数据
        self.data_queue = Queue.Queue()
        self.count = 0

    def send_request(self, url):
        print url
        html = requests.get(url, headers = self.headers).content
        time.sleep(1)
        self.parse_page(html)


    def parse_page(self, html):
        html_obj = etree.HTML(html)
        node_list = html_obj.xpath("//div[@class='info']")

        for node in node_list:
            # 获取电影标题
            title = node.xpath("./div[@class='hd']/a/span/text()")[0]
            # 获取电影评分
            score = node.xpath("./div[@class='bd']//span[@class='rating_num']/text()")[0]

            # 将数据存储到队列里
            self.data_queue.put(score + "\t" + title)
            #print score, title

    def start_work(self):
        #for url in self.url_list:
            # 发送请求
        #    html = self.send_request(url)
            # 解析响应

        # 创建任务列表,保存所有的协程任务
        job_list = []
        for url in self.url_list:
            # 创建一个协程任务
            job = gevent.spawn(self.send_request, url)
            # 将任务添加到列表里
            job_list.append(job)
        # 将所有的协程任务添加到任务队列里执行
        gevent.joinall(job_list)


        #gevent.joinall([gevent.spawn(self.send_request, url) for url in self.url_list])
        #job_list = [gevent.spawn(self.send_request, url) for url in self.url_list]
        #gevent.joinall(job_list)

        while not self.data_queue.empty():
            print self.data_queue.get()
            self.count += 1

        print "\n%d" % self.count


if __name__ == "__main__":
    douban = Douban()
    start = time.time()
    douban.start_work()

    print "[INFO]: Useing %f secend" % (time.time() - start)

 

  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值