线程的并发

一、单线程

#!/usr/bin/env python
# -*- coding:utf-8 -*-

import requests
from Queue import Queue
from lxml import etree
import time

class Douban(object):
    def __init__(self):
        self.start_urls = ["https://movie.douban.com/top250?start=" + str(num) for num in range(0, 226, 25)]
        self.headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"}
        self.data_queue = Queue()


    def send_request(self, url):
        print "[INFO]: 正在处理%s " % url
        response = requests.get(url, headers = self.headers)
        # 每次请求发送阻塞1秒
        time.sleep(1)
        self.parse_page(response)


    def parse_page(self, response):
        html_obj = etree.HTML(response.content)
        node_list = html_obj.xpath("//div[@class='info']")

        for node in node_list:
            #电影标题
            title = node.xpath("./div[@class='hd']/a/span[1]/text()")[0]
            #电影评分
            score = node.xpath("./div[@class='bd']//span[@class='rating_num']/text()")[0]
            #print title, score
            self.data_queue.put(score + "\t" + title)


    def start_work(self):
        for url in self.start_urls:
            self.send_request(url)

        while not self.data_queue.empty():
            print self.data_queue.get()

if __name__ == "__main__":
    start = time.time()
    spider = Douban()
    spider.start_work()
    print "[INFO]: Useing time %f" % (time.time() - start)
    # [INFO]: Useing time 12.071644

二、多线程 threading


#!/usr/bin/env python
# -*- coding:utf-8 -*-

import requests
from Queue import Queue
from lxml import etree
import time

import threading

class Douban(object):
    def __init__(self):
        self.start_urls = ["https://movie.douban.com/top250?start=" + str(num) for num in range(0, 226, 25)]
        self.headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"}
        self.data_queue = Queue()


    def send_request(self, url):
        print "[INFO]: 正在处理%s " % url
        response = requests.get(url, headers = self.headers)
        # 每次请求发送阻塞1秒
        time.sleep(1)
        self.parse_page(response)


    def parse_page(self, response):
        html_obj = etree.HTML(response.content)
        node_list = html_obj.xpath("//div[@class='info']")

        for node in node_list:
            #电影标题
            title = node.xpath("./div[@class='hd']/a/span[1]/text()")[0]
            #电影评分
            score = node.xpath("./div[@class='bd']//span[@class='rating_num']/text()")[0]
            #print title, score
            self.data_queue.put(score + "\t" + title)


    def start_work(self):
        # 单线程:
        """
        for url in self.start_urls:
            self.send_request(url)
        """

        # 多线程:
        thread_list = []
        for url in self.start_urls:
            thread = threading.Thread(target = self.send_request, args = [url])
            thread.start() # run
            thread_list.append(thread)

        # 让主线程等待所有的子线程结束,主线程再继续执行
        for thread in thread_list:
            thread.join()

        while not self.data_queue.empty():
            print self.data_queue.get()

if __name__ == "__main__":
    start = time.time()
    spider = Douban()
    spider.start_work()
    print "[INFO]: Useing time %f" % (time.time() - start)
    # [INFO]: Useing time 1.2071644

三、多线程:multiprocessing.dummy

#!/usr/bin/env python
# -*- coding:utf-8 -*-

import requests
from Queue import Queue
from lxml import etree
import time

#import threading
# 多进程类库里的多线程模块 dummy
from multiprocessing.dummy import Pool

class Douban(object):
    def __init__(self):
        self.start_urls = ["https://movie.douban.com/top250?start=" + str(num) for num in range(0, 226, 25)]
        self.headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"}
        self.data_queue = Queue()


    def send_request(self, url):
        print "[INFO]: 正在处理%s " % url
        response = requests.get(url, headers = self.headers)
        # 每次请求发送阻塞1秒
        time.sleep(1)
        self.parse_page(response)


    def parse_page(self, response):
        html_obj = etree.HTML(response.content)
        node_list = html_obj.xpath("//div[@class='info']")

        for node in node_list:
            #电影标题
            title = node.xpath("./div[@class='hd']/a/span[1]/text()")[0]
            #电影评分
            score = node.xpath("./div[@class='bd']//span[@class='rating_num']/text()")[0]
            #print title, score
            self.data_queue.put(score + "\t" + title)


    def start_work(self):
        # 单线程:
        """
        for url in self.start_urls:
            self.send_request(url)
        """

        # 多线程:
        """
        thread_list = []
        for url in self.start_urls:
            thread = threading.Thread(target = self.send_request, args = [url])
            thread.start() # run
            thread_list.append(thread)

        # 让主线程等待所有的子线程结束,主线程再继续执行
        for thread in thread_list:
            thread.join()
        """


        # multiprocessing.dummy 的多线程写法
        # 创建线程池对象
        pool = Pool(len(self.start_urls))
        # 依次执行start_urls里的每个url地址请求
        pool.map(self.send_request, self.start_urls)
        # 关闭线程池
        pool.close()
        # 让主线程等待所有子线程结束,主线程再继续执行
        pool.join()


        while not self.data_queue.empty():
            print self.data_queue.get()

if __name__ == "__main__":
    start = time.time()
    spider = Douban()
    spider.start_work()
    print "[INFO]: Useing time %f" % (time.time() - start)
    # [INFO]: Useing time 1.2071644

四、协程 gevent


#!/usr/bin/env python
# -*- coding:utf-8 -*-

import requests
from Queue import Queue
from lxml import etree
import time

#import threading
# 多进程类库里的多线程模块 dummy
#from multiprocessing.dummy import Pool
import gevent
from gevent import monkey
monkey.patch_all()
# 在程序执行的时候,会将Python的网络库打个补丁,变成异步的库
# 那么程序在执行网络请求的时候,按异步的方式执行

class Douban(object):
    def __init__(self):
        self.start_urls = ["https://movie.douban.com/top250?start=" + str(num) for num in range(0, 226, 25)]
        self.headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"}
        self.data_queue = Queue()


    def send_request(self, url):
        print "[INFO]: 正在处理%s " % url
        response = requests.get(url, headers = self.headers)
        # 每次请求发送阻塞1秒
        time.sleep(1)
        self.parse_page(response)


    def parse_page(self, response):
        html_obj = etree.HTML(response.content)
        node_list = html_obj.xpath("//div[@class='info']")

        for node in node_list:
            #电影标题
            title = node.xpath("./div[@class='hd']/a/span[1]/text()")[0]
            #电影评分
            score = node.xpath("./div[@class='bd']//span[@class='rating_num']/text()")[0]
            #print title, score
            self.data_queue.put(score + "\t" + title)


    def start_work(self):
        # 单线程:
        """
        for url in self.start_urls:
            self.send_request(url)
        """

        # 多线程:
        """
        thread_list = []
        for url in self.start_urls:
            thread = threading.Thread(target = self.send_request, args = [url])
            thread.start() # run
            thread_list.append(thread)

        # 让主线程等待所有的子线程结束,主线程再继续执行
        for thread in thread_list:
            thread.join()
        """


        # multiprocessing.dummy 的多线程写法
        """
        # 创建线程池对象
        pool = Pool(len(self.start_urls))
        # 依次执行start_urls里的每个url地址请求
        pool.map(self.send_request, self.start_urls)
        # 关闭线程池
        pool.close()
        # 让主线程等待所有子线程结束,主线程再继续执行
        pool.join()
        """

        """
        job_list = []
        for url in self.start_urls:
            # 创建一个协程任务
            job = gevent.spawn(self.send_request, url)
            # 将协程任务放入任务队列里
            job_list.append(job)

        # 获取所有的协程任务,并放入任务队列
        gevent.joinall(job_list)
        """

        job_list = [gevent.spawn(self.send_request, url) for url in self.start_urls]
        gevent.joinall(job_list)

        while not self.data_queue.empty():
            print self.data_queue.get()

if __name__ == "__main__":
    start = time.time()
    spider = Douban()
    spider.start_work()
    print "[INFO]: Useing time %f" % (time.time() - start)
    # [INFO]: Useing time 1.2071644
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值