python3 threading Thread 多线程模板 queue

Python
import queue import <span class="wp_keywordlink_affiliate"><a href="https://www.168seo.cn/tag/threading" title="View all posts in threading" target="_blank">threading</a></span> import re import requests def source(): for x in range(0, 728): yield "url" def do_work(item): try: r = requests.get(item) r.encoding = "utf-8" html = r.text if html: pattern = re.compile('<title>(.*?)</title>', re.S) t = re.search(pattern, html).group(1) # print(t) print(item, t) except requests.exceptions.RequestException as e: print(e) pass def worker(): while True: item = q.get() if item is None: break do_work(item) q.task_done() q = queue.Queue() <span class="wp_keywordlink_affiliate"><a href="https://www.168seo.cn/tag/thread" title="View all posts in thread" target="_blank">thread</a></span>s = [] num_worker_<span class="wp_keywordlink_affiliate"><a href="https://www.168seo.cn/tag/thread" title="View all posts in thread" target="_blank">thread</a></span>s = 80 for i in range(num_worker_threads): t = <span class="wp_keywordlink_affiliate"><a href="https://www.168seo.cn/tag/threading" title="View all posts in threading" target="_blank">threading</a></span>.Thread(target=worker) t.start() threads.append(t) for item in source(): q.put(item) # block until all tasks are done q.join() # stop workers for _ in range(num_worker_threads): q.put(None) for t in threads: t.join()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import queue
import threading
 
import re
import requests
 
 
def source ( ) :
     for x in range ( 0 , 728 ) :
         yield "url"
 
 
def do_work ( item ) :
     try :
         r = requests . get ( item )
         r . encoding = "utf-8"
         html = r . text
         if html :
             pattern = re . compile ( '<title>(.*?)</title>' , re . S )
             t = re . search ( pattern , html ) . group ( 1 )
             # print(t)
             print ( item , t )
     except requests . exceptions . RequestException as e :
         print ( e )
         pass
 
 
def worker ( ) :
     while True :
         item = q . get ( )
         if item is None :
             break
         do_work ( item )
         q . task_done ( )
 
 
q = queue . Queue ( )
threads = [ ]
num_worker_threads = 80
for i in range ( num_worker_threads ) :
     t = threading . Thread ( target = worker )
     t . start ( )
     threads . append ( t )
 
for item in source ( ) :
     q . put ( item )
 
# block until all tasks are done
q . join ( )
 
# stop workers
for _ in range ( num_worker_threads ) :
     q . put ( None )
for t in threads :
     t . join ( )

Python 多线程抓取 豆瓣 top250

Python
#!/usr/bin/env <span class="wp_keywordlink_affiliate"><a href="https://www.168seo.cn/tag/python3" title="View all posts in python3" target="_blank">python3</a></span> # -*- coding=utf-8 -*- from multiprocessing import Process, Queue import time from lxml import etree import requests class DouBanSpider(Process): def __init__(self, url, q): # 重写写父类的__init__方法 super(DouBanSpider, self).__init__() self.url = url self.q = q self.headers = { 'Host': 'movie.douban.com', 'Referer': 'https://movie.douban.com/top250?start=225&filter=', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.104 Safari/537.36', } def run(self): self.parse_page() def send_request(self,url): ''' 用来发送请求的方法 :return: 返回网页源码 ''' # 请求出错时,重复请求3次, i = 0 while i <= 3: try: print("[INFO]请求url:"+url) return requests.get(url=url,headers=self.headers).content except Exception as e: print('[INFO] %s%s'% (e,url)) i += 1 def parse_page(self): ''' 解析网站源码,并采用xpath提取 电影名称和平分放到队列中 :return: ''' response = self.send_request(self.url) html = etree.HTML(response) # 获取到一页的电影数据 node_list = html.xpath("//div[@class='info']") for move in node_list: # 电影名称 title = move.xpath('.//a/span/text()')[0] # 评分 score = move.xpath('.//div[@class="bd"]//span[@class="rating_num"]/text()')[0] # 将每一部电影的名称跟评分加入到队列 self.q.put(score + "\t" + title) def main(): # 创建一个队列用来保存进程获取到的数据 q = Queue() base_url = 'https://movie.douban.com/top250?start=' # 构造所有url url_list = [base_url+str(num) for num in range(0,225+1,25)] # 保存进程 Process_list = [] # 创建并启动进程 for url in url_list: p = DouBanSpider(url,q) p.start() Process_list.append(p) # 让主进程等待子进程执行完成 for i in Process_list: i.join() while not q.empty(): print(q.get()) if __name__=="__main__": start = time.time() main() print('[info]耗时:%s'%(time.time()-start))
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!/usr/bin/env python3
# -*- coding=utf-8 -*-
from multiprocessing import Process , Queue
import time
from lxml import etree
import requests
class DouBanSpider ( Process ) :
     def __init__ ( self , url , q ) :
         # 重写写父类的__init__方法
         super ( DouBanSpider , self ) . __init__ ( )
         self . url = url
         self . q = q
         self . headers = {
             'Host' : 'movie.douban.com' ,
             'Referer' : 'https://movie.douban.com/top250?start=225&filter=' ,
             'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.104 Safari/537.36' ,
         }
     def run ( self ) :
         self . parse_page ( )
     def send_request ( self , url ) :
         '''
        用来发送请求的方法
        :return: 返回网页源码
        '''
         # 请求出错时,重复请求3次,
         i = 0
         while i <= 3 :
             try :
                 print ( "[INFO]请求url:" + url )
                 return requests . get ( url = url , headers = self . headers ) . content
             except Exception as e :
                 print ( '[INFO] %s%s' % ( e , url ) )
                 i += 1
     def parse_page ( self ) :
         '''
        解析网站源码,并采用xpath提取 电影名称和平分放到队列中
        :return:
        '''
         response = self . send_request ( self . url )
         html = etree . HTML ( response )
         # 获取到一页的电影数据
         node_list = html . xpath ( "//div[@class='info']" )
         for move in node_list :
             # 电影名称
             title = move . xpath ( './/a/span/text()' ) [ 0 ]
             # 评分
             score = move . xpath ( './/div[@class="bd"]//span[@class="rating_num"]/text()' ) [ 0 ]
            
             # 将每一部电影的名称跟评分加入到队列
             self . q . put ( score + "\t" + title )
def main ( ) :
     # 创建一个队列用来保存进程获取到的数据
     q = Queue ( )
     base_url = 'https://movie.douban.com/top250?start='
     # 构造所有url
     url_list = [ base_url + str ( num ) for num in range ( 0 , 225 + 1 , 25 ) ]
     # 保存进程
     Process_list = [ ]
     # 创建并启动进程
     for url in url_list :
         p = DouBanSpider ( url , q )
         p . start ( )
         Process_list . append ( p )
    
     # 让主进程等待子进程执行完成
     for i in Process_list :
         i . join ( )
     while not q . empty ( ) :
         print ( q . get ( ) )
if __name__ == "__main__" :
    
     start = time . time ( )
     main ( )
     print ( '[info]耗时:%s' % ( time . time ( ) - start ) )



  • zeropython 微信公众号 5868037 QQ号 5868037@qq.com QQ邮箱
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值