python3 threading Thread 多线程模板 queue

最新推荐文章于 2023-06-27 09:50:40 发布

songhao8080

最新推荐文章于 2023-06-27 09:50:40 发布

阅读量574

点赞数

本文链接：https://blog.csdn.net/songhao8080/article/details/103670253

版权

Python

import queue import <a href="https://www.168seo.cn/tag/threading" title="View all posts in threading" target="_blank">threading</a> import re import requests def source(): for x in range(0, 728): yield "url" def do_work(item): try: r = requests.get(item) r.encoding = "utf-8" html = r.text if html: pattern = re.compile('<title>(.*?)</title>', re.S) t = re.search(pattern, html).group(1) # print(t) print(item, t) except requests.exceptions.RequestException as e: print(e) pass def worker(): while True: item = q.get() if item is None: break do_work(item) q.task_done() q = queue.Queue() <a href="https://www.168seo.cn/tag/thread" title="View all posts in thread" target="_blank">thread</a>s = [] num_worker_<a href="https://www.168seo.cn/tag/thread" title="View all posts in thread" target="_blank">thread</a>s = 80 for i in range(num_worker_threads): t = <a href="https://www.168seo.cn/tag/threading" title="View all posts in threading" target="_blank">threading</a>.Thread(target=worker) t.start() threads.append(t) for item in source(): q.put(item) # block until all tasks are done q.join() # stop workers for _ in range(num_worker_threads): q.put(None) for t in threads: t.join()

import queue

import threading

import re

import requests

def source ( ) :

for x in range ( 0 , 728 ) :

yield "url"

def do_work ( item ) :

try :

r = requests . get ( item )

r . encoding = "utf-8"

html = r . text

if html :

pattern = re . compile ( '<title>(.*?)</title>' , re . S )

t = re . search ( pattern , html ) . group ( 1 )

# print(t)

print ( item , t )

except requests . exceptions . RequestException as e :

print ( e )

pass

def worker ( ) :

while True :

item = q . get ( )

if item is None :

break

do_work ( item )

q . task_done ( )

q = queue . Queue ( )

threads = [ ]

num_worker_threads = 80

for i in range ( num_worker_threads ) :

t = threading . Thread ( target = worker )

t . start ( )

threads . append ( t )

for item in source ( ) :

q . put ( item )

# block until all tasks are done

q . join ( )

# stop workers

for _ in range ( num_worker_threads ) :

q . put ( None )

for t in threads :

t . join ( )

Python 多线程抓取豆瓣 top250

Python

#!/usr/bin/env <a href="https://www.168seo.cn/tag/python3" title="View all posts in python3" target="_blank">python3</a> # -*- coding=utf-8 -*- from multiprocessing import Process, Queue import time from lxml import etree import requests class DouBanSpider(Process): def __init__(self, url, q): # 重写写父类的__init__方法 super(DouBanSpider, self).__init__() self.url = url self.q = q self.headers = { 'Host': 'movie.douban.com', 'Referer': 'https://movie.douban.com/top250?start=225&filter=', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.104 Safari/537.36', } def run(self): self.parse_page() def send_request(self,url): ''' 用来发送请求的方法 :return: 返回网页源码 ''' # 请求出错时，重复请求３次, i = 0 while i <= 3: try: print("[INFO]请求url:"+url) return requests.get(url=url,headers=self.headers).content except Exception as e: print('[INFO] %s%s'% (e,url)) i += 1 def parse_page(self): ''' 解析网站源码，并采用ｘｐａｔｈ提取　电影名称和平分放到队列中 :return: ''' response = self.send_request(self.url) html = etree.HTML(response) #　获取到一页的电影数据 node_list = html.xpath("//div[@class='info']") for move in node_list: # 电影名称 title = move.xpath('.//a/span/text()')[0] # 评分 score = move.xpath('.//div[@class="bd"]//span[@class="rating_num"]/text()')[0] # 将每一部电影的名称跟评分加入到队列 self.q.put(score + "\t" + title) def main(): # 创建一个队列用来保存进程获取到的数据 q = Queue() base_url = 'https://movie.douban.com/top250?start=' # 构造所有ｕｒｌ url_list = [base_url+str(num) for num in range(0,225+1,25)] # 保存进程 Process_list = [] # 创建并启动进程 for url in url_list: p = DouBanSpider(url,q) p.start() Process_list.append(p) # 让主进程等待子进程执行完成 for i in Process_list: i.join() while not q.empty(): print(q.get()) if __name__=="__main__": start = time.time() main() print('[info]耗时：%s'%(time.time()-start))

#!/usr/bin/env python3

# -*- coding=utf-8 -*-

from multiprocessing import Process , Queue

import time

from lxml import etree

import requests

class DouBanSpider ( Process ) :

def __init__ ( self , url , q ) :

# 重写写父类的__init__方法

super ( DouBanSpider , self ) . __init__ ( )

self . url = url

self . q = q

self . headers = {

'Host' : 'movie.douban.com' ,

'Referer' : 'https://movie.douban.com/top250?start=225&filter=' ,

'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.104 Safari/537.36' ,

}

def run ( self ) :

self . parse_page ( )

def send_request ( self , url ) :

'''

用来发送请求的方法

:return: 返回网页源码

'''

# 请求出错时，重复请求３次,

i = 0

while i <= 3 :

try :

print ( "[INFO]请求url:" + url )

return requests . get ( url = url , headers = self . headers ) . content

except Exception as e :

print ( '[INFO] %s%s' % ( e , url ) )

i += 1

def parse_page ( self ) :

'''

解析网站源码，并采用ｘｐａｔｈ提取　电影名称和平分放到队列中

:return:

'''

response = self . send_request ( self . url )

html = etree . HTML ( response )

#　获取到一页的电影数据

node_list = html . xpath ( "//div[@class='info']" )

for move in node_list :

# 电影名称

title = move . xpath ( './/a/span/text()' ) [ 0 ]

# 评分

score = move . xpath ( './/div[@class="bd"]//span[@class="rating_num"]/text()' ) [ 0 ]

# 将每一部电影的名称跟评分加入到队列

self . q . put ( score + "\t" + title )

def main ( ) :

# 创建一个队列用来保存进程获取到的数据

q = Queue ( )

base_url = 'https://movie.douban.com/top250?start='

# 构造所有ｕｒｌ

url_list = [ base_url + str ( num ) for num in range ( 0 , 225 + 1 , 25 ) ]

# 保存进程

Process_list = [ ]

# 创建并启动进程

for url in url_list :

p = DouBanSpider ( url , q )

p . start ( )

Process_list . append ( p )

# 让主进程等待子进程执行完成

for i in Process_list :

i . join ( )

while not q . empty ( ) :

print ( q . get ( ) )

if __name__ == "__main__" :

start = time . time ( )

main ( )

print ( '[info]耗时：%s' % ( time . time ( ) - start ) )

zeropython 微信公众号 5868037 QQ号 5868037@qq.com QQ邮箱

songhao8080

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python3 threading Thread 多线程模板 queue

Python import queueimport <a href="https://www.168seo.cn/tag/threading" titl...
复制链接

扫一扫