import queue import <span class="wp_keywordlink_affiliate"><a href="https://www.168seo.cn/tag/threading" title="View all posts in threading" target="_blank">threading</a></span> import re import requests def source(): for x in range(0, 728): yield "url" def do_work(item): try: r = requests.get(item) r.encoding = "utf-8" html = r.text if html: pattern = re.compile('<title>(.*?)</title>', re.S) t = re.search(pattern, html).group(1) # print(t) print(item, t) except requests.exceptions.RequestException as e: print(e) pass def worker(): while True: item = q.get() if item is None: break do_work(item) q.task_done() q = queue.Queue() <span class="wp_keywordlink_affiliate"><a href="https://www.168seo.cn/tag/thread" title="View all posts in thread" target="_blank">thread</a></span>s = [] num_worker_<span class="wp_keywordlink_affiliate"><a href="https://www.168seo.cn/tag/thread" title="View all posts in thread" target="_blank">thread</a></span>s = 80 for i in range(num_worker_threads): t = <span class="wp_keywordlink_affiliate"><a href="https://www.168seo.cn/tag/threading" title="View all posts in threading" target="_blank">threading</a></span>.Thread(target=worker) t.start() threads.append(t) for item in source(): q.put(item) # block until all tasks are done q.join() # stop workers for _ in range(num_worker_threads): q.put(None) for t in threads: t.join()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
|
import
queue
import
threading
import
re
import
requests
def
source
(
)
:
for
x
in
range
(
0
,
728
)
:
yield
"url"
def
do_work
(
item
)
:
try
:
r
=
requests
.
get
(
item
)
r
.
encoding
=
"utf-8"
html
=
r
.
text
if
html
:
pattern
=
re
.
compile
(
'<title>(.*?)</title>'
,
re
.
S
)
t
=
re
.
search
(
pattern
,
html
)
.
group
(
1
)
# print(t)
print
(
item
,
t
)
except
requests
.
exceptions
.
RequestException
as
e
:
print
(
e
)
pass
def
worker
(
)
:
while
True
:
item
=
q
.
get
(
)
if
item
is
None
:
break
do_work
(
item
)
q
.
task_done
(
)
q
=
queue
.
Queue
(
)
threads
=
[
]
num_worker_threads
=
80
for
i
in
range
(
num_worker_threads
)
:
t
=
threading
.
Thread
(
target
=
worker
)
t
.
start
(
)
threads
.
append
(
t
)
for
item
in
source
(
)
:
q
.
put
(
item
)
# block until all tasks are done
q
.
join
(
)
# stop workers
for
_
in
range
(
num_worker_threads
)
:
q
.
put
(
None
)
for
t
in
threads
:
t
.
join
(
)
|
Python 多线程抓取 豆瓣 top250
#!/usr/bin/env <span class="wp_keywordlink_affiliate"><a href="https://www.168seo.cn/tag/python3" title="View all posts in python3" target="_blank">python3</a></span> # -*- coding=utf-8 -*- from multiprocessing import Process, Queue import time from lxml import etree import requests class DouBanSpider(Process): def __init__(self, url, q): # 重写写父类的__init__方法 super(DouBanSpider, self).__init__() self.url = url self.q = q self.headers = { 'Host': 'movie.douban.com', 'Referer': 'https://movie.douban.com/top250?start=225&filter=', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.104 Safari/537.36', } def run(self): self.parse_page() def send_request(self,url): ''' 用来发送请求的方法 :return: 返回网页源码 ''' # 请求出错时,重复请求3次, i = 0 while i <= 3: try: print("[INFO]请求url:"+url) return requests.get(url=url,headers=self.headers).content except Exception as e: print('[INFO] %s%s'% (e,url)) i += 1 def parse_page(self): ''' 解析网站源码,并采用xpath提取 电影名称和平分放到队列中 :return: ''' response = self.send_request(self.url) html = etree.HTML(response) # 获取到一页的电影数据 node_list = html.xpath("//div[@class='info']") for move in node_list: # 电影名称 title = move.xpath('.//a/span/text()')[0] # 评分 score = move.xpath('.//div[@class="bd"]//span[@class="rating_num"]/text()')[0] # 将每一部电影的名称跟评分加入到队列 self.q.put(score + "\t" + title) def main(): # 创建一个队列用来保存进程获取到的数据 q = Queue() base_url = 'https://movie.douban.com/top250?start=' # 构造所有url url_list = [base_url+str(num) for num in range(0,225+1,25)] # 保存进程 Process_list = [] # 创建并启动进程 for url in url_list: p = DouBanSpider(url,q) p.start() Process_list.append(p) # 让主进程等待子进程执行完成 for i in Process_list: i.join() while not q.empty(): print(q.get()) if __name__=="__main__": start = time.time() main() print('[info]耗时:%s'%(time.time()-start))
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
|
#!/usr/bin/env python3
# -*- coding=utf-8 -*-
from
multiprocessing
import
Process
,
Queue
import
time
from
lxml
import
etree
import
requests
class
DouBanSpider
(
Process
)
:
def
__init__
(
self
,
url
,
q
)
:
# 重写写父类的__init__方法
super
(
DouBanSpider
,
self
)
.
__init__
(
)
self
.
url
=
url
self
.
q
=
q
self
.
headers
=
{
'Host'
:
'movie.douban.com'
,
'Referer'
:
'https://movie.douban.com/top250?start=225&filter='
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.104 Safari/537.36'
,
}
def
run
(
self
)
:
self
.
parse_page
(
)
def
send_request
(
self
,
url
)
:
'''
用来发送请求的方法
:return: 返回网页源码
'''
# 请求出错时,重复请求3次,
i
=
0
while
i
<=
3
:
try
:
print
(
"[INFO]请求url:"
+
url
)
return
requests
.
get
(
url
=
url
,
headers
=
self
.
headers
)
.
content
except
Exception
as
e
:
print
(
'[INFO] %s%s'
%
(
e
,
url
)
)
i
+=
1
def
parse_page
(
self
)
:
'''
解析网站源码,并采用xpath提取 电影名称和平分放到队列中
:return:
'''
response
=
self
.
send_request
(
self
.
url
)
html
=
etree
.
HTML
(
response
)
# 获取到一页的电影数据
node_list
=
html
.
xpath
(
"//div[@class='info']"
)
for
move
in
node_list
:
# 电影名称
title
=
move
.
xpath
(
'.//a/span/text()'
)
[
0
]
# 评分
score
=
move
.
xpath
(
'.//div[@class="bd"]//span[@class="rating_num"]/text()'
)
[
0
]
# 将每一部电影的名称跟评分加入到队列
self
.
q
.
put
(
score
+
"\t"
+
title
)
def
main
(
)
:
# 创建一个队列用来保存进程获取到的数据
q
=
Queue
(
)
base_url
=
'https://movie.douban.com/top250?start='
# 构造所有url
url_list
=
[
base_url
+
str
(
num
)
for
num
in
range
(
0
,
225
+
1
,
25
)
]
# 保存进程
Process_list
=
[
]
# 创建并启动进程
for
url
in
url_list
:
p
=
DouBanSpider
(
url
,
q
)
p
.
start
(
)
Process_list
.
append
(
p
)
# 让主进程等待子进程执行完成
for
i
in
Process_list
:
i
.
join
(
)
while
not
q
.
empty
(
)
:
print
(
q
.
get
(
)
)
if
__name__
==
"__main__"
:
start
=
time
.
time
(
)
main
(
)
print
(
'[info]耗时:%s'
%
(
time
.
time
(
)
-
start
)
)
|