今天在工作中遇到爬虫效率问题,在此处记录多进程、多线程测试脚本
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__author__ = 'Seven'
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
import time
def gcd(pair):
a, b = pair
low = min(a, b)
for i in range(low, 0, -1):
if a % i == 0 and b % i == 0:
return i
numbers = [
(1963309, 2265973), (1879675, 2493670), (2030677, 3814172),
(1551645, 2229620), (1988912, 4736670), (2198964, 7876293)
]
def thread_map_test():
start_time = time.time()
with ThreadPoolExecutor(max_workers=4) as pool:
results = pool.map(gcd, numbers)
results = list(results)
end_time = time.time()
print(f'运行结果:{results}')
print(f'多线程map运行时长:{end_time - start_time}')
def thread_submit_test():
start_time = time.time()
results = []
with ThreadPoolExecutor(max_workers=4) as pool:
for i in numbers:
future = pool.submit(gcd, i)
results.append(future)
results = [result.result() for result in results]
end_time = time.time()
print(f'运行结果:{results}')
print(f'多线程submit运行时长:{end_time - start_time}')
def process_map_test():
start_time = time.time()
with ProcessPoolExecutor(max_workers=4) as pool:
results = pool.map(gcd, numbers)
results = list(results)
end_time = time.time()
print(f'运行结果:{results}')
print(f'多进程map运行时长:{end_time - start_time}')
def process_submit_test():
start_time = time.time()
results = []
with ProcessPoolExecutor(max_workers=4) as pool:
for i in numbers:
future = pool.submit(gcd, i)
results.append(future)
results = [result.result() for result in results]
end_time = time.time()
print(f'运行结果:{results}')
print(f'多进程submit运行时长:{end_time - start_time}')
if __name__ == '__main__':
thread_map_test()
thread_submit_test()
process_map_test()
process_submit_test()
当多进程/多线程传参时,一个为可变变量,一个为不可变变量,可参照如下代码进行传参:
with ProcessPoolExecutor(max_workers=4) as pool:
pool.map(partial(data_crawl, variable_constant=variable_constant), variable_changed)