import time
import inspect
from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED, as_completed
class CreateThreadPool:
"""线程池,获取所有结果,之后再对结果进行处理"""
def __init__(self, t_p=10):
self.result = []
self.thread_pool = ThreadPoolExecutor(t_p)
def run_thread(self, fun, data_list: list):
""""
实现多线程
submit():不是阻塞的,而是立即返回。
task1.done(): 判断任务状态
task1.result(): 获取任务结果
wait(all_task, return_when=ALL_COMPLETED, timeout=2.5):
return_when: 表示wait返回结果的条件,默认为 ALL_COMPLETED 全部执行完成再返回
timeout: 最大等待时间
"""
all_task = []
for data in data_list:
res = self.thread_pool.submit(fun, **data)
res.add_done_callback(self.result_done)
all_task.append(res)
wait(all_task, return_when=ALL_COMPLETED)
return self.result
def result_done(self, res):
"""回调函数,用于结果处理"""
try:
self.result.append(res.result())
except:
self.result.append((None, None))
class CreateThreadPool1:
"""线程池,获取到第一个满足条件的结果后就结束任务"""
def __init__(self, t_p=3):
self.result = []
self.thread_pool = ThreadPoolExecutor(t_p)
def run_thread(self, fun, data_list: list):
""""实现多线程"""
all_task = []
for data in data_list:
res = self.thread_pool.submit(fun, **data)
all_task.append(res)
for future in as_completed(all_task):
result = future.result()
try:
if result == "test_def":
return result
else:
print(f"ccc:{result}")
except:
pass
def map_def(self, fun, data_list):
"""
使用map
map(fn, *iterables, timeout=None)
fn: 第一个参数 fn 是需要线程执行的函数;
iterables:第二个参数接受一个可迭代对象;
timeout: 第三个参数 timeout 跟 wait() 的 timeout 一样,但由于 map 是返回线程执行的结果,如果 timeout小于线程执行时间会抛异常 TimeoutError。
使用 map 方法,无需提前使用 submit 方法,map 方法与 python 高阶函数 map 的含义相同,都是将序列中的每个元素都执行同一个函数。
return: 输出顺序和列表的顺序相同
"""
for res in self.thread_pool.map(fun, data_list):
print(f"res:{res}.")
def get_func_name():
"""获取方法名"""
try:
return inspect.stack()[1][3]
except:
return None
def test_def(a=None):
time.sleep(3)
if not a:
a = get_func_name()
return a
data = [{"a": 0}, {"a": ""}, {"a": "d"}, {"a": "c"}]
# res1 = CreateThreadPool().run_thread(test_def, data)
# print(res1)
res2 = CreateThreadPool1().run_thread(test_def, data)
print(res2)
实例:
演示线程池爬取的网页
# coding: utf-8
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
import json
from requests import adapters
from proxy import get_proxies
headers = {
"Host": "splcgk.court.gov.cn",
"Origin": "https://splcgk.court.gov.cn",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36",
}
url = "https://splcgk.court.gov.cn/gzfwww/ktgglist?pageNo=1"
def spider(page):
data = {
"bt": "",
"fydw": "",
"pageNum": page,
}
for _ in range(5):
try:
response = requests.post(url, headers=headers, data=data, proxies=get_proxies())
json_data = response.json()
except (json.JSONDecodeError, adapters.SSLError):
continue
else:
break
else:
return {}
return json_data
def main():
with ThreadPoolExecutor(max_workers=8) as t:
obj_list = []
begin = time.time()
for page in range(1, 15):
obj = t.submit(spider, page)
obj_list.append(obj)
for future in as_completed(obj_list):
data = future.result()
print(data)
print('*' * 50)
times = time.time() - begin
print(times)
if __name__ == "__main__":
main()
来源:https://blog.csdn.net/weixin_42522389/article/details/112535616