import time
import requests
import current
import concurrent
from concurrent import futures
import pandas as pd
import threading
from multiprocessing import Pool
# 装饰器,打印函数的执行时间
def gettime(func):
def warapper(*args, **kwags):
print("="*50)
print(func.__name__, 'Strat...')
starttime = time.time()
func(*args)
endtime = time.time()
spendtime = endtime - starttime
print(func.__name__, "End...")
print("Spend", spendtime, "s totally")
print("="*50)
return warapper
# 从文件去n个测试网站
def get_urls_from_files(n):
df = pd.read_csv('TestUrls.csv')
urls = list(df['url'])[:n]
return urls
# 请求并解析网页获取数据
def getdata(url, retries=3):
headers = {}
try:
html = requests.get(url, headers=headers)
except requests.exceptions.ConnectionError as e:
print('下载出错,错误原因:', e)
html = None
# 5XX错误为服务器错误,可以重新请求
if(html != None and 500 <= html.status_code <600 and retries):
retries -= 1
print("服务器出错正在重试...")
getdata(url, retries)
data = html.text
else:
data = None
return data
# 串行
@gettime
def Mynormal():
for url in urls:
getdata(url)
# 进程池
@gettime
def MyprocessPool(num=10):
pool = Pool(num)
results = pool.map(getdata, urls)
pool.close()
pool.join()
return results
# 线程池
@gettime
def Myfutures(num_of_max_works=10):
with concurrent.futures.threadPoolExecutor(max_workers = num_of_max_works) as executor:
executor.map(getdata, urls)
if __name__ == '__main__':
urls = get_urls_from_file(100)
# 串行
Mynormal()
# 进程池
MyprocessPool(10)
# 线程池
Myfutures(100)
python笔记--多进程与多线程
最新推荐文章于 2024-07-20 17:12:48 发布