2021.04.08
线程和进程的作用
直接通过Thread创建子线程
Thread(target=需要在子线程中调用的函数,args=(函数的实参列表))
from threading import Thread
import time
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor
from threading import current_thread
def download(url):
print(f'=========={current_thread()}=========')
print(f'{url}开始下载:{datetime.now()}')
time.sleep(1)
print(f'{url}下载结束:{datetime.now()}')
1) 在主线程中下载三个电影:需要时间是三个电影的时间的叠加
download('肖生克的救赎')
download('摔跤吧,爸爸')
download('霸王别姬')
2) 多线程下载三个电影
t1 = Thread(target=download, args=('肖生克的救赎',))
t2 = Thread(target=download, args=('摔跤吧,爸爸', ))
t3 = Thread(target=download, args=('霸王别姬',))
t1.start()
t2.start()
t3.start()
- 线程池
pool = ThreadPoolExecutor(max_workers=2)
# a.一次添加一个任务(任务对应的函数可以是任何类型)
pool.submit(download, '肖生克的救赎')
# b.一次添加多个任务
pool.map(download, ['霸王别姬', '阿甘正传', '这个杀手不太冷', 'V字仇杀队', '沉默的羔羊'])
# 关闭线程池
pool.shutdown()
创建线程子类
from threading import Thread
from datetime import datetime
import time
class DownloadThread(Thread):
def __init__(self, url):
super().__init__()
self.url = url
def run(self) -> None:
# 确定需要在子线程中完成的任务
print(f'{self.url}开始下载:{datetime.now()}')
time.sleep(1)
print(f'{self.url}下载结束:{datetime.now()}')
t1 = DownloadThread('肖生克的救赎')
t2 = DownloadThread('阿甘正传')
t3 = DownloadThread('霸王别姬')
t1.start()
t2.start()
t3.start()
数据分析岗位爬取
import requests
from re import search
import json
from concurrent.futures import ThreadPoolExecutor
from threading import Thread
from datetime import datetime
import csv
# 多线程的Queue
from queue import Queue
def an_data(html: str):
result = search(r'(?s)window.__SEARCH_RESULT__\s*=(.+?)</script>', html)
result_dict = json.loads(result.group(1))
return result_dict['engine_search_result']
def get_data(url):
# print(f'{url}开始!')
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36',
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return an_data(response.text)
else:
print(response)
all_data = []
q = Queue()
def get_all():
pool = ThreadPoolExecutor(200)
page = 1
print(f'start:{datetime.now()}')
while page < 2000:
fu = pool.submit(get_data, f'https://search.51job.com/list/090200,000000,0000,00,9,99,数据分析,2,{page}.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=')
if not fu.result():
pool.shutdown()
print(f'end:{datetime.now()}')
# 方法二-2:写入所有的数据
# with open('files/数据分析2.csv', 'w', newline='', encoding='utf-8') as f:
# print('打开文件!')
# writer = csv.DictWriter(f, all_data[0].keys())
# writer.writeheader()
# writer.writerows(all_data)
# del all_data
# q.put('end')
break
page += 1
# 获取结果
data = fu.result()
# ==================存储数据==============
# 方案一: 得到一页数据就打开一次文件, 缺点:消耗额外的CPU资源
# data = fu.result()
# save1(data)
# 方案二-1:得到所有数据,然后将数据一次性写入文件中, 缺点:在一定时间内容内存消耗增加
# all_data.extend(data)
# 方案三:
# q.put(data)
pool.shutdown()
def save1(data):
with open('files/数据分析.csv', 'a', newline='', encoding='utf-8') as f:
print('打开文件')
writer = csv.DictWriter(f, list(data[0].keys()))
writer.writerows(data)
print('写入数据!')
def save2():
with open('files/数据分析3.csv', 'w', encoding='utf-8', newline='') as f:
print('=======打开文件')
first = q.get()
print('========写入文件')
writer = csv.DictWriter(f, first[0].keys())
writer.writeheader()
writer.writerows(first)
while True:
data = q.get()
if data == 'end':
break
writer.writerows(data)
print('========写入文件')
# t = Thread(target=save2)
# t.start()
get_all()
# save2()
线程间和进程间通信
同一个进程中的多个线程数据可以直接共享,不同进程中的数据不能直接共享
from threading import Thread, current_thread
from multiprocessing import Process, current_process
多线程之间的数据通信
a = []
def func1():
a.append('A')
def func2():
a.append('B')
t1 = Thread(target=func1)
t2 = Thread(target=func2)
t1.start()
t2.start()
t1.join()
t2.join()
print(a)
多进程之间的数据通信
a = []
def func1(data1, data2):
print('进程1:', current_process())
a.append('A')
def func2(data1):
print('进程2:', current_process())
a.append('B')
if __name__ == '__main__':
print('主进程:', current_process())
# 创建进程对象
p1 = Process(target=func1, args=(100, 200))
p2 = Process(target=func2, args=(111,))
p1.start()
p2.start()
p1.join()
p2.join()
print('任务全部完成:', a)
进程间有效通信
使用多进程队列中需要注意:1.将队列定义全局变量 2.队列对象必须通过在子进程中调用的函数的参数传递到进程中去
import time
from multiprocessing import Process
# 多进程的队列
from multiprocessing import Queue
def download(name, queue: Queue):
print(f'{name}开始下载!')
time.sleep(1)
print(f'{name}下载完成!')
# return name+'数据'
# queue.put(name+'数据') # 往进程中添加数据
queue.put([f'{name}10', f'{name}20'])
if __name__ == '__main__':
q = Queue()
p1 = Process(target=download, args=('肖生克的救赎', q))
p2 = Process(target=download, args=('阿甘正传', q))
p1.start()
p2.start()
# p1.join()
# p2.join()
# 获取两个进程产生的数据
print(q.get()) # 获取队列中的数据;如果获取的时候队列中没有数据会阻塞线程一直等,等到有数据或者超时为止
print(q.get())
# print(q.get())
# print(q.get(timeout=2))
线程间有效通信
线程间通信:定义一个全局的容器,在子线程的函数中直接在全局容器中添加数据
from queue import Queue
from threading import Thread
import time
def download(name):
print(f'{name}开始下载')
time.sleep(1)
print(f'{name}下载结束')
q.put(f'{name}数据')
if __name__ == '__main__':
q = Queue()
t1 = Thread(target=download, args=('沉默的羔羊',))
t2 = Thread(target=download, args=('怦然心动',))
t1.start()
t2.start()
# t1.join()
# t2.join()
print(q.get())
print(q.get())
进程池
from multiprocessing import Pool
import time
def download(name):
print(f'{name}开始下载')
time.sleep(1)
print(f'{name}下载结束')
if __name__ == '__main__':
# 1.创建进程池对象
pool = Pool(maxtasksperchild=3)
# 2.添加任务
# 1)添加单个任务
# apply - 添加同步执行
# apply_async - 添加异步执行
pool.apply_async(download, args=('肖生克的救赎',))
pool.apply_async(download, args=('触不可及',))
# 2)同时添加多个任务
# pool.map(download, ['V字仇杀队', '恐怖游轮', '林中小屋'])
# 3.关闭
pool.close() # 停止添加任务
# pool.apply(download, args=('摔跤吧,爸爸',))
# 4. 等待
pool.join()