大家好,我是天空之城,今天给大家带来,多进程的知识介绍
1. 进程介绍
进程:正在执行的程序
程序:没有执行的代码,是一个静态的
2. 线程和进程之间的对比
进程:能够完成多任务,一台电脑上可以同时运行多个QQ
线程:能够完成多任务,一个QQ中的多个聊天窗口
根本区别:进程是操作系统资源分配的基本单位,而线程是任务调度和执行的基本单位
3. 进程之间的通信
• Queue-队列 先进先出
• 共享全局变量不适用于多进程编程
4. 进程池之间的通信
当需要创建的子进程数量不多时,可以直接利用multiprocessing中的Process动态生成多个进程,但是如果是上百甚至上千个目标,手动的去创建的进程的工作量巨大,此时就可以用到multiprocessing模块提供的Pool方法
from multiprocessing import Pool
import os,time,random
def worker(msg):
t_start = time.time()
print('%s开始执行,进程号为%d'%(msg,os.getpid()))
time.sleep(random.random()*2)
t_stop = time.time()
print(msg,"执行完成,耗时%0.2f"%(t_stop-t_start))
if __name__ == '__main__':
po = Pool(3) # 定义一个进程池
for i in range(0,10):
po.apply_async(worker,(i,))
print("--start--")
po.close()
po.join()
print("--end--")
--------------------------------------------------------------------------------------------------
进程简介
import multiprocessing
import threading
import time
def demo1():
while True:
print('--1--')
time.sleep(1)
def demo2():
while True:
print('--2--')
time.sleep(1)
def main():
# 多线程
# t1 = threading.Thread(target=demo1)
# t2 = threading.Thread(target=demo2)
#
# t1.start()
# t2.start()
# 多进程
p1 = multiprocessing.Process(target=demo1)
p2 = multiprocessing.Process(target=demo2)
p1.start()
p2.start()
# 传统方式
# demo1()
# demo2()
if __name__ == '__main__':
main()
main()
进程通信
import multiprocessing
from queue import Queue
def download(q):
# 下载数据,放入队列中
lst = [11,22,33,44]
for item in lst:
q.put(item)
print('数据下载完成,保存到队列当中......')
def parse_data(q):
# 处理数据
data = []
while True:
q_get_data = q.get()
data.append(q_get_data)
if q.empty():
break
print(data)
def main():
# q = multiprocessing.Queue()
q = Queue()
t1 = multiprocessing.Process(target=download,args=(q,))
t2 = multiprocessing.Process(target=parse_data,args=(q,))
# t1.start()
#
# t2.start()
# t1.run()
# t2.run() 不允许
if __name__ == '__main__':
main()
进程之间共享全局变量
import multiprocessing
a = 1
def demo1():
global a
a += 1
def demo2():
print(a)
if __name__ == '__main__':
t1 = multiprocessing.Process(target=demo1)
t2 = multiprocessing.Process(target=demo2)
t1.start()
t2.start()
进程队列
# from queue import Queue # 普通线程的队列
from multiprocessing import Queue # 进程的队列
# q = Queue(3)
# 存数据
# q.put(1)
# q.put(2)
# q.put(3)
# # q.put(4) # 堵塞
# # q.put_nowait(4) # queue.Full 报错
#
# # 取值
# print(q.get())
# print(q.get())
# print(q.get())
# print(q.get()) # 堵塞
# q.empty()
进程池之间的通信
# from multiprocessing import Pool
#
# import os, time, random
#
#
# def worker(msg):
# t_start = time.time()
# print('%s开始执行,进程号为%d' % (msg, os.getpid()))
#
# time.sleep(random.random() * 2)
# t_stop = time.time()
# print(msg, "执行完成,耗时%0.2f" % (t_stop - t_start))
#
# def demo():
# pass
# if __name__ == '__main__':
# po = Pool(3) # 定义一个进程池
# for i in range(0, 10):
# po.apply_async(worker, (i,))
#
# print("--start--")
# po.close()
# po.apply_async(demo)
# po.join()
# print("--end--")
import multiprocessing
def demo1(q):
print(1)
q.put('a')
def demo2(q):
print(2)
print(q.get())
if __name__ == '__main__':
# q = multiprocessing.Queue()
q = multiprocessing.Manager().Queue() # 进程池之间进程间的通信
po = multiprocessing.Pool(2)
po.apply_async(demo1,args=(q,))
po.apply_async(demo2,args=(q,))
po.close()
po.join()
多线程爬取图片实例
# https://www.doutula.com/photo/list/?page=1 第一页
# https://www.doutula.com/photo/list/?page=2 第二页
# https://www.doutula.com/photo/list/?page=3 第三页
import requests
from lxml import etree
import os
from time import *
from queue import Queue
import threading
# 开始时间
start = time()
# 定义生产者对象
class Procuder(threading.Thread):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'
}
def __init__(self,page_queue,img_queue,*args,**kwargs):
super(Procuder,self).__init__(*args,**kwargs)
self.page_queue = page_queue
self.img_queue = img_queue
def run(self):
while True:
# 如果队列没有数据就退出循环
if self.page_queue.empty():
break
# 拿到网页的url
url = self.page_queue.get()
# 调用解析网页的函数
self.parse_page(url)
# 定义一个函数来解析网页
def parse_page(self,url):
response = requests.get(url,headers=self.headers)
# print(response.text)
# 网页源码
text = response.text
html = etree.HTML(text)
# 获取Img 并且把gif的图片过滤掉
imgs = html.xpath('//div[@class="page-content text-center"]//img[@class!="gif"]')
for img in imgs:
# print(etree.tostring(img))
# 获取每张图片的url
img_url = img.get('data-original')
# 获取图片的名字
alt = img.get('alt')
# 在os模块中可以很好的分割字符,比如这种带.的数据
suffix = os.path.splitext(img_url)[1]
# 把图片的名字进行拼接
filename = alt + suffix
# print(filename)
# 保存图片
with open('img/' + filename,'wb') as f:
img_rep = requests.get(img_url)
f.write(img_rep.content)
# 定义消费者
# 定义生产者对象
class Consumer(threading.Thread):
def __init__(self, page_queue, img_queue, *args, **kwargs):
super(Consumer, self).__init__(*args, **kwargs)
self.page_queue = page_queue
self.img_queue = img_queue
def run(self):
while True:
if self.img_queue.empty() and self.page_queue.empty():
break
img_url,filename = self.img_queue.get()
# 保存图片
with open('img/' + filename, 'wb') as f:
img_rep = requests.get(img_url)
f.write(img_rep.content)
def main():
# 定义页数的队列
page_queue = Queue(100)
# 定义图片的url
img_queue = Queue(500)
for x in range(1,2):
# print(x)
url = 'https://www.doutula.com/photo/list/?page=%d'%x
# parse_page(url)
# 把页数添加到队列中
page_queue.put(url)
# 创建生产者和消费者
# 创建生产者
for x in range(5):
t = Procuder(page_queue,img_queue)
t.start()
# 创建消费者
for x in range(5):
t = Consumer(page_queue,img_queue)
t.start()
if __name__ == '__main__':
main()
end = time()
print('程序共花费了:',end - start)