多线程
1. threading
import threading
thread = threading.Thread(target=target, args=[i])
def targets(second):
print(f’Threading {threading.current_thread().name} is running’)
print(f’Threading {threading.current_thread().name} sleep {second}s’)
time.sleep(second)
print(f’Threading {threading.current_thread().name} is ended’)
print(f’Threading {threading.current_thread().name} is running’)
t = []
for i in [1, 5]:
thread = threading.Thread(target=targets, args=[i])
t.append(thread)
thread.start()
for i in t:
i.join()
print(f’Threading {threading.current_thread().name} is ended’)
2. 线程池
1名员工 10双鞋
10名员工 100双
10名员工 半天 50双
5名员工 一天 50双
给线程一个数量 只有这么多人
from concurrent.futures import ThreadPoolExecutor
def crawl(url):
print(url)
if __name__ == '__main__':
base_url = 'https://jobs.51job.com/pachongkaifa/p{}/'
with ThreadPoolExecutor(10) as f:
for i in range(1,15):
f.submit(crawl,url=base_url.format(i))
多线程采集案例
1、 先提取json文件 获取英雄的Id
2、根据iD 找到英雄的详情页地址
3、从详情页里面提取头像地址 构造 皮肤地址
import requests
import os
import json
import threading
from lxml import etree
import time
h=[]
s=time.time()
#1.先提取json文件 获取英雄的Id
def duo():
'''
处理多任务
:return:
'''
response=requests.get('https://pvp.qq.com/web201605/js/herolist.json')
data=json.loads(response.text)
#print(data)
for j in data:
t=threading.Thread(target=pa,args=(j,))
t.start()
h.append(t)
for k in h:
k.join()
#2.根据iD 找到英雄的详情页地址
def pa(j):
num = j['ename'] #从data中获取ename的值
name = j['cname']
res2 = requests.get("https://pvp.qq.com/web201605/herodetail/{}.shtml".format(num))
res2_decode = res2.content.decode('gbk') # 返回相应的html页面,字符串格式,解码为utf-8
_element = etree.HTML(res2_decode) # 将html转换为_Element对象,可以方便的使用getparent()、remove()、xpath()等方法
element_img = _element.xpath('//div[@class="pic-pf"]/ul/@data-imgname')
#print(element_img)
name_img = element_img[0].split('|') # 去掉字符串中的|字符,并分割
#print(name_img)
for i in range(0,10):
res1=requests.get("https://game.gtimg.cn/images/yxzj/img201606/skin/hero-info/{0}/{0}-bigskin-{1}.jpg".format(num,i+1)) #返回响应包
if res1.status_code == 200:
aa=name_img[i].find('&')
#print(aa)
bb=name_img[i][:aa]
res_img=res1.content #把相应包内容转换为2进制
a = './王者荣耀/' + str(name)
b='./王者荣耀/'+str(name)+'/'+bb+'.jpg'
if not os.path.exists('./王者荣耀/'):
os.mkdir('./王者荣耀/')
if not os.path.exists(a):
os.mkdir(a)
#3.从详情页里面提取头像地址 构造 皮肤地址
with open(b,"wb") as f: #创建一个名为1.jpg的图片
f.write(res_img) #把响应包2进制内容写入到1.jpg中
print(name, bb)
else:
break
if __name__=='__main__':
duo()
g=time.time()
print("用时:",g-s,"秒")
多进程
multiprocessing
from multiprocessing import Pool #进程池
import multiprocessing
from multiprocessing import Pool
import requests
def process(index):
print(f'Proess:{index}')
def scrape(url):
try:
requests.get(url)
print(f'URL {url} Scraped')
except requests.ConnectionError:
print(f'URL {url} not Scraped')
if __name__ == '__main__':
# for i in range(5):
# p = multiprocessing.Process(target=process,args=(i,))
# p.start()
pool = Pool(processes=3)
urls = [
'https://www.baidu.com',
'http://www.meituan.com/',
'http://blog.csdn.net/',
'http://xxxyxxx.net'
]
pool.map(scrape, urls)
pool.close()
异步携程
#异步函数声明 async
实例
import asyncio
import time
import httpx
async def req(client, i):
res = await client.get('https://www.example.com')
print(f'第{i + 1}次请求,status_code = {res.status_code}')
return res
async def main():
async with httpx.AsyncClient() as client:
task_list = [] # 任务列表
for i in range(50):
res = req(client, i)
task = asyncio.create_task(res) # 创建任务
task_list.append(task)
#await 耗时任务给他挂起
await asyncio.gather(*task_list) # 收集任务
if __name__ == '__main__':
start = time.time()
asyncio.run(main())
end = time.time()
print(f'异步发送50次请求,耗时:{end - start}')