2-5.多进程、多线程、异步携程

最新推荐文章于 2023-04-29 22:45:05 发布

青竹·忆

最新推荐文章于 2023-04-29 22:45:05 发布

阅读量268

点赞数

文章标签： python 爬虫 json

本文链接：https://blog.csdn.net/weixin_38477850/article/details/124629644

版权

多线程

1. threading

import threading
thread = threading.Thread(target=target, args=[i])

def targets(second):
print(f’Threading {threading.current_thread().name} is running’)
print(f’Threading {threading.current_thread().name} sleep {second}s’)
time.sleep(second)
print(f’Threading {threading.current_thread().name} is ended’)

print(f’Threading {threading.current_thread().name} is running’)

t = []
for i in [1, 5]:
thread = threading.Thread(target=targets, args=[i])
t.append(thread)
thread.start()
for i in t:
i.join()
print(f’Threading {threading.current_thread().name} is ended’)

2. 线程池

1名员工 10双鞋
10名员工 100双

10名员工半天 50双
5名员工一天 50双

给线程一个数量只有这么多人

from concurrent.futures import ThreadPoolExecutor

def crawl(url):
    print(url)

if __name__ == '__main__':
    base_url = 'https://jobs.51job.com/pachongkaifa/p{}/'
    with ThreadPoolExecutor(10) as f:
        for i in range(1,15):
            f.submit(crawl,url=base_url.format(i))

多线程采集案例

1、先提取json文件获取英雄的Id
2、根据iD 找到英雄的详情页地址
3、从详情页里面提取头像地址构造皮肤地址

import requests
import os
import json
import threading
from lxml import etree
import time
h=[]
s=time.time()

#1.先提取json文件  获取英雄的Id
def duo():
    '''
    处理多任务
    :return:
    '''
    response=requests.get('https://pvp.qq.com/web201605/js/herolist.json')
    data=json.loads(response.text)
    #print(data)
    for j in data:
        t=threading.Thread(target=pa,args=(j,))
        t.start()
        h.append(t)
    for k in h:
        k.join()

#2.根据iD 找到英雄的详情页地址
def pa(j):
    num = j['ename']  #从data中获取ename的值
    name = j['cname']
    res2 = requests.get("https://pvp.qq.com/web201605/herodetail/{}.shtml".format(num))
    res2_decode = res2.content.decode('gbk')  # 返回相应的html页面，字符串格式，解码为utf-8
    _element = etree.HTML(res2_decode)  # 将html转换为_Element对象，可以方便的使用getparent()、remove()、xpath()等方法
    element_img = _element.xpath('//div[@class="pic-pf"]/ul/@data-imgname')
    #print(element_img)
    name_img = element_img[0].split('|')  # 去掉字符串中的|字符,并分割
    #print(name_img)
    for i in range(0,10):
        res1=requests.get("https://game.gtimg.cn/images/yxzj/img201606/skin/hero-info/{0}/{0}-bigskin-{1}.jpg".format(num,i+1))  #返回响应包
        if res1.status_code == 200:
            aa=name_img[i].find('&')
            #print(aa)
            bb=name_img[i][:aa]
            res_img=res1.content   #把相应包内容转换为2进制
            a = './王者荣耀/' + str(name)
            b='./王者荣耀/'+str(name)+'/'+bb+'.jpg'
            if not os.path.exists('./王者荣耀/'):
                os.mkdir('./王者荣耀/')
            if not os.path.exists(a):
                os.mkdir(a)
                
            #3.从详情页里面提取头像地址  构造 皮肤地址
            with open(b,"wb") as f:   #创建一个名为1.jpg的图片
                f.write(res_img)   #把响应包2进制内容写入到1.jpg中
                print(name, bb)
        else:
            break


if __name__=='__main__':
    duo()
    g=time.time()
    print("用时：",g-s,"秒")

多进程

multiprocessing
from multiprocessing import Pool #进程池

import multiprocessing

from multiprocessing import Pool
import requests


def process(index):
    print(f'Proess:{index}')



def scrape(url):
    try:
        requests.get(url)
        print(f'URL {url} Scraped')
    except requests.ConnectionError:
        print(f'URL {url} not Scraped')

if __name__ == '__main__':

    # for i in range(5):
    #     p = multiprocessing.Process(target=process,args=(i,))
    #     p.start()


    pool = Pool(processes=3)
    urls = [
        'https://www.baidu.com',
        'http://www.meituan.com/',
        'http://blog.csdn.net/',
        'http://xxxyxxx.net'
    ]
    pool.map(scrape, urls)
    pool.close()

异步携程

#异步函数声明 async

实例

import asyncio
import time
import httpx
async def req(client, i):
    res = await client.get('https://www.example.com')
    print(f'第{i + 1}次请求，status_code = {res.status_code}')
    return res

async def main():
    async with httpx.AsyncClient() as client:
        task_list = []  # 任务列表
        for i in range(50):
            res = req(client, i)
            task = asyncio.create_task(res)  # 创建任务
            task_list.append(task)
            #await 耗时任务给他挂起
        await asyncio.gather(*task_list)  # 收集任务


if __name__ == '__main__':
    start = time.time()
    asyncio.run(main())
    end = time.time()
    print(f'异步发送50次请求，耗时：{end - start}')

青竹·忆

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
2-5.多进程、多线程、异步携程

多线程1. threadingimport threadingthread = threading.Thread(target=target, args=[i])def targets(second):print(f’Threading {threading.current_thread().name} is running’)print(f’Threading {threading.current_thread().name} sleep {second}s’)time.sleep(seco
复制链接

扫一扫