爬虫--多线程,线程池,异步协程

本文介绍了Python中的多线程和线程池的使用,通过示例展示了如何创建和管理线程。接着讲解了异步编程的概念,包括使用asyncio库进行协程操作,以及异步HTTP请求的实现。最后,通过实际例子展示了异步下载文件的过程,强调了异步编程在提高效率方面的优势。
摘要由CSDN通过智能技术生成
# 1 , 多线程
from threading import Thread


def func():
    for i in range(1000):
        print("线程1", i)


def func2():
    for i in range(1000):
        print("线程2", i)


if __name__ == '__main__':
    t = Thread(target=func())
    t.start()  # 开辟一个线程,具体实现要看cpu的
    t2 = Thread(target=func())
    t2.start()  # 开辟一个线程,具体实现要看cpu的
    for i in range(1000):
        print("主线程", i)
# 线程池,一次性开辟多个线程

from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor

def func(name):
    for i in range(1000):
        print(name,i)
if __name__ == '__main__':
    # 创建线程池
    with ThreadPoolExecutor(50) as t:
        for i in range(100):
            t.submit(func,name = f"线程{i}")
    # 等待线程池完成任务之后,才进行下一步操作
    print("ok")

异步协程

 

import asyncio
import time
# 4 爬虫应用 , 相当于一个模板
async def downLoad(url):
    print("开始下载")
    await asyncio.sleep(2) # 下载两秒
    print("下载完成")

async def main():
    urls = [
        "https://www.baidu.com",
        "https://www.baida.com",
        "https://www.aidu.com"
    ]
    tasks = []
    for url in urls:
        d = downLoad(url)
        tasks.append(d)
    await asyncio.wait(tasks)


if __name__ == '__main__':
    t1 = time.time()
    asyncio.run(main())
    t2 = time.time()
    print(t2 - t1)  # 得出时间
import asyncio
import aiohttp


# 5 案例

async def urlDownload(url):
    name = url.rsplit('/', 1)[1]
    #    aiohttp.ClientSession() == requests
    # 在异步协程中with前面必须写上async
    # with可以不用手动关闭了,自动关闭
    async with aiohttp.ClientSession() as session:
        # session.get   session.post    左边是异步操作的,右边是requests
        # resp.content.read() == resp.content
        # resp.text() == resp.text
        # resp.json() == resp.json()
        async with session.get(url) as resp:
            with open(f"{name}.jepg", mode="wb") as f:
                f.write(await resp.content.read())  # 读取内容是异步的,需要挂起


async def main():
    urls = [
        "https://image.sitapix.com/index-thumb/sitapix-photo-2290543-via-sitapix-com.jpeg",
        "https://image.sitapix.com/index-thumb/bloom-blooming-blossom-130168-via-sitapix-com.jpeg",
        "https://image.sitapix.com/index-thumb/albums-antique-audio-1181789-via-sitapix-com.jpeg"
    ]
    tasks = []
    for url in urls:
        d = urlDownload(url)
        tasks.append(d)
    await asyncio.wait(tasks)


if __name__ == '__main__':
    asyncio.run(main())
这个是文件的异步操作
async with aiofiles.open(f"./novel/{n}.txt", mode="w", encoding="utf-8") as f:
await f.write(message)
# with open(f"./novel/{n}.txt", mode="w", encoding="utf-8") as f:
#     f.write(message)
import asyncio
import aiohttp
import requests
from lxml import etree
import aiofiles
from concurrent.futures import ThreadPoolExecutor
import time
import random

requests.packages.urllib3.disable_warnings()  # 关闭:不安全请求警告:正在发出未验证的HTTPS请求。强烈建议添加证书验证。


# 1,同步
# 2,异步
def getBookid(url_1):
    url = "http://quanxiaoshuo.com"
    resp = requests.get(url_1, verify=False)  # 给requests.get()传入 verify=False 避免ssl认证。
    et = etree.HTML(resp.text)
    list_1 = et.xpath("//div[@class='chapter']")
    data = []
    for item in list_1:
        # print(type(item))
        getData = url + item.xpath("./a/@href")[0]
        data.append(getData)
        # break
    # for i in data:
    #     print(i)
    return data


async def downLoad(url,n):

    async with aiohttp.ClientSession() as session:
        async with session.get(url) as resp:
            txt = await resp.text()  ##########
            # print(txt)
            et = etree.HTML(txt)
            message = ''.join(et.xpath('//*[@id="content"]/text()'))    # 可以将列表中的各个字符连接起来 
            message = ''.join(message.split())
            # print(message)
            # print("-"*100)
            async with aiofiles.open(f"./novel/{n}.txt", mode="w", encoding="utf-8") as f:
                await f.write(message)
            # with open(f"./novel/{n}.txt", mode="w", encoding="utf-8") as f:
            #     f.write(message)


async def main(urls):
    n = 1
    tasks = []
    for url in urls:
        tasks.append(downLoad(url,n))
        print("第%d回" % n)
        n += 1
    await asyncio.wait(tasks)


if __name__ == '__main__':
    a = 0
    b = 100
    url_1 = "http://quanxiaoshuo.com/179092/"
    data = getBookid(url_1)
    asyncio.run(main(data[:5]))
    # for i in range(10):
    #     # time.sleep(random.randint(1,4))
    #     asyncio.run(main(data[a:b]))
    #     a += 5
    #     b += 5
    print("ok")

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值