爬虫7：多线程与协程

大鸣王潮2024

已于 2024-09-06 17:58:34 修改

阅读量631

点赞数 1

分类专栏： python爬虫学习之路文章标签：爬虫 php 开发语言

于 2024-09-06 17:55:29 首次发布

本文链接：https://blog.csdn.net/bliblisukabulie/article/details/141965868

版权

python爬虫学习之路专栏收录该内容

7 篇文章 0 订阅

订阅专栏

多线程

进程是资源单位，每个进程至少要有一个线程
线程是执行单位
创建线程比创建进程花销小，故使用前者

我们可以使用线程池：一次性开辟一些线程，我们用户直接给线程池子提交任务，线程任务的调度交给线程池来完成

from threading import Thread
from multiprocessing import Process
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor

class MyThread(Thread):
    def run(self): # 固定的, 线程被执行的时候，执行的就是run()
        print(1)

def func(name):
    print(name)

if __name__ == '__main__':
    t = MyThread()
    t.start()

    t2 = Thread(target=func, args=('wahaha',))
    t2.start()

    # 创建线程池
    with ThreadPoolExecutor(50) as t:
        for i in range(100):
            t.submit(func, name=f"线程{i}")
    # 等待线程池中的任务全部执行完毕，才继续执行（守护）

多线程爬取北京新发地

"""
1. 如何提取单个页面的数据
2. 上线程池，多个页面同时抓取
"""

import requests
from lxml import etree
import csv
f = open('data.csv', mode='w', encoding='utf-8')
csvwriter = csv.writer(f)

def download_one_page(url):
    resp = requests.get(url)
    html = etree.HTML(resp.text)
    table = html.xpath("/html/body")
    trs = table.xpath("./tr[position()>1]")
    for tr in trs:
        txt = tr.xpath("./td/text()")
        txt = (item.replace("\\", "").replace("/", "") for item in txt)
        # 存放到文件之中
        csvwriter.writerow(txt)

from concurrent.futures import ThreadPoolExecutor

if __name__ == '__main__':
    with ThreadPoolExecutor(50) as t:
        for i in range(1, 10):
            url = f"https://www.qiushibaike.com/8hr/page/{i}/"
            t.submit(download_one_page, url)

    print('全部下载完毕!')

协程

import time

"""
input()程序也是处于阻塞状态
requests.get()在网络请求返回数据之前，程序也是处于阻塞状态
一般情况下，程序处于IO操作时候，线程都会处于阻塞状态

协程：当程序遇见IO操作的时候，可以选择性地切换到其他任务上
在微观上是一个任务一个任务进行切换，切换条件一般就是IO操作
在宏观上，能看到的是多个任务一起执行
多任务异步操作

上方所讲的一切，
"""
import asyncio
async def func1():
    # 异步
    print("我爱黎明")
    # time.sleep(3)  # 当程序出现了同步操作，异步就中断了
    await asyncio.sleep(3) # 异步操作的代码
    print("我爱番茄")

async def func2():
    # 异步
    print("我爱诺手")
    # time.sleep(2)
    await asyncio.sleep(2)
    print("我爱嘻嘻")

async def func3():
    # 异步
    print("我爱周杰伦")
    # time.sleep(4)
    await asyncio.sleep(4)
    print("我爱鸡蛋")

async def main():
    """
    await 挂起操作放在协程对象前面
    """
    # tasks = [
    #     asyncio.create_task(func1()),  # 使用 create_task 包装协程
    #     asyncio.create_task(func2()),
    #     asyncio.create_task(func3())
    # ]
    tasks = [
        asyncio.create_task(func1()),  # 使用 create_task 包装协程
        asyncio.create_task(func2()),
        asyncio.create_task(func3())
    ]
    await asyncio.wait(tasks)  # 传递任务给 asyncio.wait

async def download(url):
    print('开始下载')

async def create():
    urls= [
        'www.baidu.com',
        'www.qq.com',
        'www.163.com'
    ]

    tasks = []
    for url in urls:
        d = download(url)
        tasks.append(asyncio.create_task(d))
    await asyncio.wait(tasks)

if __name__ == '__main__':
    # f1 = asyncio.create_task(func1()) # 此时的函数是异步协程函数，执行得到一个协程对象
    # f2 = asyncio.create_task(func2())
    # f3 = asyncio.create_task(func3())
    # tasks = [f1, f2, f3]

    # 一次性启动多个任务（协程
    # asyncio.run(main())
    asyncio.run(create())

异步http请求aiohttp

import asyncio
import aiohttp

urls = [
    'urls =["http://kr.shanghai-jiuxin.com/file/2020/1031/191468637cab2f0206f7d1d9b175ac81.jpg'
]

async def aiodownload(url):
    """
    s = aoiohttp.ClientSession() <==> requests
    request.get(), post()
    s.get(), post()
    发送请求
    得到图片内容
    保存到文件

    :param url:
    :return:
    """
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as resp:
            # 读取响应的内容，这也是异步的
            # 可以自己去学习一个模块，aiofiles
            content = await resp.content.read()
            with open('images/' + url.split('/')[-1], mode='wb') as f:
                f.write(content)

async def main():
    tasks = []
    for url in urls:
        tasks.append(aiodownload(url))

    await asyncio.wait(tasks)

if __name__ == '__main__':
    asyncio.run(main())

使用aiohttp扒光百度小说

import json

urls = [
    'https://dushu.baidu.com/api/pc/getCatalog?data={book_id:4306063500}',  # 所有章节的内容，（名称，cid）
    # 章节内部的内容
    'https://dushu.baidu.com/api/pc/getChapterContent?data={book_id:4306063500,cid:4306063500|1569782244,need_bookinfo:1}',
]

import requests
import asyncio
import aiohttp
import aiofiles

"""
1. 同步操作: 访问getCatalog 拿到所有章节的cid和名称
2. 异步操作：访问getChapterContent 拿到每个章节的内容
"""
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0",
    "cookie" : '',
}

async def aiodownload(cid, b_id, title):
    data = {
        "book_id":b_id,
        "cid":f"{b_id}|{cid}",
        "need_bookinfo":1
    }
    data = json.dumps(data)
    url = f'https://dushu.baidu.com/api/pc/getChapterContent?data={data}'
    url.replace('"', '%22')
    # print(data)
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as resp:
            dic = await resp.json()
            async with aiofiles.open('book/' + title, mode="w", encoding="utf-8") as f:
                await f.write(dic['data']['novel']['content']) # 写出

async def getCatalog(url):
    resp = requests.get(url, headers=headers, verify=False)
    # print(resp.text)
    dic = resp.json()
    tasks = []
    for item in dic['data']['novel']['items']:
        title = item['title']
        cid = item['cid']
        # print(title, cid)
        tasks.append(asyncio.create_task(aiodownload(cid, b_id, title)))
    resp.close()

    await asyncio.wait(tasks)

if __name__ == '__main__':
    b_id = "4306063500"
    url = 'https://dushu.baidu.com/api/pc/getCatalog?data={%22book_id%22:%22'+b_id+'%22}'

    asyncio.run(getCatalog(url))
    # data = {
    #     "book_id":b_id,
    #     "cid":f"{b_id}|'ad'",
    #     "need_bookinfo":1
    # }
    # data = json.dumps(data)
    # print(data)

抓取91视频

m3u8文件

"""
<video src="xx.mp4"></video>
一般的视频网站是怎么做的？
用户上传->转码(不同清晰度，把视频做处理)->切片处理(单个文件进行拆分)
用户在进行拉动进度条的时候

需要一个文件记录：1.视频播放顺序，2.视频存放的路径
M3U txt json => 文本

想要抓取一个视频：
1. 找到m3u8（各种手段）
2. 通过m3u8下载到ts文件
3. 可以通过各种手段（不仅仅是编程手段）把ts文件合并为一个mp4文件

复杂版本：

思路：
1.拿到主页面的页面源代码，找到iframe
2.从iframe的页面源代码中拿到m3u8文件
3.下载第一层m3u8文件->下载第二层（视频存放路径）
4.下载视频
5.下载秘钥，进行解密操作
6.合并所有ts文件为一个mp4文件
"""

因为91被封了这里用另一个视频网站代替吧
包含了下载.ts文件和合并的过程

import os
import time

import requests

# url = 'https://v2.tlkqc.com/wjv2/202308/22/zuKAbRKZTT2/video/1000k_0X720_64k_25/hls/index.m3u8'
#
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0",
    "Referer": "https://www.91kmj.cc/play/13669-1-1/",
}
#
# resp = requests.get(url, headers=headers, verify=False)
#
# with open('index.m3u8', mode='wb') as f:
#     f.write(resp.content)

# 解析m3u8文件
import asyncio
import aiohttp
import aiofiles
from yarl import URL
async def download_ts(url, name, session):
    # async with session.get(URL(url), headers=headers) as resp:
    async with session.request('GET', url, headers=headers) as resp:
        assert resp.status == 200
        async with aiofiles.open(f'video/{name}.ts', mode="wb") as f:
            await f.write(await resp.content.read())
            # print(resp.headers)
            # print(resp.status)

async def aio_download():
    tasks = []
    n = 0
    cookies = {'cookie' : ''}
    conn = aiohttp.TCPConnector(limit=10, ssl=False)
    async with aiohttp.ClientSession(connector=conn) as session:
        # 经验之谈，提前准备好session传递过去
        async with aiofiles.open('index.m3u8', mode='r', encoding='utf-8') as f:
            async for line in f:
                if line.startswith('#'):
                    continue
                line = line.strip()
                ts_url = line
                # print(ts_url)
                task = asyncio.create_task(download_ts(ts_url, n, session))
                n += 1
                # if n == 2:
                #     break
                tasks.append(task)
                time.sleep(1)
            await asyncio.wait(tasks)

# with open('index.m3u8', mode='r', encoding='utf-8') as f:
#     n = 0
#     for line in f:
#         line = line.strip()
#         if line.startswith('#'):
#             continue
#
#         # 下载视频片段
#         resp3 = requests.get(line, headers=headers, verify=False)
#         f = open(f'video/{n}.ts',mode='wb')
#         f.write(resp3.content)
#         f.close()
#         n += 1
from tqdm import tqdm

def get_video(path):
    files = os.listdir(path)
    # print(files[0].split('\\')[-1].split('.')[0])
    files = [file for file in files if file.endswith('.ts')]
    with open(os.path.join(path, 'merge.mp4'), 'wb') as f2:
        pass
    files = sorted(files, key=lambda file: int(file.split('\\')[-1].split('.')[0]))
    for file in tqdm(files, desc="正在转换"):
        cur = os.path.join(path, file)
        print(file)
        if os.path.exists(cur):
            with open(cur, mode='rb') as f1:
                with open(os.path.join(path, 'merge.mp4'), 'ab') as f2:
                    f2.write(f1.read())
        else:
            print('?')
    print('finish')


if __name__ == '__main__':
    # asyncio.run(aio_download())
    get_video(r'D:\ProgramFiles\PythonProject\spider\video')
    pass