多线程
进程是资源单位 ,每个进程至少要有一个线程
线程是执行单位
创建线程比创建进程花销小,故使用前者
我们可以使用线程池:一次性开辟一些线程,我们用户直接给线程池子提交任务,线程任务的调度交给线程池来完成
from threading import Thread
from multiprocessing import Process
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
class MyThread(Thread):
def run(self): # 固定的, 线程被执行的时候,执行的就是run()
print(1)
def func(name):
print(name)
if __name__ == '__main__':
t = MyThread()
t.start()
t2 = Thread(target=func, args=('wahaha',))
t2.start()
# 创建线程池
with ThreadPoolExecutor(50) as t:
for i in range(100):
t.submit(func, name=f"线程{i}")
# 等待线程池中的任务全部执行完毕,才继续执行(守护)
多线程爬取北京新发地
"""
1. 如何提取单个页面的数据
2. 上线程池,多个页面同时抓取
"""
import requests
from lxml import etree
import csv
f = open('data.csv', mode='w', encoding='utf-8')
csvwriter = csv.writer(f)
def download_one_page(url):
resp = requests.get(url)
html = etree.HTML(resp.text)
table = html.xpath("/html/body")
trs = table.xpath("./tr[position()>1]")
for tr in trs:
txt = tr.xpath("./td/text()")
txt = (item.replace("\\", "").replace("/", "") for item in txt)
# 存放到文件之中
csvwriter.writerow(txt)
from concurrent.futures import ThreadPoolExecutor
if __name__ == '__main__':
with ThreadPoolExecutor(50) as t:
for i in range(1, 10):
url = f"https://www.qiushibaike.com/8hr/page/{i}/"
t.submit(download_one_page, url)
print('全部下载完毕!')
协程
import time
"""
input()程序也是处于阻塞状态
requests.get()在网络请求返回数据之前,程序也是处于阻塞状态
一般情况下,程序处于IO操作时候,线程都会处于阻塞状态
协程:当程序遇见IO操作的时候,可以选择性地切换到其他任务上
在微观上是一个任务一个任务进行切换,切换条件一般就是IO操作
在宏观上,能看到的是多个任务一起执行
多任务异步操作
上方所讲的一切,
"""
import asyncio
async def func1():
# 异步
print("我爱黎明")
# time.sleep(3) # 当程序出现了同步操作,异步就中断了
await asyncio.sleep(3) # 异步操作的代码
print("我爱番茄")
async def func2():
# 异步
print("我爱诺手")
# time.sleep(2)
await asyncio.sleep(2)
print("我爱嘻嘻")
async def func3():
# 异步
print("我爱周杰伦")
# time.sleep(4)
await asyncio.sleep(4)
print("我爱鸡蛋")
async def main():
"""
await 挂起操作放在协程对象前面
"""
# tasks = [
# asyncio.create_task(func1()), # 使用 create_task 包装协程
# asyncio.create_task(func2()),
# asyncio.create_task(func3())
# ]
tasks = [
asyncio.create_task(func1()), # 使用 create_task 包装协程
asyncio.create_task(func2()),
asyncio.create_task(func3())
]
await asyncio.wait(tasks) # 传递任务给 asyncio.wait
async def download(url):
print('开始下载')
async def create():
urls= [
'www.baidu.com',
'www.qq.com',
'www.163.com'
]
tasks = []
for url in urls:
d = download(url)
tasks.append(asyncio.create_task(d))
await asyncio.wait(tasks)
if __name__ == '__main__':
# f1 = asyncio.create_task(func1()) # 此时的函数是异步协程函数,执行得到一个协程对象
# f2 = asyncio.create_task(func2())
# f3 = asyncio.create_task(func3())
# tasks = [f1, f2, f3]
# 一次性启动多个任务(协程
# asyncio.run(main())
asyncio.run(create())
异步http请求aiohttp
import asyncio
import aiohttp
urls = [
'urls =["http://kr.shanghai-jiuxin.com/file/2020/1031/191468637cab2f0206f7d1d9b175ac81.jpg'
]
async def aiodownload(url):
"""
s = aoiohttp.ClientSession() <==> requests
request.get(), post()
s.get(), post()
发送请求
得到图片内容
保存到文件
:param url:
:return:
"""
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
# 读取响应的内容,这也是异步的
# 可以自己去学习一个模块,aiofiles
content = await resp.content.read()
with open('images/' + url.split('/')[-1], mode='wb') as f:
f.write(content)
async def main():
tasks = []
for url in urls:
tasks.append(aiodownload(url))
await asyncio.wait(tasks)
if __name__ == '__main__':
asyncio.run(main())
使用aiohttp扒光百度小说
import json
urls = [
'https://dushu.baidu.com/api/pc/getCatalog?data={book_id:4306063500}', # 所有章节的内容,(名称,cid)
# 章节内部的内容
'https://dushu.baidu.com/api/pc/getChapterContent?data={book_id:4306063500,cid:4306063500|1569782244,need_bookinfo:1}',
]
import requests
import asyncio
import aiohttp
import aiofiles
"""
1. 同步操作: 访问getCatalog 拿到所有章节的cid和名称
2. 异步操作:访问getChapterContent 拿到每个章节的内容
"""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0",
"cookie" : '',
}
async def aiodownload(cid, b_id, title):
data = {
"book_id":b_id,
"cid":f"{b_id}|{cid}",
"need_bookinfo":1
}
data = json.dumps(data)
url = f'https://dushu.baidu.com/api/pc/getChapterContent?data={data}'
url.replace('"', '%22')
# print(data)
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
dic = await resp.json()
async with aiofiles.open('book/' + title, mode="w", encoding="utf-8") as f:
await f.write(dic['data']['novel']['content']) # 写出
async def getCatalog(url):
resp = requests.get(url, headers=headers, verify=False)
# print(resp.text)
dic = resp.json()
tasks = []
for item in dic['data']['novel']['items']:
title = item['title']
cid = item['cid']
# print(title, cid)
tasks.append(asyncio.create_task(aiodownload(cid, b_id, title)))
resp.close()
await asyncio.wait(tasks)
if __name__ == '__main__':
b_id = "4306063500"
url = 'https://dushu.baidu.com/api/pc/getCatalog?data={%22book_id%22:%22'+b_id+'%22}'
asyncio.run(getCatalog(url))
# data = {
# "book_id":b_id,
# "cid":f"{b_id}|'ad'",
# "need_bookinfo":1
# }
# data = json.dumps(data)
# print(data)
抓取91视频
m3u8文件
"""
<video src="xx.mp4"></video>
一般的视频网站是怎么做的?
用户上传->转码(不同清晰度,把视频做处理)->切片处理(单个文件进行拆分)
用户在进行拉动进度条的时候
需要一个文件记录:1.视频播放顺序,2.视频存放的路径
M3U txt json => 文本
想要抓取一个视频:
1. 找到m3u8(各种手段)
2. 通过m3u8下载到ts文件
3. 可以通过各种手段(不仅仅是编程手段)把ts文件合并为一个mp4文件
复杂版本:
思路:
1.拿到主页面的页面源代码,找到iframe
2.从iframe的页面源代码中拿到m3u8文件
3.下载第一层m3u8文件->下载第二层(视频存放路径)
4.下载视频
5.下载秘钥,进行解密操作
6.合并所有ts文件为一个mp4文件
"""
因为91被封了这里用另一个视频网站代替吧
包含了下载.ts文件和合并的过程
import os
import time
import requests
# url = 'https://v2.tlkqc.com/wjv2/202308/22/zuKAbRKZTT2/video/1000k_0X720_64k_25/hls/index.m3u8'
#
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0",
"Referer": "https://www.91kmj.cc/play/13669-1-1/",
}
#
# resp = requests.get(url, headers=headers, verify=False)
#
# with open('index.m3u8', mode='wb') as f:
# f.write(resp.content)
# 解析m3u8文件
import asyncio
import aiohttp
import aiofiles
from yarl import URL
async def download_ts(url, name, session):
# async with session.get(URL(url), headers=headers) as resp:
async with session.request('GET', url, headers=headers) as resp:
assert resp.status == 200
async with aiofiles.open(f'video/{name}.ts', mode="wb") as f:
await f.write(await resp.content.read())
# print(resp.headers)
# print(resp.status)
async def aio_download():
tasks = []
n = 0
cookies = {'cookie' : ''}
conn = aiohttp.TCPConnector(limit=10, ssl=False)
async with aiohttp.ClientSession(connector=conn) as session:
# 经验之谈,提前准备好session传递过去
async with aiofiles.open('index.m3u8', mode='r', encoding='utf-8') as f:
async for line in f:
if line.startswith('#'):
continue
line = line.strip()
ts_url = line
# print(ts_url)
task = asyncio.create_task(download_ts(ts_url, n, session))
n += 1
# if n == 2:
# break
tasks.append(task)
time.sleep(1)
await asyncio.wait(tasks)
# with open('index.m3u8', mode='r', encoding='utf-8') as f:
# n = 0
# for line in f:
# line = line.strip()
# if line.startswith('#'):
# continue
#
# # 下载视频片段
# resp3 = requests.get(line, headers=headers, verify=False)
# f = open(f'video/{n}.ts',mode='wb')
# f.write(resp3.content)
# f.close()
# n += 1
from tqdm import tqdm
def get_video(path):
files = os.listdir(path)
# print(files[0].split('\\')[-1].split('.')[0])
files = [file for file in files if file.endswith('.ts')]
with open(os.path.join(path, 'merge.mp4'), 'wb') as f2:
pass
files = sorted(files, key=lambda file: int(file.split('\\')[-1].split('.')[0]))
for file in tqdm(files, desc="正在转换"):
cur = os.path.join(path, file)
print(file)
if os.path.exists(cur):
with open(cur, mode='rb') as f1:
with open(os.path.join(path, 'merge.mp4'), 'ab') as f2:
f2.write(f1.read())
else:
print('?')
print('finish')
if __name__ == '__main__':
# asyncio.run(aio_download())
get_video(r'D:\ProgramFiles\PythonProject\spider\video')
pass