导包
import asyncio
import aiohttp
from lxml import etree
import requests
import time
获取最大页数
def get_maxpage():
url = "https://www.qiushibaike.com/video/page"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.41'
}
response = requests.get(url,headers=headers)
html = etree.HTML(response.text)
maxpage = html.xpath('//ul[@class = "pagination"]//span/text()')[-2].strip()
print(maxpage)
return maxpage
依次抓取每个页面上的视频详情链接,并调用函数
def get_page_url(maxpage):
for i in range(1,int(maxpage)+1):
url = f"https://www.qiushibaike.com/video/page/{i}/"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.41'
}
response = requests.get(url,headers=headers)
html = etree.HTML(response.text)
url = html.xpath('//div[@class="col1 old-style-col1"]/div/a/@href')
url = list(set(url))
url = ["https://www.qiushibaike.com"+ j for j in url]
task2 = [get_down_data(url) for url in url]
asyncio.run(asyncio.wait(task2))
依次抓取每个视频详情页上的视频名字及下载链接【协程函数】
async def get_down_data(url2):
print(f"打开{url2}")
async with aiohttp.ClientSession() as session:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.41'
}
async with await session.get(url=url2,headers=headers) as response:
res = await response.text()
html = etree.HTML(res)
name = html.xpath('//div[@class="video"]/div/text()')[0]
down_url = html.xpath('//*[@id="article-video"]/source/@src')[0]
down_url1 = "https:" + down_url
download_date(name,down_url1)
打开下载链接,并保存数据
def download_date(name,down_url1):
print(name,down_url1)
houzhui = down_url1.split('.')[-1]
res = requests.get(down_url1).content
with open(f'e:/糗事百科视频/{name}.{houzhui}','wb') as f:
f.write(res)
print(name,"下载完成!")
if __name__ == "__main__":
maxpage = get_maxpage()
get_page_url(maxpage)
完整代码
import asyncio
import aiohttp
from lxml import etree
import requests
import time
#获取最大页数
def get_maxpage():
url = "https://www.qiushibaike.com/video/page"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.41'
}
response = requests.get(url,headers=headers)
html = etree.HTML(response.text)
maxpage = html.xpath('//ul[@class = "pagination"]//span/text()')[-2].strip()
print(maxpage)
return maxpage
# def get_page_url(maxpage):
# url = []
# for i in range(1,int(maxpage)+1):
# url1 = f"https://www.qiushibaike.com/video/page/{i}/"
#
# url.append(url1)
# return url
def get_page_url(maxpage):
for i in range(1,int(maxpage)+1):
url = f"https://www.qiushibaike.com/video/page/{i}/"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.41'
}
response = requests.get(url,headers=headers)
html = etree.HTML(response.text)
url = html.xpath('//div[@class="col1 old-style-col1"]/div/a/@href')
url = list(set(url))
url = ["https://www.qiushibaike.com"+ j for j in url]
task2 = [get_down_data(url) for url in url]
asyncio.run(asyncio.wait(task2))
# async def open_url(url):
# async with aiohttp.ClientSession() as session:
# headers = {
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.41'
# }
# async with await session.get(url=url,headers=headers) as response:
# resp = await response.text()
# html = etree.HTML(resp)
# url = html.xpath('//div[@class="col1 old-style-col1"]/div/a/@href')
# url = list(set(url))
# url = ["https://www.qiushibaike.com"+ j for j in url]
# print(url)
# task2 = [get_down_data(url2) for url2 in url]
# asyncio.run(asyncio.wait(task2))
async def get_down_data(url2):
print(f"打开{url2}")
async with aiohttp.ClientSession() as session:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.41'
}
async with await session.get(url=url2,headers=headers) as response:
res = await response.text()
html = etree.HTML(res)
name = html.xpath('//div[@class="video"]/div/text()')[0]
down_url = html.xpath('//*[@id="article-video"]/source/@src')[0]
down_url1 = "https:" + down_url
download_date(name,down_url1)
def download_date(name,down_url1):
print(name,down_url1)
houzhui = down_url1.split('.')[-1]
res = requests.get(down_url1).content
with open(f'e:/糗事百科视频/{name}.{houzhui}','wb') as f:
f.write(res)
print(name,"下载完成!")
if __name__ == "__main__":
maxpage = get_maxpage()
# url = get_page_url(maxpage)
# task1 = [open_url(url) for url in url]
# asyncio.run(asyncio.wait(task1))
get_page_url(maxpage)
备注:
注释部分为,每个页面url 也想采用协程,但是会报错,如果谁有解决的方式或者更好的方法,欢迎留言。