import aiohttp
import asyncio
import random
import requests
from lxml import etree
import re
base_url = 'https://www.pearvideo.com/'
def request_list(p):
# 获取第p页视频列表
url = base_url + 'category_loading.jsp'
params = {
'reqType': 5,
'categoryId': 130,
'start': p,
'mrd': random.random(),
'filterIds': '1763625,1763630,1763676,1758075,1758078,1758077,1758079,1758081,1758080,1758083,1758085,1758084,1758086,1758090,1758088'
}
try:
print('请求列表页面中:第%d页' % p)
response = requests.get(url, params)
except requests.exceptions.Timeout() as e:
print('链接超时:', e)
if response.status_code == 200:
return response.text
def parse_list(text):
# 解析列表页面获取详情页url
vids = re.findall('<a href="(.*?)" class="vervideo-lilink actplay">', text)
if len(vids) > 0:
print('列表页解析成功:', vids)
vids = [base_url + vid for vid in vids]
return vids
async def a_request(url):
# async关键字使a_request成为特殊函数,调用时并不会被执行
# await 异步请求的阻塞条件必须带上的关键字
async with aiohttp.ClientSession() as session:
# 使用上下文,不用手动关闭
print('获取详情页中:%s' % url)
async with await session.get(url) as response:
if response.status == 200:
# 阻塞情况都可使用该方法
text = await response.text()
return url, text
def parse_detail(task):
# 回调函数,用于解析返回结果
result = task.result()
tree = etree.HTML(result[1])
# 获取该视频点赞
star = tree.xpath('//div[@class="brief-box"]/div[1]/text()')
if len(star) > 0:
print('解析详情页中成功:%s [%s] ' % (result[0], star[0]))
def run(start_page, end_page):
loop = asyncio.get_event_loop()
for p in range(start_page*24, end_page*24, 24):
# 获取列表页url
text = request_list(p)
urls = parse_list(text)
# 异步请求详情页
tasks = []
for url in urls:
# 创建一个协程对象
coro_request = a_request(url)
# 创建一个任务对象
task_request = asyncio.ensure_future(coro_request)
# 回调函数
task_request.add_done_callback(parse_detail)
# 将所有任务添加到列表中
tasks.append(task_request)
# 将任务注册到循环事件中,注意tasks需要用
# wait方法,使任务允许被挂起
loop.run_until_complete(asyncio.wait(tasks))
if __name__ == '__main__':
run(0, 5)
【Python协程爬虫】
最新推荐文章于 2024-01-23 20:08:43 发布