5 爬虫 异步协程 梨视频

# 如何提升requests模块爬取数据的效率?
- 多进程或多线程(不建议) 太耗费资源
- 线程池或进程池(适当使用)
- 单线程 + 异步协程(推荐)
# 线程池使用案例
# 梨视频 下载作业
import random
from lxml import etree
from multiprocessing.dummy import Pool  # 线程 
import requests
import re

url = 'https://www.pearvideo.com/category_3'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'}
page_text = requests.get(url=url, headers=headers).text

tree = etree.HTML(page_text)
all_video = tree.xpath('//ul[@id="listvideoListUl"]/li/div/a/@href')
mp4_list = []
for video in all_video:
    video_url = 'https://www.pearvideo.com/%s' % video
    page_video = requests.get(url=video_url, headers=headers).text
    tree = etree.HTML(page_video)
    name1 = tree.xpath('//*[@id="detailsbd"]/div[1]/div[2]/div/div[1]/h1/text()')[0]
    mp4_url = re.findall('srcUrl="(.*?)",vdoUrl', page_video, re.S)[0]
    mp4_list.append(mp4_url)

pool = Pool(4)  # 将耗时严重的任务异步处理,实例化一个线程池对象

# 视频二进制流获取
def mp4_request(url):
    return requests.get(url=url, headers=headers).content

# 数据持久化存储
def mp4_save(mp4_data):
    name = str(random.randint(0,9999))+'.mp4' # 随机生成name
    with open("./%s.mp4" % name, 'wb') as f:
        f.write(mp4_data)
        print(name, ',download ok')

mp4_data_list = pool.map(mp4_request, mp4_list) # 获取二进制流
pool.map(mp4_save, mp4_data_list)  # data持久化存储

print('Task is OK!') # 任务结束的提醒
pool.close() #关闭线程池
 
  
# 下面是 带真实名字的版本
from lxml import etree
from multiprocessing.dummy import Pool
import requests
import re

url = 'https://www.pearvideo.com/category_3'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'}
page_text = requests.get(url=url, headers=headers).text

tree = etree.HTML(page_text)
all_video = tree.xpath('//ul[@id="listvideoListUl"]/li/div/a/@href')
mp4_list = []
for video in all_video:
    video_url = 'https://www.pearvideo.com/%s' % video
    page_video = requests.get(url=video_url, headers=headers).text
    tree = etree.HTML(page_video)
    name = tree.xpath('//*[@id="detailsbd"]/div[1]/div[2]/div/div[1]/h1/text()')[0]
    mp4_url = re.findall('srcUrl="(.*?)",vdoUrl', page_video, re.S)[0]
    mp4_list.append({name:mp4_url})

mp4_list = mp4_list[2:]
print(mp4_list)
pool = Pool(4)  # 将耗时的任务异步处理,实例化一个线程池对象

# 视频二进制流获取
def mp4_request(url):
    return {list(url.keys())[0]:requests.get(url=list(url.values())[0], headers=headers).content}

# 数据持久化存储
def mp4_save(mp4_data):
    with open("./%s.mp4" % list(mp4_data.keys())[0], 'wb') as f:
        f.write(list(mp4_data.values())[0])
        print(list(mp4_data.keys())[0], ',download ok')

mp4_data_list = pool.map(mp4_request, mp4_list) # 获取二进制流
pool.map(mp4_save, mp4_data_list)  # data持久化存储

print('Task is OK!')
# --  下面内容都是异步的内容  -- 
# 基本使用 
# 异步轮询的执行
import asyncio
async def hello(name):
    print('hello to:',name)
c = hello('zc')#调用 返回协程对象<coroutine协程 object hello at 0x0000000005EDDE08>
# 创建一个事件循环对象
loop = asyncio.get_event_loop()
# 将协程对象注册到事件循环中,然后启动事件循环对象
loop.run_until_complete(c)  # 输出hello to: zc
# task 的使用   单任务协程
import asyncio
async def hello(name):
    print('hello to:',name)
c = hello('zc')
# 创建一个事件循环对象
loop = asyncio.get_event_loop()
# 就协程进行进一步封装,封装到了task对象中
task = loop.create_task(c)
print(task)
loop.run_until_complete(task) 
print(task)
 
  
# future 的使用
import asyncio
async def hello(name):
    print('hello to:',name)
c = hello('zc')
loop = asyncio.get_event_loop()
task = asyncio.ensure_future(c)
print(task)
loop.run_until_complete(task)
print(task)
# furure 绑定回调
import
asyncio def callback(task): # 回调函数 print('I am callback', task.result()) async def hello(name): print('hello to:', name) return name c = hello('zc') loop = asyncio.get_event_loop() # 创建loop实例 task = asyncio.ensure_future(c) # print(task) task.add_done_callback(callback) # 添加要执行的回调函数 loop.run_until_complete(task) # 当任务设定完成开始执行 print(task)
# 爬虫中应用多任务异步操作
# 支持异步的网络请求的模块  pip install aiohttp
import asyncio
import aiohttp
import time

async def get_page(url):
    async with aiohttp.ClientSession() as session:
         async with await session.get(url=url) as response:
            page_text = await response.text()  # read() 二进制形式的响应数据,json()  
            print('响应数据:',page_text)
       # print('ok %s'%url) start
= time.time() urls = [ 'http://127.0.0.1:5000/bobo', 'http://127.0.0.1:5000/jay', 'http://127.0.0.1:5000/tom', ] tasks = [] #任务列表 放置多个任务对象 loop = asyncio.get_event_loop() for url in urls: c = get_page(url) task = asyncio.ensure_future(c) tasks.append(task) # 将多个任务对象对应的列表注册到事件循环中 loop.run_until_complete(asyncio.wait(tasks)) print('总耗时',time.time()-start) # -- 下面是输出结果 -- # downloading http://127.0.0.1:5000/bobo # downloading http://127.0.0.1:5000/jay # downloading http://127.0.0.1:5000/tom # 下载 ok http://127.0.0.1:5000/bobo # 下载 ok http://127.0.0.1:5000/jay # 下载 ok http://127.0.0.1:5000/tom # 总耗时 2.0021142959594727
# 基于python的flask框架实现的简单的Web服务器,代码:
from flask import Flask
import time

app = Flask(__name__)

@app.route('/bobo')
def index_bobo():
    time.sleep(2)
    return 'Hello bobo'

@app.route('/jay')
def index_jay():
    time.sleep(2)
    return 'Hello jay'

@app.route('/tom')
def index_tom():
    time.sleep(2)
    return 'Hello tom'

if __name__ == '__main__':
    app.run(threaded=True)
# 真实网站请求的 高性能异步IO
import asyncio
import aiohttp
import time

async def get_page(url):
    async with aiohttp.ClientSession() as session:
         async with await session.get(url=url) as response:
            page_text = await response.text()  # read() 二进制形式的响应数据,json()
            # print('响应数据:',page_text)
            print('ok %s'%url)
start = time.time()
urls = [
    'https://baidu.com',
    'https://y.qq.com',
    'https://www.taobao.com',
]
tasks = []  #任务列表 放置多个任务对象
loop = asyncio.get_event_loop()
for url in urls:
    c = get_page(url)
    task = asyncio.ensure_future(c)
    tasks.append(task)
# 将多个任务对象对应的列表注册到事件循环中
loop.run_until_complete(asyncio.wait(tasks))
print('总耗时',time.time()-start)

 

0 and False  => 0
0 or False    => False

 

转载于:https://www.cnblogs.com/zhangchen-sx/p/10818591.html

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
异步协程爬虫是利用Python中的异步编程和协程机制来实现高效的网络爬虫。通过使用异步协程,可以在一个线程中同时处理多个网络请求,提高爬取数据的效率。 在Python中,可以使用asyncio库来实现异步协程爬虫。下面是一个简单的异步协程爬虫的示例代码: ```python import asyncio import aiohttp async def fetch(session, url): async with session.get(url) as response: return await response.text() async def main(): urls = ['http://example.com', 'http://example.org', 'http://example.net'] async with aiohttp.ClientSession() as session: tasks = [] for url in urls: task = asyncio.create_task(fetch(session, url)) tasks.append(task) responses = await asyncio.gather(*tasks) for response in responses: print(response) if __name__ == '__main__': asyncio.run(main()) ``` 在上面的代码中,我们首先定义了一个`fetch`函数,用于发送异步HTTP请求并返回响应的内容。然后,在`main`函数中,我们创建了一个异步的`ClientSession`对象,用于发送并发的HTTP请求。接着,我们使用`asyncio.create_task`函数创建了多个任务,并将其添加到任务列表中。最后,使用`asyncio.gather`函数等待所有任务完成,并打印每个响应的内容。 通过使用异步协程爬虫,可以实现高效的并发请求,从而加快爬取数据的速度。同时,由于使用了异步编程的机制,还可以避免阻塞主线程,提高整个程序的运行效率。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值