scrapy学习03--异步aiohttp练习

最新推荐文章于 2023-11-17 11:11:07 发布

我永远喜欢希露菲叶特

最新推荐文章于 2023-11-17 11:11:07 发布

阅读量487

点赞数

分类专栏：学习记录文章标签： python

本文链接：https://blog.csdn.net/qq_42405558/article/details/113828755

版权

学习记录专栏收录该内容

16 篇文章 0 订阅

订阅专栏

# 学习aiohttp

# 第一步 安装
#pip install aiohttp   -i https://pypi.douban.com/simple
#pip install cchardet   -i https://pypi.douban.com/simple


# 客户端
import aiohttp
import asyncio

async def fetch(session,url):
    async with session.get(url) as response:
        return await response.text()

# 基本
# async def main():
#     async with aiohttp.ClientSession() as session:
#         html=await fetch(session,"http://httpbin.org/headers")
#         print(html)

'''
{
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate", 
    "Host": "httpbin.org", 
    "User-Agent": "Python/3.9 aiohttp/3.7.3", 
    "X-Amzn-Trace-Id": "Root=1-602b6a92-01f81ee520af7312137b8421"
  }
}
'''

#读取网页内容
# async def main():
#     async with aiohttp.ClientSession() as session:
#         async with session.get('http://httpbin.org/get') as resp:
#             print(resp.status) # 状态码
#             print(await resp.text(encoding='utf-8'))

'''
200
{
  "args": {}, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate", 
    "Host": "httpbin.org", 
    "User-Agent": "Python/3.9 aiohttp/3.7.3", 
    "X-Amzn-Trace-Id": "Root=1-602b6c17-66a4d09013f941f320068d7f"
  }, 
  "origin": "171.44.106.55", 
  "url": "http://httpbin.org/get"
}
'''

# 非文本内容格式 只需把text方法改成read即可

#请求的自定义

# 自定义Headers
# async def main():
#     async with aiohttp.ClientSession() as session:
#         url="http://httpbin.org/"
#         headers = {
#             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
#                           "AppleWebKit/537.36 (KHTML, like Gecko)"
#                           " Chrome/78.0.3904.108 Safari/537.36"
#         }
#         await session.post(url, headers=headers)


# 自定义cookie
async def main():
    url = 'http://httpbin.org/cookies'
    cookies = {'cookies_are': 'working'}
    async with aiohttp.ClientSession(cookies=cookies) as session:
        async with session.get(url) as resp:
            assert await resp.json() == {
              "cookies": {"cookies_are": "working"}}


asyncio.run(main())

# 一个函数用来发起请求，另外一个函数用来下载网页

# 同步爬虫
#爬取豆瓣电影
from datetime import datetime

import requests
from bs4 import BeautifulSoup
from lxml import etree


#请求头
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit"
                         "/537.36 (KHTML, like Gecko) "
                         "Chrome/72.0.3626.121 Safari/537.36"}
def get_movie_url():
    req_url='https://movie.douban.com/chart'
    response=requests.get(url=req_url,headers=headers)
    html=etree.HTML(response.text)
    movies_url=html.xpath('//*[@id="content"]/div/div[1]/div/div/table/tr/td/a/@href')
    return movies_url

def get_movie_content(movie_url):
    response=requests.get(movie_url,headers=headers)
    result=etree.HTML(response.text)
    movie=dict()
    name=result.xpath('//*[@id="content"]/h1/span[1]/text()')
    author =result.xpath('//*[@id="info"]/span[1]/span[2]//text()')
    movie["name"] = name
    movie["author"] = author
    return movie


if __name__ == '__main__':
    start = datetime.now()
    movie_url_list = get_movie_url()
    movies = dict()
    for url in movie_url_list:
        movies[url] = get_movie_content(url)
    print(movies)
    print("同步用时为：{}".format(datetime.now() - start))

'''
{'https://movie.douban.com/subject/30458949/': {'name': ['无依之地 Nomadland'], 'author': ['赵婷']}, 'https://movie.douban.com/subject/30257787/': {'name': ['一秒钟'], 'author': ['张艺谋']}, 'https://movie.douban.com/subject/30443686/': {'name': ['穷途鼠的奶酪梦 窮鼠はチーズの夢を見る'], 'author': ['行定勋']}, 'https://movie.douban.com/subject/34869387/': {'name': ['女人的碎片 Pieces of a Woman'], 'author': ['凯内尔·穆德卢佐']}, 'https://movie.douban.com/subject/33408026/': {'name': ['刻在你心底的名字'], 'author': ['柳广辉']}, 'https://movie.douban.com/subject/34894753/': {'name': ['沐浴之王'], 'author': ['易小星']}, 'https://movie.douban.com/subject/35211578/': {'name': ['逃避虽可耻但有用 新春特别篇 逃げるは恥だが役に立つ ガンバレ人類! 新春スペシャル!!'], 'author': ['金子文纪']}, 'https://movie.douban.com/subject/30450313/': {'name': ['前程似锦的女孩 Promising Young Woman'], 'author': ['埃默拉尔德·芬内尔']}, 'https://movie.douban.com/subject/35275115/': {'name': ['2020去死 Death to 2020'], 'author': ['阿尔·坎贝尔', ' / ', 'Alice Mathias']}, 'https://movie.douban.com/subject/34982759/': {'name': ["玫瑰岛的不可思议的历史 L'incredibile storia dell'isola delle rose"], 'author': ['希德尼·希比利亚']}}
同步用时为：0:00:08.478348
'''

# 将同步代码改成异步测试速度
import asyncio
from datetime import datetime

import aiohttp
from lxml import etree

#请求头
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit"
                         "/537.36 (KHTML, like Gecko) "
                         "Chrome/72.0.3626.121 Safari/537.36"}

async def get_movie_url():
    req_url='https://movie.douban.com/chart'
    async with aiohttp.ClientSession(headers=headers) as session:
        async with session.get(url=req_url,headers=headers) as response:
            result=await response.text()
            result=etree.HTML(result)

    return result.xpath('//*[@id="content"]/div/div[1]/div/div/table/tr/td/a/@href')

async def get_movie_content(movie_url):
    async with aiohttp.ClientSession(headers=headers) as session:
        async with session.get(url=movie_url,headers=headers) as response:
            result=await response.text()
            result=etree.HTML(result)
        movie = dict()
        name = result.xpath('//*[@id="content"]/h1/span[1]//text()')
        author = result.xpath('//*[@id="info"]/span[1]/span[2]//text()')
        movie["name"] = name
        movie["author"] = author
    return movie

if __name__ == '__main__':
    start = datetime.now()
    loop = asyncio.get_event_loop()
    movie_url_list = loop.run_until_complete(get_movie_url()) # run_until_complete 来运行 loop ，等到 future 完成，run_until_complete 也就返回了。
    tasks = [get_movie_content(url) for url in movie_url_list]
    movies = loop.run_until_complete(asyncio.gather(*tasks))
    print(movies)
    print("异步用时为：{}".format(datetime.now()- start))

'''
[{'name': ['无依之地 Nomadland'], 'author': ['赵婷']}, {'name': ['一秒钟'], 'author': ['张艺谋']}, {'name': ['穷途鼠的奶酪梦 窮鼠はチーズの夢を見る'], 'author': ['行定勋']}, {'name': ['女人的碎片 Pieces of a Woman'], 'author': ['凯内尔·穆德卢佐']}, {'name': ['刻在你心底的名字'], 'author': ['柳广辉']}, {'name': ['沐浴之王'], 'author': ['易小星']}, {'name': ['逃避虽可耻但有用 新春特别篇 逃げるは恥だが役に立つ ガンバレ人類! 新春スペシャル!!'], 'author': ['金子文纪']}, {'name': ['前程似锦的女孩 Promising Young Woman'], 'author': ['埃默拉尔德·芬内尔']}, {'name': ['2020去死 Death to 2020'], 'author': ['阿尔·坎贝尔', ' / ', 'Alice Mathias']}, {'name': ["玫瑰岛的不可思议的历史 L'incredibile storia dell'isola delle rose"], 'author': ['希德尼·希比利亚']}]
异步用时为：0:00:01.431173
'''

# 异步理解
'''
参考博客  https://www.cnblogs.com/xinghun85/p/9937741.html
'''
import asyncio
from datetime import datetime

import requests



# 例子

async def test2(i):
    r=await other_test(i)
    print("1",i,r)

async def other_test(i):
     r = requests.get(i)
     print("2",i)
     await asyncio.sleep(4)
     print("3",datetime.now()-start)
     return r

url = ["https://segmentfault.com/p/1210000013564725",
        "https://www.jianshu.com/p/83badc8028bd",
        "https://www.baidu.com/"]
loop = asyncio.get_event_loop()
task = [asyncio.ensure_future(test2(i)) for i in url]
start =datetime.now()
loop.run_until_complete(asyncio.wait(task))
endtime = datetime.now()-start
print(endtime)
loop.close()

# aiohttp 爬虫实战
# 参考博客 ：https://blog.csdn.net/qq_36772866/article/details/105355445?ops_request_misc=%257B%2522request%255Fid%2522%253A%2522161346654316780271553327%2522%252C%2522scm%2522%253A%252220140713.130102334.pc%255Fblog.%2522%257D&request_id=161346654316780271553327&biz_id=0&utm_medium=distribute.pc_search_result.none-task-blog-2~blog~first_rank_v1~rank_blog_v1-3-105355445.pc_v1_rank_blog_v1&utm_term=%E7%88%AC%E8%99%AB&spm=1018.2226.3001.4450

#  第一步 分析 页面  查看小说信息所在的div

# 导包
import asyncio
import csv
import aiohttp
from bs4 import BeautifulSoup

#爬取网站内容
async def getData(url,headers):
    # 创建会话对象
    async with aiohttp.ClientSession() as session:
        # 发送get请求，设置请求头
        async with session.get(url,headers=headers) as response:
            #返回响应内容
            return await response.text()

#保存数据
def saveData(result):
    for i in result:
        soup=BeautifulSoup(i,'lxml')
        find_div=soup.find_all('div',class_='book-mid-info')
        for d in find_div:
            # 小说名
            name = d.find('h4').getText()
            # 作者
            author = d.find('a', class_='name').getText()
            # 更新时间
            update = d.find('p', class_='update').getText()
            # 写入 csv
            csvFile = open(r'C:\Users\zxy\Desktop\data.csv', 'w', encoding='utf-8-sig', newline='')
            writer = csv.writer(csvFile)
            writer.writerow([name, author, update])
            csvFile.close()

# 创建异步任务并保存数据
def run():
    for i in range(25):
        #构建不同的 url 传入 getData，最后由 asyncio 模块执行
        task=asyncio.ensure_future(getData(url.format(i+1),headers))
        #将所有请求都加入到列表 tasks
        tasks.append(task)
    # 当所有请求执行完成，返回响应结果
    result = loop.run_until_complete(asyncio.gather(*tasks))
    saveData(result)
    print(len(result))

if __name__=='__main__':
    #请求头
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36'
    }
    # 任务列表
    tasks=[]
    # url
    url='https://www.qidian.com/rank/hotsales?page={}'
    #loop对象
    loop=asyncio.get_event_loop()
    #调用run函数
    run()

如何在scrapy中使用aiohttp

这里需要修改下载中间件

import requests
# 现在，我们创建一个中间件，在这个中间件里面，使用 requests 请求一个需要延迟 5 秒钟才会返回的网址：
# class TestAiohttp:
#     def get_ip(self):
#         requests.get('http://httpbin.org/delay/5').json()
#
#     def process_request(self,request,spider):
#         print("请求一个5秒才会返回的网址")
#         self.get_ip()

#将requests换成aiohttp

import asyncio
import aiohttp


class TestAiohttp:
    async def get_ip(self):
        async with aiohttp.ClientSession() as client:
            resp = await client.get('http://httpbin.org/delay/5')
            result = await resp.json()
            print(result)

    async def process_request(self, request, spider):
        print('请求一个延迟5秒的网址开始')
        await asyncio.create_task(self.get_ip())

爬虫部分

from datetime import datetime

import scrapy


class ExerciseSpider(scrapy.Spider):
    name = 'exercise'
    #allowed_domains = ['v.qq.com']
    # 跟进url
    start_urls = ['http://exercise.com']


    def start_requests(self):
        for page in range(1, 10):
            url=f'http://exercise.kingname.info/exercise_middleware_ip/{page}'
            yield scrapy.Request(url)
    def parse(self, response):
        now=datetime.now()
        print(f'现在时间是：',now,response.text)

'''
在没使用下载中间件的时候，每次请求只需要一秒钟。
我么那在下载中间件内 需要请求一个五秒才会给反馈的地址
如果用普通的requests 每次请求时，请求将会被卡五秒
换成aiohttp后，启动后会启动五个并发(这里设置的最大并发数是5)，所以会同时打印出请求一个延迟5秒的网址开始5次。然后稍稍停 5 秒，这 5 个请求几乎同时完成，于是同时打印出这个延迟网址的返回信息。接下来，后面的请求就是每秒一个。
'''

设置部分settings

# 开启自定义下载中间件
DOWNLOADER_MIDDLEWARES = {
   'aioTest.middlewares.TestAiohttp': 543
}
# 开启异步
TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'

启动类

import asyncio

from scrapy import cmdline

import sys



if sys.platform == 'win32':
    asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
# 我这里不加这句会报错

cmdline.execute('scrapy crawl exercise'.split())

我永远喜欢希露菲叶特

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
scrapy学习03--异步aiohttp练习

# 学习aiohttp# 第一步安装#pip install aiohttp -i https://pypi.douban.com/simple#pip install cchardet -i https://pypi.douban.com/simple# 客户端import aiohttpimport asyncioasync def fetch(session,url): async with session.get(url) as response:
复制链接

扫一扫