aiohttp 高并发抓取

建立一个 session 会话对象

首先建立一个 session 会话对象,利用会话对象 session 去访问网页

访问 python 官网,async,await 关键字是将函数设置为异步操作,是 aiohttp 使用方式

import aiohttp
import asyncio


async def hello(URL):
    async with aiohttp.ClientSession() as session:
        async with session.get(URL) as response:
            responae = await response.text()
            print(response)


if __name__ == '__main__':
    URl = 'http://python.org'
    loop = asyncio.get_event_loop()
    loop.run_until_complete(hello(URl))

 

 

请求头,超时,cookies,代理

在第二段代码修改

from aiohttp import ClientSession
import aiohttp
import asyncio


# 设置请求头
headers = {'content-type' : "application/json"}
async def hello(URL):
    async with ClientSession() as session:
        async with session.get(URL, headers=headers) as response:
            response = await response.text()
            print(response)


if __name__ == '__main__':
    URl = 'http://python.org'
    loop = asyncio.get_event_loop()
    loop.run_until_complete(hello(URl))
# 设置超时,在会话中设置超时
timeout = aiohttp.ClientTimeout(total=60)
async def hello(URL):
    async with ClientSession(timeout=timeout) as session:
        async with session.get(URL) as response:
            response = await response.text()
            print(response)
# 设置超时,在请求中设置超时
timeout = aiohttp.ClientTimeout(total=60)
async def hello(URL):
    async with ClientSession() as session:
        async with session.get(URL,timeout=timeout) as response:
            response = await response.text()
            print(response)
# 设置 cookies
cookies = {'cookies' : 'working'}
async def hello(URL):
    async with ClientSession(cookies=cookies) as session:
        async with session.get(URL) as response:
            response = await response.text()
            print(response)
# 设置代理 ip
proxy = 'http://117.191.11.72:8080'
async def hello(URL):
    async with ClientSession() as session:
        async with session.get(URL,proxy=proxy) as response:
            response = await response.text()
            print(response)
# 支持代理授权
async def hello(URL):
    async with ClientSession() as session:
        proxy_auth = aiohttp.BasicAuth('user','pass')
        async with session.get('http://python.org',
                               proxy='http://proxy.com',
                               proxy_auth=proxy_auth) as response:
            response = await response.text()
            print(response)

 

 

get 请求方法

两种,不带参数,带参数

# 不带参数
async def hello(URL):
    async with ClientSession() as session:
        async with session.get(URL) as response:
            response = await response.text()
            print(response)
# 带参数
# 在 URL 中设置参数
async def hello(URL):
    URl = 'http://httpbin.org/get?key=python'
    async with ClientSession() as session:
        async with session.get(URL) as response:
            response = await response.text()
            print(response)
# 设置请求参数 params
async def hello(URL):
    URl = 'http://httpbin.org/get'
    params = {'wd' : 'python'}
    async with ClientSession() as session:
        async with session.get(URL,params=params) as response:
            response = await response.text()
            print(response)

 

 

post 请求

# 字典格式写入
async def hello(URL):
    URl = 'http://httpbin.org/post'
    data = {'wd' : 'python'}
    async with ClientSession() as session:
        async with session.get(URL,data=data) as response:
            response = await response.text()
            print(response)
# json 格式写入
async def hello(URL):
    URl = 'http://httpbin.org/post'
    data = {'wd' : 'python'}
    async with ClientSession() as session:
        async with session.get(URL,json=data) as response:
            response = await response.text()
            print(response)
# 字符串格式写入
async def hello(URL):
    URl = 'http://httpbin.org/post'
    data = 'python'
    async with ClientSession() as session:
        async with session.get(URL,data=data) as response:
            response = await response.text()
            print(response)
# 以字节流格式写入(上传文件)
async def hello(URL):
    URl = 'http://httpbin.org/post'
    data = 'python'
    async with ClientSession() as session:
        async with session.get(URL,data=data) as response:
            response = await response.text()
            print(response)

 

获取响应内容方法

# 设置编码格式
response = await response.text(encoding='utf-8')
# 以字节流格式返回
response = await response.read()
# 以 json 格式返回
response = await response.json()
# 获取响应状态码
response = await response.status
# 获取响应的请求头
response = await response.headers
# 获取 url 地址
url = response.url

 

 

异步爬取小说排行榜

import asyncio
import csv
from aiohttp import ClientSession
from bs4 import BeautifulSoup


# 网站访问函数,将网站内容返回
async def getData(url,headers):
    # 创建回话对象
    async with ClientSession() as session:
        # 发送 get 请求,设置请求头
        async with session.get(url,headers=headers) as response:
            # 返回响应内容
            return await response.text()


def savaData(result):
    for i in result:
        soup = BeautifulSoup(i,'lxml')
        find_div = soup.find_all('div',class_='book-mid-info')
        for d in find_div:
            # 小说名
            name = d.find('h4').getText()
            # 作者
            author = d.find('a',class_='name').getText()
            # 更新时间
            update = d.find('p',class_='update').getText()
            # 写入 csv
            csvFile = open('data.csv','a',encoding='utf8',newline='')
            writer = csv.writer(csvFile)
            writer.writerow([name,author,update])
            csvFile.close()


def run():
    for i in range(25):
        # 构建不同的 url 传入 getData,最后由 asyncio 模块执行
        task = asyncio.ensure_future(getData(url.format(i+1),headers))
        # 将所有请求都加入到列表 tasks
        tasks.append(task)
    # 等待所有请求执行完成,一并返回全部响应内容
    result = loop.run_until_complete(asyncio.gather(*tasks))
    savaData(result)
    print(len(result))


if __name__ == '__main__':
    import time
    start = time.time()
    headers = {
        'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36'
    }
    tasks = []
    url = 'https://www.qidian.com/rank/hotsales?page={}'
    # 创建 get_evevt_loop 对象
    loop = asyncio.get_event_loop()
    # 调用 run 函数
    run()
    end = time.time()
    print(end-start)

 

  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值