【转载】爬虫篇——httpx的基础知识(总结)

注意:httpx库的使用方法,跟requests库的使用方法非常相似,学会requests库,再学httpx库非常容易。

  • 查看请求所用的协议是HTTP/1.1,还是HTTP/2.0的方法(打开浏览器的开发者模式中查看):
    请求所用的协议是HTTP/1.1,还是HTTP/2.0

一、快速入门

urllib和request只支持HTTP/1.1,不支持HTTP/2.0
httpx跟request很相似,不过httpx既支持HTTP/1.1,也支持HTTP/2.0
httpx默认是HTTP/1.1,需要安装httpx[http2]
特殊情况下,httpx.Client(http2=True)方法中传入http2=True,才能启用HTTP/2.0

使用 pip 安装:
$ pip install httpx
或者,要包括可选的 HTTP/2 支持,请使用:
$ pip install httpx[http2]
要包括可选的 brotli 解码器支持,请使用:
$ pip install httpx[brotli]

1、get请求

# Python版本:3.6
# -*- coding:utf-8 -*-

import httpx

r = httpx.get('https://www.example.org/')
print(r.status_code) # 200
print(r.headers['content-type']) # text/html; charset=UTF-8
print(r.text)

2、post请求

import httpx

r = httpx.post('https://httpbin.org/post',data={'key1':'value1'})
print(r.text)

3、put、delete、head、options请求

import httpx

httpx.put('https://httpbin.org/put', data={'key': 'value'})
httpx.delete('https://httpbin.org/delete')
httpx.head('https://httpbin.org/get')
httpx.options('https://httpbin.org/get')

4、在url链接中传递参数

A、使用params关键字传递参数
import httpx

url = 'https://httpbin.org/get'
params = {'key1':'value1','key2':'value2'}
r = httpx.get('https://httpbin.org/get',params=params)
print(r.status_code) # 200
print(r.url) # https://httpbin.org/get?key1=value1&key2=value2
B、列表数据类型
import httpx

params = {'key1':'value1','key2':['value2','value3']}
r = httpx.get('https://httpbin.org/get',params=params)
print(r.status_code) # 200
print(r.url) # https://httpbin.org/get?key1=value1&key2=value2&key2=value3

5、响应文本内容

import httpx

r = httpx.get('https://www.example.org/')
print(r.text)

6、查看或设置网页的编码

A、查看网页的编码
import httpx

r = httpx.get('https://httpbin.org/get')
# 查看网页的编码
print(r.encoding) # ascii
B、设置编码方式,一旦设置就会覆盖原来的编码
import httpx

r = httpx.get('https://httpbin.org/get')
r.encoding = 'utf-8'
print(r.encoding) # utf-8
print(r.text)

7、二进制响应内容

import httpx

r = httpx.get('https://httpbin.org/get')
print(r.content)

8、要从请求返回的二进制数据创建图像

import httpx
from PIL import Image
from io import BytesIO

r = httpx.get('https://pic.ntimg.cn/file/20220402/19727910_161258533101_2.jpg')
print(r.content)
i = Image.open(BytesIO(r.content))

# 打开、查看图片
i.show()

# 保存图片到当前文件夹
i.save('biadu_logo_BytesIO.PNG')

9、JSON响应内容

import httpx

r = httpx.get('https://api.github.com/events')
print(r.json())

10、自定义headers

import httpx

url = 'https://httpbin.org/headers'
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36'}
r = httpx.get(url, headers=headers)
print(r.status_code) # 200
print(r.headers)
"""
Headers({'date': 'Thu, 21 Apr 2022 11:40:57 GMT', 'content-type': 'application/json', 'content-length': '319', 'connection': 'keep-alive', 'server': 'gunicorn/19.9.0', 'access-control-allow-origin': '*', 'access-control-allow-credentials': 'true'})
"""

11、发送表单数据

import httpx

data = {
    'key1':'value1',
    'key2':'value2'
}
r = httpx.post('https://httpbin.org/post',data=data)
print(r.text)
"""
...
"form": {
    "key1": "value1", 
    "key2": "value2"
  }, 
...
"""
  • 同一键,包含多个值
import httpx

data = {
    'key1':['value1','value2']
}
r = httpx.post('https://httpbin.org/post',data=data)
print(r.text)
"""
...
  "form": {
    "key1": [
      "value1", 
      "value2"
    ]
  },
...
"""

12、上传文件

import httpx

files = {'upload-file':open('uploading.txt', 'rb')}
r = httpx.post('https://httpbin.org/post',files=files)
print(r.text)
"""
....
"files": {
    "upload-file": "abcdef\r\nghig\r\nkega"
  }, 
...
"""
  • 显式设置文件名和内容类型
import httpx

"""
显式设置文件名和内容类型
文件类型:https://tool.oschina.net/commons/
"""
files = {'upload-file':('uploading.xls', open('uploading.xls', 'rb'), 'application/vnd.ms-excel')}
# files = {'upload-file':('uploading.txt',open('uploading.txt','rb'),'text/plain')}
r = httpx.post('https://httpbin.org/post',files=files)
print(r.text)
  • 如果需要在表单中包含非文件数据字段,请使用data=…参数
import httpx

data = {'message':'Hello world!'}
files = {'file':open('uploading.txt', 'rb')}
r = httpx.post('https://httpbin.org/post',data=data,files=files)
print(r.text)
"""
....
"files": {
    "file": "abcdef\r\nghig\r\nkega"
  }, 
  "form": {
    "message": "Hello world!"
  }, 
....
"""

13、发送JSON编码的数据

import httpx

data = {'iteger':123,'boolean':True,'list':['a','b','c']}
r = httpx.post('https://httpbin.org/post',json=data)
print(r.text)
"""
...
"json": {
    "boolean": true, 
    "iteger": 123, 
    "list": [
      "a", 
      "b", 
      "c"
    ]
  },
... 
"""

14、发送二进制请求数据

import httpx

content = b'Hello wolrd'
r = httpx.post('https://httpbin.org/post',content=content)
print(r.text) #  ...."data": "Hello wolrd", ...

15、状态码

import httpx

r = httpx.get('https://httpbin.org/get')
print(r.status_code) # 200

print(r.status_code == httpx.codes.OK) # True
  • 其它状态码:比如404
import httpx

not_found = httpx.get('https://httpbin.org/status/404')
print(not_found.status_code) # 404
# 捕获异常
# print(not_found.raise_for_status())

16、headers文件头

import httpx

r = httpx.get('https://www.baidu.com')
print(r.headers)
"""
Headers([('accept-ranges', 'bytes'), ('cache-control', 'no-cache'), ('connection', 'keep-alive'), ('content-length', '227'), ('content-type', 'text/html'), ('date', 'Thu, 21 Apr 2022 13:01:43 GMT'), ('p3p', 'CP=" OTI DSP COR IVA OUR IND COM "'), ('p3p', 'CP=" OTI DSP COR IVA OUR IND COM "'), ('pragma', 'no-cache'), ('server', 'BWS/1.1'), ('set-cookie', 'BD_NOT_HTTPS=1; path=/; Max-Age=300'), ('set-cookie', 'BIDUPSID=AC1E015F16AEB5710A9851F42E6A5A7E; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com'), ('set-cookie', 'PSTM=1650546103; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com'), ('set-cookie', 'BAIDUID=AC1E015F16AEB57186EF629BB294F2FC:FG=1; max-age=31536000; expires=Fri, 21-Apr-23 13:01:43 GMT; domain=.baidu.com; path=/; version=1; comment=bd'), ('strict-transport-security', 'max-age=0'), ('traceid', '165054610306639106669606882951048680134'), ('x-frame-options', 'sameorigin'), ('x-ua-compatible', 'IE=Edge,chrome=1')])
"""
# 获取headers中的指定键值
print('connection:',r.headers['connection']) # connection: keep-alive
print('cache-control:',r.headers.get('cache-control')) # cache-control: no-cache

17、response响应流

使用响应流,对大网页的响应,不立即将整个响应加载到内存中。

  • 二进制内容(bytes数据类型)
import httpx

with httpx.stream('GET','https://httpbin.org/get') as r:
    for data in r.iter_bytes():
        print(data)
  • 文本内容
import httpx

with httpx.stream('GET','https://httpbin.org/get') as r:
    for text in r.iter_text():
        print(text)
  • 逐行传输文本
import httpx

with httpx.stream('GET','https://httpbin.org/get') as r:
    for line in r.iter_lines():
        print(line)
  • 原始字节
import httpx

with httpx.stream('GET','https://httpbin.org/get') as r:
    for chunk in r.iter_raw():
        print(chunk)

18、访问指定的Cookie

import httpx

r = httpx.get('https://httpbin.org/cookies/set?chocolate=chip')
print(r.cookies['chocolate']) # chip

20、Cookies参数

import httpx

cookies = {'key':'value'}
# 发出请求时,传入cookies参数
res = httpx.get('https://httpbin.org/cookies',cookies=cookies)
print(res.json())

21、设置Cookies

import httpx

# 实例一个cookies对象
cookies = httpx.Cookies()
# 设置cookies
cookies.set('cookie_on_domain', 'hello, there!', domain='httpbin.org')
cookies.set('cookie_off_domain', 'nope.', domain='example.org')
# 发出请求
r = httpx.get('http://httpbin.org/cookies', cookies=cookies)
print(r.json())

22、重定向

  • GitHub 将所有 HTTP 请求重定向到 HTTPS
import httpx

r = httpx.get('http://github.com/')
print(r.status_code) # 301
print(r.history) # []
print(r.next_request) # <Request('GET', 'https://github.com/')>
  • 启用重定向
import httpx

r = httpx.get('http://github.com',follow_redirects=True)
print(r.url) # https://github.com/
print(r.status_code) # 200
print(r.history) # [<Response [301 Moved Permanently]>]
  • 禁用重定向
import httpx

r = httpx.get('http://github.com',follow_redirects=False)
print(r.url) # http://github.com
print(r.status_code) # 301
print(r.next_request) # <Request('GET', 'https://github.com/')>

23、超时设置

  • timeout参数值太小,会报错
import httpx

r = httpx.get('https://github.com/',timeout=0.001) # 报错
  • 完全禁用超时行为
import httpx

r = httpx.get('http://github.com',timeout=None)
print(r.url) # http://github.com
print(r.status_code) # 301

24、HTTP身份验证

  • 没有进行http身份验证时
import httpx

r = httpx.get('https://ssr3.scrape.center/')
print(r.url) # https://ssr3.scrape.center/
print(r.status_code) # 401
  • 进行http身份验证
import httpx

r = httpx.get('https://ssr3.scrape.center/',auth=("admin", "admin"))
print(r.url) # https://ssr3.scrape.center/
print(r.status_code) # 200

二、高级用法

1、Client方法

import httpx

with httpx.Client(http2=True) as client:
    r = client.get('https://www.qq.com')
    print(r) # <Response [200 OK]>

另一种写法:

import httpx
client = httpx.Client()
try:
    response = client.get('https://httpbin.org/get')
finally:
    client.close()

2、添加headers

  • 在get请求中,加入headers
import httpx

with httpx.Client() as client:
    headers = {'X-Custom':'value'}
    r = client.get('https://www.qq.com',headers=headers)
    print(r.request.headers['X-Custom'])
  • client方法中,添加headers
import httpx

url = 'https://httpbin.org/headers'
headers = {'user-agent':'my-app/0.0.1'}
with httpx.Client(headers=headers) as client:
    r = client.get(url)
    print(r.json())

3、异步请求

# Python版本:3.6
# -*- coding:utf-8 -*-

import httpx
import asyncio


async def fetch(url):
    async with httpx.AsyncClient(http2=True) as client:
        response = await client.get(url)
        print(response.text)


if __name__ == '__main__':
    asyncio.get_event_loop().run_until_complete(fetch('https://www.httpbin.org/get'))

程序运行结果为:

{
  "args": {}, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate", 
    "Host": "www.httpbin.org", 
    "User-Agent": "python-httpx/0.22.0", 
    "X-Amzn-Trace-Id": "Root=1-626176f1-3bcfc83347c0102047408b78"
  }, 
  "origin": "120.239.165.180", 
  "url": "https://www.httpbin.org/get"
}

4、配置合并:对headers、params、cookie值组合在一起

# Python版本:3.6
# -*- coding:utf-8 -*-
import httpx

headers = {'X-Auth': 'from-client'}  # 添加到headers
params = {'client_id': 'client1'}  # 在地址栏中显示该参数值
with httpx.Client(headers=headers, params=params) as client:
    headers = {'X-Custom': 'from-request'}  # 添加到headers
    params = {'request_id': 'request1'}  # 在地址栏中显示该参数值
    r = client.get('https://example.com', headers=headers, params=params)
    print(r.request.url) # https://example.com?client_id=client1&request_id=request1
    print(r.request.headers) # Headers({'host': 'example.com', 'accept': '*/*', 'accept-encoding': 'gzip, deflate', 'connection': 'keep-alive', 'user-agent': 'python-httpx/0.22.0', 'x-auth': 'from-client', 'x-custom': 'from-request'})
    print(r.request.headers['X-Auth']) # from-client
    print(r.request.headers['X-Custom']) # from-request

5、在base_url参数值的基础上添加路径

import httpx

with httpx.Client(base_url='http://httpbin.org') as client:
    r = client.get('/headers') # 实际get方法传入"base_url + /headers"
    print(r.url) # http://httpbin.org/headers
    print(r.request.url) # http://httpbin.org/headers
    print(r.json())

6、Request请求

import httpx

request = httpx.Request('GET', 'https://www.qq.com')

with httpx.Client() as client:
    response = client.send(request)
    print(response)  # <Response [200 OK]>
    print(response.url)  # https://www.qq.com
    print(response.text)

7、监控下载进度

# 用于生成临时文件
import tempfile
import httpx
# 进度条库
from tqdm import tqdm

with tempfile.NamedTemporaryFile() as download_file:
    url = "https://speed.hetzner.de/100MB.bin"
    with httpx.stream('GET',url) as response:
        total = int(response.headers['Content-Length'])

        with tqdm(total=total,unit_scale=True,unit_divisor=1024,unit='B') as progress:
            num_bytes_download = response.num_bytes_downloaded
            for chunk in response.iter_bytes():
                download_file.write(chunk)
                progress.update(response.num_bytes_downloaded - num_bytes_download)
                num_bytes_download = response.num_bytes_downloaded

程序运行后,在run控制台中显示:
使用tqdm库,监控下载进度

  • 2
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值