Python3网络爬虫:Requests库

1.综述

requests库比urllib库使用简单,功能也强大很多。

import requests

r = requests.get('http://www.nwu.edu.cn')
print(type(r))
print(r.status_code)
print(type(r.text))
print(r.text)
print(r.cookies)

2.基本用法

2.1 GET方法

urllib库中的urlopen()方法实际上是以GET方式请求网页,而requests中相应的方法就是get()方法

获取网页

import requests

data = {
    'name': 'zijeak',
    'age': 20
}
r = requests.get("http://httpbin.org/get", params=data)
print(type(r)) #GET方法返回response对象
print(r.text) #response对象的text属性返回字符串(json格式的字符串)
print(type(r.text))
print(r.json())#可以直接使用json()方法,返回值为json字典
print(type(r.json()))

在这里插入图片描述

获取多媒体文件

#抓取多媒体文件,以视频为例
import requests
 
r = requests.get("https://ad.us.sinaimg.cn/0017idmblx07A91X0Tf201041200irLW0E010.mp4?label=mp4_720p&template=1280x720.25.0&trans_finger=1f0da16358befad33323e3a1b7f95fc9&Expires=1579156781&ssig=4OFI%2BAUhCY&KID=unistore,video&media_id=1034:4460590916108313&tp=YTkl0eM8:YTkl0eM8&us=2Svijz&ori=0&ot=h&ps=4pdsh0&ab=1326-g0,540-g1,1410-g1,966-g1,1055-g0,878-g1,1493-g0,1277-g1,1192-g0,1091-g1,1449-g4,1191-g0,1046-g2,1258-g0,495-g0")
 
with open('test.mp4', 'wb') as f:
    f.write(r.content)

添加Headers

import requests
 
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
}
r = requests.get("https://www.zhihu.com/explore", headers=headers)
print(r.text)

添加参数

import requests

data = {
    'name': 'germey',
    'age': 22
}
r = requests.get("http://httpbin.org/get", params=data)
print(r.text)

2.2 POST请求

import requests
 
data = {'name': 'zijeak', 'age': '20'}
r = requests.post("http://httpbin.org/post", data=data)
print(r.text)

2.3 响应

import requests
 
r = requests.get('http://www.jianshu.com')
print(type(r.status_code), r.status_code)
print(type(r.headers), r.headers)
print(type(r.cookies), r.cookies)
print(type(r.url), r.url)
print(type(r.history), r.history)

2.4 判断请求状态

import requests
 
r = requests.get('http://www.jianshu.com')
exit() if not r.status_code == requests.codes.ok else print('Request Successfully')

3.高级用法

3.1 文件上传

import requests
 
files = {'file': open('test.mp4', 'rb')}
r = requests.post("http://httpbin.org/post", files=files)
print(r.text)

3.2 Cookie

import requests
 
headers = {
    'Cookie': 'SINAGLOBAL=274086193738.90942.1576832637376; un=tjl0000001@163.com; _ga=GA1.2.405438623.1577087899; __gads=ID=c80f776e51377aa2:T=1577087911:S=ALNI_MY47A5S66EC4V4N2EzzyJiZgHqCQA; _gid=GA1.2.1886114731.1579075524; UOR=,,login.sina.com.cn; wvr=6; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WhfcWFhpo5dYQuAqdv1wf1y5JpX5KMhUgL.Fo-Eeh57ShqfSh-2dJLoIf2LxK-LBKBLBKMLxK-LB-BLBKqLxKML1KBL1-qLxK-L1h2LBK-LxKMLB.-L12-LxK-L1h2LBK-LxKnLB.2LB-zLxKnL1h2L1KBLxKML1-zL1-qt; Ugrow-G0=6fd5dedc9d0f894fec342d051b79679e; ALF=1610689179; SSOLoginState=1579153180; SCF=Ah41qkLQDGGaDu74zXqEsq-r4swmgkYB0xrRFmqQPTUlpycc74CZFLXZ0RGTCuSTit0i6uXrCb8PjVI2rs5d3-w.; SUB=_2A25zG4dMDeRhGeNM61IR9CjJzzmIHXVQUP-ErDV8PUNbmtANLUnFkW9NTloVmj608-TqRJXr13J4WubQDw3EFKUJ; SUHB=03oVsKYXurr0sa; YF-V5-G0=e8fcb05084037bcfa915f5897007cb4d; wb_view_log_5200046545=1536*8641.25; _s_tentry=login.sina.com.cn; Apache=1707932027516.208.1579153189104; ULV=1579153189231:14:7:3:1707932027516.208.1579153189104:1579100873076; YF-Page-G0=f48b81114eb6409906d5e133ec75f400|1579154852|1579154852; webim_unReadCount=%7B%22time%22%3A1579154857910%2C%22dm_pub_total%22%3A0%2C%22chat_group_client%22%3A0%2C%22allcountNum%22%3A833%2C%22msgbox%22%3A0%7D',
    'Host': 'weibo.com',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36',
}
r = requests.get('https://weibo.com/5200046545/profile', headers=headers)
print(r.text)

3.3 会话维持:Session

利用Session,可以做到模拟同一个会话而不用担心Cookies的问题。它通常用于模拟登录成功之后再进行下一步的操作。

3.4 SSL证书验证

指定本地证书用作客户端证书

import requests
 
response = requests.get('https://www.12306.cn', cert=('/path/server.crt', '/path/key'))
print(response.status_code)

关闭证书验证

response = requests.get('https://www.12306.cn', verify=False)
print(response.status_code)

忽略证书警告(直接忽略)

import requests
from requests.packages import urllib3
 
urllib3.disable_warnings()
response = requests.get('https://www.12306.cn', verify=False)
print(response.status_code)

忽略证书警告(警告到日志)

import logging
import requests
logging.captureWarnings(True)
response = requests.get('https://www.12306.cn', verify=False)
print(response.status_code)

3.5 代理设置

import requests
 
proxies = {
  "http": "http://10.10.1.10:3128",
  "https": "http://10.10.1.10:1080",
}
 
requests.get("https://www.taobao.com", proxies=proxies)

3.6 超时设置


import requests

r = requests.get("https://www.taobao.com", timeout = 1)
print(r.status_code)
1
2
3
4
import requests
 
r = requests.get("https://www.taobao.com", timeout = 1)
print(r.status_code)

3.7 身份认证

import requests
from requests.auth import HTTPBasicAuth

r = requests.get('http://localhost:5000', auth=HTTPBasicAuth('username', 'password'))
print(r.status_code)

import requests
from requests.auth import HTTPBasicAuth
 
r = requests.get('http://localhost:5000', auth=HTTPBasicAuth('username', 'password'))
print(r.status_code)
发布了37 篇原创文章 · 获赞 11 · 访问量 5929
展开阅读全文

没有更多推荐了,返回首页

©️2019 CSDN 皮肤主题: 技术黑板 设计师: CSDN官方博客

分享到微信朋友圈

×

扫一扫,手机浏览