发送POST请求
哪些地方我们会用到POST请求:
1 登录注册(post比get安全)
2 需要传输大文本的时候(POST请求对数据长度没有要求)
爬虫也需要在这两个地方模拟浏览器发送post请求
百度翻译案例
import requests
# 快捷键:ctrl + r 上(.*?):(.*) 下'$1':'$2',
data = {
'from': 'en',
'to': 'zh',
'query': 'hello',
'simple_means_flag': '3',
'sign': '54706.276099',
'token': 'da0282fa359cc60f6dcbf589525ccc4a',
'domain': 'common',
}
post_url = 'https://fanyi.baidu.com/v2transapi?from=en&to=zh'
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
r = requests.post(post_url, data=data, headers=header)
print(r.text)
使用代理
问题:为什么爬虫需要使用代理
1 让服务器以为不是同⼀个客户端在请求
2 防止我们的真实地址被泄露,防止被追究
使用代理IP
准备⼀堆的IP地址,组成IP池,随机选择⼀个IP来用
检查IP的可用性
1.可以使用requests检查
2.在线代理IP质量检查的网站
import requests
url = 'https://baidu.com'
# 使用代理IP的时候要检查IP的可用性
proxies = {
'HTTPS': '223.199.21.229:9999',
'HTTP': '163.204.241.204:9999'
}
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
# 可以使用requests.get检查代理IP,直接print(r)返回200就可以用
r = requests.get(url, proxies=proxies, headers=header)
print(r.text)
cookie和session的区别
- cookie数据存放在客户端的浏览器上,session数据放在服务器上
- cookie不是很安全,别人可以分析存放在本地的cookie并进行cookie欺骗
- session会在⼀定时间内保存在服务器上。当访问增多,会比较占用你服务器的性能
- 单个cookie保存的数据不能超过4K,很多浏览器都限制⼀个站点最多保存 20个cookie
爬虫处理cookie和session
带上cookie、session的好处:能够请求到登录之后的页面
带上cookie、session的弊端:一套cookie和session往往和一个用户对应请求 太多,请求次数太多,容易被服务器识别为爬虫
不需要cookie的时候尽量不去使用cookie
但是为了获取登录之后的页面,我们必须发送带有cookies的请求
# requests提供了⼀个叫做session类,来实现客户端和服务端的会话保持
# 使⽤⽅法
# 1 实例化⼀个session对象
# 2 让session发送get或者post请求
session = requests.session()
response = session.get(url, headers)
请求登录之后的网站的思路:
- 实例化session
- 先使用session发送请求,登录对应网站,把cookie保持在session中
- 在使用session请求登录之后才能访问的网站,session能够自动携带登录成功时保存在其中的cookie,进行请求
模拟登录人人网案例1
import requests
# 可以创建pwd.ini文件放置登录密码
"""
[password]
rr_pwd:123
"""
# 读取ini的配置文件
from configparser import ConfigParser
# 实例化 ConfigParser是一个类
cfg = ConfigParser()
# 读取pwd.ini
r = cfg.read('pwd.ini')
# 读取数据 先找到password 再找到rr_pwd键值对 得到密码
pwd = cfg.get('password', 'rr_pwd')
# print(pwd)
session = requests.session()
# print(session)
post_url = 'http://www.renren.com/PLogin.do'
post_data = {
'email': '人人网的账号',
# pwd就是你的密码
'password': pwd
}
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
# cookie保存在其中
session.post(post_url, data=post_data, headers=header)
# 请求登录之后的页面
response = session.get('http://www.renren.com/973726172/profile', headers=header)
with open('renren.html', 'w', encoding='utf-8') as f:
f.write(response.text)
模拟登录人人网案例2
"""
这种方法虽然不需要账号和密码但是要登录后的页面的cookie
"""
import requests
# 把Cookie加到header里
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
'Cookie': 'anonymid=k6nb2as430muat; depovince=GW; _r01_=1; JSESSIONID=abchMjdqFrLY99AH61hbx; ick_login=cc97167c-80d2-4657-a2cf-21b34c514633; taihe_bi_sdk_uid=035e016eb4a371e8d5807f3b93c10b94; taihe_bi_sdk_session=3e74039b34e1dacc79e8b116242c235c; ick=40c33bff-ffa8-4761-a605-5c2d64f9e823; __utma=151146938.1439976612.1581754800.1581754800.1581754800.1; __utmc=151146938; __utmz=151146938.1581754800.1.1.utmcsr=renren.com|utmccn=(referral)|utmcmd=referral|utmcct=/; jebe_key=a88e4348-3fd2-403c-a858-968b836f060b%7Cf6890de6c1c46fb480cf7053219383b6%7C1581754903394%7C1%7C1581754903528; jebe_key=a88e4348-3fd2-403c-a858-968b836f060b%7Cf6890de6c1c46fb480cf7053219383b6%7C1581754903394%7C1%7C1581754903549; first_login_flag=1; ln_uact=18703885836; ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; _de=A075AAAE0253B1591487AB07C343E92E; jebecookies=1fc49534-58d7-4baf-96af-d9e32f5c337f|||||; p=4ea3acb1740d99ced7cd3b411cba56642; t=ababa7c75b2118b2b3e6f20719cd9bcf2; societyguester=ababa7c75b2118b2b3e6f20719cd9bcf2; id=973726172; xnsid=b7db52a3; loginfrom=syshome; wp_fold=0'
}
url = 'http://www.renren.com/973726172/profile'
r = requests.get(url, headers=header)
with open('renren2.html', 'w', encoding='utf-8') as f:
f.write(r.text)
模拟登录人人网案例3
"""
把cookie加到requests.get里
这种方法虽然不需要账号和密码但是要登录后的页面的cookie
"""
import requests
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
cookie = 'anonymid=k6nb2as430muat; depovince=GW; _r01_=1; JSESSIONID=abchMjdqFrLY99AH61hbx; ick_login=cc97167c-80d2-4657-a2cf-21b34c514633; taihe_bi_sdk_uid=035e016eb4a371e8d5807f3b93c10b94; taihe_bi_sdk_session=3e74039b34e1dacc79e8b116242c235c; ick=40c33bff-ffa8-4761-a605-5c2d64f9e823; __utma=151146938.1439976612.1581754800.1581754800.1581754800.1; __utmc=151146938; __utmz=151146938.1581754800.1.1.utmcsr=renren.com|utmccn=(referral)|utmcmd=referral|utmcct=/; jebe_key=a88e4348-3fd2-403c-a858-968b836f060b%7Cf6890de6c1c46fb480cf7053219383b6%7C1581754903394%7C1%7C1581754903528; jebe_key=a88e4348-3fd2-403c-a858-968b836f060b%7Cf6890de6c1c46fb480cf7053219383b6%7C1581754903394%7C1%7C1581754903549; first_login_flag=1; ln_uact=18703885836; ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; _de=A075AAAE0253B1591487AB07C343E92E; jebecookies=1fc49534-58d7-4baf-96af-d9e32f5c337f|||||; p=4ea3acb1740d99ced7cd3b411cba56642; t=ababa7c75b2118b2b3e6f20719cd9bcf2; societyguester=ababa7c75b2118b2b3e6f20719cd9bcf2; id=973726172; xnsid=b7db52a3; loginfrom=syshome; wp_fold=0'
# 或者可以不在header里面添加Cookie,可以添加到requests.get里面但是Cookie必须要是字典的形式
cookies = {i.split('=')[0]: i.split('=')[1] for i in cookie.split('; ')}
print(cookies)
url = 'http://www.renren.com/973726172/profile'
r = requests.get(url, headers=header, cookies=cookies)
with open('renren3.html', 'w', encoding='utf-8') as f:
f.write(r.text)
requests小技巧
import requests
response = requests.get('https://baidu.com')
print(response.cookies)
# {'BDORZ': '27315'} cookie 字典
print(requests.utils.dict_from_cookiejar(response.cookies))
# 把字典转换成对象
print(requests.utils.cookiejar_from_dict({'BDORZ': '27315'}))
import requests
# SSL证书验证 HTTPS HTTP + SSL
# 如果想跳过证书验证的话
r = requests.get('https://www.12306.cn/index/', verify=False)
print(r)
import requests
# 设置超时
# 如果时间超过3秒就报错
r = requests.get('https://www.google.com/', timeout=3)
# assert 断言 True继续执行 False直接报错
assert r.status_code == 200
print(r)
import requests
# URL地址的编解码
# 解码
print(requests.utils.unquote('%E8%B0%B7%E6%AD%8C'))
# 编码
print(requests.utils.quote('谷歌'))
"""
https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&tn=02003390_24_hao_pg&wd=%E8%B0%B7%E6%AD%8C&oq=%25E8%25B0%25B7%25E6%25AD%258C&rsv_pq=e75f01b500258083&rsv_t=55efZGQltHmvtVvC0IFC4WyQdEtoCv8j7S0wPO024VgI8v3nyl7ieLSnI2AN4Y28tWXiq9S4NJ33&rqlang=cn&rsv_enter=0&rsv_dl=tb
"""
import requests
"""
小测试
"""
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
# 真正的业务逻辑
def _parse_url(url):
# 设置超时
response = requests.get(url, headers=header, timeout=3)
# 配合状态码判断是否请求成功
assert response.status_code == 200
return response.content.decode()
# 给用户调用的
def parse_url(url):
"""做一个异常处理,看返回的值有没有问题,如果有问题返回None"""
try:
html_str = _parse_url(url)
except:
html_str = None
return html_str
if __name__ == '__main__':
url = 'https://www.baidu.com/'
print(parse_url(url))
retrying
超时重发案例
import requests
from retrying import retry
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
# self._stop_max_attempt_number = 5 if stop_max_attempt_number is None else stop_max_attempt_number
# 装饰器:给下面的模块添加功能的(最多超时三次)
@retry(stop_max_attempt_number=3)
# 真正的业务逻辑
def _parse_url(url):
# 验证装饰器的功能
print("代码执行了几次")
response = requests.get(url, headers=header, timeout=3)
assert response.status_code == 200
return response.content.decode()
# 给用户调用的
def parse_url(url):
"""做一个异常处理,看返回的值有没有问题,如果有问题返回None"""
try:
html_str = _parse_url(url)
except:
html_str = None
return html_str
if __name__ == '__main__':
# 故意写的不存在网站
url = 'https://www.baidus.csom/'
print(parse_url(url))