爬虫实例代码讲解
- Requests 继承了urllib的所有特性。
- Requests支持HTTP连接保持和连接池,
- 支持使用cookie保持会话,
- 支持文件上传,
- 支持自动确定响应内容的编码,
- 支持国际化的 URL 和 POST 数据自动编码。
requests 的底层实现其实就是 urllib:
开源地址:https://github.com/kennethreitz/requests
中文文档 API: http://docs.python-requests.org/zh_CN/latest/index.html
1. 基本Get请求
import requests
import re
kw = {'wd':'长城'}
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
# params 接收一个字典或者字符串的查询参数,字典类型自动转换为url编码,不需要urlencode()
responses = requests.get("http://www.baidu.com/s?", params = kw, headers = headers)
# 查看响应码
print (responses.status_code)
#查找内容
print(re.findall("长城",responses.content.decode()))
2. 基本Post请求
3. proxies请求
import requests
url = "https://www.sina.com.cn"
proxies = {"http":"117.127.0.195:8080"}
headers ={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"}
resp = requests.get(url, proxies=proxies, headers=headers)
print(resp.status_code)
4.request携带cookie登陆
- cookie字符串放在headers中
- 把cookie字典交给requests请求方法的cookies
如下4.1和4.2介绍的方法中,适合单次登陆操作,一但账号退出,cookie信息会改变,需要根据重新登录生成的cookie进行填写,否则无法获取页面信息。session方法可解决此问题。
4.1 cookie字符串放在headers中
import requests
import re
"""
使用cookie进行模拟登录
1、首先使用浏览器登录网站
2、获取cookie信息
3、保存cookie信息放到请求头中
Cookie: anonymid=jf9n8n50xoknmp; depovince=BJ; _r01_=1; ln_uact=18949599846; ln_hurl=http://hdn.xnimg.cn/photos/hdn121/20180111/1930/h_main_TKoW_53900007f6ef1986.jpg; jebecookies=8be00f84-7cb1-423b-a038-956134986681|||||; JSESSIONID=abc8gieV_FfurcjIQfXjw; ick_login=8026bf2c-8754-4c63-9929-ce84eb34269d; jebe_key=1a4e5c26-e2c6-4077-bb17-1d88cc271315%7C42420105704af4678321d141b5f75e18%7C1522154103573%7C1%7C1522315734563; _de=FD7D4EEB60D449F2FF8D98007282E0DA; p=b497bbfd0666a1344336caa9b3eb22933; first_login_flag=1; t=70856ed352294801a8327d92d973eeea3; societyguester=70856ed352294801a8327d92d973eeea3; id=963112933; xnsid=8ee6b6d1; ch_id=10016; wp_fold=0; ver=7.0; loginfrom=null
"""
# 登录后用户信息页
url = 'http://www.renren.com/971560537/'
# 保存cookie信息
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
"Cookie": "anonymid=jybbbitl-dnnfph; depovince=GW; _r01_=1; JSESSIONID=abcWx3wGSHFJEcyDGPoWw; ick_login=a691050f-490c-4fc5-87e4-7328da25c1db; ick=5ebb83b1-427e-4bf8-aacc-49bd6d8b2313; XNESSESSIONID=e440536e7118; jebe_key=2d9b4de6-dc3d-4d23-9b15-0d4007c80323%7C823ec3625641e470076826989e452127%7C1563614033551%7C1%7C1563614032223; jebe_key=2d9b4de6-dc3d-4d23-9b15-0d4007c80323%7C823ec3625641e470076826989e452127%7C1563614033551%7C1%7C1563614032227; first_login_flag=1; ln_uact=13180977920; ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; wp_fold=0; jebecookies=95e39b9e-f125-443f-9e21-f9e61ce847f5|||||; _de=364BCF137409B991E34F9DF2409E43F1; p=9f7d30da0b3ef042b3ad3d553ca320487; t=ca9ae313928529f12350d6cbc25dcbb67; societyguester=ca9ae313928529f12350d6cbc25dcbb67; id=971560537; xnsid=f04448b3; loginfrom=syshome"
}
# 发送请求
response = requests.get(url,headers=headers)
# 获取响应,并且解码成str,使用正则获取响应中的字符串
print(re.findall('于洋',response.content.decode()))
print(response.status_code)
with open("renren.html","w",encoding="utf-8") as f:
f.write(response.content.decode())
4.2 cookie字典
import requests
import re
"""
使用cookie进行模拟登录
1、首先使用浏览器登录网站
2、获取cookie信息
3、保存cookie信息放到请求头中
"""
# 登录后用户信息页
url = 'http://www.renren.com/971560537/'
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
}
# 临时存储cookie信息
coki = 'anonymid=jybbbitl-dnnfph; depovince=GW; _r01_=1; JSESSIONID=abcWx3wGSHFJEcyDGPoWw; ick_login=a691050f-490c-4fc5-87e4-7328da25c1db; ick=5ebb83b1-427e-4bf8-aacc-49bd6d8b2313; XNESSESSIONID=e440536e7118; jebe_key=2d9b4de6-dc3d-4d23-9b15-0d4007c80323%7C823ec3625641e470076826989e452127%7C1563614033551%7C1%7C1563614032223; jebe_key=2d9b4de6-dc3d-4d23-9b15-0d4007c80323%7C823ec3625641e470076826989e452127%7C1563614033551%7C1%7C1563614032227; first_login_flag=1; ln_uact=13180977920; ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; wp_fold=0; jebecookies=95e39b9e-f125-443f-9e21-f9e61ce847f5|||||; _de=364BCF137409B991E34F9DF2409E43F1; p=9f7d30da0b3ef042b3ad3d553ca320487; t=ca9ae313928529f12350d6cbc25dcbb67; societyguester=ca9ae313928529f12350d6cbc25dcbb67; id=971560537; ver=7.0; xnsid=f02af0aa; loginfrom=null'
# 构造cookie字典,字符串切割,获取cookie信息
cookie_dict ={i.split('=')[0] : i.split('=')[-1] for i in coki.split('; ')}
# 发送请求,利用requests模块中的cookies参数,来指定传入的cookie信息
response = requests.get(url,headers=headers,cookies=cookie_dict)
# 获取响应,并且解码成str,使用正则获取响应中的字符串
print(re.findall('于洋',response.content.decode()))
print(response.status_code)
5. 寻找登陆接口的方法
- form表单action对应的url地址
- 用户名和密码的input标签中,name的值作为健,用户名和密码作为值的字典,作为post data
- 通过抓包,定位url地址
- form data
6 . Form_data + cookiejar解决登陆问题
import os,re
import urllib.request
import urllib.parse
import http.cookiejar
import time
import http.cookiejar
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0'
}
cj = http.cookiejar.CookieJar()
hander = urllib.request.HTTPCookieProcessor(cj)
opener = urllib.request.build_opener(hander)
data = {
'username': '***',
'userpwd': '******'
}
data = urllib.parse.urlencode(data).encode('utf-8')
print(data)
#登入操作
get_url = 'http://www.gc-zb.com/login/checklogin.html'
rquest = urllib.request.Request(url = get_url, data = data,headers=headers)
response = opener.open(rquest)
time.sleep(1)
#其他页面做操作
url1 = 'http://www.gc-zb.com/read-131398224.html'
rquest1 = urllib.request.Request(url1,headers=headers)
response1 = opener.open(rquest1)
print(response1.read().decode())
7. Sission
在 requests 里,session对象是一个非常常用的对象,这个对象代表一次用户会话:从客户端浏览器连接服务器开始,到客户端浏览器与服务器断开。
会话能让我们在跨请求时候保持某些参数,比如在同一个 Session 实例发出的所有请求之间保持 cookie 。
import requests
import re
#注:此url是点击登陆后,通过抓包获得,不是页面的url
url = 'http://www.renren.com/PLogin.do'
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"
}
# 1.构造session会话对象
session = requests.session()
# 2.构造post请求data数据
data = {
'email': '*****',
'password': '*****'
}
# 3.发送post请求
session.post(url,headers=headers,data=data)
# 4.验证登录是否成功
response = session.get('http://www.renren.com/971560537/')
print(re.findall('于洋',response.content.decode()))
print(response.status_code)
with open("login_yy.html","w",encoding="utf-8") as f:
f.write(response.content.decode())