requests 功能和 urllib一样
将cookies 和 http变成了参数
请求方式
re = requests.get('http://www.baidu.com')
re = requests.post('http://baidu.com')
响应
- text返回文本内容,会乱码,requests转码时基于http头部推测转码
- content返回二进制内容
- json方法 只会对json数据进行解码
print(re.text) # 会乱码,requests转马时基于http头部推测转码
print(re.encoding) # 一般都转为ISO-8859-1
手动转码
re.encoding = 'utf-8'
print(re.text)
print(re.json())
print(re.content.decode())
requests参数
def request(self, method, url,
params=None, data=None, headers=None, cookies=None, files=None,
auth=None, timeout=None, allow_redirects=True, proxies=None,
hooks=None, stream=None, verify=None, cert=None, json=None):
- params:get的请求参数
- data:post的请求参数
获取headers、cookies
'''表单中,多个元素使用同一key的时候,可用元组传参data = (('key1', 'value1'), ('key1', 'value2'))'''
data = (('key1', 'value1'), ('key1', 'value2'))
print(requests.post('http://httpbin.org/post', data=data).headers)
print(requests.post('http://httpbin.org/post', data=data).cookies)
重定向(301、302):allow_redirects
参数:allow_redirects=False 取消自动重定向
"""重定向"""
r = requests.get('http://github.com', allow_redirects=False) # 禁止重定向
print(r.headers)
print(r.url)
print(r.history) # 历史记录,从哪个网站跳转过来的
流下载:stream=True
url = 'http://httpbin.org/bytes/102400000'
r = requests.get(url, stream=True)
for chunk in r.iter_content(chunk_size=1024): # 流下载,迭代下载每次1024
print(chunk)
session会话对象
"""
session 会话对象,所有会话对象发出的请求 会自动保持状态
同一主机发送多个请求,会重用tcp链接
使用socket是会先connect链接网页也就是建立tcp链接,因此session重用会快很多
"""
import time
import urllib.request
s = requests.Session() # session所有api和requests相同
start_time = time.time()
for i in range(50):
# r = urllib.request.urlopen('https://www.baidu.com')
# r = requests.get('https://www.baidu.com')
r = s.get('https://www.baidu.com') # 明显快于前两种,因为会重用tcp链接
print('耗时{}s'.format(time.time()-start_time))
cookies和代理
cookies
"""手动添加cookies"""
以蔡xx的微博为例
headers = {
'Cookie': 'Ugrow-G0=9ec894e3c5cc0435786b4ee8ec8a55cc; login_sid_t=e51289257991fb89895050ec904235a9; cross_origin_proto=SSL; YF-V5-G0=4e19e5a0c5563f06026c6591dbc8029f; WBStorage=384d9091c43a87a5|undefined; wb_view_log=1920*10801; _s_tentry=passport.weibo.com; UOR=passport.weibo.com,weibo.com,www.baidu.com; Apache=6786804717320.616.1571319204822; SINAGLOBAL=6786804717320.616.1571319204822; ULV=1571319204826:1:1:1:6786804717320.616.1571319204822:; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWzDZA2m47xOpFH2jGMB4jp5JpX5K2hUgL.FoqXSo27Sh-peoB2dJLoIpjLxKqL1KqL1-eLxK-L1h-L1h.LxKBLBonLBKnt; ALF=1602855233; SSOLoginState=1571319234; SCF=AjdOv7Rm7mFLsHRu2Tghbym8Hxz0j2_BXQVBWOjkep9nlJp9yVFgnv3Ms2iv9580DGwDjwroI4vycbasA9Zc5pg.; SUB=_2A25wrB2TDeRhGeBK7VMR9CvNyTiIHXVT2AhbrDV8PUNbmtBeLUjekW9NR7AVSx1PwhH4vQMad2gYAUMH7Ms2Qb29; SUHB=0rr7DKl01ed5ZY; un=17398891960; wvr=6; wb_view_log_6461045124=1920*10801; wb_timefeed_6461045124=1; YF-Page-G0=96c3bfa80dc53c34a567607076bb434e|1571319257|1571319239; webim_unReadCount=%7B%22time%22%3A1571319313680%2C%22dm_pub_total%22%3A0%2C%22chat_group_client%22%3A0%2C%22allcountNum%22%3A25%2C%22msgbox%22%3A0%7D',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3732.400 QQBrowser/10.5.3819.400',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
}
r = requests.get('https://weibo.com/caizicaixukun', headers=headers)
r.encoding = 'utf-8'
print(r.text)
代理
参数
""":param proxies: (optional) Dictionary mapping protocol or protocol and
hostname to the URL of the proxy."""
加代理的方式
proxy = {
'http': 'ip:port',
'https': 'ip:port',
}
print(requests.get('https://www.baidu.com', proxies=))
百度贴吧小案例
"""
需求
1.请求首页地址,匹配每一个帖子的详情页url
2.分页请求
"""
import requests
import re
import threading
def parse(word, pn):
url = 'https://tieba.baidu.com/f?kw={}&ie=utf-8&{}'.format(word, pn)
r = requests.get(url).content.decode()
# print(r)
article_urls = re.findall(r'<a rel="noreferrer" href="(/p/\d+)" title="(.*?)" target=', r, re.S)
# print(article_urls)
return article_urls
def parse_detail(article_urls):
for article_url in article_urls:
article_req = requests.get('https://tieba.baidu.com'+article_url[0]).text
# print(article_req)
author = re.findall(r'author: "(.*?)"', article_req, re.S)
# print(author)
create_time = re.findall(r'>1楼</span><span class="tail-info">(.*?)</span>', article_req, re.S)
# print(create_time)
if author and create_time:
print('作者:{},标题:{},创建时间:{}'.format(author, article_url[1], create_time))
if __name__ == '__main__':
word = input("请输入贴吧名字:")
t_list = []
for pn in range(0, 201, 50):
# 获取详情页
article = parse(word, pn)
# 对每个详情页请求
t = threading.Thread(target=parse_detail, args=(article,))
t_list.append(t) # 不要立即启动线程,先追加到一个列表,之后一起启动
# t.start()
# 启动
for t in t_list:
t.start()
# 等待所有线程结束
for t in t_list:
t.join()