requests库的使用

最新推荐文章于 2022-07-28 17:43:02 发布

斜光

最新推荐文章于 2022-07-28 17:43:02 发布

阅读量413

点赞数

分类专栏： python爬虫

本文链接：https://blog.csdn.net/weixin_44145258/article/details/102589200

版权

python爬虫专栏收录该内容

16 篇文章 0 订阅

订阅专栏

requests 功能和 urllib一样
将cookies 和 http变成了参数

请求方式

re = requests.get('http://www.baidu.com')
re = requests.post('http://baidu.com')

响应

text返回文本内容，会乱码，requests转码时基于http头部推测转码
content返回二进制内容
json方法只会对json数据进行解码

print(re.text)  # 会乱码，requests转马时基于http头部推测转码
print(re.encoding)  # 一般都转为ISO-8859-1

手动转码
re.encoding = 'utf-8'
print(re.text)

print(re.json())

print(re.content.decode())

requests参数

    def request(self, method, url,
            params=None, data=None, headers=None, cookies=None, files=None,
            auth=None, timeout=None, allow_redirects=True, proxies=None,
            hooks=None, stream=None, verify=None, cert=None, json=None):

params：get的请求参数
data：post的请求参数

获取headers、cookies

'''表单中，多个元素使用同一key的时候，可用元组传参data = (('key1', 'value1'), ('key1', 'value2')）'''
data = (('key1', 'value1'), ('key1', 'value2'))

print(requests.post('http://httpbin.org/post', data=data).headers)
print(requests.post('http://httpbin.org/post', data=data).cookies)

重定向（301、302）：allow_redirects

参数：allow_redirects=False 取消自动重定向

"""重定向"""
r = requests.get('http://github.com', allow_redirects=False)    # 禁止重定向
print(r.headers)
print(r.url)
print(r.history)    # 历史记录，从哪个网站跳转过来的

流下载：stream=True

url = 'http://httpbin.org/bytes/102400000'

r = requests.get(url, stream=True)

for chunk in r.iter_content(chunk_size=1024):   # 流下载，迭代下载每次1024
    print(chunk)

session会话对象

"""
session 会话对象，所有会话对象发出的请求 会自动保持状态
同一主机发送多个请求，会重用tcp链接
使用socket是会先connect链接网页也就是建立tcp链接，因此session重用会快很多
"""
import time
import urllib.request
s = requests.Session()  # session所有api和requests相同

start_time = time.time()
for i in range(50):
    # r = urllib.request.urlopen('https://www.baidu.com')
    # r = requests.get('https://www.baidu.com')
    r = s.get('https://www.baidu.com')      # 明显快于前两种，因为会重用tcp链接
print('耗时{}s'.format(time.time()-start_time))

cookies和代理

cookies

"""手动添加cookies"""
以蔡xx的微博为例
headers = {
    'Cookie': 'Ugrow-G0=9ec894e3c5cc0435786b4ee8ec8a55cc; login_sid_t=e51289257991fb89895050ec904235a9; cross_origin_proto=SSL; YF-V5-G0=4e19e5a0c5563f06026c6591dbc8029f; WBStorage=384d9091c43a87a5|undefined; wb_view_log=1920*10801; _s_tentry=passport.weibo.com; UOR=passport.weibo.com,weibo.com,www.baidu.com; Apache=6786804717320.616.1571319204822; SINAGLOBAL=6786804717320.616.1571319204822; ULV=1571319204826:1:1:1:6786804717320.616.1571319204822:; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWzDZA2m47xOpFH2jGMB4jp5JpX5K2hUgL.FoqXSo27Sh-peoB2dJLoIpjLxKqL1KqL1-eLxK-L1h-L1h.LxKBLBonLBKnt; ALF=1602855233; SSOLoginState=1571319234; SCF=AjdOv7Rm7mFLsHRu2Tghbym8Hxz0j2_BXQVBWOjkep9nlJp9yVFgnv3Ms2iv9580DGwDjwroI4vycbasA9Zc5pg.; SUB=_2A25wrB2TDeRhGeBK7VMR9CvNyTiIHXVT2AhbrDV8PUNbmtBeLUjekW9NR7AVSx1PwhH4vQMad2gYAUMH7Ms2Qb29; SUHB=0rr7DKl01ed5ZY; un=17398891960; wvr=6; wb_view_log_6461045124=1920*10801; wb_timefeed_6461045124=1; YF-Page-G0=96c3bfa80dc53c34a567607076bb434e|1571319257|1571319239; webim_unReadCount=%7B%22time%22%3A1571319313680%2C%22dm_pub_total%22%3A0%2C%22chat_group_client%22%3A0%2C%22allcountNum%22%3A25%2C%22msgbox%22%3A0%7D',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3732.400 QQBrowser/10.5.3819.400',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',

}

r = requests.get('https://weibo.com/caizicaixukun', headers=headers)
r.encoding = 'utf-8'
print(r.text)

代理

参数

""":param proxies: (optional) Dictionary mapping protocol or protocol and
    hostname to the URL of the proxy."""

加代理的方式

proxy = {
    'http': 'ip:port',
    'https': 'ip:port',
}
print(requests.get('https://www.baidu.com', proxies=))

百度贴吧小案例

"""
需求
1.请求首页地址，匹配每一个帖子的详情页url
2.分页请求
"""

import requests
import re
import threading


def parse(word, pn):
    url = 'https://tieba.baidu.com/f?kw={}&ie=utf-8&{}'.format(word, pn)

    r = requests.get(url).content.decode()
    # print(r)
    article_urls = re.findall(r'<a rel="noreferrer" href="(/p/\d+)" title="(.*?)" target=', r, re.S)
    # print(article_urls)
    return article_urls


def parse_detail(article_urls):
    for article_url in article_urls:
        article_req = requests.get('https://tieba.baidu.com'+article_url[0]).text
        # print(article_req)

        author = re.findall(r'author: "(.*?)"', article_req, re.S)
        # print(author)
        create_time = re.findall(r'>1楼</span><span class="tail-info">(.*?)</span>', article_req, re.S)
        # print(create_time)

        if author and create_time:
            print('作者:{},标题:{},创建时间:{}'.format(author, article_url[1], create_time))


if __name__ == '__main__':
    word = input("请输入贴吧名字：")
    t_list = []
    for pn in range(0, 201, 50):
        # 获取详情页
        article = parse(word, pn)
        # 对每个详情页请求
        t = threading.Thread(target=parse_detail, args=(article,))
        t_list.append(t)    # 不要立即启动线程，先追加到一个列表，之后一起启动
        # t.start()
    # 启动
    for t in t_list:
        t.start()

    # 等待所有线程结束
    for t in t_list:
        t.join()