爬虫数据获取-requests库

数据获取-requests库

requests通用

代码说明与简单案例

# 优点:简单易用,url不需要转译;支持Python2,3且使用方法相同
#获取响应
response = requests.get(url,headers)

'response的各种属性与方法'
def day4_requests_get():
    """request.get使用演示,post使用相同"""
    url = 'http://www.baidu.com/'
    params = {
        'wd': '美女'
    }
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0',
    }
    # 发送请求,获取响应  参数无需解码
    response = requests.get(url=url, headers=headers, params=params)
    # 响应解码
    data = response.content.decode('utf-8')  # content byte
    data = response.text  # text str
    data = response.json  # json dict
    #常用参数获取
    # 获取请求头
    request_headers = response.request.headers
    # 获取响应头
    response_header = response.headers
    # 获取响应状态码
    code = response.status_code
    # 获取请求的Cookie
    request_cookie = response.request._cookies
    # 获取响应的Cookie
    response_cookie = response.cookies

"""当响应为json时数据解析"""
def day4_requests_json():
    url = '	https://fanyi.baidu.com/langdetect'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0',
    }
    data = {
        'query': "encode"
    }
    response = requests.post(url, headers=headers,data=data)
    
    #1、 content获取 json响应数据处理方法
    # data = response.content.decode()
    # 将str转为dict
    # data_dict = json.loads(data)
    # print(data_dict['lan'])
    
    # 2、json获取json响应  字典类型
    data = response.json()
    print(data)   

requests 与代理

def day5_requests_proxy():
    url = 'https://www.baidu.com/'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0',
    }
    #添加免费代理
    free_proxy = {
        'http': '120.77.249.46:8080'
    }
    response = requests.get(url,headers=headers,proxies=free_proxy)
    print(response.status_code)

requests 与 SSL

def day5_requests_ssl():
    url = 'http://www.icbc.com.cn/icbc/'
    # https需第三方证书认证  SSL
    # 12306是自己的证书
    # 解决方法:告诉web忽略证书,直接访问
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0',
    }
    # verify 忽略证书
    response = requests.get(url,headers=headers,verify=False)
    print(response.status_code)
    with open('baidu.html', 'w', encoding='utf-8') as f:
        f.write(response.content.decode())

requests与 Cookie

cookie字符串转字典

cookie 复制后是字符串,但

cookies = 'BAIDUID=9E7C2D060E38585B188986F0EF0CA99E:SL=0:NR=10:FG=1;BIDUPSID=314F6F32BDBB7DC43ED777384251421C;'

方法1:正则匹配 自己替换

替换 ;为\n
替换如图:
在这里插入图片描述

方法二:函数处理

cookie_dict = {}
cookie_list = cookies.split('; ')
for cookie in cookie_list:
	cookie_dict[cookie.split('=')[0]] = cookie.split('=')[1]

方法三:字典推导式

cookie_dict = {cookie.split('=')[0] : cookie.split('=')[1] for cookie in cookies.split('; ')}

携带Cookie访问权限信息

def day5_requests_cookies():
    url = '	https://www.baidu.com/my/index'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0',
    }
    #方法一:传参给cookies 需要的传参类型是字典
    # cookies = 'BAIDUID=9E7C2D060E38585B188986F0EF0CA99E:SL=0:NR=10:FG=1; BIDUPSID=314F6F32BDBB7DC43ED777384251421C; PSTM=1631362136'
    # cookie_dict = {cookie.split('=')[0] : cookie.split('=')[1] for cookie in cookies.split('; ')}
    # print(cookie_dict)
    # response = requests.get(url, headers=headers, cookies=cookie_dict)
    # with open('baidu.html', 'w', encoding='utf-8') as f:
    #     f.write(response.content.decode())

    # 方法二 传参给headers
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0',
        'Cookie':'BAIDUID=9E7C2D060E38585B188986F0EF0CA99E:SL=0:NR=10:FG=1; BIDUPSID=314F6F32BDBB7DC43ED777384251421C; PSTM=1631362136;'
    }
    response = requests.get(url, headers=headers)
    with open('baidu.html', 'w', encoding='utf-8') as f:
        f.write(response.content.decode())

自动登录获取Cookie

def day5_auto_login():
    #session 类似于cookiejar 自动保存cookie
    session = requests.session()
    url = 'https://www.zcbbe.com/member.php?mod=logging&action=login&loginsubmit=yes&infloat=yes&lssubmit=yes&inajax=1'
    login_header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0',
        # 'Host':'bpoyg.zdzhiheng.com.cn:8000',
        # 'Origin':'http://bpoyg.zdzhiheng.com.cn:8000',
        # 'Referer':'http://bpoyg.zdzhiheng.com.cn:8000/login',
    }
    login_form_data = {
        "username": "jml",
        "password": "j*******9",
        "quickforward": 'yes',
        "handlekey": 'ls',
    }
    # 登录操作,获得cookie
    response = session.post(url=url,headers=login_header,data=login_form_data)
    print(response.content)

    # 直接获取权限信息
    center_url = 'https://www.zcbbe.com/plugin.php?id=dc_vip&moblie=no&mobile=no'
    response = session.get(center_url,headers=login_header)
    with open('baidu.html', 'w', encoding='utf-8') as f:
        f.write(response.content.decode())

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值