Python爬虫-requests库

 记录一下学习过的爬虫知识,方便以后回顾查找~
import requests
import re

一、Get请求

r=requests.get('http://baidu.com/') #获取百度网址
print(type(r)) #类型
print(r.status_code) #状态码
print(type(r.text))# 响应体的类型
print(r.text) #内容
print(r.cookies) #cookies

在这里插入图片描述

1.1 添加params参数

data={
    'name':'germey',
    'age':22
}
r=requests.get('http://httpbin.org/get?',params=data)
 #请求链接自动变为http://httpbin.org/get?name=germey&age=22'
print(r.text)

在这里插入图片描述

  • json 格式转化为字典
r=requests.get('http://httpbin.org/get')
print(type(r.text))
print("\n")
print(r.json()) #将JSON格式的字符串转化为字典
print(type(r.json()))

在这里插入图片描述

1.2 抓取网页

headers={
    'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Mobile Safari/537.36'
}
r=requests.get('https://www.zhihu.com/explore',headers=headers)
pattern=re.compile('explore-feed.*') #正则表达式
title=re.findall(pattern,r.text)
title

在这里插入图片描述

1.3 获取二进制数据

r=requests.get('https://github.com/favicon.ico')
print(r.text)
print(r.content)

结果显示不了
在这里插入图片描述

  • 获取Github图标
r=requests.get('https://github.com/favicon.ico')
with open('favicon.ico','wb') as f:
    f.write(r.content) #获取Github图标

本地文件夹出现github图标照片
在这里插入图片描述

二、post请求

data={'name':'germey','age':22}
r=requests.post('http://httpbin.org/post',data=data) #data参数就是params
print(r.text) #结果中form就是提交的data

在这里插入图片描述

三、响应

headers={'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36(KHTML, like Gecko) Chrome/75.0.3770.142 Mobile Safari/537.36'}
r=requests.get('http://www.jianshu.com',headers=headers)
print(type(r.status_code),r.status_code)
print(type(r.headers),r.headers)
print(type(r.cookies),r.cookies)
print(type(r.url),r.url)
print(type(r.history),r.history)

四、高级用法

4.1 cookie

r=requests.get("https://www.baidu.com")
print(r.cookies)
print(r.cookies.items())
for key,value in r.cookies.items():
    print(key + "=" + value)

在这里插入图片描述

  • 获取知乎用自己的cookie
headers={'Cookie': '_zap=698024e1-91ad-4ef1-b497-8faa3456af84; _xsrf=Nk1tdLqXQKJcprow5wl84Uf9hF8eRJQs; d_c0="AOAvjwIetA-PTgv6gt-7-7DXekoQvE6Llbs=|1562565500"; __utmz=51854390.1578618753.1.1.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmv=51854390.110--|2=registration_date=20160502=1^3=entry_date=20160502=1; _ga=GA1.2.1987250188.1578618753; capsion_ticket="2|1:0|10:1583373076|14:capsion_ticket|44:NDBmNGYxNjJiYzFiNGNjNDljZmU5ZDNlN2M2M2E0OGI=|ef52da7f694e2d959b574ccf8770642b11a2c3cc15a995c4900eb6f90d85060a"; z_c0="2|1:0|10:1583373097|4:z_c0|92:Mi4xYVVqM0FnQUFBQUFBNEMtUEFoNjBEeWNBQUFDRUFsVk5LZWlIWGdEQl9VaWItMWstSm1VRG9ZejV0Q3F4cjBJVDln|ce6d3024eb77f9fb8e03cf68d8f2094ce84f905569bdabcc365892fac8ef0319"; tst=r; _gid=GA1.2.143501728.1584538440; q_c1=886762f2bae04daca68b9ae2af9a4e38|1584538854000|1564363133000; __utma=51854390.1987250188.1578618753.1578618753.1584538698.2; __utmb=51854390.0.10.1584538698; __utmc=51854390; _gat_gtag_UA_149949619_1=1; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1583761483,1584325469,1584538441,1584540435; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1584540435; KLBRSID=ed2ad9934af8a1f80db52dcb08d13344|1584540595|1584538597',
        'Host':'www.zhihu.com',
        'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Mobile Safari/537.36'
        }
r=requests.get('https://www.zhihu.com',headers=headers)
print(r.text)
cookies='_zap=698024e1-91ad-4ef1-b497-8faa3456af84; _xsrf=Nk1tdLqXQKJcprow5wl84Uf9hF8eRJQs; d_c0="AOAvjwIetA-PTgv6gt-7-7DXekoQvE6Llbs=|1562565500"; __utmz=51854390.1578618753.1.1.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmv=51854390.110--|2=registration_date=20160502=1^3=entry_date=20160502=1; _ga=GA1.2.1987250188.1578618753; capsion_ticket="2|1:0|10:1583373076|14:capsion_ticket|44:NDBmNGYxNjJiYzFiNGNjNDljZmU5ZDNlN2M2M2E0OGI=|ef52da7f694e2d959b574ccf8770642b11a2c3cc15a995c4900eb6f90d85060a"; z_c0="2|1:0|10:1583373097|4:z_c0|92:Mi4xYVVqM0FnQUFBQUFBNEMtUEFoNjBEeWNBQUFDRUFsVk5LZWlIWGdEQl9VaWItMWstSm1VRG9ZejV0Q3F4cjBJVDln|ce6d3024eb77f9fb8e03cf68d8f2094ce84f905569bdabcc365892fac8ef0319"; tst=r; _gid=GA1.2.143501728.1584538440; q_c1=886762f2bae04daca68b9ae2af9a4e38|1584538854000|1564363133000; __utma=51854390.1987250188.1578618753.1578618753.1584538698.2; __utmc=51854390; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1584325469,1584538441,1584540435,1584540857; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1584540857; KLBRSID=ed2ad9934af8a1f80db52dcb08d13344|1584541061|1584538597'
jar=requests.cookies.RequestCookieJar()
headers={'Host':'www.zhihu.com',
        'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Mobile Safari/537.36'
        }
for cookie in cookies.split(';'):
    key,value=cookie.split('=',1)  #1表示分割成1+1=2个
    jar.set(key,value)
r=requests.get('https://www.zhihu.com',cookie=jar,headers=headers)
print(r.text)

4.2 会话维持(Session)

  • 没有设置会话维持
requests.get('http://httpbin.org/cookies/set/number/123456789')
r=requests.get('http://httpbin.org/cookies')
print(r.text)

在这里插入图片描述

  • 设置会话
s=requests.Session()
s.get('http://httpbin.org/cookies/set/number/123456789')
r=s.get('http://httpbin.org/cookies')
print(r.text)

在这里插入图片描述

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值