get案例
- 需求 爬取贴吧的数据
- 1 输入爬取贴吧的主题(例如 海贼王)
- 2 输入起始页和终止页(例如 3 - 5)
- 3 把每一页的数据保存到本地(例如 第一页.html第二页.html) 思路
https://tieba.baidu.com/f?kw=%E6%B5%B7%E8%B4%BC%E7%8E%8B&pn=0 第一页
https://tieba.baidu.com/f?kw=%E6%B5%B7%E8%B4%BC%E7%8E%8B&pn=50 第二页
https://tieba.baidu.com/f?kw=%E6%B5%B7%E8%B4%BC%E7%8E%8B&pn=100 第三页
https://tieba.baidu.com/f?
kw=%E6%B5%B7%E8%B4%BC%E7%8E%8B&pn=150 第四页
pn = (page - 1) * 50 发起请求 —> 数据 保存数据 方法论 解决方式 解决方案 a b c 跨越式
学习方法
学习反应慢一些 思维逻辑 跳过去 你就理解了 1 3 5 碎片化的时间来学习 生活工作 6 3 分析一下 2 - 3个视频
2个视频的内容都搞定了 都复习了 4 5个 目标 2 - 3个月
代码实现
import urllib.request
import urllib.parse
"""
类属性 实例属性 类方法 实例方法 (静态方法)
"""
#https://tieba.baidu.com/f?kw=%E4%BA%BA%E7%94%9F&ie=utf-8&pn=50
class BaiduSpider:
#把常用不变的添加到init方法里面
def __init__(self):
self.headers=headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.57"
}
self.base_url="https://tieba.baidu.com/f?"
def readPage(self,url):
req=urllib.request.Request(url,headers=self.headers)
res=urllib.request.urlopen(req)
html=res.read().decode("utf-8")
return html
def writePage(self,filename,html):
with open(filename,'w',encoding="utf-8") as f:
f.write(html)
print("写入成功")
@property
def main(self):
name=input("请输入贴吧名称:")
begin=int(input("请输入起始页:"))
end=int(input("请输入结束页:"))
kw={"kw":name}
result=urllib.parse.urlencode(kw)
for i in range(begin,end+1):
pn=(i-1)*50
filename=name+"第"+str(i)+"页.html"
url=self.base_url+result+"&ie=utf-8&pn="+str(pn)
#调用函数
html=self.readPage(url)
self.writePage(filename,html)
if __name__ == '__main__':
spider=BaiduSpider()
spider.main
请输入贴吧名称:自行车
请输入起始页:1
请输入结束页:2
写入成功
写入成功
post案例
- 有道翻译
请求的url地址 携带数据
import urllib.request
import urllib.parse
import json
content=input("请输入你要翻译的内容:")
data = {
"i": content,
"from": "AUTO",
"to": "AUTO",
"smartresult": "dict",
"client": "fanyideskweb",
"salt": "16077402310287",
"sign":
"98a46c8c591fa9d2cb3c38a44c5e1e36",
"lts": "1607740231028",
"bv":
"8269b35cc1594b7635631cdd3a301112",
"doctype": "json",
"version": "2.1",
"keyfrom": "fanyi.web",
"action": "FY_BY_REALTlME"
}
data=urllib.parse.urlencode(data)
data=bytes(data,"utf-8")#将请求数据必须变为字节流 字符串,字典,列表等转为二进制字节流
#http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule url中去了_o才能响应
url="http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule"#
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.57"
}
req=urllib.request.Request(url,headers=headers,data=data)
res=urllib.request.urlopen(req)
html=res.read().decode("utf-8")
#print(html,type(html))
r_dict=json.loads(html)#将已编码的 JSON 字符串解码为 Python 对象 将json格式转化为python中的格式 将数组转化为列表 将对象转化为字典
r=r_dict["translateResult"]#[[{"src":"你好","tgt":"hello"}]]
result=r[0][0]['tgt']#{"src":"你好","tgt":"hello"}字典中取出"tgt"
print(result)
#
"""
{"type":"ZH_CN2EN","errorCode":0,"elapsedTime":0,"translateResult":[[{"src":"你好","tgt":"hello"}]]}
"""
此代码中data的由来(就是post请求的发送数据)
data = {
“i”: content,
“from”: “AUTO”,
“to”: “AUTO”,
“smartresult”: “dict”,
“client”: “fanyideskweb”,
“salt”: “16077402310287”,
“sign”:
“98a46c8c591fa9d2cb3c38a44c5e1e36”,
“lts”: “1607740231028”,
“bv”:
“8269b35cc1594b7635631cdd3a301112”,
“doctype”: “json”,
“version”: “2.1”,
“keyfrom”: “fanyi.web”,
“action”: “FY_BY_REALTlME”
}
请输入你要翻译的内容:他们在那里啊
They were there
Requests
- 安装
- 学习源码(为什么?)
。了解底层实现原理
。了解优秀的框架/代码是如何编写出来
。练习英文
。提供思路 提供解决方案
。程序员最基本的素养 企业开发的要求
- response=requests.get(‘https://qq.yh31.com/love/’,headers=headers)
- print(response.status_code)#状态码
- print(response.url)#主链接url
response.encoding=“utf-8”#将字节流进行转化
print(response.text)#打印出源码
print(response.decode(“utf-8”))#打印出源码
#import requests
"""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.57"
}
#发起响应请求
response=requests.get('https://qq.yh31.com/love/',headers=headers)
print(response)#<Response [200]> 对象
#print(response.content)#字符串的数据 乱码的
#print(response.text)#字符串数据 乱码的
#print(response.content.decode("utf-8"),type(response.content.decode("utf-8")))#字符串类型
response.encoding="utf-8"
#print(response.text)
print(response.status_code)#状态码
print(response.url)#主链接url
"""
import requests
content=input("请输入你要翻译的内容:")
data={
"i": content,
"from": "AUTO",
"to": "AUTO",
"smartresult": "dict",
"client": "fanyideskweb",
"salt": "16077402310287",
"sign":
"98a46c8c591fa9d2cb3c38a44c5e1e36",
"lts": "1607740231028",
"bv":
"8269b35cc1594b7635631cdd3a301112",
"doctype": "json",
"version": "2.1",
"keyfrom": "fanyi.web",
"action": "FY_BY_REALTlME"
}
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.57"
}
url="http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule"
res=requests.post(url,data=data,headers=headers)#requests中自带post请求模块
html=res.text
print(html)