03-爬虫请求模块

最新推荐文章于 2024-04-03 09:31:23 发布

无痕的雨

最新推荐文章于 2024-04-03 09:31:23 发布

阅读量497

点赞数

分类专栏：爬虫

本文链接：https://blog.csdn.net/qq_45451647/article/details/111060862

版权

爬虫专栏收录该内容

24 篇文章 0 订阅

订阅专栏

get案例

需求爬取贴吧的数据

1 输入爬取贴吧的主题(例如海贼王)
2 输入起始页和终止页(例如 3 - 5)
3 把每一页的数据保存到本地(例如第一页.html第二页.html) 思路

https://tieba.baidu.com/f？kw=%E6%B5%B7%E8%B4%BC%E7%8E%8B&pn=0 第一页

https://tieba.baidu.com/f?kw=%E6%B5%B7%E8%B4%BC%E7%8E%8B&pn=50 第二页

https://tieba.baidu.com/f?kw=%E6%B5%B7%E8%B4%BC%E7%8E%8B&pn=100 第三页

https://tieba.baidu.com/f?
kw=%E6%B5%B7%E8%B4%BC%E7%8E%8B&pn=150 第四页
pn = (page - 1) * 50 发起请求 —> 数据保存数据方法论解决方式解决方案 a b c 跨越式

学习方法

学习反应慢一些思维逻辑跳过去你就理解了 1 3 5 碎片化的时间来学习生活工作 6 3 分析一下 2 - 3个视频
2个视频的内容都搞定了都复习了 4 5个目标 2 - 3个月

代码实现

import urllib.request
import urllib.parse

"""
类属性 实例属性 类方法 实例方法 （静态方法）
"""
#https://tieba.baidu.com/f?kw=%E4%BA%BA%E7%94%9F&ie=utf-8&pn=50
class BaiduSpider:
    #把常用不变的添加到init方法里面
    def __init__(self):
        self.headers=headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.57"
    }

        self.base_url="https://tieba.baidu.com/f?"

    def readPage(self,url):
        req=urllib.request.Request(url,headers=self.headers)
        res=urllib.request.urlopen(req)
        html=res.read().decode("utf-8")
        return html

    def writePage(self,filename,html):
        with open(filename,'w',encoding="utf-8") as f:
            f.write(html)
        print("写入成功")

    @property
    def main(self):
        name=input("请输入贴吧名称：")
        begin=int(input("请输入起始页："))
        end=int(input("请输入结束页："))
        kw={"kw":name}
        result=urllib.parse.urlencode(kw)

        for i in range(begin,end+1):
            pn=(i-1)*50
            filename=name+"第"+str(i)+"页.html"
            url=self.base_url+result+"&ie=utf-8&pn="+str(pn)
            #调用函数
            html=self.readPage(url)
            self.writePage(filename,html)

if __name__ == '__main__':
    spider=BaiduSpider()
    spider.main

请输入贴吧名称：自行车
请输入起始页：1
请输入结束页：2
写入成功
写入成功

在这里插入图片描述

post案例

有道翻译
请求的url地址携带数据

import urllib.request
import urllib.parse
import json

content=input("请输入你要翻译的内容:")

data = {
    "i": content,
    "from": "AUTO",
    "to": "AUTO",
    "smartresult": "dict",
    "client": "fanyideskweb",
    "salt": "16077402310287",
    "sign":
        "98a46c8c591fa9d2cb3c38a44c5e1e36",
    "lts": "1607740231028",
    "bv":
        "8269b35cc1594b7635631cdd3a301112",
    "doctype": "json",
    "version": "2.1",
    "keyfrom": "fanyi.web",
    "action": "FY_BY_REALTlME"
}

data=urllib.parse.urlencode(data)

data=bytes(data,"utf-8")#将请求数据必须变为字节流 字符串，字典，列表等转为二进制字节流
#http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule url中去了_o才能响应
url="http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule"#
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.57"
}

req=urllib.request.Request(url,headers=headers,data=data)
res=urllib.request.urlopen(req)
html=res.read().decode("utf-8")
#print(html,type(html))

r_dict=json.loads(html)#将已编码的 JSON 字符串解码为 Python 对象 将json格式转化为python中的格式 将数组转化为列表 将对象转化为字典
r=r_dict["translateResult"]#[[{"src":"你好","tgt":"hello"}]]
result=r[0][0]['tgt']#{"src":"你好","tgt":"hello"}字典中取出"tgt"
print(result)
#
"""
{"type":"ZH_CN2EN","errorCode":0,"elapsedTime":0,"translateResult":[[{"src":"你好","tgt":"hello"}]]}
"""

此代码中data的由来（就是post请求的发送数据）
data = {
“i”: content,
“from”: “AUTO”,
“to”: “AUTO”,
“smartresult”: “dict”,
“client”: “fanyideskweb”,
“salt”: “16077402310287”,
“sign”:
“98a46c8c591fa9d2cb3c38a44c5e1e36”,
“lts”: “1607740231028”,
“bv”:
“8269b35cc1594b7635631cdd3a301112”,
“doctype”: “json”,
“version”: “2.1”,
“keyfrom”: “fanyi.web”,
“action”: “FY_BY_REALTlME”
}

在这里插入图片描述

请输入你要翻译的内容:他们在那里啊
They were there

Requests

安装
学习源码(为什么？)
。了解底层实现原理
。了解优秀的框架/代码是如何编写出来
。练习英文
。提供思路提供解决方案
。程序员最基本的素养企业开发的要求

response=requests.get(‘https://qq.yh31.com/love/’,headers=headers)
print(response.status_code)#状态码
print(response.url)#主链接url
response.encoding=“utf-8”#将字节流进行转化
print(response.text)#打印出源码
print(response.decode(“utf-8”))#打印出源码

#import requests

"""
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.57"
}

#发起响应请求
response=requests.get('https://qq.yh31.com/love/',headers=headers)
print(response)#<Response [200]> 对象
#print(response.content)#字符串的数据 乱码的
#print(response.text)#字符串数据 乱码的
#print(response.content.decode("utf-8"),type(response.content.decode("utf-8")))#字符串类型
response.encoding="utf-8"
#print(response.text)

print(response.status_code)#状态码
print(response.url)#主链接url
"""

import requests

content=input("请输入你要翻译的内容：")
data={
    "i": content,
    "from": "AUTO",
    "to": "AUTO",
    "smartresult": "dict",
    "client": "fanyideskweb",
    "salt": "16077402310287",
    "sign":
        "98a46c8c591fa9d2cb3c38a44c5e1e36",
    "lts": "1607740231028",
    "bv":
        "8269b35cc1594b7635631cdd3a301112",
    "doctype": "json",
    "version": "2.1",
    "keyfrom": "fanyi.web",
    "action": "FY_BY_REALTlME"
}

headers={
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.57"
}
url="http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule"
res=requests.post(url,data=data,headers=headers)#requests中自带post请求模块
html=res.text
print(html)

无痕的雨

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
03-爬虫请求模块

get案例需求爬取贴吧的数据1 输入爬取贴吧的主题(例如海贼王)2 输入起始页和终止页(例如 3 - 5)3 把每一页的数据保存到本地(例如第一页.html第二页.html) 思路https://tieba.baidu.com/f？kw=%E6%B5%B7%E8%B4%BC%E7%8E%8B&pn=0 第一页https://tieba.baidu.com/f?kw=%E6%B5%B7%E8%B4%BC%E7%8E%8B&pn=50 第二页https://
复制链接

扫一扫