爬取百度贴吧王者荣耀吧的页面文件
面向对象思想的实现:
import requests
class Stick_Spider():
# 初始化方法
def __init__(self, html_name):
"""获取网页的url"""
self.html_name = html_name
self.headers = 'https://tieba.baidu.com/f?kw=' + html_name + '={}'
self.params = {"User-Agent": " Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"}
def spider_list(self):
"""构造url列表"""
baidu_list = []
for i in range(50):
baidu_list.append(self.headers.format(i * 50))
return baidu_list
def Send_request(self, url):
"""发送请求"""
data = requests.get(url, self.params)
return data.content.decode()
def file(self, html_str, html_number):
"""保存html文件"""
html_name = "{}第{}页".format(self.html_name, html_number)
print(html_name)
with open(html_name, "w", encoding="utf-8") as f:
f.write(html_str)
def main(self):
"""实现主要逻辑"""
# 构造url列表
url_list = self.spider_list()
# 遍历发送请求
for url in url_list:
data_name = self.Send_request(url)
# 保存
html_number = url_list.index(url) + 1
self.file(data_name, html_number)
if __name__ == '__main__':
s = Stick_Spider('王者荣耀')
s.main()