from urllib import request, parse, error
import json
import time
import os
class post_bar():
def creeper(self, name, page, path):
base_url = 'https://tieba.baidu.com/f?kw={}&ie=utf-8&pn={}'
head = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
}
try:
path1 = os.path.join(path,name)
if not os.path.exists(path1):
os.mkdir(path1) # 创建文件夹
for i in range(page):
i = i * 50 # 贴吧每页帖子数量为50
name1 = parse.quote(name) # 将中文转码
base_url1 = base_url.format(name1, i)
response = request.Request(base_url1, headers=head)
response = request.urlopen(response)
response1 = response.read().decode("utf-8")
print(path1)
with open(path1 + "//" +name + str(i) + '.html', 'w',encoding="utf-8") as e:
e.write(response1) # 将获取到的页面写入路径中
except error.HTTPError as e:
print(e.code) # 获取报错代码
except error.URLError as e:
print(e.reason) # 获取报错原因
file = os.getcwd()[0:-5] # 获取当前路径
tieba = post_bar() # 实例化对象
tieba.creeper(名字, 页数, 路径) # 调用函数