可以爬取任意贴吧,自定义任意页数并且保存页面。主要用到了request模块和quote。
from urllib.request import Request,urlopen
from urllib.parse import quote
def get_html(url):
headers = {
"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.69"
}
requests = Request(url,headers=headers)
response = urlopen(requests)
return response.read().decode()
def save_html(html,filename):
with open(filename,'w',encoding='utf-8') as f:
f.write(html)
def main():
content = input("请输入要获取哪个贴吧:")
n = int(input("请输入要爬取多少页:"))
for i in range(n):
url = "https://tieba.baidu.com/f?kw=" + quote(content) + "&ie=utf-8&pn={}".format(i*50)
print(url)
html = get_html(url)
filename = "第" + str(i+1) + "页.html"
save_html(html,filename)
if __name__ == '__main__':
main()