"""
贴吧
"""
import requests
import re
from urllib import parse
import time
import random
class BaiduSpider:
def __init__(self):
self.url = 'http://tieba.baidu.com/f?kw={}&pn={}'
self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201'}
def get_html(self,url):
html = requests.get(url=url,headers=self.headers).content.decode('utf-8')
return html
def parse_html(self):
pass
def save_html(self,filename,html):
with open(filename,'w') as f:
f.write(html)
def run(self):
name = input('请输入贴吧名')
# page = int(input("请输入页码"))
start = int(input('开始页'))
end = int(input('结束页'))
params = parse.quote(name)
for page in range(start,end+1):
pn = (page - 1) *50
url = self.url.format(params,pn)
html = self.get_html(url)
filename = '{}第{}页.html'.format(name,page)
self.save_html(filename,html)
print('完成')
time.sleep(random.randint(1,8))
if __name__ == '__main__':
spider = BaiduSpider()
spider.run()
百度贴吧爬虫获取网页
最新推荐文章于 2024-09-09 13:39:05 发布