百度贴吧爬虫小程序
import os
import urllib.request
import urllib.parse
class BaiduTiebaSpider(object):
def init(self, tieba_name, page):
‘’’
初始化必要参数,完成基础设置
:param tieba_name:
:param page:
‘’’
self.base_url = 'https://tieba.baidu.com/f?’
self.tieba_name = tieba_name
self.page = page
self.headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36’,
}
def make_url(self):
'''
制作url,
:return:
'''
base_url = 'https://tieba.baidu.com/f?'
url_list = []
for p in range(1, self.page + 1):
pn = (p - 1) * 50
data = {
'kw': self.tieba_name,
'ie': 'utf-8',
'pn': pn
}
query_string = urllib.parse.urlencode(data)
url = base_url + query_string
url_list.append(url)
# print(url_list)
return url_list
def request(self, url):
'''
发送请求对象,获取响应
:return:
'''
# 创建请求对象
request = urllib.request.Request(url=url, headers=self.headers)
response = urllib.request.urlopen(request)
content = response.read()
return content
def keep(self, text, page_num):
"""
保存文件
:return:
"""
if not os.path.exists(self.tieba_name):
os.mkdir('%s' % self.tieba_name)
filename = '%s_第%s页.html' % (self.tieba_name, page_num)
dirname = '%s' % self.tieba_name
filepath = os.path.join(dirname, filename)
with open(filepath, 'wb') as f:
f.write(text)
def run(self):
'''
启动文件
:return:
'''
# print(self.make_url())
url_list = self.make_url()
for url in url_list:
text = self.request(url)
page_num = url_list.index(url) + 1
self.keep(text, page_num)
if name == ‘main’:
xiaodu = BaiduTiebaSpider(‘lol’, 5)
xiaodu.run()