# -*- coding:utf-8 -*-
import re
from urllib import request
# 引入自定义的工作类 在此博文后会给出
from tools import Tools
from fake_useragent import UserAgent
agent = UserAgent()
import xlwt
"""
https://tieba.baidu.com/f?kw=nba&tab=good&cid=&pn=0
kw 贴吧名称
tab 帖子类型
pn 数据页码
https://tieba.baidu.com/p/5328438222?pn=2
/p/5381402933 帖子详情地址
pn 页码
"""
class BDTBSpider(object):
def __init__(self):
self.url = 'https://tieba.baidu.com'
self.html = ''
self.headers = {'User-Agent':agent.random}
def get_html(self, url):
"""
:param url:传入完整的url
:return:
"""
req = request.Request(url, headers=self.headers)
response = request.urlopen(req)
self.html = response.read().decode('utf-8','ignore')
# 从大网页中解析标题链接与标题
def parse_link(self):
pattern = re.compile('<div class="threadlist_title pull_left j_th_tit.*?".*?<a rel="noreferrer".*?href="('
'.*?)".*?title="(.*?)"',re.S)
res = re.findall(pattern, self.html)
# print(res)
for info in res:
print("正在爬取{},请稍后.....".format(info[1]))
# 拼接帖子详情地址
url = self.url + info[0]
self.get_html(url)
# 创建一个workbook
workbook = xlwt.Workbook(encoding='utf-8')
sheet = workbook.add_sheet('data')
sheet.write(0, 0, '用户昵称')
sheet.write(0, 1, '用户头衔')
sheet.write(0, 2, '用户等级')
sheet.write(0, 3, '发表内容')
sheet.write(0, 4, '客户端')
sheet.write(0, 5, '楼层')
sheet.write(0, 6, '发布日期')
self.count = 1
# 帖子标题传进来
self.parse_detail(sheet)
# 保存
workbook.save(info[1] + '.xls')
# 以下代码为获取到下一页链接
# 先找到class="next pagination-item 字符的位置
# 在这里注意看网页的源代码
index = self.html.find('class="next pagination-item')
next_html = self.html[index-80:index]
# 定义获取下一页链接的正则
next_pat = re.compile('<a href="(.*?)"')
next_link = re.search(next_pat, next_html)
if next_link:
link = 'http:'+next_link.group(1)
print(link)
self.get_html(link)
self.parse_link()
else:
print('没有下一页')
# 解析详情页的函数,获取帖子具体内容
def parse_detail(self, sheet):
print('正在爬取下一页,请稍后')
pattern = re.compile('<li class="d_name".*?>(.*?)</li>.*?<div class="d_badge_title ">(.*?)</div>.*?<div class="d_badge_lv">(.*?)</div>.*?<cc>(.*?)</cc>.*?<div class="post-tail-wrap"(.*?)</div>', re.S)
res = re.findall(pattern, self.html)
for info in res:
nickname = Tools.strip_char(info[0])
badge_title = info[1]
badge_lv = info[2]
content = Tools.strip_char(info[3])
# msg是一个元组,元组存放的是处理之后的数据
msg = Tools.get_client_floor_data(info[4])
sheet.write(self.count, 0, nickname)
sheet.write(self.count, 1, badge_title)
sheet.write(self.count, 2, badge_lv)
sheet.write(self.count, 3, content)
sheet.write(self.count, 4, msg[0])
sheet.write(self.count, 5, msg[1])
sheet.write(self.count, 6, msg[2])
self.count += 1
# 获取下一页
index = self.html.find('下一页')
if index != -1:
next_html = self.html[index-50:index]
next_pattern = re.compile('<a href="(.*?)"')
next_link = re.search(next_pattern,next_html).group(1)
# print(next_link)
# 拼接url
next_url = self.url + next_link
self.get_html(next_url)
self.parse_detail(sheet)
else:
print('没有下一页')
工具类Tools所在文件tools:
# -*- coding:utf-8 -*-
import re
import sqlite3
class Tools(object):
@classmethod
def strip_char(cls,string):
"""
:param string: 要进行处理的数据
:return: 处理之后的数据
"""
# 利用正则去除字符
string = re.sub(re.compile('\n|\t| |<.*?>', re.S), '', string)
# 将换行标签替换为\n
# string = re.sub(re.compile('</br/>'),'\n',string)
return string
#处理百度天霸中的客户端 楼层 发布日期
@classmethod
def get_client_floor_data(cls,string):
"""
:param string: 包含客户端\楼层、发布日期的字符串
:return: 元组
"""
pattern = ''
client = ''
floor = ''
date = ''
if '来自' in string:
pattern = re.compile('<span class="tail-info".*?<a rel="noopener.*?>(.*?)</a>.*?class="tail-info">(.*?)</span.*?class="tail-info">(.*?)</span>')
res = re.search(pattern,string)
client = '来自' + res.group(1)
floor = res.group(2)
date = res.group(3)
else:
pattern = re.compile('<span class="tail-info">(.*?)</span.*?class="tail-info">(.*?)</span>')
res = re.search(pattern, string)
client = '来自Web客户端'
floor = res.group(1)
date = res.group(2)
# 返回结果元组
return client,floor,date
# 数据库的管理类
class DBManger(object):
# 声明类变量
connect = None
cursor = None
# 连接数据库的操作
@classmethod
def connect_db(cls):
cls.connect = sqlite3.connect('qsbk.db')
cls.cursor = cls.connect.cursor()
# 关闭数据库的操作
@classmethod
def close_db(cls):
cls.cursor.close()
cls.connect.close()
# 向数据库中插入数据
@classmethod
def insert_data(cls,dz_tuple):
sql = 'INSERT INTO qsbk(name,age,content,vote,comments)VALUES ("%s",%s,"%s",%s,%s)'%(dz_tuple[0],dz_tuple[1],dz_tuple[2],dz_tuple[3],dz_tuple[4])
# 执行SQL语句
cls.cursor.execute(sql)
cls.connect.commit()
DBManger.connect_db()
运行结果展示: