1.代码中需要引入的包
import re
from urllib import request
from fake_useragent import UserAgent
agent = UserAgent()
from tool import Tools
import xlwt
2.解析url
爬取-----精品贴
https://tieba.baidu.com/f?kw=nba&tab=good&cid=&pn=0
kw 贴吧名称
tab 帖子类型
pn 数据页码
https://tieba.baidu.com/p/5328438222?pn=2
/p/5381402933 帖子详情地址
pn 页码
import re
from urllib import request
from fake_useragent import UserAgent
agent = UserAgent()
from tool import Tools
import xlwt
2.解析url
爬取-----精品贴
https://tieba.baidu.com/f?kw=nba&tab=good&cid=&pn=0
kw 贴吧名称
tab 帖子类型
pn 数据页码
https://tieba.baidu.com/p/5328438222?pn=2
/p/5381402933 帖子详情地址
pn 页码
3. 代码出现编码格式错误
改正方法如下:
get_html函数中把self.html = response.read().decode('utf-8')改为:
self.html = response.read().decode('utf-8', 'ignore')
5.获取下一页的简单方式
index = self.html.find('class="next pagination-item')
print(index)
完整代码:
# -*- coding:utf-8 -*-
import re
from urllib import request
from fake_useragent import UserAgent
agent = UserAgent()
from tool import Tools
import xlwt
'''
爬取-----精品贴
https://tieba.baidu.com/f?kw=nba&tab=good&cid=&pn=0
kw 贴吧名称
tab 帖子类型
pn 数据页码
https://tieba.baidu.com/p/5328438222?pn=2
/p/5381402933 帖子详情地址
pn 页码
'''
class BDTBSpider(object):
def __init__(self):
self.url = 'https://tieba.baidu.com'
self.html = ''
self.headers = {'User-Agent': agent.random}
# 发起请求
def get_html(self, url):
req = request.Request(url, headers=self.headers)
response = request.urlopen(req)
self.html = response.read().decode('utf-8', 'ignore')
# 解析数据
def parse_link(self):
pattern = re.compile('<div class="threadlist_title.*?<a rel="noreferrer".*?href="(.*?)".*?title="(.*?)"', re.S)
res = re.findall(pattern, self.html)
for info in res:
print("正在爬取{},请稍后.....".format(info[1]))
# 拼接帖子详情地址
url = self.url + info[0]
print('帖子链接:{}'.format(url))
self.get_html(url)
# 创建一个workbook
workbook = xlwt.Workbook(encoding='utf-8')
sheet = workbook.add_sheet('data')
sheet.write(0, 0, '用户昵称')
sheet.write(0, 1, '用户头衔')
sheet.write(0, 2, '用户等级')
sheet.write(0, 3, '发表内容')
sheet.write(0, 4, '客户端')
sheet.write(0, 5, '楼层')
sheet.write(0, 6, '发布日期')
self.count = 1
# 帖子标题传进来
self.parse_detail(sheet)
# 保存
workbook.save(info[1] + '.xls')
# 先找到class="next pagination-item 字符的位置
index = self.html.find('class="next pagination-item')
print(index)
next_html = self.html[index-80:index]
next_pat = re.compile('<a href="(.*?)"')
# 该贴吧主页的下一页
next_link = re.search(next_pat, next_html)
# if next_link:
# link = 'http:' + next_link.group(1)
# print(next_link.group(1).split('&')[-1])
# self.get_html(link)
# self.parse_link()
# else:
# print('没有下一页')
# 解析详情页的函数
def parse_detail(self, sheet):
# 准备正则,从self.html中解析数据
pattern = re.compile('<li class="d_name".*?>(.*?)</li>.*?class="d_badge_title ">(.*?)</div>.*?class="d_badge_lv">(.*?)</div>.*?<cc>(.*?)</cc>.*?<div class="post-tail-wrap">(.*?)</div>', re.S)
res = re.findall(pattern, self.html)
# print(res)
# for循环拿回数据
for info in res:
nickname = Tools.strip_char(info[0])
content = Tools.strip_char(info[3])
# print(nickname, info[1], info[2], content)
# msg 是一个元组,元组存放的是处理之后的数据
msg = Tools.get_client_floor_date(info[4])
print(msg)
sheet.write(self.count, 0, nickname)
sheet.write(self.count, 1, info[1])
sheet.write(self.count, 2, info[2])
sheet.write(self.count, 3, content)
sheet.write(self.count, 4, msg[0])
sheet.write(self.count, 5, msg[1])
sheet.write(self.count, 6, msg[2])
self.count += 1
# 帖子链接,进去之后获取楼主等详细信息,找下一页位置
index = self.html.find('下一页')
if index != -1:
next_html = self.html[index-40:index]
pattern = re.compile('<a href="(.*?)"')
next_href = re.search(pattern, next_html).group(1)
print(next_href)
# 拼接完整地址 例如:https://tieba.baidu.com/p/5328438222?pn=2
url = self.url + next_href
self.get_html(url)
self.parse_detail(sheet)
else:
print('没有下一页')
def start(self):
self.get_html('https://tieba.baidu.com/f?kw=%s&tab=good&cid=&pn=0' % name)
# print(self.url)
self.parse_link()
# self.parse_detail()
if __name__ == '__main__':
bdtb = BDTBSpider()
name = input('请输入贴吧名称:')
bdtb.start()
引入的工具类的完整代码:
# -*- coding:utf-8 -*-
import re
import sqlite3
class Tools(object):
@classmethod
def strip_char(cls, string):
"""
:param string: 要处理的数据
:return: 处理之后的数据
"""
# 利用正则去除特殊字符
# (第一个string是内涵段子中引用时用到的)
string = re.sub(re.compile('\n|\t| |<.*?>', re.S), '', string)
# 糗事百科用到的下面两个string
# string = re.sub(re.compile('\n|\t| ', re.S), '', string)
# 将换行标签替换为\n
# string = re.sub(re.compile('</br/>'), '\n', string)
return string
# 处理百度贴吧中的客户端 楼层 发布日期
@classmethod
def get_client_floor_date(cls, string):
'''
:param string:包含客户端\楼层\发布日期的字符串
:return: 元组
'''
pattern = ''
client = ''
floor = ''
date = ''
if '来自' in string:
pattern = re.compile('<span class="tail-info".*?<a rel="noopener.*?>(.*?)</a>.*?class="tail-info">(.*?)</span.*?class="tail-info">(.*?)</span>')
res = re.search(pattern, string)
client = '来自' + res.group(1)
floor = res.group(2)
date = res.group(3)
else:
pattern = re.compile('<span class="tail-info">(.*?)</span.*?class="tail-info">(.*?)</span>')
res = re.search(pattern, string)
client = '来自Web客户端'
floor = res.group(1)
date = res.group(2)
# 返回结果元组
return client, floor, date
# 数据库管理
class DBManager(object):
# 声明类变量
connect = None
cursor = None
# 连接数据库
@classmethod
def connect_db(cls):
cls.connect = sqlite3.connect('qsbk.db')
cls.cursor = cls.connect.cursor()
@classmethod
def close_db(cls):
cls.cursor.close()
cls.connect.close()
# 向数据库中插入数据
@classmethod
def insert_data(cls, dz_tuple):
sql = "insert into qsbk(name,age,content,vote,comments) values('%s', %s, '%s', %s, %s) " % (dz_tuple[0], dz_tuple[1], dz_tuple[2], dz_tuple[3], dz_tuple[4])
cls.cursor.execute(sql)
cls.connect.commit()
if __name__ == '__main__':
DBManager.connect_db()
运行结果: