'''
@文件功能描述:提取指定帖子网页文本、图片(包括表情包)并写入word,同时下载语音和视频
@贴吧id示例:5706691546
@运行后会在当前路径生成一个文件夹,名称为贴吧ID,包含四个子文件夹,存放分过类的文档、图片、语音、视频。
@作者:吴越南蛮
@完工日期:2019.01.28
@语音功能大致完成
@新增视频下载功能
'''
import requests
import re
import os
import json
from time import clock # 代码计时模块
from docx.image.exceptions import UnrecognizedImageError # 超链接图片抛出的异常在docx.image中
from docx import Document # docx对象
from docx.oxml.ns import qn # 设置字体用
from docx.shared import Inches # 设置docx图片用
from bs4 import BeautifulSoup
from multiprocessing import Pool # 进程池
from functools import partial # 偏函数
from retrying import retry # 重试模块
def Make_dir(tid):
'''建立文件夹,并返回所有文件夹路径列表'''
allpath = os.getcwd() + '\\' + tid+'\\' # 存放图片的路径
path1 = allpath+'floor_headimg\\' # 存放楼主头像路径
path2 = allpath+'floor_img\\' # 存放楼层图片路径
path3 = allpath+'comment_headimg\\' # 存放评论人头像路径
path4 = allpath+ 'comment_img\\' # 存放评论图片路径
path5 = allpath+'document\\' # 存放文本文件路径
path6 = allpath + 'voice\\' # 保存语音的路径
path7 = allpath + 'video\\' # 保存视频的路径
pathlist = (allpath,path1,path2,path3,path4,path5,path6,path7)
for i in pathlist:
if not os.path.exists(i):
os.mkdir(i) # 如果文件夹不存在则新建
return pathlist
@retry(wait_fixed=1,stop_max_attempt_number=5)
def Check_url(tid):
'''检测tid是否合法,若合法则返回数据,否则返回空'''
url = 'https://tieba.baidu.com/p/'+tid+'?pn=1'
response = requests.get(url,allow_redirects = False) # 试着请求第一页数据,参数表示禁止页面跳转
if response.status_code == 200: # 成功获得请求
string = response.text
pn = re.compile('<title>(.*?)</title>')
title = re.search(pn,string).group(1) # 正则匹配标题
if title == '贴吧404':
print('帖子被隐藏或不存在,请重新输入')
return None
elif title == '百度贴吧':
print('该吧被合并,请重新输入')
return None
else:
return string
elif response.status_code == 301:
r = requests.get(url)
if r.status_code == 200:
return r.text
else:
stacode = r.status_code
print('错误码' + str(stacode) + ',请重新输入')
else:
stacode = response.status_code
print('错误码' + str(stacode) + ',请重新输入')# 状态码非200,则输出错误状态码
def Initdoc():
'''初始化doc对象,并返回'''
doc = Document() # 建立一个document实例
doc.styles['Normal'].font.name = 'Helvetica Neue' # 设置正文格式
doc.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), 'Helvetica Neue') # 设置正文格式
return doc
def Init():
'''获取帖子基本信息,并返回部分数据'''
global pic_dic
pic_dic = {} # 初始化一个全局变量字典,用于存放图片url和对应的路径
while True:
tid = input('请输入帖子id:')
string = Check_url(tid) # 检测tid是否合法
if string:
break # tid检测通过则跳出循环
mode0 = input('是否只看楼主?是输入1,否输入0\n')
if mode0 != '0' and mode0 != '1':
exit()
mode1 = input('是否抓取评论内容?是输入1,否输入0\n')
if mode1 != '0' and mode1 != '1':
exit()
modelist = (mode0, mode1)
print('正在进行初始化。。。')
if mode0 == '1':
url = 'https://tieba.baidu.com/p/' + tid + '?see_lz=1&pn=1' # 如果只看楼主,则重新获取页码
string = requests.get(url).text
doc = Initdoc()
pathlist = Make_dir(tid)
pn = re.compile("fid:'(\d+)'")
fid = re.search(pn, string).group(1) # 正则匹配获取fid(这是json数据不能用美味汤)
soup = BeautifulSoup(string,features='lxml') # 美味汤提取需要的信息
title = soup.title.string
page = soup.find('li', class_="l_reply_num").contents[2].string
result = soup.find(lambda tag: tag.has_attr('username'))
author = result['username']
lzpic = result['src']
grade = soup.find('div', class_="d_badge_lv").string
url = 'url: https://tieba.baidu.com/p/' + tid
run = doc.add_heading('', level=2).add_run(title) # 写入标题
run.font.name = '宋体' # 设置标题字体
run._element.rPr.rFonts.set(qn('w:eastAsia'), '宋体')
doc.add_paragraph('楼主:' + author) # 写入楼主昵称
doc.add_paragraph('贴吧等级:' + grade) # 写入楼主等级
picpath = Download_pic(lzpic, pathlist[1]) # 下载楼主头像
doc.add_picture(picpath,width=Inches(1)) # 写入楼主头像,宽度为1英寸
doc.add_paragraph(url) # 写入帖子url
doc.add_paragraph('共' + page + '页') # 写入页码数
docpath = pathlist[5] + title + '.docx'
doc.save(docpath) # 保存doc文件到document文件夹
print('初始化成功,共' + page + '页')
return (int(page), tid, fid, pathlist,modelist)
@retry(wait_fixed=5)
def Get_floor_datalist(tid,page,mode):
'''获取楼层数据列表,并返回'''
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) C'
'hrome/55.0.2883.87 UBrowser/6.2.4094.1 Safari/537.36'
}
if mode == '1': # 是否只看楼主url有一点不一样
url = 'https://tieba.baidu.com/p/' + tid + '?see_lz=1&pn='+page
else:
url = 'https://tieba.baidu.com/p/' + tid + '?pn='+page
response = requests.get(url,headers=headers)
string = response.text
pn1 = re.compile('<img username="(.*?)" class="" src="(.*?)"(?: data-tb-lazyload="(.*?)")?/>')
pn2 = re.compile('id="post_content_(\d+)".*?display:;">(.*?)</div><br>', re.S)
pn3_1 = re.compile('<span class="tail-info">(\d+)楼</span>.*?"tail-info">(.*?)</span>', re.S)
pn3_2 = re.compile('"content":({.*?})')
temp_list = re.findall(pn1, string)
content_list = re.findall(pn2, string)
foot_list = re.findall(pn3_1, string)
user_list = []
for i in temp_list:
j = list(i) # 正则findall方法返回的是一个包含了元组的列表,需要转化为列表进行编辑
if j[2] == '': # 这种情况是因为有的src属性下是一个默认头像,真实头像在lazyload属性下面
del j[2] # 根据情况删除多余的项
else:
del j[1]
user_list.append(j)
if len(foot_list)==0:
foot_list = []
for i in re.findall(pn3_2,string):
temp = i.replace('"', '"')
tail_list = json.loads(temp)
flrnum = tail_list['post_no']
time = tail_list['date']
foot_list.append((str(flrnum), time))
return list(zip(user_list, content_list, foot_list))# 返回数据
@retry(wait_fixed=1)
def Get_comment_datalist(tid,pid,page):
'''获取评论数据列表,并返回'''
url = 'https://tieba.baidu.com/p/comment?tid=' + tid + '&pid=' + pid + '&pn=' + page
response = requests.get(url)
pn1 = re.compile('{"spid":"(\d+)"')
spid = re.findall(pn1,response.text)
pn = re.compile('username="(.*?)"><img src="(.*?)"/>.*?"lzl_content_main">(.*?)</span>.*?"lzl_time">(.*?)</span>')
comlist = re.findall(pn,response.text)
return tuple(zip(spid,comlist))
def Parse_comment_data(tid,pid,doc,pathlist):
'''解析评论内容'''
page = 0 # 初始化楼层评论页码为0
while True:
page +=1 # 楼层评论页码递增
comlist = Get_comment_datalist(tid, pid, str(page)) # 获取相应的楼层数据列表
if len(comlist): # 通过列表长度判断是否为空
for i in comlist:
doc.add_paragraph(i[1][0]) # 写入评论人昵称
picpath = Download_pic(i[1][1], pathlist[3]) # 下载评论人头像
try:
doc.add_picture(picpath, width=Inches(1)) # 写入评论人头像,宽度为1英寸
except UnrecognizedImageError:
pass
Parse_content(i[1][2],tid,i[0],doc,pathlist[4],pathlist[6],pathlist[7])
doc.add_paragraph(i[1][3]) # 写入评论时间
else: # 说明读取到空页(也就是所有页码数据读取完毕)
break # 如果列表为空,跳出循环
def Parse_content(content,tid,pid,doc,pic_path,voice_path,video_path):
'''解析正文数据'''
temp = content.replace('<br><br>','<br>') # 双换行符转为单换行符
temp = temp.replace('<br>','\n') # br转为换行符
temp = temp.replace('<br/>', '\n') # br转为换行符
temp = temp.replace('>', '>') # 替换HTML转义字符
temp = temp.replace('"', '"')
temp = temp.replace(' ', ' ')
temp = re.sub('<a href=".*?".*?class="at">', '@', temp) # @语句转为@
pn = re.compile('<div class="voice_player.*?<span class="speaker speaker_animate">', re.S)
result = re.findall(pn, temp)
if not len(result)==0:
result = result[0]
Download_voice(tid, pid, voice_path)
temp = re.sub(pn, 'pid:' + pid + ' ', temp)
rs = re.sub('<img class="j_voice_ad_gif".*?/>', '', temp) # 去掉提示语音的图片
conlist = re.split('(<img.*?>)', rs)
for i,j in enumerate(conlist):
if i%2==0:
tp = re.split('<embed.*?data-video="(.*?)".*?>',j)
for k,l in enumerate(tp):
if k%2==0:
text = re.sub('<.*?>', '', l).strip() # 去除其余的标签和两端的空格
result = re.split('&#x(.*?);', text) # 进一步处理可能出现的HTML转移字符(格式为�x0283)
for k, m in enumerate(result):
if k % 2 != 0:
result[k] = chr(int(m, 16)) # 转化为十六进制数并转为对应的ASCII码
text = ''.join(result)
if k == 0:
paragraph = doc.add_paragraph(text) # 第一段文本需要新建一个段落
else:
paragraph.add_run(text) # 追加楼层文本
else:
Download_video(l,video_path)
doc.add_paragraph(l)
else:
src = re.search('src="(.*?)"', j).group(1) # 重新匹配img标签下的src属性
width = re.search('width="(.*?)"', j) # 重新匹配img标签下的width属性
if width: # width可能为None
width = int(width.group(1)) * 10000 # 写入width需要的宽度是网页width属性值*10000
try:
picpath = Download_pic(src, pic_path) # 下载图片
run = paragraph.add_run()
run.add_picture(picpath, width=width) # 追加图片,并指定宽度
except: # 有的超链接图片会报错
pass
@retry(wait_fixed=1,stop_max_attempt_number=5)
def Download_pic(url,path):
'''下载图片到指定文件夹,并返回写入图片的路径'''
global pic_dic # 全部变量字典
if url in pic_dic.keys():
filepath = pic_dic[url] # 如果url在字典中,则跳过下载步骤(防止重复下载相同图片)
else:
if url.startswith('http')==False:
url = 'http:' + url # 有的图片url没有http或者https头,则加上http
filename = url.split('/')[-1] # 取url最后一段
filename = filename.split('?')[0] # 有的图片url有参数,去除参数
filepath = path + filename # 生成路径
temp = os.path.splitext(filepath) # 从路径中分离后缀名
print('正在下载图片'+filename+'。。。')
response = requests.get(url)
content = response.content
if temp[1]=='': # 有的图片url不含后缀名
ext = response.headers['Content-Type'].lstrip('image/') # 从headers中取出后缀名
filepath = filepath + '.'+ext # 加上后缀名
file = open(filepath, 'wb')
file.write(content) # 写入到本地图片
file.close
pic_dic[url] = filepath
return filepath # 返回最终写入图片的路径
def Download_voice(tid,pid,path):
headers = {
'Cookie':'BAIDUID=C60BF37451CEA27DDC645FF2B66A171A:FG=1; BIDUPSID=C60BF37451CEA27DDC645FF2B66A17'
'1A; PSTM=1546781299; TIEBA_USERTYPE=87db289fa99dbc157b31dd26; bdshare_firstime=15468761'
'16968; TIEBAUID=7b00a4229d95323b1b9270f6; IS_NEW_USER=c45bea2255d884ab5865d09b; CLIENTW'
'IDTH=921; CLIENTHEIGHT=1920; 754897887_FRSVideoUploadTip=1; LONGID=754897887; Hm_lvt_28'
'7705c8d9e2073d13275b18dbd746dc=1548485695,1548515348,1548515536,1548516349; Hm_lpvt_287'
'705c8d9e2073d13275b18dbd746dc=1548516349; wise_device=0; Hm_lvt_98b9d8c2fd6608d564bf2ac'
'2ae642948=1548515347,1548515535,1548516348,1548516516; Hm_lpvt_98b9d8c2fd6608d564bf2ac2'
'ae642948=1548516516; delPer=0; PSINO=1; H_PS_PSSID=1466_21087_28328_28413_22158; BDORZ=F'
'FFB88E999055A3F8A630C64834BD6D0; ZD_ENTRY=baidu',
'User-Agent':'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/5'
'5.0.2883.87 UBrowser/6.2.4094.1 Safari/537.36'
}
url = 'https://tieba.baidu.com/voice/index?tid='+tid+'&pid='+pid
print('正在下载语音'+pid+'。。。')
r = requests.get(url,headers=headers)
filepath = path+pid+'.wav'
file = open(filepath,'wb')
file.write(r.content)
file.close()
def Download_video(url,path):
filename = url.split('/')[-1]
print('正在下载视频'+filename+'。。。')
filepath = os.path.join(path,filename)
r = requests.get(url)
file = open(filepath,'wb')
file.write(r.content)
file.close()
def Main(page,tid,fid,pathlist,modelist):
'''解析帖子所有数据'''
global pic_dic
pic_dic = {}
page = str(page)
doc = Initdoc()
flrlist = Get_floor_datalist(tid, page,modelist[0]) # 获取楼层数据列表
for i in flrlist:
doc.add_paragraph(i[2][0]+'楼:') # 写入楼层号
if modelist[0] == '0':
doc.add_paragraph(i[0][0]) # 用户名
picpath = Download_pic(i[0][1], pathlist[1])
doc.add_picture(picpath,width=Inches(1)) # 用户头像
Parse_content(i[1][1],tid,i[1][0],doc,pathlist[2],pathlist[6],pathlist[7]) # 写入楼层内容
doc.add_paragraph(i[2][1]) # 写入楼层时间
if modelist[1] == '1':
Parse_comment_data(tid,i[1][0],doc,pathlist)
doc.add_paragraph('\n')
docpath = pathlist[5]+page+'.docx'
doc.save(docpath)
if __name__=='__main__':
data = Init()
start = clock()
pool = Pool()
pool.map(partial(Main, tid=data[1], fid=data[2], pathlist=data[3], modelist=data[4]),
[i for i in range(1, data[0] + 1)])
end = clock()
print('抓取成功,总共运行时间{:.2f}秒'.format((end - start)))
百度贴吧爬虫
最新推荐文章于 2023-03-16 21:33:14 发布