睿思是个很好的平台,给西电er在科研之余带来了很多的乐趣,在水睿思的时候就想能不能写个程序自动回复帖子,于是就有了玩转睿思系列博客。睿思自动回帖共包括三部分:
玩转睿思一:模拟浏览器回帖
玩转睿思二:生成回帖信息
玩转睿思三:自动有选择地回帖
本文介绍如何从睿思自动获得最新回复和最新发表帖子的id,将帖子信息保存到数据库,然后对数据库中保存的帖子有选择性地回复,并将回复信息保存入数据库,更新数据库
一.获取最新回复帖子id,将帖子信息存入数据库
<1>获取帖子id
睿思http://rs.xidian.edu.cn/forum.php?mod=guide&view=newthread 页面为最新发表的帖子列表,http://rs.xidian.edu.cn/forum.php?mod=guide&view=new 页面为最新回复的帖子列表,两个页面的布局相似,帖子id位于target="_blank", class="xst"的a标签中。
<a href="forum.php?mod=viewthread&tid=982061&extra=" target="_blank" class="xst" >好桑心</a>
def get_tiezi_id(url,cookie,userAgent):
htmlText = get_html(url,cookie,userAgent)
tieziIdList = []
soup = BeautifulSoup(htmlText,features="html5lib")
lines = soup.find_all('a',attrs={'class': 'xst','target':'_blank'}) #searchObj顺序对应每个楼层的用户id
if not lines:
print('get tid error')
return None
pattern = 'tid=[0-9]+'
for lin in lines:
href = lin['href']
searchObj = re.search(pattern,href)
tid = href[searchObj.span()[0]:searchObj.span()[1]][len('tid='):].strip()
tieziIdList.append(tid)
return tieziIdList
<2>.将帖子信息存入数据库
SQLite 是一个软件库,实现了自给自足的、无服务器的、零配置的、事务性的 SQL 数据库引擎,其数据库就是一个文件。python中集成了数据库sqlite3。使用下面函数建立一个数据库,并在其中创建表用于保存帖子信息:tid,fid,pbt(发表时间),title,ret(回复时间),remes(回复内容),isrep(是否已经回复过标志)
def create_table(dbPath,tableName):
conn = sqlite3.connect(dbPath) #连接到数据库,如果数据库dbPath不存在则建立一个dbPath数据库
cur = conn.cursor()
cur.execute('''CREATE TABLE {tableName}
(
TID INT NOT NULL,
FID INT,
PBT TEXT,
TITLT TEXT,
RET TEXT,
REMES TEXT,
ISREP INT
);'''.format(tableName = tableName))
conn.commit()
conn.close()
将获取的帖子id保存入dbPath中的tableName
def insert_tid(tid_list,dbPath,tableName):
conn = sqlite3.connect(dbPath)
cur = conn.cursor()
if not tid_list:
print('tid is erroe')
return
for tid in tid_list:
selelR = cur.execute('SELECT TID from {tableName} WHERE TID = {tid}'.format(tableName = tableName,tid = tid )).fetchall()
if not selelR:
cur.execute('INSERT INTO {tableName} (TID,ISREP) VALUES({tid},0)'.format(tableName = tableName,tid = tid))
conn.commit()
print('INSERT INTO {tableName} (TID,ISREP) VALUES({tid},0) OK'.format(tableName = tableName,tid = tid))
conn.commit()
conn.close()
<3>.实验
import AutoReply
import time
import random
cookie = ''
userAgent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0'
url = 'http://rs.xidian.edu.cn/forum.php?mod=guide&view=new'
db = 'ruisi.db'
table = 'ruisi'
while (True):
try:
tid_list = AutoReply.get_tiezi_id(url,cookie,userAgent)
except BaseException:
tid_list = []
#AutoReply.create_table(db,table) #初次运行时创建表
AutoReply.insert_tid(tid_list,db,table)
timeGap = 60*(random.randint(10,100)/10)
time.sleep(timeGap)
二.选择性的回复帖子并保存回复信息
在回复帖子的时候要符合论坛规定,发表时间过长的帖子不要回复,在页面中发表时间有两种形式:
<em id="authorposton25139294">发表于 <span title="2018-12-24 22:25:37">5 天前</span></em>
<em id="authorposton25116227">发表于 2018-12-19 16:46:32</em>
使用下面函数获取帖子发表时间
def get_tiezi_time(htmlText):
pattern = 'authorposton[0-9]+'
searchObj = re.findall(pattern,htmlText)
if not searchObj:
print('get tieze publish time error')
return None
soup = BeautifulSoup(htmlText,features="html5lib")
line = soup.find_all(attrs={'id': searchObj[0]}) #searchObj顺序对应每个楼层的用户id
if not line:
print('get tieze publish time error')
return None
if not line[0].span:
tieziTime = line[0].contents[0][len('发表于'):].strip()
else:
tieziTime = line[0].span['title'] #id为"authorposton25139294"的标签中span标签的title属性值
return tieziTime
从数据库中读取帖子id,并有选择地进行回复,将回复信息存入数据库
reType为0表示该帖子没有被自动回复过,reType为1表示该帖子被自动回复过,reType>1时,reType表示该帖子的fid,且该帖子没有被回复过
def reply(dbPath,tableName,cookie,userAgent,reType=1):
conn = sqlite3.connect(dbPath)
cur = conn.cursor()
selelR = cur.execute('SELECT TID from {tableName} WHERE ISREP = 0'.format(tableName = tableName)).fetchall()
for tid in selelR:
print('\n********************************************************{tid}********************************************************'.format(tid=tid[0]))
tid = tid[0]
url = 'http://rs.xidian.edu.cn/forum.php?mod=viewthread&tid='+str(tid)
htmlText = get_html(url,cookie,userAgent)
title = get_title(htmlText)
if not title:
print('get title error,replay {tid} error'.format(tid=tid))
continue
pubtime = get_tiezi_time(htmlText)
if not pubtime:
print('get pubtime error,replay {tid} error'.format(tid=tid))
continue
else:
pubDay = time.strptime(pubtime,"%Y-%m-%d %H:%M:%S").tm_mday
nowDay = time.localtime(time.time()).tm_mday
#发表时间过长的帖子不要回复
if (nowDay-pubDay+31)%31>1:
print('tid is old:{pbTime}'.format(tid =tid,pbTime = pubtime))
order = 'UPDATE {tableName} SET FID={FID},ISREP={ISREP} WHERE TID = {TID}'.format(tableName=tableName,FID=fid,ISREP=-1,TID=tid)
cur.execute(order)
conn.commit()
continue
fid = get_info('fid',htmlText)
if not fid:
print('get fid error,replay {tid} error'.format(tid=tid))
continue
#...区的帖子不回复
prohibitFid = [106,546,568,110,565] #交易区
prohibitFid.extend([165,13,14,15,16,17,18,19,20,21,214,22,534,557,559]) #资源区
prohibitFid.extend([553,554,555]) #就业招聘区
prohibitFid.extend([566,119,2,141,137,94]) #站务管理区
prohibitFid.extend([563]) #邀请专区
prohibitFid.extend([142]) #失物招领区
prohibitFid.extend([134,560,548,145,144,91,152]) #学习交流区
#if int(fid) in prohibitFid:
if int(fid) != 72: #只回复灌水区
order = 'UPDATE {tableName} SET FID={FID},ISREP={ISREP} WHERE TID = {TID}'.format(tableName=tableName,FID=fid,ISREP=fid,TID=tid)
cur.execute(order)
conn.commit()
print('is not waterzone')
continue
messageList = get_replyed_message(htmlText)
if not messageList:
print('get messageList error,replay {tid} error'.format(tid=tid))
continue
else:
if len(messageList) < 5:
print('louceng taishao ')
continue
formhash = get_info('formhash',htmlText)
if not formhash:
print('get formhash error,replay {tid} error'.format(tid=tid))
continue
mesTime = None
if reType == 0:
mesTime = reply_random(messageList[1:],formhash,fid,tid,cookie,userAgent)
elif reType == 1:
mesTime = reply_random_similar(messageList[1:],formhash,fid,tid,cookie,userAgent)
elif reType == 2:
mesTime = reply_tuling(title,formhash,fid,tid,cookie,userAgent)
else:
print('replyType is error')
continue
if not mesTime:
print('relpy error')
continue
else:
reMes = mesTime[0]
retime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(mesTime[1])))
print('{retime} 回复 {tid} : {message}'.format(retime = retime,tid = tid,message = reMes))
order = 'UPDATE {tableName} SET FID={FID},PBT="{PBT}",TITLT="{TITLT}",RET="{RET}",REMES="{REMES}",ISREP={ISREP} WHERE TID = {TID}'.format(tableName=tableName,FID=fid,PBT=pubtime,TITLT=title,RET=retime,REMES=reMes,ISREP=1,TID=tid)
cur.execute(order)
conn.commit()
time.sleep(random.randint(15,55))
conn.commit()
conn.close()
import AutoReply
import sqlite3
import time
import random
cookie = ''
userAgent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0'
db = 'ruisi.db'
table = 'ruisi'
while(True):
AutoReply.reply(db,table,cookie,userAgent,reType=1)
timeGap = 60*(random.randint(50,300)/10)
time.sleep(timeGap)