玩转睿思三:自动有选择地回帖

睿思是个很好的平台,给西电er在科研之余带来了很多的乐趣,在水睿思的时候就想能不能写个程序自动回复帖子,于是就有了玩转睿思系列博客。睿思自动回帖共包括三部分:
玩转睿思一:模拟浏览器回帖
玩转睿思二:生成回帖信息
玩转睿思三:自动有选择地回帖

本文介绍如何从睿思自动获得最新回复和最新发表帖子的id,将帖子信息保存到数据库,然后对数据库中保存的帖子有选择性地回复,并将回复信息保存入数据库,更新数据库


一.获取最新回复帖子id,将帖子信息存入数据库

<1>获取帖子id

睿思http://rs.xidian.edu.cn/forum.php?mod=guide&view=newthread 页面为最新发表的帖子列表,http://rs.xidian.edu.cn/forum.php?mod=guide&view=new 页面为最新回复的帖子列表,两个页面的布局相似,帖子id位于target="_blank", class="xst"的a标签中。

 <a href="forum.php?mod=viewthread&amp;tid=982061&amp;extra=" target="_blank" class="xst" >好桑心</a>
def get_tiezi_id(url,cookie,userAgent):
    htmlText = get_html(url,cookie,userAgent)
    tieziIdList = []

    soup = BeautifulSoup(htmlText,features="html5lib")
    lines = soup.find_all('a',attrs={'class': 'xst','target':'_blank'})   #searchObj顺序对应每个楼层的用户id

    if not lines:
        print('get tid error')
        return None

    pattern = 'tid=[0-9]+' 
    for lin in lines:
        href = lin['href']

        searchObj = re.search(pattern,href)
        tid = href[searchObj.span()[0]:searchObj.span()[1]][len('tid='):].strip()
        tieziIdList.append(tid)
    
    return tieziIdList
<2>.将帖子信息存入数据库

SQLite 是一个软件库,实现了自给自足的、无服务器的、零配置的、事务性的 SQL 数据库引擎,其数据库就是一个文件。python中集成了数据库sqlite3。使用下面函数建立一个数据库,并在其中创建表用于保存帖子信息:tid,fid,pbt(发表时间),title,ret(回复时间),remes(回复内容),isrep(是否已经回复过标志)

def create_table(dbPath,tableName):
    conn = sqlite3.connect(dbPath)   #连接到数据库,如果数据库dbPath不存在则建立一个dbPath数据库
    cur = conn.cursor()
    
    cur.execute('''CREATE TABLE {tableName}
    (
       TID INT NOT NULL,
       FID INT,
       PBT TEXT,
       TITLT TEXT,
       RET TEXT,
       REMES TEXT,
       ISREP INT
    );'''.format(tableName = tableName))
    conn.commit()
    conn.close()

将获取的帖子id保存入dbPath中的tableName

def insert_tid(tid_list,dbPath,tableName):
    conn = sqlite3.connect(dbPath)
    cur = conn.cursor()

    if not tid_list:
        print('tid is erroe')
        return 
    for tid in tid_list:   
        selelR = cur.execute('SELECT TID from {tableName} WHERE TID = {tid}'.format(tableName = tableName,tid = tid )).fetchall()
        if not selelR:
            cur.execute('INSERT INTO {tableName} (TID,ISREP) VALUES({tid},0)'.format(tableName = tableName,tid = tid))
            conn.commit()
            print('INSERT INTO {tableName} (TID,ISREP) VALUES({tid},0) OK'.format(tableName = tableName,tid = tid))
        
    conn.commit()
    conn.close()
<3>.实验
import AutoReply 
import time
import random

cookie = ''
userAgent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0'

url = 'http://rs.xidian.edu.cn/forum.php?mod=guide&view=new'
db = 'ruisi.db'
table = 'ruisi'    
while (True):
    try:
        tid_list = AutoReply.get_tiezi_id(url,cookie,userAgent)
    except BaseException:
        tid_list = []
    
    #AutoReply.create_table(db,table)   #初次运行时创建表
    
    AutoReply.insert_tid(tid_list,db,table)
    timeGap = 60*(random.randint(10,100)/10)
    time.sleep(timeGap)
   

二.选择性的回复帖子并保存回复信息

在回复帖子的时候要符合论坛规定,发表时间过长的帖子不要回复,在页面中发表时间有两种形式:

<em id="authorposton25139294">发表于 <span title="2018-12-24 22:25:37">5&nbsp;天前</span></em>
<em id="authorposton25116227">发表于 2018-12-19 16:46:32</em>

使用下面函数获取帖子发表时间

def get_tiezi_time(htmlText):
    pattern = 'authorposton[0-9]+'
    searchObj = re.findall(pattern,htmlText)
    if not searchObj:
        print('get tieze publish time error')
        return None

    soup = BeautifulSoup(htmlText,features="html5lib")
    line = soup.find_all(attrs={'id': searchObj[0]})   #searchObj顺序对应每个楼层的用户id
    if not line:
        print('get tieze publish time error')
        return None
    
    if not line[0].span:
        tieziTime = line[0].contents[0][len('发表于'):].strip()
    else:
        tieziTime = line[0].span['title'] #id为"authorposton25139294"的标签中span标签的title属性值
   
    return tieziTime  

从数据库中读取帖子id,并有选择地进行回复,将回复信息存入数据库
reType为0表示该帖子没有被自动回复过,reType为1表示该帖子被自动回复过,reType>1时,reType表示该帖子的fid,且该帖子没有被回复过

def reply(dbPath,tableName,cookie,userAgent,reType=1):
    conn = sqlite3.connect(dbPath)
    cur = conn.cursor()

    selelR = cur.execute('SELECT TID from {tableName} WHERE ISREP = 0'.format(tableName = tableName)).fetchall()
    for tid in selelR:
        print('\n********************************************************{tid}********************************************************'.format(tid=tid[0]))
        tid = tid[0]
        url = 'http://rs.xidian.edu.cn/forum.php?mod=viewthread&tid='+str(tid)
        htmlText = get_html(url,cookie,userAgent)

        title = get_title(htmlText)
        if not title:
            print('get title error,replay {tid} error'.format(tid=tid))
            continue

        pubtime = get_tiezi_time(htmlText)
        if not  pubtime:
            print('get  pubtime error,replay {tid} error'.format(tid=tid))
            continue
        else:
            pubDay = time.strptime(pubtime,"%Y-%m-%d %H:%M:%S").tm_mday
            nowDay = time.localtime(time.time()).tm_mday
            #发表时间过长的帖子不要回复
            if (nowDay-pubDay+31)%31>1:
                print('tid is old:{pbTime}'.format(tid =tid,pbTime = pubtime))
                order = 'UPDATE {tableName} SET FID={FID},ISREP={ISREP} WHERE TID = {TID}'.format(tableName=tableName,FID=fid,ISREP=-1,TID=tid)
                cur.execute(order)
                conn.commit()
                continue

       
        fid = get_info('fid',htmlText)
        if not fid:
            print('get fid error,replay {tid} error'.format(tid=tid))
            continue

        #...区的帖子不回复
        prohibitFid = [106,546,568,110,565] #交易区
        prohibitFid.extend([165,13,14,15,16,17,18,19,20,21,214,22,534,557,559]) #资源区
        prohibitFid.extend([553,554,555]) #就业招聘区
        prohibitFid.extend([566,119,2,141,137,94]) #站务管理区
        prohibitFid.extend([563]) #邀请专区
        prohibitFid.extend([142]) #失物招领区
        prohibitFid.extend([134,560,548,145,144,91,152]) #学习交流区
        
        #if int(fid) in prohibitFid:
        if int(fid) != 72:  #只回复灌水区
            order = 'UPDATE {tableName} SET FID={FID},ISREP={ISREP} WHERE TID = {TID}'.format(tableName=tableName,FID=fid,ISREP=fid,TID=tid)
            cur.execute(order)
            conn.commit()
            print('is not waterzone')
            continue

        messageList = get_replyed_message(htmlText)
        if not messageList:
            print('get messageList error,replay {tid} error'.format(tid=tid))
            continue
        else:
            if len(messageList) < 5:
                print('louceng taishao ')
                continue

        formhash = get_info('formhash',htmlText)
        if not formhash:
            print('get formhash error,replay {tid} error'.format(tid=tid))
            continue

        mesTime = None
        if reType == 0:
            mesTime = reply_random(messageList[1:],formhash,fid,tid,cookie,userAgent)
        elif reType == 1:
            mesTime = reply_random_similar(messageList[1:],formhash,fid,tid,cookie,userAgent)
        elif reType == 2:
            mesTime = reply_tuling(title,formhash,fid,tid,cookie,userAgent)
        else:
            print('replyType is error')
            continue
        
        if not mesTime:
            print('relpy error')
            continue
        else:
    
            reMes = mesTime[0]
            retime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(mesTime[1])))
            print('{retime} 回复 {tid} : {message}'.format(retime = retime,tid = tid,message = reMes))
            order = 'UPDATE {tableName} SET FID={FID},PBT="{PBT}",TITLT="{TITLT}",RET="{RET}",REMES="{REMES}",ISREP={ISREP} WHERE TID = {TID}'.format(tableName=tableName,FID=fid,PBT=pubtime,TITLT=title,RET=retime,REMES=reMes,ISREP=1,TID=tid)
            cur.execute(order)
            conn.commit()
        
        time.sleep(random.randint(15,55))   
       
    conn.commit()
    conn.close()
import AutoReply
import sqlite3
import time
import random

cookie = ''
userAgent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0'

db = 'ruisi.db'
table = 'ruisi'
while(True):   
    AutoReply.reply(db,table,cookie,userAgent,reType=1)
    
    timeGap = 60*(random.randint(50,300)/10)
    time.sleep(timeGap)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值