查找北大未名BBS上任意ID的所有發帖

最新推荐文章于 2020-03-14 15:23:38 发布

KalariKong

最新推荐文章于 2020-03-14 15:23:38 发布

阅读量2.7k

点赞数

分类专栏： Python黑科技

本文链接：https://blog.csdn.net/ssy8stephy/article/details/47231797

版权

Python黑科技专栏收录该内容

20 篇文章

订阅专栏

北大未名BBS全站搜索只能返回100條信息，對於大部分ID來說，這100個帖子只是冰山一角。因此人家開發了可以查找指定ID的全部帖子(免登錄能看到的版面的帖子)的PY。

嗯，現在很晚了，02:07了，人家要睡覺了。不多說了，直接上代碼吧。meow是用戶名

#-*-coding=utf-8-*-
#爬BBS上某ID的所有帖子
import requests
import re
from multiprocessing.dummy import Pool
from lxml import etree

def getsth(url):
    html=requests.get(url).text
    print html.encode('gb18030')
    content=re.findall('<tr><td class=body.*?</tr>',html,re.S)
    boardname=[]#the name of these board
    i=0
    for each in content:
        i=i+1
        selector=etree.HTML(each)
        boardname.append(selector.xpath('//td/a/text()')[0])
    print i
    j=0
    for each in boardname:#how many post in each board
        j=j+1
        html=requests.get('http://bbs.pku.edu.cn/bbs/bbssearch.php?board='+each+'&go=W&to=meow').text
        number=re.findall('name=meow">(.*?)</a>',html,re.S)
        i=0
        for eac in number:
            i=i+1
        if  j%50==0:
            print j
        if i>0:
            print str(j)+'\t'+str(i)+'\t'+each+'\t'+'http://bbs.pku.edu.cn/bbs/bbssearch.php?board='+each+'&go=W&to=meow'


if __name__=='__main__':
    url='http://bbs.pku.edu.cn/bbs/bbsall.php?'
    getsth(url)
    print '===========finish==========='

========2015年9月8日更新========

#-*-coding=utf-8-*-
#爬BBS上某ID的所有帖子
import requests
import re
from multiprocessing.dummy import Pool
from lxml import etree

def getsth(url,idname):
    html=requests.get(url).text
    print html.encode('gb18030')
    content=re.findall('<tr><td class=body.*?</tr>',html,re.S)
    boardname=[]#the name of these board
    i=0
    for each in content:
        i=i+1
        selector=etree.HTML(each)
        boardname.append(selector.xpath('//td/a/text()')[0])
    print i
    j=0
    for each in boardname:#how many post in each board
        j=j+1
        html=requests.get('http://bbs.pku.edu.cn/bbs/bbssearch.php?board='+each+'&go=W&to='+idname).text
        number=re.findall('name='+idname+'">(.*?)</a>',html,re.S)
        i=0
        for eac in number:
            i=i+1
        if  j%50==0:
            print j
        if i>0:
            print str(j)+'\t'+str(i)+'\t'+each+'\t'+'http://bbs.pku.edu.cn/bbs/bbssearch.php?board='+each+'&go=W&to='+idname

def test():
    url='http://bbs.pku.edu.cn/bbs/bbssearch.php?board=boy&go=W&to='+idname
    html=requests.get(url).text
    print html
    number=re.findall('name='+idname+'">(.*?)</a>',html,re.S)
    i=0
    for eac in number:
        i=i+1
        print eac
    print i

if __name__=='__main__':
    url='http://bbs.pku.edu.cn/bbs/bbsall.php?'
    idname='meow'
    getsth(url,idname)
    print '===========finish==========='