北大未名BBS全站搜索只能返回100條信息,對於大部分ID來說,這100個帖子只是冰山一角。因此人家開發了可以查找指定ID的全部帖子(免登錄能看到的版面的帖子)的PY。
嗯,現在很晚了,02:07了,人家要睡覺了。不多說了,直接上代碼吧。meow是用戶名
#-*-coding=utf-8-*-
#爬BBS上某ID的所有帖子
import requests
import re
from multiprocessing.dummy import Pool
from lxml import etree
def getsth(url):
html=requests.get(url).text
print html.encode('gb18030')
content=re.findall('<tr><td class=body.*?</tr>',html,re.S)
boardname=[]#the name of these board
i=0
for each in content:
i=i+1
selector=etree.HTML(each)
boardname.append(selector.xpath('//td/a/text()')[0])
print i
j=0
for each in boardname:#how many post in each board
j=j+1
html=requests.get('http://bbs.pku.edu.cn/bbs/bbssearch.php?board='+each+'&go=W&to=meow').text
number=re.findall('name=meow">(.*?)</a>',html,re.S)
i=0
for eac in number:
i=i+1
if j%50==0:
print j
if i>0:
print str(j)+'\t'+str(i)+'\t'+each+'\t'+'http://bbs.pku.edu.cn/bbs/bbssearch.php?board='+each+'&go=W&to=meow'
if __name__=='__main__':
url='http://bbs.pku.edu.cn/bbs/bbsall.php?'
getsth(url)
print '===========finish==========='
========2015年9月8日更新========
#-*-coding=utf-8-*-
#爬BBS上某ID的所有帖子
import requests
import re
from multiprocessing.dummy import Pool
from lxml import etree
def getsth(url,idname):
html=requests.get(url).text
print html.encode('gb18030')
content=re.findall('<tr><td class=body.*?</tr>',html,re.S)
boardname=[]#the name of these board
i=0
for each in content:
i=i+1
selector=etree.HTML(each)
boardname.append(selector.xpath('//td/a/text()')[0])
print i
j=0
for each in boardname:#how many post in each board
j=j+1
html=requests.get('http://bbs.pku.edu.cn/bbs/bbssearch.php?board='+each+'&go=W&to='+idname).text
number=re.findall('name='+idname+'">(.*?)</a>',html,re.S)
i=0
for eac in number:
i=i+1
if j%50==0:
print j
if i>0:
print str(j)+'\t'+str(i)+'\t'+each+'\t'+'http://bbs.pku.edu.cn/bbs/bbssearch.php?board='+each+'&go=W&to='+idname
def test():
url='http://bbs.pku.edu.cn/bbs/bbssearch.php?board=boy&go=W&to='+idname
html=requests.get(url).text
print html
number=re.findall('name='+idname+'">(.*?)</a>',html,re.S)
i=0
for eac in number:
i=i+1
print eac
print i
if __name__=='__main__':
url='http://bbs.pku.edu.cn/bbs/bbsall.php?'
idname='meow'
getsth(url,idname)
print '===========finish==========='