从百度贴吧抓取前若干页所有帖子的脚本 import re, string, urllib f = open("百度贴吧.html", "w+") def baidutieba(url,PostBegin,PostEnd): for i in range(PostBegin, PostEnd ,50): m = re.findall(r"f?kz=/d.+?<//a>",urllib.urlopen(url+str(i)).read(),re.M) for j in m: if j.find("onclick")==-1: line1 = j.split('"_blank">')[1] title = line1.split("</a>")[0] line2 = j.split("=")[1] num = line2.split('"')[0] UrlNews = '<a href="http://tieba.baidu.com/f?kz=' + str(num) + '" mce_href="http://tieba.baidu.com/f?kz=' + str(num) + '">' + title +'</a><br/>/n' print UrlNews f.write(UrlNews) f.close() tiebaurl = 'http://tieba.baidu.com/f?kw=2012&pn=' iPostBegin = 0 iPostEnd = 500 baidutieba(tiebaurl,iPostBegin,iPostEnd)