#coding:utf-8 import urllib import urllib2 def tiebaSpider(url, beginPage, endPage): for page in range(beginPage, endPage + 1): pn = (page - 1) * 50 filename = "the" + str(page) + "page.html" url = url + "&pn=" + str(pn) html = loadPage(url, filename) writeFile(html, filename) def loadPage(url, filename): print "正在下载" + filename headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"} request = urllib2.Request(url, headers=headers) response = urllib2.urlopen(request) return response.read() def writeFile(html, filename): print "正在存储" + filename with open(filename, "w") as f: f.write(html) print "-" * 20 if __name__ == "__main__": kw = raw_input("请输入需要爬取的贴吧:") beginPage = int(raw_input("请输入起始页:")) endPage = int(raw_input("请输入终止页:")) url = "https://tieba.baidu.com/f?" key = urllib.urlencode({"kw" : kw}) url = url + key tiebaSpider(url, beginPage, endPage)
抓取百度贴吧
最新推荐文章于 2021-11-23 20:17:36 发布