python3 抓取贴吧页面
1 from urllib import request, parse 2 import sys 3 4 5 def loadPage(url, filename): 6 """ 7 作用: 根据url发送请求, 获取服务器相应文件 8 url: 需要爬取的url地址 9 """ 10 print("正在下载" + filename) 11 headers = { 12 "User-Agent": "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50" 13 } 14 req = request.Request(url, headers=headers) 15 # 获得系统的编码 16 type = sys.getfilesystemencoding() 17 # 设置爬出内容的编码 18 print(type) 19 # ************************************************ 20 html = request.urlopen(req).read().decode(type) 21 # ************************************************ 22 print(html) 23 return html 24 25 26 def writePage(html, filename): 27 """ 28 作用: 将html内容写入到本地 29 html: 服务器相应文件内容 30 """ 31 print("正在保存" + filename) 32 # *********************************************** 33 with open(filename, "w", encoding="utf-8") as f: 34 # *********************************************** 35 f.write(html) 36 print("*" * 30) 37 38 39 def tiebaSpider(url, bingenPage, endPage): 40 """ 41 作用: 爬虫爬虫调度器, 负责组合处理每一个页面的url 42 url: 贴吧url的前部分, 43 beginPage: 起始页 44 endPage: 终止页 45 """ 46 for page in range(beginPage, endPage + 1): 47 pn = (page - 1) * 50 48 filename = " 第" + str(page) + "页.html" 49 fullurl = url + "&pn=" + str(pn) 50 print(fullurl) 51 html = loadPage(fullurl, filename) 52 writePage(html, filename) 53 54 55 if __name__ == "__main__": 56 kw = input("请输入需要爬取的贴吧名:") 57 beginPage = int(input("请输入起始页编号")) 58 endPage = int(input("请输入结束页编号:")) 59 url = "http://tieba.baidu.com/f?" 60 key = parse.urlencode({"kw": kw}) 61 fullurl = url + key 62 tiebaSpider(fullurl, beginPage, endPage)