该楼层疑似违规已被系统折叠 隐藏此楼查看此楼
from urllib import request
import urllib
import time
header={"User_Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"}
#url规律 pn=(page-1)*50
def loadpage(fullurl,filename):
print("正在下载:",filename)
req=request.Request(fullurl,headers=header)
rep=request.urlopen(req).read()
return rep
def writepage(html,filename):
print("正在保存:",filename)
with open(filename,"wb") as f:
f.write(html)
print("---------------")
#for i in range(1,4):
# print("http://tieba.baidu.com/f?kw=%E6%88%90%E9%83%BD%E7%90%86%E5%B7%A5%E5%A4%A7%E5%AD%A6&ie=utf-8&pn="+str((i-1)*50))
#构造Url
def tiebaSpader(url,startpage,endpage):
for page in range(startpage,endpage+1):
pn=(page-1)*50
fullurl=url+"&pn="+str(pn)#每次请求的完整url
filename="E:/第"+str(page)+"页.html"#每次请求后保存的文件名
html=loadpage(fullurl,filename)#调用爬虫,爬去网页
writepage(html,filename)#把获取的网页信息写入到本地
print("谢谢使用")
if __name__=='__main__':
kw=input("请输入贴吧名:")
startpage=int(input("请输入起始页:"))
endpage=int(input("请输入结束页:"))
url="http://tieba.baidu.com/f?"
key=urllib.parse.urlencode({"kw":kw})
url=url+key
time.sleep(5)