python爬取百度贴吧,任意吧任意页数的网页源代码
from urllib import request
import urllib
import time
from urllib import parse
header = {'User-Agent':'(Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.36 Safari/537.36'}
# https://tieba.baidu.com/f?kw=python&ie=utf-8&pn=0
# https://tieba.baidu.com/f?kw=python&ie=utf-8&pn=50
# https://tieba.baidu.com/f?kw=python&ie=utf-8&pn=100
def loadpage(fullurl,filename):
print("正在下载:")
req = request.Request(fullurl,headers=header)
res = request.urlopen(req).read()
return res
def writepage(html,filename):
print("正在保存",filename)
with open (filename,"wb")as f:
f.write(html)
print("---------------------------------------")
#构造url
def tiebaspider (url,begin,end):
for page in range(begin,end+1):
pn = (page-1)*50
fullurl = url + "&pn"+str(pn)
filename = "D:/第"+str(page)+"页.html"
html = loadpage(fullurl,filename)
writepage(html,filename)
if __name__ == '__main__':
kw = input("请输入贴吧名字:")
begin = int(input("请输入起始页:"))
end = int(input("请输入结束页:"))
url = (r"https://tieba.baidu.com/")
key = urllib.parse.urlencode({"kw":kw})
url = url + key
tiebaspider(url, begin, end)
time.sleep(10)