import urllib.request
import urllib.parse
import os
#ba_name = "python"
ba_name=input('吧名')
start_page=int(input('爬取起始页'))
end_page=int(input('请输入结束页码'))
#start_page= 1
#end_page= 2
if not os.path.isdir(ba_name):
os.mkdir(ba_name)
url = "https://movie.douban.com/j/chart/top_list?type=13&interval_id=100%3A90&action=";
for page in range(start_page,end_page+1):
data = {
'kw':ba_name,
'pn':(page-1)*50
}
data = urllib.parse.urlencode(data)
url_t = url+data;
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
}
request = urllib.request.Request(url=url_t,headers=headers);
print("%s页开始下载....." % page)
response = urllib.request.urlopen(request);
filename = ba_name+'_'+str(page)+'.html';
filepath = ba_name+'/'+filename;
with open(filepath,'wb') as fp:
fp.write(response.read());
print("%s页结束下载....." % page)
print(response.read())
一个简单的贴吧爬取
最新推荐文章于 2024-05-23 21:34:38 发布