import urllib.request
import urllib.parse
import os
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
#定义生成request对象方法
def request_handle(base_url,bar_name,page):
#构造参数
pn = (page-1)*50
data = {
'kw':bar_name,
'pn':pn
}
data = urllib.parse.urlencode(data)
#根据参数,生成对应的url
url = base_url+data
#构造headers
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36'
}
request = urllib.request.Request(url=url)
#返回request对象
return request
#定义下载方法
def download(request,page,bar_name):
response = urllib.request.urlopen(request)
if not os.path.exists(bar_name):
os.mkdir(os.path.join(os.getcwd(),bar_name))
filename = '第'+str(page)+'页.html'
filepath = os.path.join(bar_name,filename)
with open(filepath,'wb') as f1:
f1.write(response.read())
#主函数
def main():
bar_name = input('请输入您想爬取的吧名:')
start_page = int(input('请输入您想爬取的起始页码:'))
end_page = int(input('请输入您想爬取的结束页面:'))
base_url = 'http://tieba.baidu.com/f?ie=utf-8&'
#遍历需要爬取的页码,依次开始爬取内容
for page in range(start_page,end_page+1):
#构造reuest请求
request = request_handle(base_url,bar_name,page)
# 执行爬取函数,开始爬取
print('开始下载第:' + str(page) + '页')
download(request,page,bar_name)
print('结束下载第:' + str(page) + '页')
简单爬虫爬取百度贴吧
最新推荐文章于 2024-04-08 08:34:21 发布