使用urllib库爬取百度贴吧
代码如下
import urllib.request
import urllib.parse
def tieba_spider(url,begin_page,end_page):
'''
作用:贴吧爬虫调度器,负责组合处理每个页面的url
url:贴吧url的前半部分
begin_page:起始页码
end_page:结束页
'''
for page in range(begin_page,end_page+1):
print("保存数据,请等待!")
pn=(page-1)*50
file_name="第"+str(page)+"页.html"
full_url=url+"&pn="+str(pn)
html=load_page(full_url,file_name)
write_page(html,file_name)
def load_page(url,file_name):
'''
作用:根据url发送请求,获取服务器响应文件
url: 需要爬取的url地址
'''
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36'
}
request=urllib.request.Request(url,headers=headers)
return urllib.request.urlopen(request).read()
def write_page(html,filename):
'''
作用:将html内容写入本地文件
html: 服务器响应文件内容
'''
print("正在保存"+filename)
with open(filename,'w',encoding='utf-8') as file:
file.write(html.decode('utf-8'))
if __name__=="__main__":
kw=input("请输入需要爬取的贴吧名:")
begin_page=int(input("请输入起始页:"))
end_page=int(input("请输入结束页:"))
url='http://tieba.baidu.com/f?'
key=urllib.parse.urlencode({"kw":kw})
url=url+key
tieba_spider(url,begin_page,end_page)
结果: