任务需求:
# -*- coding: utf-8 -*-
# @Time : 2020/7/29 6:05 下午
# @Author : livein80
# @Email : 12985594@qq.com
# @File : ssyer.py
# @Software : PyCharm
import urllib.request
import urllib.parse
import ssl
class BaiduTieba():
def __init__(self):
self.base_url='http://tieba.baidu.com/f?'
self.context = ssl._create_unverified_context()
self.headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
}
def read_html(self,url):
req = urllib.request.Request(url,headers=self.headers)
res = urllib.request.urlopen(req,context=self.context)
html = res.read().decode('utf-8')
return html
def write_html(self,filename,html):
with open(filename,'w',encoding='utf-8') as file:
file.write(html)
def main(self):
key = input('请输入关键字 : ')
start_page = int(input('开始页'))
end_page = int(input('结束页'))
kw = {'kw':key}
kw = urllib.parse.urlencode(kw)
for i in range(start_page,end_page+1):
pn = (i-1)*50
url = self.base_url+kw+'&pn='+str(pn)
html = self.read_html(url)
file_name = '第{}页.html'.format(i)
self.write_html(file_name,html)
if __name__=='__main__':
spider = BaiduTieba()
spider.main()
本文介绍了如何使用Python实现了一个针对百度贴吧的爬虫,可以输入关键字搜索并下载指定范围的帖子页面,便于进行数据分析或信息收集。通过BaiduTieba类,实现了URL构建、HTTP请求头设置和未验证SSL连接等功能。
1317

被折叠的 条评论
为什么被折叠?



