实现步骤
- 查看是否为静态页面
右键 - 查看网页源代码 - 搜索数据关键字 - 找URL查询字符串规律
- 获取网页内容
- 提取所需数据
- 保存(本地文件、数据库)
代码
from urllib import request,parse
import time
import random
from fake_useragent import UserAgent
class TiebaSpider(object):
def __init__(self):
self.url = 'http'://tieba.baidu.com/f?kw={}&pn={}'
def get_html(self,url):
headers = {'User-Agent':UserAgent().random)}
req = request.Request(url=url,headers=headers)
resp = request.urlopen(req)
html = resp.read().decode()
return html
def parse_html(self):
pass
def save_html(self,filename,html):
with open(filename,'w') as f:
f.write(html)
def run(self):
name = input('请输入贴吧名:')
begin = int(input('请输入起始页:')
end = int(input('请输入终止页:')
params = parse.quote(name)
for page in range(begin,end+1):
pn = (page-1)*50
url = self.url.format(params,pn)
html = self.get_html(url)
filename = name + '-第%s页.html' % str(page)
self.save_html(filename,html)
print('第%d页抓取成功' % page)
time.sleep(random.uniform(0,1))
if __name__ == '__main__':
spider = TiebaSpider()
spider.run()