fiddler抓包获取公众号文章页面链接
1,类型地址
https://redu.lopao.com/api/article/cate
2,具体文章链接
https://redu.lopao.com/api/article/lists?category=mil&page=0
问题描述
需要注意的是,这里不能请求到某个类别的所有页面,只需稍做处理即可
源码如下:
class wechatSpider:
def __init__(self):
"""
数据初始化
"""
self.url = 'https://redu.lopao.com/api/article/cate'
self.headers = {'User-Agent':UserAgent().random}
def get_PageRes(self, url):
"""
页面响应
:return:
"""
try:
res = requests.get(url=url, headers=self.headers)
if res.status_code == 200:
return res.text
except Exception as e:
pass
def getHtml(self):
"""
数据解析
:return:
"""
typeData = self.get_PageRes(url=self.url)
typeDataList = json.loads(typeData)
item = {}
for typeData in typeDataList['data']:
# 用户提示
print(typeData['text'], end=' ')
k = typeData['text']
v = typeData['name']
item[k] = v
print('\n')
word = input('请输入分类名称:').strip()
if word in item.keys():
self.getAllPage(item[word])
else:
print('---输入有误,请重新输入---')
某个类别的所有页
def getAllPage(self, word):
"""
获取所有文章链接
:param word:
:return:
"""
count = 0
while True:
url = 'https://redu.lopao.com/api/article/lists?category={}&page={}'.format(word, count)
infoData = self.get_PageRes(url)
dataList = json.loads(infoData)['data']
if dataList['next_item'] != None:
infoUrl = url.format(word, count+1)
data = self.get_PageRes(infoUrl)
self.getInfoData(data)
else:
print('---已获取所有文章链接---')
break
count += 1
数据已字典的格式输出
def getInfoData(self, data):
"""
数据处理
:return:
"""
item = {}
infoData = json.loads(data)['data']
for dataInfo in infoData['data']:
item['title'] = dataInfo['title']
item['src'] = dataInfo['src']
item['desc'] = dataInfo['desc']
item['url'] = dataInfo['url']
print(item)