本实例为通过对百度百科APP进行Fiddler抓包来获取到百科的搜索API并对其进行解析来获取结果,具体见代码:
import codecs
import datetime
import time
import requests
import json
headers = {
'Accept': 'application/json',
'content-type': 'application/x-www-form-urlencoded',
'Cookie': '_V=2.10.1; _T=dfysfewefi7yd3df; _A=baike_sapp; _C=3F3244A3E7B4019E5918FD5F8A4AED6F|O; BAIDUCUID=3F3244A3E7B4019E5918FD5F8A4AED6F|O; BAIDUID=CDE6CCCF3FBD674BF9116A9262A38B15:FG=1; SCENE=baikeapp',
'Referer': 'https://smartapps.cn/AZQtr4jkpf90T3X9QMWVLF1bkeV4LXxD/2.11.0.1/page-frame.html',
'User-Agent': 'Mozilla/5.0 (Linux; Android 5.1.1; LYA-AL20 Build/HUAWEIVOG-AL10; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/74.0.3729.136 Mobile Safari/537.36 swan/2.21.0 swan-baikeapp/5.1.1 (Baidu; P1 5.1.1)',
'Host': 'baikeapi.baidu.com',
'Connection': 'Keep-Alive',
'Accept-Encoding': 'gzip'
} #设置访问网页的Header,将浏览模式设置成手机模式,模拟手机上网(手机类型是iPhone)
# url="http://baikeapi.baidu.com/smartapp/appui/wiki/item?lemmaTitle=[keyword]&page=0"
url="https://baikeapi.baidu.com/smartapp/appui/wiki/item?lemmaTitle=[keyword]&page=0"
# key_word=['蜗居']
url3='https://baikeapi.baidu.com/smartapp/searchui/search/word?wd=%E5%AE%89%E5%AE%B6'
def getlemmaidReq(url3,key_word):
itemUrl = url3.replace('[keyword]', key_word)
requests.packages.urllib3.disable_warnings()
resp = requests.get(url, headers=headers, allow_redirects=False, verify=False)
# 请求访问获取多义项itemid和title
def reqItemUrl(url,key_word):
itemUrl=url.replace('[keyword]',key_word)
requests.packages.urllib3.disable_warnings()
resp = requests.get(itemUrl, headers=headers, allow_redirects=False, verify=False)
if resp.status_code >= 200 and resp.status_code < 300:
data=resp.json()
navi=data["data"]
if( 'navigation' in navi):
return data["data"]["navigation"]
else:
return []
# 对多义项进行解析
def paseitem(lemmas):
lemmaList=[]
for lemma in lemmas:
lemmadict=dict()
lemmadict['lemmaId']=str(lemma.get('lemmaId'))
lemmadict['lemmaDesc']=lemma.get('lemmaDesc')
lemmadict['rank']=str(lemma.get('rank'))
lemmaList.append(lemmadict)
return lemmaList
url2='https://baikeapi.baidu.com/smartapp/appui/wiki/item?lemmaTitle=[keyword]&lemmaId=[itemid]&page=0'
def reqBasicInfoUrl(url2):
requests.packages.urllib3.disable_warnings()
response=requests.get(url2,headers=headers,allow_redirects=False, verify=False)
data=response.json()
return data
# 获取词的基本信息
def paseLemmaMsg(data):
basicMsg=''
if 'structuredContent' in data['data']['version']['card']:
msgList=data['data']['version']['card']['structuredContent']
for msg in msgList:
basicMsg+=msg.get('name')
basicMsg+='='
basicMsg+=msg['value'][0][0].get('text')
basicMsg+='&'
return basicMsg
#读取文件
def getTitles(filepath):
file_name = filepath
titleList=[]
with open(file_name,"r", encoding='UTF-8') as file_obj:
for content in file_obj:
titleList.append(content)
return titleList
if __name__ == '__main__':
filepath='./keyword.txt'
date = datetime.datetime.now().strftime('%Y%m%d')
keywords=getTitles(filepath)
f1 = codecs.open('./baike/keywordMsg_'+date+'.txt', "a", 'utf-8')
f2 = codecs.open('./baike/lemmaMsg_'+date+'.txt', "a", 'utf-8')
for key_word in keywords:
# 获取多义项
key_word=key_word.replace("\r","").replace("\n","")
time.sleep(2)
itemdata=reqItemUrl(url,key_word)
if itemdata:
lemmaList=paseitem(itemdata)
for lemma in lemmaList:
# 获取所有多义项信息
url2 = 'https://baikeapi.baidu.com/smartapp/appui/wiki/item?lemmaTitle=' + key_word + '&lemmaId=' + lemma.get('lemmaId') + '&page=0'
time.sleep(2)
lemmaMsg=reqBasicInfoUrl(url2)
basicmsg=paseLemmaMsg(lemmaMsg)
lemmastr="百度百科"+"|"+key_word+"|"+lemma.get('lemmaDesc')+"|"+lemma.get('lemmaId')+"|"+basicmsg+"|"+url2+"|"+date+ "|" +'\r\n'
f2.write(lemmastr)
basicmsgStr="百度百科"+"|"+key_word+"|"+lemma.get('lemmaDesc')+"|"+lemma.get('lemmaId')+"|"+basicmsg+"|"+date+ "|" +'\r\n'
f1.write(basicmsgStr)
else:
basicmsgStr = "百度百科" + "|" + key_word + "|||||" + date+ "|" +'\r\n'
f1.write(basicmsgStr)