import datetime
import requests
import json
import pandas as pd
import numpy as np
搜索词条,选定一定范围日期。
分析url中搜索词条和日期区间的位置,提供爬取URL模板。
word_url = 'https://index.baidu.com/api/SearchApi/index?area=0&word=[[%7B%22name%22:%22tesla%22,%22wordType%22:1%7D]]&startDate=2024-02-14&endDate=2024-03-14'
将标头信息填入以下请求头结构中。
COOKIES = 'BAIDUID=C7995A4C91DACF4BE2F9E87E344FE02E:FG=1; BIDUPSID=C7995A4C91DACF4BE2F9E87E344FE02E; PSTM=1700204189; BDUSS=jMxZmk5ZTczVEp2ekNqaWxOeEJ2ZzdWfjBDMWo5VEZ6MTd6fmptbDdDVU9rNE5sRVFBQUFBJCQAAAAAAAAAAAEAAADAj84M3kTfXkzHp8TqAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA4GXGUOBlxlN; delPer=0; BDRCVFR[feWj1Vr5u3D]=-_EV5wtlMr0mh-8uz4WUvY; __bid_n=18cbb4b2875230450a1e90; H_WISE_SIDS=40156_39996_40010_40161_40203_39662_40210_40206_40216_40223; H_WISE_SIDS_BFESS=40156_39996_40010_40161_40203_39662_40210_40206_40216_40223; MCITY=-167%3A51%3A; BAIDUID_BFESS=C7995A4C91DACF4BE2F9E87E344FE02E:FG=1; H_PS_PSSID=39661_40207_40212_40217_40224_40294_40291_40287_40317_40079_40365_40352_40367_40374_40338_40416; ZFY=kyS:B52iSXAScBHGhCvBi1V8TzFpPyw53EzHgheTIWXk:C; PSINO=3; BCLID=8449895968974362268; BCLID_BFESS=8449895968974362268; BDSFRCVID=KQLOJexroG3bL6vqNzk1KGMrlQpWxY5TDYrEOwXPsp3LGJLVYfzwEG0Pts1-dEu-S2EwogKKLgOTHULF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; BDSFRCVID_BFESS=KQLOJexroG3bL6vqNzk1KGMrlQpWxY5TDYrEOwXPsp3LGJLVYfzwEG0Pts1-dEu-S2EwogKKLgOTHULF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=tRAOoC8-fIvDqTrP-trf5DCShUFsBhorB2Q-XPoO3KOD8KQCKfT0MT_UQPcH2pojQ5bk_xbgy4op8P3y0bb2DUA1y4vpX45Eb2TxoUJ2-KDVeh5Gqq-KQJ-ebPRiWPb9QgbP2pQ7tt5W8ncFbT7l5hKpbt-q0x-jLTnhVn0MBCK0HPonHjKMe5Oy3H; H_BDCLCKID_SF_BFESS=tRAOoC8-fIvDqTrP-trf5DCShUFsBhorB2Q-XPoO3KOD8KQCKfT0MT_UQPcH2pojQ5bk_xbgy4op8P3y0bb2DUA1y4vpX45Eb2TxoUJ2-KDVeh5Gqq-KQJ-ebPRiWPb9QgbP2pQ7tt5W8ncFbT7l5hKpbt-q0x-jLTnhVn0MBCK0HPonHjKMe5Oy3H; Hm_lvt_d101ea4d2a5c67dab98251f0b5de24dc=1710208322; bdindexid=t2mb1875k79o2budqg67p7le44; SIGNIN_UC=70a2711cf1d3d9b1a82d2f87d633bd8a04605642077okCG4xQ8MQTUKIcKEOYegu9eAH7GUdbb9H9DhvOlmhvHv542sN3Th8fm4leYSLg4a%2Fd8Hi2erwDz%2BVNBHXItsQKC6VXTg7hdfaiWGKfNhIADmNWWA7x9ogTOvqCACRdSC0HmosrqM%2FOjdhPNNHXGfuNNq8ABcE%2F%2FjzhQo%2BB8hPm1gXpMoIjbYdRr9ijrhZ348imqyjWb%2Fkycz1LtnzdqEErI68TPbVTdO0kT3yn1pepydmmeo6b3oKLOE5%2Bif4dN2elDZCZmBQ1A1SWh1LU2h9PXzRHi%2BxQL2GtMhwU%2Fb9K1fzNuuWvwMLRKUhHTIZkR80785640456665541880016339253281; __cas__rn__=460564207; __cas__st__212=4db08f6a8b666ccb62b0bf4a1df2c0756e63f53445d3423308e00d4f27d4952b6b2476db2173dbde30490198; __cas__id__212=49668465; CPTK_212=331523714; CPID_212=49668465; Hm_lpvt_d101ea4d2a5c67dab98251f0b5de24dc=1710552568; BDUSS_BFESS=jMxZmk5ZTczVEp2ekNqaWxOeEJ2ZzdWfjBDMWo5VEZ6MTd6fmptbDdDVU9rNE5sRVFBQUFBJCQAAAAAAAAAAAEAAADAj84M3kTfXkzHp8TqAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA4GXGUOBlxlN; ab_sr=1.0.1_M2JlYTRiYTU4ZTAwMjhmNjlhNDY2ZTI3MTUzMzhhZjQ1Yzg4ODJmZGI4MmRkNDBmMWU2NjJhNzAwMGIyNTY3Y2VjYTZiZTQ0OGUxZWU3YmI4YjZkZjhlOWQxMjk1MmI4ZGIxYWRiMGU0ZWY2YTJhNjBiOWM4MTc5ODhlNjJjNGJiNjM5N2E1YjEwOWZkOTA2Y2QzNGQ4YWI3OTQwNjkzNQ==; RT="z=1&dm=baidu.com&si=0facbdd0-bcd5-4038-901e-108a86a06b90&ss=lttecjxs&sl=c&tt=ikk&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf&ld=icor&ul=iwut"'
def get_html(url):
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.188",
"Host":
"index.baidu.com",
"Referer":
"http://index.baidu.com/v2/main/index.html",
"Cipher-Text":
"1652425237825_1652501356206_VBpwl9UG8Dvs2fAi91KToRTSAP7sDsQU5phHL97raPDFJdYz3fHf9hBAQrGGCs+qJoP7yb44Uvf91F7vqJLVL0tKnIWE+W3jXAI30xx340rhcwUDQZ162FPAe0a1jsCluJRmMLZtiIplubGMW/QoE/0Pw+2caH39Ok8IsudE4wGLBUdYg1/bKl4MGwLrJZ7H6wbhR0vT5X0OdCX4bMJE7vcwRCSGquRjam03pWDGZ51X15fOlO0qMZ2kqa3BmxwNlfEZ81l3L9nZdrc3/Tl4+mNpaLM7vA5WNEQhTBoDVZs6GBRcJc/FSjd6e4aFGAiCp1Y8MD66chTiykjIN51s7gbJ44JfVS0NjBnsvuF55bs="
}
cookies = {'Cookie': COOKIES}
response = requests.get(url, headers=headers, cookies=cookies)
return response.text
get_html(word_url)
'{"status":0,"data":{"userIndexes":[{"word":[{"name":"tesla","wordType":1}],"all":{"startDate":"2024-02-14","endDate":"2024-03-14","data":"eewWKeWWCKeWeeKeWaYKeCUUKeJUWKeJeeKeAwYKeCACKeCwoKeWwwKeWUWKeJwCKeJaAKeCUCKeCJaKeYWoKeACCKeAeYKeJAUKeCAAKeCCwKeJeWKeawWKeweAKeWAaKeaYAKeJeeKeJUoKeJUA"},"pc":{"startDate":"2024-02-14","endDate":"2024-03-14","data":"WCUKwAwKwwUKwoUKJWUKoAaKoAAKaowKJCCKJJaKwJWKwwaKoAAKoaYKJWJKoUwKYWoKAYCKaWAKoaWKJaAKoUaKoACKCJUKAJeKwYaKJAwKoCUKoACKoAA"},"wise":{"startDate":"2024-02-14","endDate":"2024-03-14","data":"oJWKoowKooeKoJYKooUKoaJKoCJKoaCKooUKoCwKoCeKoCJKoYWKoYaKoJYKoJWKeUUUKYJUKoYaKoooKoYUKoaoKoCCKoCWKoAwKoaUKoaeKoaeKoCWKoCU"},"type":"day"}],"generalRatio":[{"word":[{"name":"tesla","wordType":1}],"all":{"avg":1538,"yoy":-15,"qoq":7},"pc":{"avg":660,"yoy":-21,"qoq":13},"wise":{"avg":877,"yoy":-10,"qoq":3}}],"uniqid":"078a9bd01a917ab25d15941660103f32"},"logid":2171604412,"message":0}'
找到数据获取的接口index,发现data字段被加密,但发现uniqid指向另一个接口(ptbk),其内容可能是某种加密字段。
全局搜索decrypt,找到函数所在文件路径,在源代码对应文件里找到该函数。
给函数打上断点重新搜索,发现传入的参数与ptbk接口参数及data参数一致,确定其为解密数据函数。
将其改写为python代码。
# decrypt: function(t, e) {
# if (!t)
# return "";
# for (var n = t.split(""), a = e.split(""), i = {}, r = [], o = 0; o < n.length / 2; o++)
# i[n[o]] = n[n.length / 2 + o];
# for (var s = 0; s < e.length; s++)
# r.push(i[a[s]]);
# return r.join("")
# };
# def decrypt(t, e):
# n = list(t)
# i = list(e)
# a = {}
# result = []
# ln = int(len(n) / 2)
# start = n[ln:]
# end = n[:ln]
# for j, k in zip(start, end):
# a.update({k: j})
# for j in i:
# result.append(a.get(j))
# return ''.join(result)
def decrypt(t, e):
if not t:
return ""
n = list(t)
a = list(e)
i = {}
r = []
for o in range(len(n) // 2):
i[n[o]] = n[len(n) // 2 + o]
for s in range(len(e)):
r.append(i[a[s]])
return ''.join(r)
def get_ptbk(uniqid):
url = 'http://index.baidu.com/Interface/ptbk?uniqid={}'
resp = get_html(url.format(uniqid))
return json.loads(resp)['data']
def get_data(keyword, start='2021-06-13', end='2021-08-11'):
url = "https://index.baidu.com/api/SearchApi/index?area=0&word=[[%7B%22name%22:%22{}%22,%22wordType%22:1%7D]]&startDate={}&endDate={}".format(
keyword, start, end)
data = get_html(url)
data = json.loads(data)
uniqid = data['data']['uniqid']
data = data['data']['userIndexes'][0]['all']['data']
ptbk = get_ptbk(uniqid)
result = decrypt(ptbk, data) # 所有指数
result = result.split(',')
start = start.split("-")
end = end.split("-")
a = datetime.date(int(start[0]), int(start[1]), int(start[2]))
b = datetime.date(int(end[0]), int(end[1]), int(end[2]))
node = 0
y = pd.DataFrame(columns=['日期', '指数'])
for i in range(a.toordinal(), b.toordinal() + 1): # +1
date = datetime.date.fromordinal(i)
if len(result[node]) == 0:
result[node] = "0"
print(date, result[node])
y.loc[len(y)] = [date, result[node]]
node += 1
y.index.name = '序号'
y.to_csv(r'test-2023-2024.csv')
keyword = "庄达菲"
start_date = "2023-03-02"
end_date = "2024-03-01" # 上限 366days
result = get_data(keyword, start_date, end_date)
# result.to_csv(r'test-2023-2024.csv')
2023-03-02 3078
2023-03-03 3637
2023-03-04 4802
2023-03-05 3999
2023-03-06 2884
2023-03-07 2828
2023-03-08 2472
2023-03-09 2348
2023-03-10 3521
2023-03-11 5437
......
2024-02-21 15390
2024-02-22 23688
2024-02-23 22015
2024-02-24 18546
2024-02-25 14737
2024-02-26 10945
2024-02-27 7836
2024-02-28 7972
2024-02-29 6707
2024-03-01 5913
百度限制API查询时间跨度为一年。
PS:可以补充时间区间检查、登录检查、关键词存在等功能。