利用Python爬取百度指数数据

import datetime
import requests
import json
import pandas as pd
import numpy as np

搜索词条,选定一定范围日期。

1

分析url中搜索词条和日期区间的位置,提供爬取URL模板。

word_url = 'https://index.baidu.com/api/SearchApi/index?area=0&word=[[%7B%22name%22:%22tesla%22,%22wordType%22:1%7D]]&startDate=2024-02-14&endDate=2024-03-14'

2

将标头信息填入以下请求头结构中。

COOKIES = 'BAIDUID=C7995A4C91DACF4BE2F9E87E344FE02E:FG=1; BIDUPSID=C7995A4C91DACF4BE2F9E87E344FE02E; PSTM=1700204189; BDUSS=jMxZmk5ZTczVEp2ekNqaWxOeEJ2ZzdWfjBDMWo5VEZ6MTd6fmptbDdDVU9rNE5sRVFBQUFBJCQAAAAAAAAAAAEAAADAj84M3kTfXkzHp8TqAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA4GXGUOBlxlN; delPer=0; BDRCVFR[feWj1Vr5u3D]=-_EV5wtlMr0mh-8uz4WUvY; __bid_n=18cbb4b2875230450a1e90; H_WISE_SIDS=40156_39996_40010_40161_40203_39662_40210_40206_40216_40223; H_WISE_SIDS_BFESS=40156_39996_40010_40161_40203_39662_40210_40206_40216_40223; MCITY=-167%3A51%3A; BAIDUID_BFESS=C7995A4C91DACF4BE2F9E87E344FE02E:FG=1; H_PS_PSSID=39661_40207_40212_40217_40224_40294_40291_40287_40317_40079_40365_40352_40367_40374_40338_40416; ZFY=kyS:B52iSXAScBHGhCvBi1V8TzFpPyw53EzHgheTIWXk:C; PSINO=3; BCLID=8449895968974362268; BCLID_BFESS=8449895968974362268; BDSFRCVID=KQLOJexroG3bL6vqNzk1KGMrlQpWxY5TDYrEOwXPsp3LGJLVYfzwEG0Pts1-dEu-S2EwogKKLgOTHULF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; BDSFRCVID_BFESS=KQLOJexroG3bL6vqNzk1KGMrlQpWxY5TDYrEOwXPsp3LGJLVYfzwEG0Pts1-dEu-S2EwogKKLgOTHULF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=tRAOoC8-fIvDqTrP-trf5DCShUFsBhorB2Q-XPoO3KOD8KQCKfT0MT_UQPcH2pojQ5bk_xbgy4op8P3y0bb2DUA1y4vpX45Eb2TxoUJ2-KDVeh5Gqq-KQJ-ebPRiWPb9QgbP2pQ7tt5W8ncFbT7l5hKpbt-q0x-jLTnhVn0MBCK0HPonHjKMe5Oy3H; H_BDCLCKID_SF_BFESS=tRAOoC8-fIvDqTrP-trf5DCShUFsBhorB2Q-XPoO3KOD8KQCKfT0MT_UQPcH2pojQ5bk_xbgy4op8P3y0bb2DUA1y4vpX45Eb2TxoUJ2-KDVeh5Gqq-KQJ-ebPRiWPb9QgbP2pQ7tt5W8ncFbT7l5hKpbt-q0x-jLTnhVn0MBCK0HPonHjKMe5Oy3H; Hm_lvt_d101ea4d2a5c67dab98251f0b5de24dc=1710208322; bdindexid=t2mb1875k79o2budqg67p7le44; SIGNIN_UC=70a2711cf1d3d9b1a82d2f87d633bd8a04605642077okCG4xQ8MQTUKIcKEOYegu9eAH7GUdbb9H9DhvOlmhvHv542sN3Th8fm4leYSLg4a%2Fd8Hi2erwDz%2BVNBHXItsQKC6VXTg7hdfaiWGKfNhIADmNWWA7x9ogTOvqCACRdSC0HmosrqM%2FOjdhPNNHXGfuNNq8ABcE%2F%2FjzhQo%2BB8hPm1gXpMoIjbYdRr9ijrhZ348imqyjWb%2Fkycz1LtnzdqEErI68TPbVTdO0kT3yn1pepydmmeo6b3oKLOE5%2Bif4dN2elDZCZmBQ1A1SWh1LU2h9PXzRHi%2BxQL2GtMhwU%2Fb9K1fzNuuWvwMLRKUhHTIZkR80785640456665541880016339253281; __cas__rn__=460564207; __cas__st__212=4db08f6a8b666ccb62b0bf4a1df2c0756e63f53445d3423308e00d4f27d4952b6b2476db2173dbde30490198; __cas__id__212=49668465; CPTK_212=331523714; CPID_212=49668465; Hm_lpvt_d101ea4d2a5c67dab98251f0b5de24dc=1710552568; BDUSS_BFESS=jMxZmk5ZTczVEp2ekNqaWxOeEJ2ZzdWfjBDMWo5VEZ6MTd6fmptbDdDVU9rNE5sRVFBQUFBJCQAAAAAAAAAAAEAAADAj84M3kTfXkzHp8TqAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA4GXGUOBlxlN; ab_sr=1.0.1_M2JlYTRiYTU4ZTAwMjhmNjlhNDY2ZTI3MTUzMzhhZjQ1Yzg4ODJmZGI4MmRkNDBmMWU2NjJhNzAwMGIyNTY3Y2VjYTZiZTQ0OGUxZWU3YmI4YjZkZjhlOWQxMjk1MmI4ZGIxYWRiMGU0ZWY2YTJhNjBiOWM4MTc5ODhlNjJjNGJiNjM5N2E1YjEwOWZkOTA2Y2QzNGQ4YWI3OTQwNjkzNQ==; RT="z=1&dm=baidu.com&si=0facbdd0-bcd5-4038-901e-108a86a06b90&ss=lttecjxs&sl=c&tt=ikk&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf&ld=icor&ul=iwut"'
def get_html(url):
    headers = {
        "User-Agent":
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.188",
        "Host":
        "index.baidu.com",
        "Referer":
        "http://index.baidu.com/v2/main/index.html",
        "Cipher-Text":
        "1652425237825_1652501356206_VBpwl9UG8Dvs2fAi91KToRTSAP7sDsQU5phHL97raPDFJdYz3fHf9hBAQrGGCs+qJoP7yb44Uvf91F7vqJLVL0tKnIWE+W3jXAI30xx340rhcwUDQZ162FPAe0a1jsCluJRmMLZtiIplubGMW/QoE/0Pw+2caH39Ok8IsudE4wGLBUdYg1/bKl4MGwLrJZ7H6wbhR0vT5X0OdCX4bMJE7vcwRCSGquRjam03pWDGZ51X15fOlO0qMZ2kqa3BmxwNlfEZ81l3L9nZdrc3/Tl4+mNpaLM7vA5WNEQhTBoDVZs6GBRcJc/FSjd6e4aFGAiCp1Y8MD66chTiykjIN51s7gbJ44JfVS0NjBnsvuF55bs="
    }
    cookies = {'Cookie': COOKIES}
    response = requests.get(url, headers=headers, cookies=cookies)
    return response.text
get_html(word_url)
'{"status":0,"data":{"userIndexes":[{"word":[{"name":"tesla","wordType":1}],"all":{"startDate":"2024-02-14","endDate":"2024-03-14","data":"eewWKeWWCKeWeeKeWaYKeCUUKeJUWKeJeeKeAwYKeCACKeCwoKeWwwKeWUWKeJwCKeJaAKeCUCKeCJaKeYWoKeACCKeAeYKeJAUKeCAAKeCCwKeJeWKeawWKeweAKeWAaKeaYAKeJeeKeJUoKeJUA"},"pc":{"startDate":"2024-02-14","endDate":"2024-03-14","data":"WCUKwAwKwwUKwoUKJWUKoAaKoAAKaowKJCCKJJaKwJWKwwaKoAAKoaYKJWJKoUwKYWoKAYCKaWAKoaWKJaAKoUaKoACKCJUKAJeKwYaKJAwKoCUKoACKoAA"},"wise":{"startDate":"2024-02-14","endDate":"2024-03-14","data":"oJWKoowKooeKoJYKooUKoaJKoCJKoaCKooUKoCwKoCeKoCJKoYWKoYaKoJYKoJWKeUUUKYJUKoYaKoooKoYUKoaoKoCCKoCWKoAwKoaUKoaeKoaeKoCWKoCU"},"type":"day"}],"generalRatio":[{"word":[{"name":"tesla","wordType":1}],"all":{"avg":1538,"yoy":-15,"qoq":7},"pc":{"avg":660,"yoy":-21,"qoq":13},"wise":{"avg":877,"yoy":-10,"qoq":3}}],"uniqid":"078a9bd01a917ab25d15941660103f32"},"logid":2171604412,"message":0}'

3

找到数据获取的接口index,发现data字段被加密,但发现uniqid指向另一个接口(ptbk),其内容可能是某种加密字段。

4

全局搜索decrypt,找到函数所在文件路径,在源代码对应文件里找到该函数。

5
6

给函数打上断点重新搜索,发现传入的参数与ptbk接口参数及data参数一致,确定其为解密数据函数。
将其改写为python代码。

# decrypt: function(t, e) {
#     if (!t)
#         return "";
#     for (var n = t.split(""), a = e.split(""), i = {}, r = [], o = 0; o < n.length / 2; o++)
#         i[n[o]] = n[n.length / 2 + o];
#     for (var s = 0; s < e.length; s++)
#         r.push(i[a[s]]);
#     return r.join("")
# };
# def decrypt(t, e):
#     n = list(t)
#     i = list(e)
#     a = {}
#     result = []
#     ln = int(len(n) / 2)
#     start = n[ln:]
#     end = n[:ln]
#     for j, k in zip(start, end):
#         a.update({k: j})
#     for j in i:
#         result.append(a.get(j))
#     return ''.join(result)
def decrypt(t, e):
    if not t:
        return ""
    n = list(t)
    a = list(e)
    i = {}
    r = []
    for o in range(len(n) // 2):
        i[n[o]] = n[len(n) // 2 + o]
    for s in range(len(e)):
        r.append(i[a[s]])
    return ''.join(r)

7

def get_ptbk(uniqid):
    url = 'http://index.baidu.com/Interface/ptbk?uniqid={}'
    resp = get_html(url.format(uniqid))
    return json.loads(resp)['data']
def get_data(keyword, start='2021-06-13', end='2021-08-11'):
    url = "https://index.baidu.com/api/SearchApi/index?area=0&word=[[%7B%22name%22:%22{}%22,%22wordType%22:1%7D]]&startDate={}&endDate={}".format(
        keyword, start, end)
    data = get_html(url)
    data = json.loads(data)
    uniqid = data['data']['uniqid']
    data = data['data']['userIndexes'][0]['all']['data']
    ptbk = get_ptbk(uniqid)
    result = decrypt(ptbk, data) # 所有指数
    result = result.split(',')
    start = start.split("-")
    end = end.split("-")
    a = datetime.date(int(start[0]), int(start[1]), int(start[2]))
    b = datetime.date(int(end[0]), int(end[1]), int(end[2]))
    node = 0
    y = pd.DataFrame(columns=['日期', '指数'])
    for i in range(a.toordinal(), b.toordinal() + 1):  # +1
        date = datetime.date.fromordinal(i)
        if len(result[node]) == 0:
            result[node] = "0"
        print(date, result[node])
        y.loc[len(y)] = [date, result[node]]
        node += 1
    y.index.name = '序号'
    y.to_csv(r'test-2023-2024.csv')
keyword = "庄达菲"
start_date = "2023-03-02"
end_date = "2024-03-01"  # 上限 366days
result = get_data(keyword, start_date, end_date)
# result.to_csv(r'test-2023-2024.csv')
2023-03-02 3078
2023-03-03 3637
2023-03-04 4802
2023-03-05 3999
2023-03-06 2884
2023-03-07 2828
2023-03-08 2472
2023-03-09 2348
2023-03-10 3521
2023-03-11 5437
...... 
2024-02-21 15390
2024-02-22 23688
2024-02-23 22015
2024-02-24 18546
2024-02-25 14737
2024-02-26 10945
2024-02-27 7836
2024-02-28 7972
2024-02-29 6707
2024-03-01 5913

百度限制API查询时间跨度为一年。
PS:可以补充时间区间检查、登录检查、关键词存在等功能。

  • 9
    点赞
  • 12
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

春风锤呀锤

碗在这 ,光光的T.T

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值