python爬取百度指数

import datetime
import requests
import sys
import time
import json
import pandas as pd
import numpy as np

word_url = 'http://index.baidu.com/api/SearchApi/thumbnail?area=0&word={}'

def get_html(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36",
        "Host": "index.baidu.com",
        "Referer": "http://index.baidu.com/v2/main/index.html",
        "Cipher-Text": "1652425237825_1652501356206_VBpwl9UG8Dvs2fAi91KToRTSAP7sDsQU5phHL97raPDFJdYz3fHf9hBAQrGGCs+qJoP7yb44Uvf91F7vqJLVL0tKnIWE+W3jXAI30xx340rhcwUDQZ162FPAe0a1jsCluJRmMLZtiIplubGMW/QoE/0Pw+2caH39Ok8IsudE4wGLBUdYg1/bKl4MGwLrJZ7H6wbhR0vT5X0OdCX4bMJE7vcwRCSGquRjam03pWDGZ51X15fOlO0qMZ2kqa3BmxwNlfEZ81l3L9nZdrc3/Tl4+mNpaLM7vA5WNEQhTBoDVZs6GBRcJc/FSjd6e4aFGAiCp1Y8MD66chTiykjIN51s7gbJ44JfVS0NjBnsvuF55bs="
    }
    cookies = {
        'Cookie': "__bid_n=18379203a6589298324207; BIDUPSID=2E02EE98DCCE13B3871E90D13B293B90; PSTM=1667567321; FEID=v10-907dcd0a5cc6fa5de7eb6ddc03204ab967433e10; BDUSS=UNWM2dOeDZhU215U09IaUVISHp1UjdsNUEzTVJHYmlRdkFVWFRwSGpRN1JjY0pqSVFBQUFBJCQAAAAAAAAAAAEAAAAEyXrqAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANHkmmPR5JpjM; BAIDU_WISE_UID=wapp_1671101300284_101; Hm_up_d101ea4d2a5c67dab98251f0b5de24dc=%7B%22uid_%22%3A%7B%22value%22%3A%223933915396%22%2C%22scope%22%3A1%7D%7D; BAIDUID=16C58FBA50EF47EF0A64399091E5A874:FG=1; BAIDUID_BFESS=16C58FBA50EF47EF0A64399091E5A874:FG=1; __xaf_fpstarttimer__=1672804972789; __xaf_thstime__=1672804973003; FPTOKEN=Ld+1YJ0/2IOF8n5w0o9hNT9x6Dgh37ZoNq6WLF+HyHg0O78kbrB2WuS6INjPP+1FBoMs/grBXUsbMe94Tcye08EvzsRFC2lD8+it5o50bMEnjRMzvEHqSO4EXjIJxCUdY8IvYeYgOK9IQVcGLklv7nEupfxOC2ncG7YSHLLczZPlFutqxJfGbs58AFqien3endohJmslnornN5poweWkL0TgmE1al+fqpEhxenk3vhGXJBxfdnvJX7cdtVJnt+N2p+CNnKvE7PtC8jDMs/BSVA01hw/vqjNYtVdLc7J+x7A0vpx4w/ZszYgPOA+ciMXDrz+kKFb353Zmv4Eof09tccc5n9qg/bdakV3Q7G3vzPu7ABZYQB+Y+J+qQZQvrDiAwe5Y/JzFlNCb/jAWKZm07w==|+zObTsWM9XyZ2tS/c7u+nKD0/ryhdm0WbmNhXuttlXM=|10|99a32f4aa90fa36396e141f8e300f55a; __xaf_fptokentimer__=1672804973023; Hm_lvt_d101ea4d2a5c67dab98251f0b5de24dc=1672218950,1672832223; bdindexid=taqk72m8ot9a3r7inhsu68pvj7; ZD_ENTRY=bing; SIGNIN_UC=70a2711cf1d3d9b1a82d2f87d633bd8a04228953144t9K20c2FoG8ggPbsWRxmrGL9e3rTAF6eV4RJpk2phsJdVN9JSOlIBDGmvpI073W%2FlQOooVmfokGXaOUpF8Pul3BX2s4CiZjN%2FS036h7uD9dHHWwAmZQJYrlk2deNccShmF3ydoajbR612EEhqFINPyny0KROmA6zCGO1tObUWfjlPZ7SqlBHS6zqsOW47Xl7G%2FcoNrunrO1HT2M6dxm3uvmmtiHAPnVPB1e0%2B5gHL2rEy4C02%2FM88MFy87n%2F7lG%2BJZjwO8ITbgTbQgM3G7hZnxEIGuhnVOu%2Buu%2BaZaMDbFQ%3D18035950298843985659026702870995; __cas__rn__=422895314; __cas__st__212=187a74c8ca54618d46bf34074a2d06fa782febc8b31a4f0e29ffc36fc72e2394448db3087327d0a15f86363e; __cas__id__212=44855536; CPID_212=44855536; CPTK_212=266828396; RT=z=1&dm=baidu.com&si=a0738dc7-5b13-4a2c-9aa4-9be92b5bdc68&ss=lcifbk18&sl=0&tt=0&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf; Hm_lpvt_d101ea4d2a5c67dab98251f0b5de24dc=1672891956; BDUSS_BFESS=UNWM2dOeDZhU215U09IaUVISHp1UjdsNUEzTVJHYmlRdkFVWFRwSGpRN1JjY0pqSVFBQUFBJCQAAAAAAAAAAAEAAAAEyXrqAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANHkmmPR5JpjM; ab_sr=1.0.1_OTViZTdmNjAwZjljZTQwN2Q4OWQ0NWM0ODgwNzFlNDRjMThhNjlkOGY3MjFiMDQ5ZGZjMGVmNmQ4ZDEzYWJmMjI5ZThiM2NjZDI4N2E0MDcyMTkwNDYxZTVkMjU2Mjc5YzU4MTk4ZmQ2NDhhZWYxZDdmNDY2ZDFjNjU1YzhlOGFhYjQzOTRjYTNhNGMxZTFhN2Y5MTNiMGNhNWUzMzg1Mg=="
    }
    response = requests.get(url, headers=headers, cookies=cookies)
    return response.text


def decrypt(t, e):
    n = list(t)
    i = list(e)
    a = {}
    result = []
    ln = int(len(n) / 2)
    start = n[ln:]
    end = n[:ln]
    for j, k in zip(start, end):
        a.update({k: j})
    for j in e:
        result.append(a.get(j))
    return ''.join(result)


def get_ptbk(uniqid):
    url = 'http://index.baidu.com/Interface/ptbk?uniqid={}'
    resp = get_html(url.format(uniqid))
    return json.loads(resp)['data']


def get_data(keyword, start='2011-01-02', end='2022-01-02'):
    url = "https://index.baidu.com/api/SearchApi/index?area=0&word=[[%7B%22name%22:%22{}%22,%22wordType%22:1%7D]]&startDate={}&endDate={}".format(keyword, start, end)
    data = get_html(url)
    data = json.loads(data)
    uniqid = data['data']['uniqid']
    data = data['data']['userIndexes'][0]['all']['data']
    ptbk = get_ptbk(uniqid)
    result = decrypt(ptbk, data)
    result = result.split(',')
    start = start_date.split("-")
    end = end_date.split("-")
    a = datetime.date(int(start[0]), int(start[1]), int(start[2]))
    b = datetime.date(int(end[0]), int(end[1]), int(end[2]))
    node = 0
    y=[]
    for i in range(a.toordinal(), b.toordinal()):
        date = datetime.date.fromordinal(i)
        print(date, result[node])
        y.append(result[node])
        node += 1

    y=np.array(y).flatten()
    y=pd.DataFrame(y)
    #y.to_csv(r'D:\alldata\pythonfiles\九寨沟\百度指数数据\九寨沟天气-百度20140101-2015.0101.csv')
    #y.to_csv(r'D:\alldata\pythonfiles\九寨沟\百度指数数据\九寨沟天气-百度20150101-2016.0101.csv')
    #y.to_csv(r'D:\alldata\pythonfiles\九寨沟\百度指数数据\九寨沟天气-百度20160101-2016.0601.csv')
    #y.to_csv(r'D:\alldata\pythonfiles\九寨沟\百度指数数据\九寨沟天气-百度20160601-2017.0101.csv')
    y.to_csv(r'D:\alldata\pythonfiles\四姑娘山\百度指数数据\四姑娘山-百度20190101-20200101.csv')
    



if __name__ == '__main__':
    keyword = "四姑娘山"
    start_date = "2019-01-01"
    end_date = "2020-01-01"
    get_data(keyword, start_date, end_date)
    # print(data)



  • 9
    点赞
  • 25
    收藏
    觉得还不错? 一键收藏
  • 7
    评论
要使用Python爬取百度指数,可以通过以下步骤进行: 1. 安装相关库:需要安装 requests、BeautifulSoup4、pandas 和 xlwt 库。 2. 获取百度指数的cookies:在浏览器上登录百度账号,然后在“开发者工具”中查找到“Application”栏下的“Cookies”,找到“BDUSS”和“STOKEN”两个值。 3. 构建请求链接:根据搜索关键词、起始日期和结束日期构建请求链接。请求链接的格式为: http://index.baidu.com/api/SearchApi/index?word={}&startDate={}&endDate={} 4. 发送请求:使用 requests 库发送请求,并将 cookies 和请求头添加到请求中。 5. 解析响应:使用 BeautifulSoup4 库解析响应,并将数据保存到 pandas 数据框中。 6. 导出数据:使用 xlwt 库将数据导出到 Excel 文件中。 以下是示例代码: ```python import requests from bs4 import BeautifulSoup import pandas as pd import xlwt # 百度指数请求链接 url = 'http://index.baidu.com/api/SearchApi/index?word={}&startDate={}&endDate={}' # 搜索关键词 keyword = 'Python' # 起始日期和结束日期 start_date = '20210101' end_date = '20210131' # cookies cookies = { 'BDUSS': 'xxxxx', 'STOKEN': 'xxxxx' } # 请求头 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36' } # 发送请求 response = requests.get(url.format(keyword, start_date, end_date), cookies=cookies, headers=headers) # 解析响应 soup = BeautifulSoup(response.content, 'lxml') data = soup.find_all('data') # 保存数据到 pandas 数据框 df = pd.DataFrame() for item in data: df = df.append({ 'date': item['x'], 'index': item['y'] }, ignore_index=True) # 导出数据到 Excel 文件 writer = pd.ExcelWriter('baidu_index.xlsx') df.to_excel(writer, index=False) writer.save() ``` 注意:需要替换掉示例代码中的 cookies 和搜索关键词。另外,百度指数的请求链接可能会发生变化,需要根据实际情况进行调整。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 7
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值