Python爬虫实战!爬取百度指数并可视化

文章讲述了作者如何使用Python脚本通过代理IP访问百度指数API,解码加密数据,然后提取并可视化主流编程语言的热度数据。
摘要由CSDN通过智能技术生成

海哥发现百度指数的加密方式又变了,修改了一下之前的代码。

完整代码如下。

import json
import requests
import urllib.request
from datetime import datetime
from datetime import timedelta


# 获取IP代理
def get_proxy():
    opener = urllib.request.build_opener(
        urllib.request.ProxyHandler(
            {'http': 'http://brd-customer-hl_5dede465-zone-try-country-cn:pdqt396jal8m@brd.superproxy.io:22225',
             'https': 'http://brd-customer-hl_5dede465-zone-try-country-cn:pdqt396jal8m@brd.superproxy.io:22225'}))

    response = opener.open('http://lumtest.com/myip.json').read()
    response_str = response.decode('utf-8')
    ip = json.loads(response_str)['ip']

    proxies = {
        "http": "http://{}".format(ip),
        "https": "http://{}".format(ip),
    }

    return proxies


# 解码函数
def decrypt(ptbk, index_data):
    n = len(ptbk)//2
    a = dict(zip(ptbk[:n], ptbk[n:]))
    return "".join([a[s] for s in index_data])


def reCode(data, ptbk):
    data = data['data']
    li = data['userIndexes'][0]['all']['data']
    startDate = data['userIndexes'][0]['all']['startDate']
    year_str = startDate[:4]  # 使用切片取前四个字符,即年份部分
    try:
        # 将年份字符串转换为整数
        year = int(year_str)
        # 根据年份判断是否为闰年
        if (year % 4 == 0 and year % 100 != 0) or (year % 400 == 0):
            year = 366
        else:
            year = 365
    except :
        year =365

    if li =='':
        result = {}
        name = data['userIndexes'][0]['word'][0]['name']
        tep_all = []
        while len(tep_all) < year:
            tep_all.insert(0, 0)
        result["name"] = name
        result["data"] = tep_all
    else:
        ptbk = ptbk
        result = {}
        for userIndexe in data['userIndexes']:
            name = userIndexe['word'][0]['name']
            index_all = userIndexe['all']['data']
            try:
                index_all_data = [int(e) for e in decrypt(ptbk, index_all).split(",")]
                tmp_all = index_all_data
            except:
                tmp_all = []
            while len(tmp_all) < year:
                tmp_all.insert(0, 0)
            result["name"] = name
            result["data"] = tmp_all
    return result


def get_index_data(word, year, proxies):
    words = [[{"name": word, "wordType": 1}]]
    words = str(words).replace(" ", "").replace("'", "\"")
    startDate = f"{year}-01-01"
    endDate = f"{year}-12-31"
    url = f'http://index.baidu.com/api/SearchApi/index?area=0&word={words}&startDate={startDate}&endDate={endDate}'

    # 请求头配置
    headers = {
        "Connection": "keep-alive",
        "Accept": "application/json, text/plain, */*",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
        "Sec-Fetch-Site": "same-origin",
        "Sec-Fetch-Mode": "cors",
        "Sec-Fetch-Dest": "empty",
        "Cipher-Text": "1698156005330_1698238860769_ZPrC2QTaXriysBT+5sgXcnbTX3/lW65av4zgu9uR1usPy82bArEg4m9deebXm7/O5g6QWhRxEd9/r/hqHad2WnVFVVWybHPFg3YZUUCKMTIYFeSUIn23C6HdTT1SI8mxsG5mhO4X9nnD6NGI8hF8L5/G+a5cxq+b21PADOpt/XB5eu/pWxNdwfa12krVNuYI1E8uHQ7TFIYjCzLX9MoJzPU6prjkgJtbi3v0X7WGKDJw9hwnd5Op4muW0vWKMuo7pbxUNfEW8wPRmSQjIgW0z5p7GjNpsg98rc3FtHpuhG5JFU0kZ6tHgU8+j6ekZW7+JljdyHUMwEoBOh131bGl+oIHR8vw8Ijtg8UXr0xZqcZbMEagEBzWiiKkEAfibCui59hltAgW5LG8IOtBDqp8RJkbK+IL5GcFkNaXaZfNMpI=",
        "Referer": "https://index.baidu.com/v2/main/index.html",
        "Accept-Language": "zh-CN,zh;q=0.9",
        'Cookie': '你的cookie'}
    res = requests.get(url, headers=headers, proxies=proxies)
    res_json = res.json()

    if res_json["message"] == "bad request":
        print("抓取关键词:"+word+" 失败,请检查cookie或者关键词是否存在")
    else:
        # 获取特征值
        data = res_json['data']
        uniqid = data["uniqid"]
        url = f'http://index.baidu.com/Interface/ptbk?uniqid={uniqid}'
        res = requests.get(url, headers=headers, proxies=proxies)
        # 获取解码字
        ptbk = res.json()['data']

        return res_json, ptbk


def get_date_list(year):
    """
    获取时间列表
    """
    begin_date = f"{year}-01-01"
    end_date = f"{year}-12-31"
    dates = []
    dt = datetime.strptime(begin_date, "%Y-%m-%d")
    date = begin_date[:]
    while date <= end_date:
        dates.append(date)
        dt += timedelta(days=1)
        date = dt.strftime("%Y-%m-%d")
    return dates


def get_word():
    proxies = get_proxy()
    startyear = 2023
    endyear = 2023
    words = ["Python", "C", "Java", "C#", "JavaScript", "SQL", "Go", "PHP", "MATLAB", "Swift", "Rust"]
    for word in words:
        for year in range(startyear, endyear + 1):
            try:
                data, ptbk  = get_index_data(word, year, proxies)
                res = reCode(data, ptbk)
                dates = get_date_list(year)
                for num, date in zip(res['data'], dates):
                    print(word, num, date)
                    with open('word2.csv', 'a+', encoding='utf-8') as f:
                        f.write(word + ',' + str(num) + ',' + date + '\n')
            except:
                pass


get_word()

只需替换请求头里的cookie值即可,获取方式如下图。

对于IP代理,可以根据你需要的类型,如无限机房代理、动态住宅代理、机房代理、移动代理,创建代理通道。

进入通道后,可以看到使用指南,以及其它语言的调用案例。

具体代码如下。

import json
import urllib.request

opener = urllib.request.build_opener(
    urllib.request.ProxyHandler(
        {'http': 'http://brd-customer-hl_5dede465-zone-try-country-cn:pdqt396jal8m@brd.superproxy.io:22225',
        'https': 'http://brd-customer-hl_5dede465-zone-try-country-cn:pdqt396jal8m@brd.superproxy.io:22225'}))

response = opener.open('http://lumtest.com/myip.json').read()
# 将响应转换为字符串
response_str = response.decode('utf-8')
# 使用json库解析字符串
data = json.loads(response_str)['ip']

# 打印IP地址,每一次打印结果是不一样的,输出例子:73.110.170.116
print(data)

运行代码,成功获取到3000+的数据。

接下来使用pynimate进行可视化展示。

具体代码如下。

from matplotlib import pyplot as plt
import pandas as pd
import pynimate as nim

# 中文显示
plt.rcParams['font.sans-serif'] = ['SimHei']  #Windows
plt.rcParams['axes.unicode_minus'] = False


# 更新条形图
def post_update(ax, i, datafier, bar_attr):
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    ax.spines["bottom"].set_visible(False)
    ax.spines["left"].set_visible(False)
    ax.set_facecolor("#001219")


# 读取数据
df_data = pd.read_csv('word2.csv', encoding='utf-8', header=None, names=['name', 'number', 'day']).set_index("day")
print(df_data)
# 数据处理,数据透视表
df = pd.pivot_table(df_data, values='number', index=['day'], columns=['name'], fill_value=0).head(30)
print(df)
# 保存
# df = pd.read_csv("word.csv").set_index("day")

# 新建画布
cnv = nim.Canvas(figsize=(12.8, 7.2), facecolor="#001219")
bar = nim.Barplot(
    df, "%Y-%m-%d", "2h", post_update=post_update, rounded_edges=True, grid=False, n_bars=10
)
# 标题设置
bar.set_title("主流编程语言热度排行(百度指数)", color="w", weight=600, x=0.15, size=30)
# 时间设置
bar.set_time(
    callback=lambda i, datafier: datafier.data.index[i].strftime("%Y-%m-%d"), color="w", y=0.2, size=20
)

# 文字颜色设置
bar.set_bar_annots(color="w", size=13)
bar.set_xticks(colors="w", length=0, labelsize=13)
bar.set_yticks(colors="w", labelsize=13)
# 条形图边框设置
bar.set_bar_border_props(
    edge_color="black", pad=0.1, mutation_aspect=1, radius=0.2, mutation_scale=0.6
)
cnv.add_plot(bar)
cnv.animate()
# 显示
# plt.show()
# 保存gif
cnv.save("code", 24, "gif")

结果如下所示。

万水千山总是情,点个 👍 行不行。

  • 15
    点赞
  • 15
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值