序言:
首先感谢两篇文章的作者【项目小结】爬虫学习进阶:获取百度指数历史数据_%ool-CSDN博客
Python爬虫|百度指数爬虫项目(简易版)_百度指数爬虫 python 代码-CSDN博客
附上代码:
import json import os import pandas as pd import requests from openpyxl import Workbook import time #读取想要了解数据的股票名称 qiye_name=[] df = pd.read_excel('企业选择.xlsx', sheet_name='读取列') shape = df.shape # print(shape) name_column = df['股票名称'] for i in range(shape[0]): my_string=name_column[i].strip() qiye_name.append(my_string) # 创建一个新的 Excel 工作簿 wb = Workbook() # 获取默认的工作表 ws = wb.active # 循环写入数据 for row in range(2, shape[0]+2): ws.cell(row=row, column=1, value=qiye_name[row-2]) for column in range(11): ws.cell(row=1,column=column+2,value=2012+column) timeout=3000 def get_index_data(keys,year): words = [[{"name": keys, "wordType": 1}]] words = str(words).replace(" ", "").replace("'", "\"") startDate = f"{year}-01-01" endDate = f"{year}-12-31" url = f'http://index.baidu.com/api/SearchApi/index?area=0&word={words}&startDate={startDate}&endDate={endDate}' # 请求头配置 headers = { "Connection": "keep-alive", "Accept": "application/json, text/plain, */*", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36", "Sec-Fetch-Site": "same-origin", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Dest": "empty", "Cipher-Text": "1698156005330_1698238860769_ZPrC2QTaXriysBT+5sgXcnbTX3/lW65av4zgu9uR1usPy82bArEg4m9deebXm7/O5g6QWhRxEd9/r/hqHad2WnVFVVWybHPFg3YZUUCKMTIYFeSUIn23C6HdTT1SI8mxsG5mhO4X9nnD6NGI8hF8L5/G+a5cxq+b21PADOpt/XB5eu/pWxNdwfa12krVNuYI1E8uHQ7TFIYjCzLX9MoJzPU6prjkgJtbi3v0X7WGKDJw9hwnd5Op4muW0vWKMuo7pbxUNfEW8wPRmSQjIgW0z5p7GjNpsg98rc3FtHpuhG5JFU0kZ6tHgU8+j6ekZW7+JljdyHUMwEoBOh131bGl+oIHR8vw8Ijtg8UXr0xZqcZbMEagEBzWiiKkEAfibCui59hltAgW5LG8IOtBDqp8RJkbK+IL5GcFkNaXaZfNMpI=", "Referer": "https://index.baidu.com/v2/main/index.html", "Accept-Language": "zh-CN,zh;q=0.9", 'Cookie': Cookie} res = requests.get(url, headers=headers) res_json = res.json() if res_json["message"] == "bad request": print("抓取关键词:" + keys + " 失败,请检查cookie或者关键词是否存在") if res_json["message"]=="request block": os._exit(0) else: # 获取特征值 # data = res_json['data'] # print(data) # uniqid = data["uniqid"] # url = f'http://index.baidu.com/Interface/ptbk?uniqid={uniqid}' # res = requests.get(url, headers=headers) # # 获取解码字 # ptbk = res.json()['data'] # 创建暂存文件夹 os.makedirs('res', exist_ok=True) filename = f"{keys}_{year}.json" file_path = os.path.join('res', filename) with open(file_path, 'w', encoding='utf-8') as json_file: json.dump(res_json, json_file, ensure_ascii=False, indent=4) return file_path#, ptbk def data_search(mc,years): # 假设 data.json 是你获取到的 JSON 文件 with open(f"res\\{mc}_{years}.json", 'r', encoding='utf-8') as file: data = json.load(file) # 现在 data 变量中包含了解析后的 JSON 数据,你可以按照字典的方式来访问数据 # if data['message']= return data['data']['generalRatio'][0]['all']['avg'] for i in range(shape[0]): for j in range(11): get_index_data(qiye_name[i],2012+j) print(i,j) time.sleep(3) #等到res文件里有数据时即可 # data_search(qiye_name[i],2012+j)#qiye_name[i],2012+j # ws.cell(row=i+2, column=j + 2, value=data_search(qiye_name[i],2012+j)) # wb.save("企业指数.xlsx")
踩过的坑以及注意事项:
首先建议使用代理来爬取数据(稳定的代理要不容易http报错),其次在读取的过程中发现封ip的情况非常少(就算你不开启代理也没事)不过容易把你的账号给暂时封禁一段时间此时读取的json文件为空读取不到数据,所以这边选择每隔3秒采集一次。本文所展示的案例是采集700多家股票的近十年数据。企业的名称在excel中,时间为2012-2022,数据读取出来存储在excel文件中(注意一定要是可以查询到的企业要不会报错)。如果有条件的话建议多备几个百度帐号吧。一个不行就换另外一个号。一般一个号感觉能爬取大概2000多条数据。由于本文没有要采集每天的数据,每天的数据需要具体的解码,解码过程在上述的两个链接中。又或者可以使用另外一种方式去进行爬取数据,selenium的方式也可以爬取数据。