背景
1. 读取期货的历史数据,保存路径形式为一个文件夹下有多个h5文件,每个h5文件下有多个数据集。
2. 获取期货的所有日线数据,与官方数据进行比对,查找缺失值并将其积累到日志文件中。
验证数据
import h5py
import numpy as np
import random
# 复制h5文件
def copy_h5_file(src_file, dst_file):
with h5py.File(src_file, 'r') as f:
# 读取数据
data = f['OI2007'][:] #这里的“OI2007”是标的名称,也就是最小h5文件名
# 写入数据
with h5py.File(dst_file, 'w') as f:
f['OI2007'] = data
# 读取h5文件前5行数据
def read_h5_file(file_path):
with h5py.File(file_path, 'r') as f:
data = f['OI2007'][:5] #根据需要调整数据读取范围
return data
# 随机删除某行数据并记录
def delete_row(file_path):
with h5py.File(file_path, 'r+') as f:
data = f['OI2007']
# 随机选择一行数据
index = random.randint(0, len(data)-1) #根据需要调整删除数量
# 记录删除的数据
deleted_data = data[index]
# 删除数据
data = np.delete(data, index, axis=0)
# 重新写入数据
del f['OI2007']
f.create_dataset('OI2007', data=data)
return deleted_data
# 测试代码
src_file = r'F:\daybar\future_daybar.h5' #数据文件路径
dst_file = 'day_test_copy.h5' #数据集名称
copy_h5_file(src_file, dst_file)
data = read_h5_file(dst_file)
deleted_data = delete_row(dst_file)
print('已删除数据:', deleted_data)
完整代码
import pandas as pd
import h5py
import rqdatac as rd
import os
import numpy as np
from datetime import datetime
def read_all_h5_datasets(directory_path, dataset_name):
data_list = []
column_mapping = {'date': 0}
try:
for filename in os.listdir(directory_path):
if filename.endswith(".h5"):
h5_path = os.path.join(directory_path, filename)
with h5py.File(h5_path, mode="r") as h5_file:
if dataset_name in h5_file:
dataset = h5_file[dataset_name]
for data in dataset:
selected_data = {column_name: data[column_index] for column_name, column_index in column_mapping.items()}
data_list.append(selected_data['date']) # Fix: extract 'date' directly
df = pd.DataFrame(data_list, columns=['date']) # Fix: specify column name
return df
except Exception as e:
print(f"Error occurred while reading H5 files: {e}")
return None
def get_trading_dates(symbol):
try:
rd.init('license', 'your_license') #修改your_lincese,获取米筐授权
instruments = rd.instruments(symbol, market='cn')
if instruments:
start_date = instruments.listed_date if hasattr(instruments, 'listed_date') else None
end_date = instruments.de_listed_date if hasattr(instruments, 'de_listed_date') else None
trading_dates = rd.get_trading_dates(start_date=start_date, end_date=end_date)
return trading_dates
else:
print(f"Error: No instruments found for {symbol}")
return None
except Exception as e:
print(f"Error occurred while getting trading dates: {e}")
return None
def compare_and_log_missing_dates(future_dates, trading_dates, symbol, log_file):
if isinstance(future_dates, pd.Series):
future_dates = pd.to_datetime(future_dates, format='%Y%m%d').dt.date
elif isinstance(future_dates, pd.DatetimeIndex):
future_dates = future_dates.date
trading_dates = pd.to_datetime(trading_dates).date
missing_dates = [date for date in trading_dates if date not in future_dates.values]
extra_dates = [date for date in future_dates if date not in trading_dates]
with open(log_file, 'a') as log:
if missing_dates:
for date in missing_dates:
log.write(f"{symbol} {date.year}年{date.month}月{date.day}日缺失数据\n")
if extra_dates:
for date in extra_dates:
log.write(f"{symbol} {date.year}年{date.month}月{date.day}日多出数据\n")
print(f"Comparison and logging completed. Results appended to {log_file}")
def write_to_log(log_file, content):
with open(log_file, 'a') as f:
f.write(content)
def main():
h5_directory = r"F:\daybar" #文件夹路径
dataset_name = "OI2007" #可指定数据集,不指定默认遍历所有数据集
# 读取期货数据
future_data = read_all_h5_datasets(h5_directory, dataset_name) #可指定数据集,不指定默认遍历所有数据集
if future_data is not None:
# 获取交易日期
trading_dates = get_trading_dates1(symbol=dataset_name) #可指定数据集,不指定默认遍历所有数据集
if trading_dates is not None:
# 比较并记录缺失值
log_file = f"{dataset_name}_missing_dates.log" #可修改日志文件名称
compare_and_log_missing_dates(future_data['date'], trading_dates, dataset_name, log_file)
else:
print("Error: Unable to get trading dates.")
else:
print("Error: Unable to read future data.")
if __name__ == "__main__":
main()
如需具体’license’,请私信或联系米筐官方获取。
———————————————————————————————
Nothing comes from nothing.