02历史数据缺失值检验

背景

1. 读取期货的历史数据,保存路径形式为一个文件夹下有多个h5文件,每个h5文件下有多个数据集。
2. 获取期货的所有日线数据,与官方数据进行比对,查找缺失值并将其积累到日志文件中。

验证数据

import h5py
import numpy as np
import random

# 复制h5文件
def copy_h5_file(src_file, dst_file):
    with h5py.File(src_file, 'r') as f:
        # 读取数据
        data = f['OI2007'][:]        #这里的“OI2007”是标的名称,也就是最小h5文件名

    # 写入数据
    with h5py.File(dst_file, 'w') as f:
        f['OI2007'] = data


# 读取h5文件前5行数据
def read_h5_file(file_path):
    with h5py.File(file_path, 'r') as f:
        data = f['OI2007'][:5]         #根据需要调整数据读取范围
    return data

# 随机删除某行数据并记录
def delete_row(file_path):
    with h5py.File(file_path, 'r+') as f:
        data = f['OI2007']

        # 随机选择一行数据
        index = random.randint(0, len(data)-1)       #根据需要调整删除数量
        # 记录删除的数据
        deleted_data = data[index]

        # 删除数据
        data = np.delete(data, index, axis=0)

        # 重新写入数据
        del f['OI2007']

        f.create_dataset('OI2007', data=data)

    return deleted_data

# 测试代码
src_file = r'F:\daybar\future_daybar.h5'  #数据文件路径
dst_file = 'day_test_copy.h5'      #数据集名称
copy_h5_file(src_file, dst_file)
data = read_h5_file(dst_file)
deleted_data = delete_row(dst_file)   
print('已删除数据:', deleted_data)  

完整代码

import pandas as pd
import h5py
import rqdatac as rd
import os
import numpy as np
from datetime import datetime

def read_all_h5_datasets(directory_path, dataset_name):
    data_list = []
    column_mapping = {'date': 0}
    
    try:
        for filename in os.listdir(directory_path):
            if filename.endswith(".h5"):
                h5_path = os.path.join(directory_path, filename)
                with h5py.File(h5_path, mode="r") as h5_file:
                    if dataset_name in h5_file:
                        dataset = h5_file[dataset_name]
                        for data in dataset:
                            selected_data = {column_name: data[column_index] for column_name, column_index in column_mapping.items()}
                            data_list.append(selected_data['date'])  # Fix: extract 'date' directly
        df = pd.DataFrame(data_list, columns=['date'])  # Fix: specify column name
        return df
    except Exception as e:
        print(f"Error occurred while reading H5 files: {e}")
        return None

def get_trading_dates(symbol):
    try:
        rd.init('license', 'your_license')    #修改your_lincese,获取米筐授权
        instruments = rd.instruments(symbol, market='cn')
        
        if instruments:
            start_date = instruments.listed_date if hasattr(instruments, 'listed_date') else None
            end_date = instruments.de_listed_date if hasattr(instruments, 'de_listed_date') else None
            trading_dates = rd.get_trading_dates(start_date=start_date, end_date=end_date)
            return trading_dates
        else:
            print(f"Error: No instruments found for {symbol}")
            return None
    except Exception as e:
        print(f"Error occurred while getting trading dates: {e}")
        return None

def compare_and_log_missing_dates(future_dates, trading_dates, symbol, log_file):
    if isinstance(future_dates, pd.Series):
        future_dates = pd.to_datetime(future_dates, format='%Y%m%d').dt.date
    elif isinstance(future_dates, pd.DatetimeIndex):
        future_dates = future_dates.date

    trading_dates = pd.to_datetime(trading_dates).date

    missing_dates = [date for date in trading_dates if date not in future_dates.values]
    extra_dates = [date for date in future_dates if date not in trading_dates]

    with open(log_file, 'a') as log:
        if missing_dates:
            for date in missing_dates:
                log.write(f"{symbol} {date.year}{date.month}{date.day}日缺失数据\n")

        if extra_dates:
            for date in extra_dates:
                log.write(f"{symbol} {date.year}{date.month}{date.day}日多出数据\n")

    print(f"Comparison and logging completed. Results appended to {log_file}")

def write_to_log(log_file, content):
    with open(log_file, 'a') as f:
        f.write(content)
        
def main():
    h5_directory = r"F:\daybar"  #文件夹路径
    dataset_name = "OI2007"   #可指定数据集,不指定默认遍历所有数据集

    # 读取期货数据
    future_data = read_all_h5_datasets(h5_directory, dataset_name)  #可指定数据集,不指定默认遍历所有数据集
    
    if future_data is not None:
        # 获取交易日期
        trading_dates = get_trading_dates1(symbol=dataset_name)    #可指定数据集,不指定默认遍历所有数据集
        
        if trading_dates is not None:
            # 比较并记录缺失值
            log_file = f"{dataset_name}_missing_dates.log"    #可修改日志文件名称
            compare_and_log_missing_dates(future_data['date'], trading_dates, dataset_name, log_file)
        else:
            print("Error: Unable to get trading dates.")
    else:
        print("Error: Unable to read future data.")

if __name__ == "__main__":
    main()

如需具体’license’,请私信或联系米筐官方获取。
———————————————————————————————
Nothing comes from nothing.

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值