将pd.read_hdf与columns参数一起使用。参见以下示例:import numpy as np
import pandas as pd
from contexttimer import Timer
def create_sample_df():
with Timer() as t:
df = pd.DataFrame(np.random.rand(100000, 5000))
df.to_hdf('file.h5', 'df', format='table')
print('create_sample_df: %.2fs' % t.elapsed)
def read_full_df():
""" data is too large to read fully """
with Timer() as t:
df = pd.read_hdf('file.h5')
print('read_full_df: %.2fs' % t.elapsed)
def read_df_with_start_stop():
""" to quick look all columns """
with Timer() as t:
df = pd.read_hdf('file.h5', start=0, stop=5)
print('read_df_with_start_stop: %.2fs' % t.elapsed)
def read_df_with_columns():
""" to read dataframe (hdf5) with necessary columns """
with Timer() as t:
df = pd.read_hdf('file.h5', columns=list(range(4)))
print('read_df_with_columns: %.2fs' % t.elapsed)
if __name__ == '__main__':
create_sample_df()
read_full_df()
read_df_with_start_stop()
read_df_with_columns()
# outputs:
# create_sample_df: 51.25s
# read_full_df: 5.21s
# read_df_with_start_stop: 0.03s
# read_df_with_columns: 4.44s
read_df_with_columns只降低空间成本,但不一定提高速度性能。这是假设HDF5是以table格式保存的(否则columns参数不能应用)。在