从一个大数据中创建子表并快速测试唯一性的辅助函数
player_index = 'playerShort'
player_cols = ['birthday', 'height', 'weight', 'position', 'photoID', 'rater1', 'rater2']
def get_subgroup(dataframe, g_index, g_columns):
g = dataframe.groupby(g_index).agg({col:'nunique' for col in g_columns})
if g[g > 1].dropna().shape[0] != 0:
print("Warning: you probably assumed this had all unique values but it doesn't.")
return dataframe.groupby(g_index).agg({col:'max' for col in g_columns})
players = get_subgroup(df, player_index, player_cols)
players.head()
保存数据, 并检测是否一致:
def save_subgroup(dataframe, g_index, subgroup_name, prefix = 'raw_'):
save_subgroup_filename = ''.join([prefix, subgroup_name, '.csv.gz'])
dataframe.to_csv(save_subgroup_filename, compression='gzip', encoding = 'UTF-8')
test_df = pd.read_csv(save_subgroup_filename, compression='gzip', index_col = g_index, encoding='UTF-8')
if dataframe.equals(test_df):
print('Test-passed: we recover the equivalent subgroup dataframe.')
else:
print('Warning -- equivalence test!!! Double-check.')
save_subgroup(players, player_index, 'players')
Test-passed: we recover the equivalent subgroup dataframe.