关注我的公众号YueTan进行交流探讨
欢迎关注数据比赛方案仓库 https://github.com/hongyingyue/Competition-solutions
base
def reader(f):
try:
df = pd.read_csv(f, index_col="Time", usecols=['Time', 'AccV', 'AccML', 'AccAP', 'StartHesitation', 'Turn' , 'Walking'])
df['Id'] = f.split('/')[-1].split('.')[0]
df = pd.merge(df, tasks[['Id','t_kmeans']], how='left', on='Id').fillna(-1)
df = pd.merge(df, subjects[['Id','s_kmeans']], how='left', on='Id').fillna(-1)
df_feats = fc.calculate(df, return_df=True, include_final_window=True, approve_sparsity=True, window_idx="begin").astype(np.float32)
df = df.merge(df_feats, how="left", left_index=True, right_index=True)
df.fillna(method="ffill", inplace=True)
return df
except: pass
train = pd.concat([reader(f) for f in tqdm(train)]).fillna(0); print(train.shape)
cols = [c for c in train.columns if c not in ['Id', 'Time', 'StartHesitation', 'Turn' , 'Walking', 'Valid', 'Task','Event']]
pcols = ['StartHesitation', 'Turn' , 'Walking']
scols = ['Id', 'StartHesitation', 'Turn' , 'Walking']
- 从tasks中增加kmeans类别
- 从subject中增加kmean类别
- 用tsflex增加时序特征【basic_feats, emg_feats】
- ensemble.ExtraTreesRegressor同时预测三个目标
base
GroupKfold Cross-Validation tsflex-0.246
- GroupKfold Cross Validation
def reader(f):
try:
df = pd.read_csv(f, index_col="Time", usecols=['Time', 'AccV', 'AccML', 'AccAP', 'StartHesitation', 'Turn' , 'Walking'])
df['Id'] = f.split('/')[-1].split('.')[0]
df['Module'] = pathlib.Path(f).parts[-2]
df = pd.merge(df, tasks[['Id','t_kmeans']], how='left', on='Id').fillna(-1)
# df = pd.merge(df, subjects[['Id','s_kmeans']], how='left', on='Id').fillna(-1)
df = pd.merge(df, metadata_complex[['Id','Subject']+['Visit','Test','Medication','s_kmeans']], how='left', on='Id').fillna(-1)
df_feats = fc.calculate(df, return_df=True, include_final_window=True, approve_sparsity=True, window_idx="begin").astype(np.float32)
df = df.merge(df_feats, how="left", left_index=True, right_index=True)
df.fillna(method="ffill", inplace=True)
return df
except: pass
- 保留采用subject聚类特征,而是采用了meta_data中关于subject的特征
base
Simple EDA on Time for targets-0.306
import pathlib
def reader(f):
try:
df = pd.read_csv(f, index_col="Time", usecols=['Time', 'AccV', 'AccML', 'AccAP', 'StartHesitation', 'Turn' , 'Walking'])
df['Id'] = f.split('/')[-1].split('.')[0]
df['Module'] = pathlib.Path(f).parts[-2]
df['Time_frac']=(df.index/df.index.max()).values#currently the index of data is actually "Time"
df = pd.merge(df, tasks[['Id','t_kmeans']], how='left', on='Id').fillna(-1)
# df = pd.merge(df, subjects[['Id','s_kmeans']], how='left', on='Id').fillna(-1)
df = pd.merge(df, metadata_complex[['Id','Subject']+['Visit','Test','Medication','s_kmeans']], how='left', on='Id').fillna(-1)
df_feats = fc.calculate(df, return_df=True, include_final_window=True, approve_sparsity=True, window_idx="begin").astype(np.float32)
df = df.merge(df_feats, how="left", left_index=True, right_index=True)
df.fillna(method="ffill", inplace=True)
return df
except: pass
train = pd.concat([reader(f) for f in tqdm(train)]).fillna(0); print(train.shape)
cols = [c for c in train.columns if c not in ['Id','Subject','Module', 'Time', 'StartHesitation', 'Turn' , 'Walking', 'Valid', 'Task','Event']]
pcols = ['StartHesitation', 'Turn' , 'Walking']
scols = ['Id', 'StartHesitation', 'Turn' , 'Walking']
测试时
df['Time_frac']=(df.index/df.index.max()).values#currently the index of data is actually "Time"
df = pd.merge(df, tasks[['Id','t_kmeans']], how='left', on='Id').fillna(-1)
# df = pd.merge(df, subjects[['Id','s_kmeans']], how='left', on='Id').fillna(-1)
df = pd.merge(df, metadata_complex[['Id','Subject']+['Visit','Test','Medication','s_kmeans']], how='left', on='Id').fillna(-1)
df_feats = fc.calculate(df, return_df=True, include_final_window=True, approve_sparsity=True, window_idx="begin")
- 新特征
- 模型LGB
base
base-nn-torch
PyTorch FOG End-to-End Baseline [LB 0.254]
base-nn-tf
Parkinson FoG Pred Conv1D Separate TF Model-0.296
- 针对两种数据集分开建立模型训练,主要采用序列数据
_is_tdcs = basename(dirname(path)).startswith('tdcs')
df = pd.read_csv(path) # 加载该序列数据
_cols = [*self.cfg.feature_list, *self.cfg.label_list, 'Valid', 'Task'] # 三个加速度,label 和Valid Task
self.valid_position = self.cfg.n_features + self.cfg.n_labels
self.task_position = self.valid_position + 1
# Pads past and future rows to dataframe values for indexing # 选出需要的列,并padding
_values = df[cols].values.astype(np.float16)
return np.pad(_values, ((self.past_pad, self.future_pad),(0,0)), 'edge')
# 以上是单个——read
_values = [self._read(f) for f in df_paths]
self.mapping = []
_length = 0
for _value in _values:
_shape = _value.shape[0]
self.mapping.extend(range(_length+self.past_pad, _length+_shape-self.future_pad))
_length += _shape
对于这个数据集,如何构建三维的关键在这里
- 对于y,根据batch的index选择即可
- 对于x, idx到过去的pad, idx到未来的pad, 中间还有间隔。也就是此baseline的关键
def _get_X_y(self, indices):
_X = np.empty((len(indices), self.cfg.window_size, self.cfg.n_features), dtype=np.float16)
for i, idx in enumerate(indices):
_X[i] = self.values[idx-self.past_pad: idx+self.future_pad+1:self.cfg.wx, :self.cfg.n_features]
return _X, self.values[indices, self.cfg.n_features:self.cfg.n_features+self.cfg.n_labels]