比赛官网
https://physionet.org/challenge/2019/
代码
原始数据整合
首先,手头数据的长相是这样式儿的
一个psv文件中存储着一个患者随时间变化的数据,一行代表一天的数据
- 整合并去掉缺失比例大于0.48的列
def get_data():
path = "trainingA"
filenames = os.listdir(path)
result = pd.DataFrame()
for filename in filenames:
temp = pd.read_csv(path + "/" + filename, sep='|')
result = pd.concat([result, temp])
nan_columns = result.isna().mean()>0.48
return result.drop(nan_columns, inplace=True, axis=1)
填充缺失值
values = get_data()
# 先前向填充,再均值填充
# values = values.fillna(method='pad')
# values = values.fillna(values.mean())
values.interpolate(method='linear', inplace=True, limit_direction='both')
更新label值
由于比赛要求对发病做出提前六天的预警,因此对于每个人,如果某一天发病即label变成了1,我们把他前六天的观测结果也改成1。
更改label
def change_label(value):
label = value['SepsisLabel']
if 1 in label.tolist():
index = np.argmax(label)
label[max(0, index-6):index] = 1
change_label(values)
新增累积列
考虑到时间上的累积效应,对每一个变量计算前七天的累积加权值,并获得一列新的变量,其实就是attention机制,只不过是固定的alpha值。
# 获取相乘的系数
def get_coefficient(num):
coe = np.arange(1, num+1)
return coe/sum(coe)
# 添加累积列
def compute_accu(value):
length, _ = value.shape
columns = value.columns
seven = get_coefficient(7)
for i in range(len(columns)-1):
newname = "Accu_"+columns[i]
temp = np.ones((length, 1))
for j in range(len(temp)):
if j == 0:
temp[j] = value[columns[i]][j]
elif j < 7:
cor = get_coefficient(j)
temp[j] = sum(value[columns[i]][0:j]*cor)
else:
temp[j] = sum(value[columns[i]][j-7:j] * seven)
value[newname] = temp
xgboost预测法
Y = value['SepsisLabel']
X = value.drop(['SepsisLabel'], axis = 1)
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size)
model = XGBClassifier()
eval_set = [(X_test, y_test)]
model.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="logloss", eval_set=eval_set, verbose=True)
plot_importance(model)
pyplot.show()
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
逻辑回归预测法
- 对数据进行归一化
# 归一化
def normallization(data):
columns = data.columns.tolist()
for c in columns:
d = data[c]
MAX = d.max()
MIN = d.min()
if (MAX == 1 and MIN == 0) :
continue
if (MAX != MIN) :
data[c] = ((d - MIN) / (MAX - MIN))
- 评估模型准确率
normallization(value)
Y = value['SepsisLabel']
X = value.drop(['SepsisLabel'], axis = 1)
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size)
lr = LogisticRegression(C=1e5)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
预测函数
由于比赛要求提交可执行文件,因此重新写了一个预测函数
- 尝试获取逻辑回归的参数并重新计算
weight = lr.coef_
weight = weight.transpose()
intercept = lr.intercept_[0]
sigmod = lambda x: float(1.0/(1.0+np.exp(-x)))
# 预测函数
def predict(F, weight, intercept):
numSamples, _ = np.shape(F)
result = []
scores = []
for i in range(numSamples):
# 分数这样子计算
# print(F[i, :] * weight + intercept)
scores.append(sigmod(F[i, :] * weight + intercept))
result.append(int(F[i, :] * weight >= -intercept))
return scores, result
predictions = predict(np.mat(X_test), weight, intercept)
accuracy1 = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
- 将参数融入到代码当中去
# 读取函数
def evaluate(filename):
filename = filename+'.psv'
nan_columns = ['Temp', 'EtCO2', 'BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2', 'AST', 'BUN', 'Alkalinephos', 'Calcium', 'Chloride', 'Creatinine', 'Bilirubin_direct', 'Glucose', 'Lactate', 'Magnesium', 'Phosphate', 'Potassium', 'Bilirubin_total', 'TroponinI', 'Hct', 'Hgb', 'PTT', 'WBC', 'Fibrinogen', 'Platelets']
value = pd.read_csv(filename, sep='|')
value.drop(nan_columns, inplace=True, axis=1)
M = {'HR': 0.24972140809778023, 'O2Sat': 0.9651197941386043, 'SBP': 0.3822739233740868, 'MAP': 0.21045135092077905, 'DBP': 0.1469540894675809, 'Resp': 0.2613221129135702, 'Age': 0.6334712914523257, 'Gender': 0.5777212530766943, 'Unit1': 0.5017673613352216, 'Unit2': 0.49823263866477846, 'HospAdmTime': 0.9796459436158841, 'ICULOS': 0.07820453171621204, 'SepsisLabel': 0.032193770049923125, 'Accu_HR': 0.3961986226205406, 'Accu_O2Sat': 0.9660205798556376, 'Accu_SBP': 0.3878985082015177, 'Accu_MAP': 0.26354211253459175, 'Accu_DBP': 0.1470331983530533, 'Accu_Resp': 0.2595029445367898, 'Accu_Age': 0.6334712914523257, 'Accu_Gender': 0.5777212530766943, 'Accu_Unit1': 0.5017673613352216, 'Accu_Unit2': 0.49823263866477846, 'Accu_HospAdmTime': 0.9796459436158841, 'Accu_ICULOS': 0.07065019821885823}
value.interpolate(method='linear', inplace=True, limit_direction='both')
compute_accu(value)
# print(value.mean())
if 'SepsisLabel' in value.columns.tolist():
value.drop(['SepsisLabel'], axis=1, inplace=True)
value.fillna(M, inplace=True)
weight = np.array([[3.10101382],
[0.9402448],
[-0.22685776],
[-0.80695164],
[-1.28859308],
[0.61754885],
[0.01568997],
[0.08981491],
[-1.48981012],
[-1.65665569],
[-0.61166079],
[6.84160107],
[0.33597397],
[1.07752734],
[0.38562541],
[-1.62192069],
[1.55039952],
[3.02623835],
[0.01568997],
[0.08981491],
[-1.48981012],
[-1.65665569],
[-0.61166079],
[-1.71220229]])
intercept = -3.146465793867411
scores, lables = predict(np.mat(value), weight, intercept)
with open(filename+'.out', 'w') as f:
for (s, l) in zip(scores, lables):
f.write('%g|%d\n' % (s, l))
收获
了解了一些医学常识,提升了自己的数据分析和处理能力,通过在群里和大家交流收获了不少有趣的想法,对于机器学习类比赛有了更深入的了解。取得了进入复赛的好成绩,但由于不满足后续参加会议的要求不得不终止比赛。