任务
根据小分子的属性特征,预测小分子在人体内清除率指标(即数据中的Label字段)。
训练数据
>> df.head
<bound method NDFrame.head of ID Molecule_max_phase Molecular weight RO5_violations AlogP \
0 1003 0 0.206754 0.0 0.608040
1 1819 0 0.130056 0.0 0.591848
2 6090 0 0.162482 0.0 0.592965
3 3916 0 0.112266 0.0 0.652150
4 8480 0 0.161722 0.0 0.741485
... ... ... ... ... ...
6919 6612 4 0.101742 0.0 0.621999
6920 1316 0 0.227768 1.0 0.790620
6921 54 0 0.331437 2.0 0.811837
6922 3577 0 0.250387 1.0 0.753769
6923 5955 4 0.158947 0.0 0.613065
Features Label
0 [0, 0, 0, 0, 0, 0, 2, 0, 10, 0, 0, 3, 2, 0, 0,... 3.190476
1 [0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 5, 1, 0, 0, ... 9.740969
2 [0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 7, 0, 0, 0, ... 10.545341
3 [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 9, 0, 0, 0, ... 3.206803
特征有:
字段名 | 类型 | 说明 |
ID | 整型 | 样本编号 |
Molecule_max_phase | 整型 | 分子的最长位相 |
Molecular weight | 浮点型 | 分子量 |
RO5_violations | 整型 | 违反新药5规则(RO5)的数量 |
AlogP | 浮点型 | 由ACD软件计算化合物的脂分配系数(该数据来自ChemBL) |
Features | 向量 | 小分子的矢量化表示 |
Label | 枚举/浮点型 | 单位时间内单位机体能将多少容积体液中的药物清除 |
这里也看不懂,就用线性回归处理下吧
import pandas as pd
df = pd.read_csv('train.csv')
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6924 entries, 0 to 6923
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ID 6924 non-null int64
1 Molecule_max_phase 6924 non-null int64
2 Molecular weight 6924 non-null float64
3 RO5_violations 6853 non-null float64
4 AlogP 6853 non-null float64
5 Features 6924 non-null object
6 Label 6924 non-null float64
dtypes: float64(4), int64(2), object(1)
memory usage: 378.8+ KB
把features 里的特征拿出来和外面4个特征放在一起
import numpy as np
dffeature = df.Features.values
dffeature = np.array(dffeature)
newfeature = []
for line in dffeature:
line = line[1:-1]
arra = line.split(',')
arra = np.array(arra).astype(np.float64)
newfeature.append(arra)
newfeature = np.array(newfeature)
newfeatureall = np.concatenate((np.array(df.Molecule_max_phase).reshape(df.Molecule_max_phase.shape[0],1),np.array(df['Molecular weight']).reshape(df.Molecule_max_phase.shape[0],1)),axis=1)
newfeatureall = np.concatenate((newfeatureall,np.array(df['RO5_violations']).reshape(df.Molecule_max_phase.shape[0],1)),axis=1)
newfeatureall = np.concatenate((newfeatureall,np.array(df['AlogP']).reshape(df.Molecule_max_phase.shape[0],1)),axis=1)
newfeatureall = np.concatenate((newfeatureall,newfeature),axis=1)
newfeatureall.shape
过滤掉基本不变(最小值等于最大值)的特征
selectcolumn = (newfeatureall.max(axis=0) != newfeatureall.min(axis=0))
selectcolumn = np.array(selectcolumn)
newfeatureselect = newfeatureall[:,selectcolumn]
特征标准化
from sklearn.preprocessing import StandardScaler
stdScale = StandardScaler().fit(newfeatureselect)
newfeaturenorm = stdScale.transform(newfeatureselect)
用岭回归做拟合
newlabel = df.Label
newfeaturenorm[np.isnan(newfeaturenorm)]=0
from sklearn.linear_model import Ridge
clf = Ridge(alpha=1.0)
#from sklearn.linear_model import Lasso
#clf = Lasso(alpha=0.01,positive=True)
clf.fit(newfeaturenorm, newlabel)
读取测试数据,也做特征过滤和标准化
testdf = pd.read_csv('test_noLabel.csv')
testfeature = np.array(testdf.Features.values)
testfeature.shape
testfeaturearr= []
for line in testfeature:
line = line[1:-1]
arra = line.split(',')
arra = np.array(arra).astype(np.float64)
testfeaturearr.append(arra)
testfeaturearr = np.array(testfeaturearr)
testfeatureall = np.concatenate((np.array(testdf.Molecule_max_phase).reshape(testdf.Molecule_max_phase.shape[0],1),np.array(testdf['Molecular weight']).reshape(testdf.Molecule_max_phase.shape[0],1)),axis=1)
testfeatureall = np.concatenate((testfeatureall,np.array(testdf['RO5_violations']).reshape(testdf.Molecule_max_phase.shape[0],1)),axis=1)
testfeatureall = np.concatenate((testfeatureall,np.array(testdf['AlogP']).reshape(testdf.Molecule_max_phase.shape[0],1)),axis=1)
testfeatureall = np.concatenate((testfeatureall,testfeaturearr),axis=1)
testfeatureselect = testfeatureall[:,selectcolumn]
testfeaturenorm = stdScale.transform(testfeatureselect)
newfeaturenorm[np.isnan(newfeaturenorm)]=0
对测试数据进行预测并保存
result = clf.predict(newfeaturenorm)
result
testid = np.array(testdf.ID.values)
resultfile = open('result.csv','w')
resultfile.write("ID,Label\n")
for index in range(len(testid)):
result[index] = result[index] if result[index] >= 0 else 0
resultfile.write(str(testid[index])+","+str(result[index])+"\n")
resultfile.close()
评分采用RMSE,结果跑出来6.多,不太理想,再想想哪里可以优化吧。
- 缺失值处理
- 岭回归不同参数例如lumda
- 试下其他回归例如lasso
- 画图来进一步判断哪些特征有用,排除不太相关的特征
20200313 更新 ,内容:
缺失值采用均值、众数、拉格朗日插值处理;调整岭回归alpha参数,排除ID列,去掉带有nan值的一条记录,rmse来到了3.0多;把可以参数放到代码最前面方便调参
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from scipy.interpolate import lagrange #导入拉格朗日函数
df = pd.read_csv('train_0312.csv')
testdf = pd.read_csv('test_noLabel_0312.csv')
df = df.drop('ID',axis=1)
testid = np.array(testdf.ID.values)
testdf = testdf.drop('ID',axis=1)
newlabel = df.Label
missdealtype = 3 #缺失值处理方法 1:众数 2:平均值,3:拉格朗日插值
normaltype = 2 #正则化方法,1:标准化 2:归一化
modeltype = 1 #模型类型 ,1:岭回归
alphacoef=2.5
lagelangk = 2
def ployinterp_column(s,n,k=lagelangk): #k=2表示用空值的前后两个数值来拟合曲线,从而预测空值
y = s[list(range(n-k,n)) + list(range(n+1,n+1-k))] #取值,range函数返回一个左闭右开([left,right))的序列数
y = y[y.notnull()]#取上一行中取出数值列表中的非空值,保证y的每行都有数值,便于拟合函数
return lagrange(y.index,list(y))(n) #调用拉格朗日函数,并添加索引
stdScale = 0
if missdealtype == 1:
ro5mode = df.RO5_violations.dropna().mode().values
df.RO5_violations[df.RO5_violations.isnull()] = ro5mode
alogmode = df.AlogP.dropna().mode().values
df.AlogP[df.AlogP.isnull()] = alogmode
testdf.RO5_violations[testdf.RO5_violations.isnull()] = ro5mode
testdf.AlogP[testdf.AlogP.isnull()] = alogmode
elif missdealtype == 2: #mean
ro5mode = df['RO5_violations'].mean()
alogmode = df['AlogP'].mean()
df.RO5_violations[df.RO5_violations.isnull()] = ro5mode
df.AlogP[df.AlogP.isnull()] = alogmode
testdf.RO5_violations[testdf.RO5_violations.isnull()] = ro5mode
testdf.AlogP[testdf.AlogP.isnull()] = alogmode
elif missdealtype == 3: #lagelang
for i in df.columns: #如果i在data的列名中,data.columns生成的是data的全部列名
for j in range(len(df)): #len(data)返回了data的长度,若此长度为5,则range(5)会产生从0开始计数的整数列表
if (df[i].isnull())[j]:#如果data[i][j]为空,则调用函数ployinterp_column为其插值
df[i][j] = ployinterp_column(df[i],j)
for i in testdf.columns: #如果i在data的列名中,data.columns生成的是data的全部列名
for j in range(len(testdf)): #len(data)返回了data的长度,若此长度为5,则range(5)会产生从0开始计数的整数列表
if (testdf[i].isnull())[j]:#如果data[i][j]为空,则调用函数ployinterp_column为其插值
testdf[i][j] = ployinterp_column(testdf[i],j)
#将后面的特征提取出来,与前面的特征组合
dffeature = df.Features.values
dffeature = np.array(dffeature)
newfeature = []
for line in dffeature:
line = line[1:-1]
arra = line.split(',')
arra = np.array(arra).astype(np.float64)
newfeature.append(arra)
newfeature = np.array(newfeature)
newfeatureall = np.concatenate((np.array(df.Molecule_max_phase).reshape(df.Molecule_max_phase.shape[0],1),np.array(df['Molecular weight']).reshape(df.Molecule_max_phase.shape[0],1)),axis=1)
newfeatureall = np.concatenate((newfeatureall,np.array(df['RO5_violations']).reshape(df.Molecule_max_phase.shape[0],1)),axis=1)
newfeatureall = np.concatenate((newfeatureall,np.array(df['AlogP']).reshape(df.Molecule_max_phase.shape[0],1)),axis=1)
newfeatureall = np.concatenate((newfeatureall,newfeature),axis=1)
newfeatureall = np.delete(newfeatureall, 5093, 0) # 去掉nan值的记录
newlabel = np.delete(np.array(newlabel), 5093, 0)
#test
testfeature = np.array(testdf.Features.values)
testfeaturearr= []
for line in testfeature:
line = line[1:-1]
arra = line.split(',')
arra = np.array(arra).astype(np.float64)
testfeaturearr.append(arra)
testfeaturearr = np.array(testfeaturearr)
testfeatureall = np.concatenate((np.array(testdf.Molecule_max_phase).reshape(testdf.Molecule_max_phase.shape[0],1),np.array(testdf['Molecular weight']).reshape(testdf.Molecule_max_phase.shape[0],1)),axis=1)
testfeatureall = np.concatenate((testfeatureall,np.array(testdf['RO5_violations']).reshape(testdf.Molecule_max_phase.shape[0],1)),axis=1)
testfeatureall = np.concatenate((testfeatureall,np.array(testdf['AlogP']).reshape(testdf.Molecule_max_phase.shape[0],1)),axis=1)
testfeatureall = np.concatenate((testfeatureall,testfeaturearr),axis=1)
#过滤没有变化的特征
selectcolumn = (newfeatureall.max(axis=0) != newfeatureall.min(axis=0))
selectcolumn = np.array(selectcolumn)
newfeatureselect = newfeatureall[:,selectcolumn]
testfeatureselect = testfeatureall[:,selectcolumn]
newfeaturenorm = 0
testfeaturenorm = 0
if normaltype == 1:
stdScale = StandardScaler().fit(newfeatureselect)
newfeaturenorm = stdScale.transform(newfeatureselect)
testfeaturenorm = stdScale.transform(testfeatureselect)
elif normaltype == 2:
newfeature_max = newfeatureselect.max(axis=0)
newfeaturenorm = newfeatureselect / newfeature_max
testfeaturenorm = testfeatureselect / newfeature_max
from sklearn.linear_model import Ridge
clf = Ridge(alpha=alphacoef)
clf.fit(newfeaturenorm, newlabel)
result = clf.predict(testfeaturenorm)
result
resultfile = open('result.csv','w')
resultfile.write("ID,Label\n")
for index in range(len(testid)):
result[index] = result[index] if result[index] >= 0 else 0
resultfile.write(str(testid[index])+","+str(result[index])+"\n")
resultfile.close()