数模竞赛代码整理---特征提取篇

数模竞赛代码整理—特征提取篇


特征提取即,从多个数据中提取最能预测因变量的少数几个变量。

方法

  • 线性相关:pearson、 spearman、 kendall
  • 非线性相关:距离协方差、灰相关
  • 其他特征选择方法:RFE(递归特征消除)、PCA(主成分分析)

pearson、 spearman、 kendall

# 使用DataFrame自带的.corr()函数即可,很简单
# method = pearson、 spearman、 kendall
corr=df.corr(method='spearman')

距离协方差

没有相关的库,只能自己写代码

from scipy.spatial.distance import pdist, squareform

def distcorr(X, Y):
    X = np.atleast_1d(X)
    Y = np.atleast_1d(Y)
    if np.prod(X.shape) == len(X):
        X = X[:, None]
    if np.prod(Y.shape) == len(Y):
        Y = Y[:, None]
    X = np.atleast_2d(X)
    Y = np.atleast_2d(Y)
    n = X.shape[0]
    if Y.shape[0] != X.shape[0]:
        raise ValueError('Number of samples must match')
    a = squareform(pdist(X))
    b = squareform(pdist(Y))
    A = a - a.mean(axis=0)[None, :] - a.mean(axis=1)[:, None] + a.mean()
    B = b - b.mean(axis=0)[None, :] - b.mean(axis=1)[:, None] + b.mean()
    
    dcov2_xy = (A * B).sum() / float(n * n)
    dcov2_xx = (A * A).sum() / float(n * n)
    dcov2_yy = (B * B).sum() / float(n * n)
    dcor = np.sqrt(dcov2_xy) / np.sqrt(np.sqrt(dcov2_xx) * np.sqrt(dcov2_yy))
    return dcor

# 下面代码可以将相关转为DataFrame,便于进一步的分析
dcorr = []
for i in names:
    dcorr.append([distcorr(X.loc[i, :].to_list(), Y), i]) 

ans_dcorr = pd.DataFrame(columns=['pIC50'])

for i in dcorr:
    ans_dcorr.loc[i[1]] = i[0]
ans_dcorr

灰关联

from numpy import *

def GRA_ONE(DataFrame,m=0):
    gray= DataFrame
    #读取为df格式
    gray=(gray - gray.min()) / (gray.max() - gray.min())
    #标准化
    std=gray.iloc[:,m]#为标准要素
    ce=gray.iloc[:,0:]#为比较要素
    n=ce.shape[0]
    m=ce.shape[1]#计算行列

    #与标准要素比较,相减
    a=zeros([m,n])
    for i in range(m):
        for j in range(n):
            a[i,j]=abs(ce.iloc[j,i]-std[j])

    #取出矩阵中最大值与最小值
    c=amax(a)
    d=amin(a)

    #计算值
    result=zeros([m,n])
    for i in range(m):
        for j in range(n):
            result[i,j]=(d+0.5*c)/(a[i,j]+0.5*c)

    #求均值,得到灰色关联值
    result2=zeros(m)
    for i in range(m):
            result2[i]=mean(result[i,:])
    RT=pd.DataFrame(result2)
    return RT

def GRA(DataFrame):
    list_columns = [str(s) for s in range(len(DataFrame.columns)) if s not in [None]]
    df_local = pd.DataFrame(columns=list_columns)
    for i in range(len(DataFrame.columns)):
        df_local.iloc[:,i] = GRA_ONE(DataFrame,m=i)[0]
    return df_local

gray = GRA_ONE(dataframe)

RFE

from sklearn.feature_selection import RFECV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


def cal_RFE(X, Y, names):
    X = X.T
 # use linear regression as the model
    lr = LinearRegression()
 # rank all features, i.e continue the elimination until the last one
    rfe = RFECV(lr, step=1, cv=5)
    rfe.fit(X, Y)
    res_lr = list(zip(map(lambda x: round(x, 4), rfe.ranking_), names))
    print("Features sorted by their rank:")
    print(res_lr)
 # use lasso as the model
    lr = Lasso()
 # # rank all features, i.e continue the elimination until the last one
    rfe = RFECV(lr, step=1, cv=5)
    rfe.fit(X, Y)
    res_la = list(zip(map(lambda x: round(x, 4), rfe.ranking_), names))
    print("Features sorted by their rank:")
#     print(res_la)
    return res_lr ,res_la



#Split the data into input and target
X = md_ea_dropY.T.copy()
Y = (md_ea_train.loc[:, 'pIC50'] * 1000).astype(int)

res_lr, res_la = cal_RFE(X, Y, md_ea_dropY.columns.values.tolist())

ans_res_lr = pd.DataFrame(columns=['pIC50'])
for i in res_lr:
    ans_res_lr.loc[i[1]] = i[0]

ans_res_la = pd.DataFrame(columns=['pIC50'])
for i in res_la:
    ans_res_la.loc[i[1]] = i[0]

PCA

# 将数据写入 excel 文件
def data_write_excel(data, filename, writer=None):
    data = pd.DataFrame(data)
    if not writer:
        writer = pd.ExcelWriter(filename + '1.xlsx')
        data.to_excel(writer, float_format='%.5f', sheet_name=filename)
        writer.save()
        writer.close()
    else:
        data.to_excel(writer, float_format='%.5f', sheet_name=filename)

def cal_PCA(X):
     # 20个主成分
    pca = PCA(n_components=20)
     # 降维后的数据
    data_low = pca.fit_transform(X)
     # 降维后近似恢复的数据
    data_restore = pca.inverse_transform(data_low)
    writer = pd.ExcelWriter(r'data\pca1.xlsx')
    data_write_excel(data_low, 'low', writer=writer)
    data_write_excel(data_restore, 'restore', writer=writer)
     # 主成分贡献率
    data_write_excel(pca.explained_variance_ratio_, 'ratio', writer=writer)
     # 主成分方差
    data_write_excel(pca.explained_variance_, 'variance', writer=writer)
     # 主成分在各个变量的负载
    data_write_excel(pca.components_.T, 'component', writer=writer)
     # 主成分个数
    print(pca.n_components_, ' n_components')
    writer.save()

from sklearn.preprocessing import MinMaxScaler

def scale(data):
    minmax = MinMaxScaler()
    data = minmax.fit_transform(data)
    # print(input_reframed)
    print(minmax.scale_)
    print(minmax.min_)
    return data, minmax.scale_[-1], minmax.min_[-1]

X = md_ea_dropY.copy()
cal_PCA(X)

comp = pd.read_excel(r'data\pca1.xlsx', sheet_name="component")
comp1, inscale_, inmin_ = scale(comp.abs())
ratio = pd.read_excel(r'data\pca1.xlsx', sheet_name="ratio").iloc[:, 1].to_numpy()
pca1 = np.abs(comp1[..., 1]) * ratio[0] + \
np.abs(comp1[..., 2]) * ratio[1] + \
np.abs(comp1[..., 3]) * ratio[2] + \
np.abs(comp1[..., 4]) * ratio[3] + \
np.abs(comp1[..., 5]) * ratio[4] + \
np.abs(comp1[..., 6]) * ratio[5] + \
np.abs(comp1[..., 7]) * ratio[6] + \
np.abs(comp1[..., 8]) * ratio[7] + \
np.abs(comp1[..., 9]) * ratio[8] + \
np.abs(comp1[..., 10]) * ratio[9] + \
np.abs(comp1[..., 11]) * ratio[10] + \
np.abs(comp1[..., 12]) * ratio[11] + \
np.abs(comp1[..., 13]) * ratio[12] + \
np.abs(comp1[..., 14]) * ratio[13] + \
np.abs(comp1[..., 15]) * ratio[14] + \
np.abs(comp1[..., 16]) * ratio[15] + \
np.abs(comp1[..., 17]) * ratio[16] + \
np.abs(comp1[..., 18]) * ratio[17] + \
np.abs(comp1[..., 19]) * ratio[18] + \
np.abs(comp1[..., 20]) * ratio[19]

pca11, inscale_, inmin_ = scale(pca1.reshape(-1, 1))

# 拿到不同变量的重要程度
ans_pca = pd.DataFrame(columns=['pIC50'])
names_pca = (md_ea_dropY.columns.to_list())
             
for i in range(pca11.shape[0]):
    ans_pca.loc[names_pca[i]] = pca11[i]
    
ans_pca
  • 1
    点赞
  • 15
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值