pos_sample_df是一个drug和ATC编码之间是否有associations的邻接矩阵,我需要根据这个矩阵来实现正负样本的构造。即:邻接矩阵中drug和ATC有边则用来构造正样本,无边用来构造负样本。其中有2000种drug,3000个ATC,drug和ATC的特征均为500x1,邻接矩阵中的1为正样本标签,0为负样本标签。拼接后的一个样本为1001x1的列向量。
import numpy as np
import pandas as pd
import os
data_dir='C:/Users/Administrator/Desktop/temp1113/zhou/data/'
save_dir='C:/Users/Administrator/Desktop/temp1113/zhou/result/'
def concat_lab_vec(lab_df,vec_df,pos_sample_df):
from numpy import hstack,vstack,array
print('lab_df:',lab_df.shape)
print('vec_df:',vec_df.shape)
#如何拼接?vec_df的第i列与lab_df第j列,按照pos_sample_df中前两列沿着竖直方向拼接
Total_pinjie=[]
vect_columns=vec_df.columns.tolist()
lab_columns=lab_df.columns.tolist()
pos_columns=pos_sample.columns.tolist()
for i in range(len(pos_sample)):
if pos_sample[pos_columns[2]][i]==1:
drug_index=pos_sample[pos_columns[0]][i]-1 #因为Python下标从0开始
ATC_index=pos_sample[pos_columns[1]][i]-1
#drug_index=vec_df.iloc[i,vect_columns[0]-1] #vect_columns[0]-1是将matlab中从1开始的下标一次变到从0开始。
#ATC_index=pos_sample.iloc[i,pos_columns[1]]-1
#label=pos_sample_df[i][2]
#vec_df.iloc[0,:] #0行所有列=vec_df.shape[1]
#vec_df.iloc[:,0] #0列所有行=vec_df.shape[0]
#取到数据做拼接
vect_one_col=vec_df[vect_columns[drug_index]]
lab_one_col=lab_df[lab_columns[ATC_index]]
LLabel=pos_sample[pos_columns[2]][i]
one_col=array(hstack((vect_one_col,lab_one_col,LLabel))).T #做一下转置
Total_pinjie.append(one_col)
return Total_pinjie
if __name__ == "__main__":
pos_sample=pd.read_csv(data_dir+'enzyme_ATC_matrix_possample.csv',header=None)
neg_sample=pd.read_csv(data_dir+'enzyme_ATC_matrix_negsample.csv',header=None)
# 遍历label文件夹中的文件
for path, dirnames, filelist in os.walk(data_dir+'label/ndim=50/'):
print(filelist) #['enzyme_label2vec.csv', 'GPCR_label2vec.csv', 'IC_label2vec.csv']
for filename in filelist:
if filename=='enzyme_label2vec.csv':
lab_file = os.path.join(path, filename)
print('label_file:',lab_file)
lab_df=pd.read_csv(lab_file,header=None)
lab_flag=filename[:filename.index('_')] #文件名的截断
print('lab_flag:',lab_flag)
else:
pass
# 遍历vector文件夹中的文件
for path0, dirnames0, filelist0 in os.walk(data_dir+'vector/ndim=50/'):
print(filelist0)
"""
['enzyme_merge2vec.csv', 'enzyme_sim_chem2vec.csv', 'enzyme_sim_inter2vec.csv', 'GPCR_merge2vec.csv', 'GPCR_sim_chem2vec.csv', 'GPCR_sim_inter2vec.csv', 'IC_merge2vec.csv', 'IC_sim_chem2vec.csv', 'IC_sim_inter2vec.csv']
"""
for filename0 in filelist0: #filelist0[1]='enzyme_sim_chem2vec.csv'
if filename0=='enzyme_sim_chem2vec.csv':
vec_file = os.path.join(path0, filename0)
print(vec_file) #目录下有9个文件
if lab_flag in filename0:
print('vec_file:',vec_file)
vec_df=pd.read_csv(vec_file,header=None)
res_df=concat_lab_vec(lab_df,vec_df,pos_sample)
res_df1=pd.DataFrame(res_df)
res_df1.to_csv(save_dir+filename0,index=False,header=None )
else:
pass
#将大问题抽解成为小问题寻找到通式
zhou1=[1,2,3,4,5,6]
zhou2=[1,2,3,4,5,6]
label=1
from numpy import hstack,vstack,array,nan
zz=array(vstack((zhou1,zhou2,label)))
YYYtrain=array(hstack((zhou1,zhou2,label)))