python 生存分析或者生存时间预测

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
#导入数据
#训练数据集
train = pd.read_csv("train_data.csv")
#测试数据集
test  = pd.read_csv("test_data.csv")
print ('训练数据集:',train.shape,'测试数据集:',test.shape)
#提取行数
rowNum_train=train.shape[0]
rowNum_test=test.shape[0]
print('训练数据集行数:',rowNum_train,',测试数据集行数:',rowNum_test)
full = train.append( test , ignore_index = True )
print (full.info())
print(full['NapsinA'].value_counts())#统计NapsinA这一列的各数据个数
#################################################
full['Grade of Differentiation']=full['Grade of Differentiation'].fillna( full['Grade of Differentiation'].mean() )
full['Maximum Diametercm']=full['Maximum Diametercm'].fillna( full['Maximum Diametercm'].mean() )
full['CRPmg/L']=full['CRPmg/L'].fillna( full['CRPmg/L'].mean() )
#对数字型数据进行填充采用平均值填充
#############################################
Pa={'other':5,'0':0,'1':1,'2':2,'3':3,'4':4}
full['Pathologic subtypes']=full['Pathologic subtypes'].map(Pa)
full['Pathologic subtypes']=full['Pathologic subtypes'].fillna( full['Pathologic subtypes'].mean() )
sex_mapDict={'M':1,'F':0}
full['Sex']=full['Sex'].map(sex_mapDict)
site={'I':1,'II':2,'III':3,'IV':4,'V':5}
full['Site']=full['Site'].map(site)
hs={'I':1,'II':2,'0':0}
full['Histologic subtype']=full['Histologic subtype'].map(hs)
t={'1c':12,'1b':11,'2b':21,'2a':20,'1a':10,'3':3,'4':4}
full['T']=full['T'].map(t)
ii={'surgery':0,'puncture':1,'other':2}
full['inspection item']=full['inspection item'].map(ii)
an={'0':0,'-':1}
full['ALK_neg']=full['ALK_neg'].map(an)
ae={'0':0,'-':1,'+':2,'pd':3,'ALK-':4,'p+':5,'3+':6,'plus+':7,'1+':8}
full['ALK_EML4']=full['ALK_EML4'].map(ae)
ck={'0':0,'-':1,'+':2,'pd':3,'ALK-':4,'p+':5,'3+':6,'plus+':7,'1+':8,'Lesions2+':9,'++':10,'2+':11,'+++':12,'dispersion+':13,'single+':14,'Lesions+':15}
full['CK7']=full['CK7'].map(ck)
ck5={'0':0,'-':1,'+':2,'pd':3,'ALK-':4,'p+':5,'3+':6,'plus+':7,'1+':8,'Lesions2+':9,'++':10,'2+':11,'+++':12,'dispersion+':13,'single+':14,'Lesions+':15,'local lesions+':16,'leak+':17,'local lesions3+':18,'few Lesions1+':19}
full['CK5_6']=full['CK5_6'].map(ck5)
cd={'0':0,'-':1,'+':2,'pd':3,'ALK-':4,'p+':5,'3+':6,'plus+':7,'1+':8,'Lesions2+':9,'++':10,'2+':11,'+++':12,'dispersion+':13,'single+':14,'Lesions+':15,'local lesions+':16,'leak+':17,'local lesions3+':18,'few Lesions1+':19}
full['CD56']=full['CD56'].map(cd)
na={'0':0,'-':1,'+':2,'pd':3,'ALK-':4,'p+':5,'3+':6,'plus+':7,'1+':8,'Lesions2+':9,'++':10,'2+':11,'+++':12,'dispersion+':13,'single+':14,'lesions+':15,'local lesions+':16,'leak+':17,'local lesions3+':18,'few Lesions1+':19}
full['NapsinA']=full['NapsinA'].map(na)
#采用字典的方式对字符的数据列进行整型的替换
#######################################################
print (full.info())
corrDf=full.corr()#计算相关系数矩阵
'''
#主要是对生存进行预测下边是对生存时间的预测,如果进行生存预测注释下边,上边取消注释就可以了
print(corrDf['Event'].sort_values(ascending=False))#输出其它列与Event的相关系数并实现降序
full_X = pd.concat( [full['Stage'],full['Maximum Diametercm'],full['Lymphatic metastasis'],full['N'],
full['inspection item'],full['Chemotherapy'],full['Histologic subtype'],full['Sex']] , axis=1 )#选取相关系数高的列
sourceRow=706
source_X = full_X.loc[0:sourceRow-1,:]   # 行号是从0开始的
#原始数据集:标签
source_y = full.loc[0:sourceRow-1,'Event']   
#预测数据集:特征
pred_X = full_X.loc[sourceRow:,:]
train_X, test_X, train_y, test_y = train_test_split(source_X ,source_y,train_size=.8)
model = LogisticRegression()
model.fit( train_X , train_y )
model.score(test_X, test_y)
pred_Y = model.predict(pred_X)
#查看数据类型
print(pred_Y.dtype) 
pred_Y = pred_Y.astype(int)
passenger_id = full.loc[sourceRow:,'PID']
predDf = pd.DataFrame({'PID': passenger_id,'Event': pred_Y})
predDf.to_csv( 'pred.csv' , index = False )#将预测的数据和PID进行输出,得到一个新的csv文件
'''
print(corrDf['Time_d'].sort_values(ascending=False))
full_X = pd.concat( [full['Grade of Differentiation'],full['CRPmg/L'],full['CK5_6'],full['EGFR gene mutation'],full['Smoking status']] , axis=1 )
'''
full_X = pd.concat( [full['Grade of Differentiation'],full['CRPmg/L'],full['CK5_6'],full['EGFR gene mutation'],
full['Smoking status'],full['Site'],full['Histologic subtype'],full['Family History']] , axis=1 )
'''
sourceRow=706
source_X = full_X.loc[0:sourceRow-1,:]   # 行号是从0开始的
#原始数据集:标签
source_y = full.loc[0:sourceRow-1,'Time_d']   
#预测数据集:特征
pred_X = full_X.loc[sourceRow:,:]
train_X, test_X, train_y, test_y = train_test_split(source_X ,source_y,train_size=.8)
model1 = RandomForestClassifier(n_estimators=1000,n_jobs=1)
model1.fit( train_X , train_y.astype('int'))
pred_Y = model1.predict(pred_X)
passenger_id = full.loc[sourceRow:,'PID']
predDf = pd.DataFrame({'PID': passenger_id,'Time_d': pred_Y})
predDf.to_csv( 'pred1.csv' , index = False )
  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值