【中药网络药理学】筛选细胞衰老和预后相关基因(附分类代码和画图代码)

具体代码和详细的中药网络药理学代码以及讲解可以看我的github(欢迎star)doge

https://github.com/qianwei1129/TCM-Network-Pharmacology

1、衰老相关基因

HAGRmsigdb数据获取细胞衰老相关基因,将两者取交集后构建基因蛋白互作网络

HAGR数据库

该库本身提供了下载链接,我在下载后对其进行了清洗

msigdb数据库

以"aging"作为关键词,Search Filters中collection设置为"all collections",source species设置为"Homo sapiens",contributors设置为"all contributions"

在msigdb数据库中共得到56个衰老相关基因集

2、小细胞肝癌预后相关基因_TCGA数据库

library("survival")
library("survminer")

使用TCGA数据库通过TCGA-LIHC数据集(2023年1月,n=377),下载clinical(临床信息),点击Metadata(样本信息),cart(基因文件)[/data/TCGA数据库]获取转录组测序数据与临床数据,并排除了临床数据不完整的患者队列

我使用了Harmony软件包进行批次效应调整,

接下来,基于拟合多因素Cox回归和lasso回归,构建预后效果评价分类模型

以"stranded_first"作为特定基因的表达水平评价量,临床信息中选择vital_status:生存状态(例如,存活、死亡)和days_to_death:诊断后至死亡的天数,作为评价的临床指标,最后整合得到如下表格:

病例ID基因名生存时间结局
ID1GeneA时间1结果1
ID2GeneB时间2结果2
ID3GeneC时间3结果3

描述性分析

首先使用生存分析中的Cox比例风险模型建立一个风险评分系统,然后利用风险评分中位数对其进行分组,将其分为“高风险组”和”低风险组“,并绘制生存分析图(针对筛选出的基因)

# 生存分析
load("files/exp_clinical")

install.packages("survival") 
install.packages("survminer")
install.packages("glmnet")
install.packages("caret")

library(survival)
library(survminer)
library(glmnet)
library(dplyr)
library(caret)

# 绘制Kaplan-Meier曲线
exp_clinical <- as.data.frame(t(exp_clinical))
exp_clinical$time <- as.numeric(exp_clinical$time)
exp_clinical <- exp_clinical[!is.na(exp_clinical$time), ]

# 观察小范围内数据框
View(exp_clinical[(1:50), (1:50)])
View(gene_data[(1:50), (1:50)])
View(normalized_data[(1:50), (1:50)])

gene_data <- exp_clinical[(1:(nrow(exp_clinical)-5)), ]
clinical_data <- exp_clinical[((nrow(exp_clinical)-4) : nrow(exp_clinical)), ]


# 选择至少在一个样本中表达量高于该基因90%分位数的基因
sapply(gene_data, class)

gene_data[is.na(gene_data)] <- 0
gene_data[] <- lapply(gene_data, function(x) as.numeric(as.character(x)))

gene_data_scale <- as.data.frame(scale(gene_data))

normalize_minmax <- function(x) {
  return((x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE)))
}

gene_data <- as.data.frame(lapply(gene_data_scale, normalize_minmax))

names(gene_data) <- names(clinical_data)
data <- rbind(clinical_data, gene_data)
data <- as.data.frame(t(data))

rownames(data)[6:nrow(data)] <- rownames(exp_clinical)[1:(nrow(exp_clinical)-5)]
save(data, file = "files/gene_clinical_data")


# 做相关性分析
load("files/gene_clinical_data")

View(data[(1:50), (1:50)])

rm(list = ls())

load("files/exp_clinical")
load("files/agging_HCC_tcm_gene")

library(ggplot2)
library(ggpubr)
library(survival)
library(survminer)
library(glmnet)
library(mice)
library(dplyr)
library(car)
library(randomForestSRC)
library(caret)


## 预处理
rownames(exp_clinical) <- data_rownames
exp_clinical <- replace(exp_clinical, is.na(exp_clinical), 0)
temp <- as.data.frame(t(exp_clinical))

temp <- data.frame(lapply(temp, as.numeric))
temp <- replace(temp, is.na(temp), 0)

colnames(temp) <- data_rownames
colnames(temp)

gene_data <- as.data.frame(temp[, unlist(aging_HCC_tcm_gene)])
temp_gene_data <- as.data.frame(lapply(gene_data, as.numeric))

str(temp_gene_data)

temp_gene_data <- scale(temp_gene_data)
gene_data <- as.data.frame(temp_gene_data)
# temp <- scale(gene_data)

rownames(gene_data) <- t(colnames(exp_clinical))

exp_clinical <- as.data.frame(t(exp_clinical))
clinical_data <- as.data.frame(cbind(exp_clinical$time, exp_clinical$flag))

colnames(clinical_data) <- c("time", "flag")

clinical_data <- as.data.frame(lapply(clinical_data, as.numeric))
clinical_data[is.na(clinical_data)] <- 0

sum(clinical_data$flag)

rownames(clinical_data) <- rownames(gene_data)

rows_to_remove <- rownames(clinical_data[clinical_data$time == 0, ])

gene_data <- gene_data[!rownames(gene_data) %in% rows_to_remove, ]
clinical_data <- clinical_data[!rownames(clinical_data) %in% rows_to_remove, ]
# clinical_data <- clinical_data[!rownames(clinical_data) %in% rows_to_remove, ]

# rownames(gene_data) <- t(colnames(exp_clinical))

# clinical_data <- as.matrix(temp[, c("time", "flag")])
# clinical_data <- as.data.frame(clinical_data)
# 
# clinical_data <- as.data.frame(lapply(clinical_data, as.numeric))
# sum(clinical_data$flag)
# 
data <- cbind(clinical_data, gene_data)
# # temp_data <- cbind(clinical_data, temp)
# 
# sum(is.na(clinical_data))
# sum(is.na(gene_data))

# gene_data <- preProcess(gene_data, method = "range")

save(data, gene_data, clinical_data, file = "files/surv_pre_data")


## 风险回归
rm(list = ls())
load("files/surv_pre_data")

str(clinical_data)
str(gene_data)

custom_control <- coxph.control(eps = 1e-09, 
                                iter.max = 10000, 
                                toler.chol = 1e-10)

surv_model <- coxph(Surv(time, flag) ~ ., 
                    data = data, 
                    control = custom_control)

# temp_surv_model <- coxph(
#  Surv(time, flag) ~ ., 
#  data = temp_data, 
#  control = coxph.control(iter.max = 10000)
#)

surv_model$coefficients

# surv_model <- temp_surv_model

coefficients <- as.data.frame(surv_model$coefficients)
coefficients[is.na(coefficients)] <- 0
coefficients

gene_data <- data.frame(gene_data)
gene_coef_data <- rbind(gene_data, t(coefficients))
rownames(gene_coef_data)[nrow(gene_coef_data)] <- c("coef")

surv_results <- sweep(gene_data, 2, t(coefficients), `*`)
surv_row_sums <- rowSums(surv_results)

surv_row_sums[is.na(surv_row_sums)] <- 0
surv_row_sums

surv_median_value <- median(surv_row_sums)
surv_median_value


# 分类并添加分组
risk_group <- ifelse(surv_row_sums > surv_median_value,
                     "High Risk",
                     "Low Risk")
surv_results$risk_group <- risk_group
final_surv_results <- surv_results[, !apply(surv_results, 
                                      2,
                                      function(x) all(x == 0))]

save(data, 
     risk_group,
     gene_data, 
     clinical_data, 
     coefficients, 
     final_surv_results, 
     file = "files/surv_final_data")

# 绘制生存曲线
rm(list = ls())

load("files/surv_final_data")

colnames(final_surv_results)


temp_data <- data
temp_data$time <- ceiling(temp_data$time/30)
# temp_data <- lapply(temp_data, as.numeric)

# 剔除大于5年的
rows_to_remove <- rownames(temp_data[temp_data$time > 60, ])
temp_data <- temp_data[!rownames(temp_data) %in% rows_to_remove, ]
clinical_data <- clinical_data[!rownames(clinical_data) %in% rows_to_remove, ]
risk_group <- as.data.frame(risk_group)
risk_group <- risk_group[!rownames(risk_group) %in% rows_to_remove, ]


temp_surv_obj <- Surv(temp_data$time, temp_data$flag)
temp_data <- cbind(temp_data, risk_group)
temp_surv_fit <- survfit(temp_surv_obj ~ risk_group, 
                    data = temp_data)
temp_max <- max(temp_data$time)
ggsurvplot(temp_surv_fit, 
           data = temp_data,
           risk.table = TRUE, # 显示风险表
           pval = TRUE, # 显示P值
           conf.int = TRUE, # 显示置信区间
           xlim = c(0, temp_max), # 可选:设置X轴的范围
           xlab = "Times(months)", # 设置X轴标签
           ylab = "Survival probability", # 设置Y轴标签
)

coefficients$gene_name <- rownames(coefficients)
colnames(coefficients)[1] <- c("coef")
writexl::write_xlsx(coefficients, "data/washed_data/coef.xlsx")

surv_obj <- Surv(data$time, data$flag)
surv_fit <- survfit(surv_obj ~ risk_group, 
                    data = data)
max <- max(data$time)
ggsurvplot(surv_fit, 
           data = final_surv_results,
           risk.table = TRUE, # 显示风险表
           pval = TRUE, # 显示P值
           conf.int = TRUE, # 显示置信区间
           xlim = c(0, max), # 可选:设置X轴的范围
           xlab = "Times(days)", # 设置X轴标签
           ylab = "Survival probability", # 设置Y轴标签
           title = "Kaplan-Meier 生存曲线" # 设置图标题
)
  • 2
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值