具体代码和详细的中药网络药理学代码以及讲解可以看我的github(欢迎star)doge
https://github.com/qianwei1129/TCM-Network-Pharmacology
1、衰老相关基因
从HAGR和msigdb数据获取细胞衰老相关基因,将两者取交集后构建基因蛋白互作网络
HAGR数据库
该库本身提供了下载链接,我在下载后对其进行了清洗
msigdb数据库
以"aging"作为关键词,Search Filters中collection设置为"all collections",source species设置为"Homo sapiens",contributors设置为"all contributions"
在msigdb数据库中共得到56个衰老相关基因集
2、小细胞肝癌预后相关基因_TCGA数据库
library("survival")
library("survminer")
使用TCGA数据库通过TCGA-LIHC数据集(2023年1月,n=377),下载clinical(临床信息),点击Metadata(样本信息),cart(基因文件)[/data/TCGA数据库]获取转录组测序数据与临床数据,并排除了临床数据不完整的患者队列
我使用了Harmony软件包进行批次效应调整,
接下来,基于拟合多因素Cox回归和lasso回归,构建预后效果评价分类模型
以"stranded_first"作为特定基因的表达水平评价量,临床信息中选择vital_status:生存状态(例如,存活、死亡)和days_to_death:诊断后至死亡的天数,作为评价的临床指标,最后整合得到如下表格:
病例ID | 基因名 | 生存时间 | 结局 |
---|---|---|---|
ID1 | GeneA | 时间1 | 结果1 |
ID2 | GeneB | 时间2 | 结果2 |
ID3 | GeneC | 时间3 | 结果3 |
描述性分析
首先使用生存分析中的Cox比例风险模型建立一个风险评分系统,然后利用风险评分中位数对其进行分组,将其分为“高风险组”和”低风险组“,并绘制生存分析图(针对筛选出的基因)
# 生存分析
load("files/exp_clinical")
install.packages("survival")
install.packages("survminer")
install.packages("glmnet")
install.packages("caret")
library(survival)
library(survminer)
library(glmnet)
library(dplyr)
library(caret)
# 绘制Kaplan-Meier曲线
exp_clinical <- as.data.frame(t(exp_clinical))
exp_clinical$time <- as.numeric(exp_clinical$time)
exp_clinical <- exp_clinical[!is.na(exp_clinical$time), ]
# 观察小范围内数据框
View(exp_clinical[(1:50), (1:50)])
View(gene_data[(1:50), (1:50)])
View(normalized_data[(1:50), (1:50)])
gene_data <- exp_clinical[(1:(nrow(exp_clinical)-5)), ]
clinical_data <- exp_clinical[((nrow(exp_clinical)-4) : nrow(exp_clinical)), ]
# 选择至少在一个样本中表达量高于该基因90%分位数的基因
sapply(gene_data, class)
gene_data[is.na(gene_data)] <- 0
gene_data[] <- lapply(gene_data, function(x) as.numeric(as.character(x)))
gene_data_scale <- as.data.frame(scale(gene_data))
normalize_minmax <- function(x) {
return((x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE)))
}
gene_data <- as.data.frame(lapply(gene_data_scale, normalize_minmax))
names(gene_data) <- names(clinical_data)
data <- rbind(clinical_data, gene_data)
data <- as.data.frame(t(data))
rownames(data)[6:nrow(data)] <- rownames(exp_clinical)[1:(nrow(exp_clinical)-5)]
save(data, file = "files/gene_clinical_data")
# 做相关性分析
load("files/gene_clinical_data")
View(data[(1:50), (1:50)])
rm(list = ls())
load("files/exp_clinical")
load("files/agging_HCC_tcm_gene")
library(ggplot2)
library(ggpubr)
library(survival)
library(survminer)
library(glmnet)
library(mice)
library(dplyr)
library(car)
library(randomForestSRC)
library(caret)
## 预处理
rownames(exp_clinical) <- data_rownames
exp_clinical <- replace(exp_clinical, is.na(exp_clinical), 0)
temp <- as.data.frame(t(exp_clinical))
temp <- data.frame(lapply(temp, as.numeric))
temp <- replace(temp, is.na(temp), 0)
colnames(temp) <- data_rownames
colnames(temp)
gene_data <- as.data.frame(temp[, unlist(aging_HCC_tcm_gene)])
temp_gene_data <- as.data.frame(lapply(gene_data, as.numeric))
str(temp_gene_data)
temp_gene_data <- scale(temp_gene_data)
gene_data <- as.data.frame(temp_gene_data)
# temp <- scale(gene_data)
rownames(gene_data) <- t(colnames(exp_clinical))
exp_clinical <- as.data.frame(t(exp_clinical))
clinical_data <- as.data.frame(cbind(exp_clinical$time, exp_clinical$flag))
colnames(clinical_data) <- c("time", "flag")
clinical_data <- as.data.frame(lapply(clinical_data, as.numeric))
clinical_data[is.na(clinical_data)] <- 0
sum(clinical_data$flag)
rownames(clinical_data) <- rownames(gene_data)
rows_to_remove <- rownames(clinical_data[clinical_data$time == 0, ])
gene_data <- gene_data[!rownames(gene_data) %in% rows_to_remove, ]
clinical_data <- clinical_data[!rownames(clinical_data) %in% rows_to_remove, ]
# clinical_data <- clinical_data[!rownames(clinical_data) %in% rows_to_remove, ]
# rownames(gene_data) <- t(colnames(exp_clinical))
# clinical_data <- as.matrix(temp[, c("time", "flag")])
# clinical_data <- as.data.frame(clinical_data)
#
# clinical_data <- as.data.frame(lapply(clinical_data, as.numeric))
# sum(clinical_data$flag)
#
data <- cbind(clinical_data, gene_data)
# # temp_data <- cbind(clinical_data, temp)
#
# sum(is.na(clinical_data))
# sum(is.na(gene_data))
# gene_data <- preProcess(gene_data, method = "range")
save(data, gene_data, clinical_data, file = "files/surv_pre_data")
## 风险回归
rm(list = ls())
load("files/surv_pre_data")
str(clinical_data)
str(gene_data)
custom_control <- coxph.control(eps = 1e-09,
iter.max = 10000,
toler.chol = 1e-10)
surv_model <- coxph(Surv(time, flag) ~ .,
data = data,
control = custom_control)
# temp_surv_model <- coxph(
# Surv(time, flag) ~ .,
# data = temp_data,
# control = coxph.control(iter.max = 10000)
#)
surv_model$coefficients
# surv_model <- temp_surv_model
coefficients <- as.data.frame(surv_model$coefficients)
coefficients[is.na(coefficients)] <- 0
coefficients
gene_data <- data.frame(gene_data)
gene_coef_data <- rbind(gene_data, t(coefficients))
rownames(gene_coef_data)[nrow(gene_coef_data)] <- c("coef")
surv_results <- sweep(gene_data, 2, t(coefficients), `*`)
surv_row_sums <- rowSums(surv_results)
surv_row_sums[is.na(surv_row_sums)] <- 0
surv_row_sums
surv_median_value <- median(surv_row_sums)
surv_median_value
# 分类并添加分组
risk_group <- ifelse(surv_row_sums > surv_median_value,
"High Risk",
"Low Risk")
surv_results$risk_group <- risk_group
final_surv_results <- surv_results[, !apply(surv_results,
2,
function(x) all(x == 0))]
save(data,
risk_group,
gene_data,
clinical_data,
coefficients,
final_surv_results,
file = "files/surv_final_data")
# 绘制生存曲线
rm(list = ls())
load("files/surv_final_data")
colnames(final_surv_results)
temp_data <- data
temp_data$time <- ceiling(temp_data$time/30)
# temp_data <- lapply(temp_data, as.numeric)
# 剔除大于5年的
rows_to_remove <- rownames(temp_data[temp_data$time > 60, ])
temp_data <- temp_data[!rownames(temp_data) %in% rows_to_remove, ]
clinical_data <- clinical_data[!rownames(clinical_data) %in% rows_to_remove, ]
risk_group <- as.data.frame(risk_group)
risk_group <- risk_group[!rownames(risk_group) %in% rows_to_remove, ]
temp_surv_obj <- Surv(temp_data$time, temp_data$flag)
temp_data <- cbind(temp_data, risk_group)
temp_surv_fit <- survfit(temp_surv_obj ~ risk_group,
data = temp_data)
temp_max <- max(temp_data$time)
ggsurvplot(temp_surv_fit,
data = temp_data,
risk.table = TRUE, # 显示风险表
pval = TRUE, # 显示P值
conf.int = TRUE, # 显示置信区间
xlim = c(0, temp_max), # 可选:设置X轴的范围
xlab = "Times(months)", # 设置X轴标签
ylab = "Survival probability", # 设置Y轴标签
)
coefficients$gene_name <- rownames(coefficients)
colnames(coefficients)[1] <- c("coef")
writexl::write_xlsx(coefficients, "data/washed_data/coef.xlsx")
surv_obj <- Surv(data$time, data$flag)
surv_fit <- survfit(surv_obj ~ risk_group,
data = data)
max <- max(data$time)
ggsurvplot(surv_fit,
data = final_surv_results,
risk.table = TRUE, # 显示风险表
pval = TRUE, # 显示P值
conf.int = TRUE, # 显示置信区间
xlim = c(0, max), # 可选:设置X轴的范围
xlab = "Times(days)", # 设置X轴标签
ylab = "Survival probability", # 设置Y轴标签
title = "Kaplan-Meier 生存曲线" # 设置图标题
)