R语言生存曲线的可视化(超详细)

最新推荐文章于 2024-08-27 12:16:15 发布

医学和生信笔记

最新推荐文章于 2024-08-27 12:16:15 发布

阅读量1.3w

点赞数 19

分类专栏：医学统计学文章标签： r语言开发语言数据挖掘

原文链接：https://mp.weixin.qq.com/s/HUz644esLu4kRBOMZcdOOw

版权

医学统计学专栏收录该内容

64 篇文章 211 订阅

订阅专栏

survminer是专门用来进行生存分析可视化的R包，主要函数如下：

生存曲线
- ggsurvplot():
- arrange_ggsurvplots():
- ggsurvevents():
- surv_summary():
- surv_cutpoint():
- pairwise_survdiff():
Cox模型的诊断
- ggcoxzph():
- ggcoxdiagnostics():
- ggcoxfunctional():
Cox模型总汇总
- ggforest():
- ggcoxadjustedcurves():
竞争风险模型
- ggcompetingrisks():

关于Cox模型诊断和汇总在之前的推文中已经进行过详细的讲解：R语言生存分析：Cox回归

本次主要介绍生存曲线的绘制及细节。

演示数据

使用survival包中的lung数据集用于演示，这是一份关于肺癌患者的生存数据。time是生存时间，以天为单位，status是生存状态，1代表删失，2代表死亡。

library(survival)
library(survminer)

str(lung)

## 'data.frame': 228 obs. of  10 variables:
##  $ inst     : num  3 3 3 5 1 12 7 11 1 7 ...
##  $ time     : num  306 455 1010 210 883 ...
##  $ status   : num  2 2 1 2 2 1 2 2 2 2 ...
##  $ age      : num  74 68 56 57 60 74 68 71 53 61 ...
##  $ sex      : num  1 1 1 1 1 1 2 2 1 1 ...
##  $ ph.ecog  : num  1 0 0 1 0 1 2 2 1 2 ...
##  $ ph.karno : num  90 90 90 90 100 50 70 60 70 70 ...
##  $ pat.karno: num  100 90 90 60 90 80 60 80 80 70 ...
##  $ meal.cal : num  1175 1225 NA 1150 NA ...
##  $ wt.loss  : num  NA 15 15 11 0 0 10 1 16 34 ...

fit <- survfit(Surv(time, status) ~ sex, data = lung)

基本的生存曲线

最基本的生存曲线：

ggsurvplot(fit, data = lung)

删失数据的形状可以更改，默认是+，我们可以改成自己喜欢的：

# 更改删失数据的形状、大小
ggsurvplot(fit, data = lung, censor.shape="|", censor.size = 4)

字体都是可以进行更改的！

ggsurvplot(fit, data = lung,
   title = "Survival curves", 
   subtitle = "Based on Kaplan-Meier estimates",
   caption = "created with survminer",
   font.title = c(16, "bold", "darkblue"), # 大小、粗细、颜色
   font.subtitle = c(15, "bold.italic", "purple"),
   font.caption = c(14, "plain", "orange"),
   font.x = c(14, "bold.italic", "red"),
   font.y = c(14, "bold.italic", "darkred"),
   font.tickslab = c(12, "plain", "darkgreen"))

累积风险曲线：

ggsurvplot(fit,
           fun = "cumhaz", 
           conf.int = TRUE, # 可信区间
           palette = "lancet", # 支持ggsci配色，自定义颜色，brewer palettes中的配色，等
           ggtheme = theme_bw() # 支持ggplot2及其扩展包的主题
)

累积事件曲线：

ggsurvplot(fit,
           fun = "event", 
           conf.int = TRUE, # 可信区间
           palette = "grey",
           ggtheme = theme_pubclean() 
)

增加 risk table

增加多种自定义选项：

ggsurvplot(
  fit,
  data = lung,
  size = 1,                 # 更改线条粗细
  # 配色方案，支持ggsci配色，自定义颜色，brewer palettes中的配色，等
  palette = "lancet",
  conf.int = TRUE,          # 可信区间
  pval = TRUE,              # log-rank P值，也可以提供一个数值
  pval.method = TRUE,       # 计算P值的方法，可参考https://rpkgs.datanovia.com/survminer/articles/Specifiying_weights_in_log-rank_comparisons.html
  log.rank.weights = "1",
  risk.table = TRUE,        # 增加risk table
  risk.table.col = "strata",# risk table根据分组使用不同颜色
  legend.labs = c("Male", "Female"),    # 图例标签
  risk.table.height = 0.25, # risk table高度
  ggtheme = theme_classic2()      # 主题，支持ggplot2及其扩展包的主题
)

ggsurvplot(
   fit,                     
   data = lung,             
   risk.table = TRUE,       
   pval = TRUE,             
   conf.int = TRUE,         
   xlim = c(0,500),         # 横坐标轴范围，相当于局部放大
   xlab = "Time in days",   # 横坐标标题
   break.time.by = 100,     # 横坐标刻度
   ggtheme = theme_light(), 
   risk.table.y.text.col = T, # risk table文字注释颜色
   risk.table.y.text = FALSE # risk table显示条形而不是文字
)

risk table的各种字体也都是可以更改的！

ggsurvplot(fit, data = lung,
   title = "Survival curves", subtitle = "Based on Kaplan-Meier estimates",
   caption = "created with survminer",
   font.title = c(16, "bold", "darkblue"),
   font.subtitle = c(15, "bold.italic", "purple"),
   font.caption = c(14, "plain", "orange"),
   font.x = c(14, "bold.italic", "red"),
   font.y = c(14, "bold.italic", "darkred"),
   font.tickslab = c(12, "plain", "darkgreen"),
   ########## risk table #########,
   risk.table = TRUE,
   risk.table.title = "Note the risk set sizes",
   risk.table.subtitle = "and remember about censoring.",
   risk.table.caption = "source code: website.com",
   risk.table.height = 0.45)

增加删失事件表ncensor plot

ggsurvplot(fit, data = lung, risk.table = TRUE, ncensor.plot = TRUE)

ncensor plot的字体也是支持各种设置的。

ggsurvplot(fit, data = lung,
   title = "Survival curves", subtitle = "Based on Kaplan-Meier estimates",
   caption = "created with survminer",
   font.title = c(16, "bold", "darkblue"),
   font.subtitle = c(15, "bold.italic", "purple"),
   font.caption = c(14, "plain", "orange"),
   font.x = c(14, "bold.italic", "red"),
   font.y = c(14, "bold.italic", "darkred"),
   font.tickslab = c(12, "plain", "darkgreen"),
   ########## risk table #########,
   risk.table = TRUE,
   risk.table.title = "Note the risk set sizes",
   risk.table.subtitle = "and remember about censoring.",
   risk.table.caption = "source code: website.com",
   risk.table.height = 0.2,
   ## ncensor plot ##
   ncensor.plot = TRUE,
   ncensor.plot.title = "Number of censorings",
   ncensor.plot.subtitle = "over the time.",
   ncensor.plot.caption = "data available at data.com",
   ncensor.plot.height = 0.25)

超级无敌精细化自定设置

首先设置好自己的默认样式：

ggsurv <- ggsurvplot(
           fit,                     
           data = lung,             
           risk.table = TRUE,       
           pval = TRUE,             
           conf.int = TRUE,         
           palette = c("#E7B800", "#2E9FDF"),
           xlim = c(0,500),         
           xlab = "Time in days",   
           break.time.by = 100,     
           ggtheme = theme_light(), 
          risk.table.y.text.col = T,
          risk.table.height = 0.25, 
          risk.table.y.text = FALSE,
          ncensor.plot = TRUE,      
          ncensor.plot.height = 0.25,
          conf.int.style = "step",  # customize style of confidence intervals
          surv.median.line = "hv",  
          legend.labs = c("Male", "Female")    
        )
ggsurv

自定义一个函数，用来更改各种样式：

customize_labels <- function (p, font.title = NULL,
                              font.subtitle = NULL, font.caption = NULL,
                              font.x = NULL, font.y = NULL, font.xtickslab = NULL, font.ytickslab = NULL)
{
  original.p <- p
  if(is.ggplot(original.p)) list.plots <- list(original.p)
  else if(is.list(original.p)) list.plots <- original.p
  else stop("Can't handle an object of class ", class (original.p))
  .set_font <- function(font){
    font <- ggpubr:::.parse_font(font)
    ggtext::element_markdown (size = font$size, face = font$face, colour = font$color)
  }
  for(i in 1:length(list.plots)){
    p <- list.plots[[i]]
    if(is.ggplot(p)){
      if (!is.null(font.title)) p <- p + theme(plot.title = .set_font(font.title))
      if (!is.null(font.subtitle)) p <- p + theme(plot.subtitle = .set_font(font.subtitle))
      if (!is.null(font.caption)) p <- p + theme(plot.caption = .set_font(font.caption))
      if (!is.null(font.x)) p <- p + theme(axis.title.x = .set_font(font.x))
      if (!is.null(font.y)) p <- p + theme(axis.title.y = .set_font(font.y))
      if (!is.null(font.xtickslab)) p <- p + theme(axis.text.x = .set_font(font.xtickslab))
      if (!is.null(font.ytickslab)) p <- p + theme(axis.text.y = .set_font(font.ytickslab))
      list.plots[[i]] <- p
    }
  }
  if(is.ggplot(original.p)) list.plots[[1]]
  else list.plots
}

然后分别对上面图形的3个部分（生存曲线、risk table、ncensor plot）进行个性化自定义

# 更改生存曲线的标签
ggsurv$plot <- ggsurv$plot + labs(
  title    = "Survival curves",
  subtitle = "Based on Kaplan-Meier estimates",
  caption  = "created with survminer"
  )

# 更改risk table的标签
ggsurv$table <- ggsurv$table + labs(
  title    = "Note the risk set sizes",
  subtitle = "and remember about censoring.",
  caption  = "source code: website.com"
  )

# 更改ncensor plot的标签 
ggsurv$ncensor.plot <- ggsurv$ncensor.plot + labs(
  title    = "Number of censorings",
  subtitle = "over the time.",
  caption  = "source code: website.com"
  )

# 更改生存曲线，risk table，ncensor plot的字体大小、类型、颜色

ggsurv <- customize_labels(
  ggsurv,
  font.title    = c(16, "bold", "darkblue"),
  font.subtitle = c(15, "bold.italic", "purple"),
  font.caption  = c(14, "plain", "orange"),
  font.x        = c(14, "bold.italic", "red"),
  font.y        = c(14, "bold.italic", "darkred"),
  font.xtickslab = c(12, "plain", "darkgreen")
)

ggsurv

多个组的生存曲线

如果你的分类变量是多个组别的（常见的都是两组比较的），会自动画出多条生存曲线。如果你有多个分类自变量，会自动画出所有组合的生存曲线。

使用colon数据集，其中time是时间，status是生存状态，1为发生终点事件，0为删失，rx是治疗方式，有三种：observation、Levamisole、Levamisole+5-FU，obstruct是肿瘤是否阻塞结肠，有为1，无为0，adhere是肿瘤是否粘附附近器官，有为1，无为0。

rm(list = ls())
library(survival)
library(survminer)

psych::headTail(colon)

##       id study      rx sex age obstruct perfor adhere nodes status differ
## 1      1     1 Lev+5FU   1  43        0      0      0     5      1      2
## 2      1     1 Lev+5FU   1  43        0      0      0     5      1      2
## 3      2     1 Lev+5FU   1  63        0      0      0     1      0      2
## 4      2     1 Lev+5FU   1  63        0      0      0     1      0      2
## ...  ...   ...    <NA> ... ...      ...    ...    ...   ...    ...    ...
## 1855 928     1 Lev+5FU   0  48        1      0      0     4      0      2
## 1856 928     1 Lev+5FU   0  48        1      0      0     4      0      2
## 1857 929     1     Lev   0  66        1      0      0     1      0      2
## 1858 929     1     Lev   0  66        1      0      0     1      0      2
##      extent surg node4 time etype
## 1         3    0     1 1521     2
## 2         3    0     1  968     1
## 3         3    0     0 3087     2
## 4         3    0     0 3087     1
## ...     ...  ...   ...  ...   ...
## 1855      3    1     1 2072     2
## 1856      3    1     1 2072     1
## 1857      3    0     0 1820     2
## 1858      3    0     0 1820     1

# 两个分类变量
fit2 <- survfit( Surv(time, status) ~ rx + obstruct, data = colon )

# 结果会给出所有组合的生存曲线
ggsurvplot(fit2, pval = TRUE, 
           risk.table = TRUE,
           risk.table.height = 0.3
           )

多个分类变量分面绘制

还是以colon数据集为例，这次我们用3个变量：sex/rx/adhere，这3个都是分类变量。

首先构建生存函数：

fit3 <- survfit(Surv(time, status) ~ sex + rx + adhere, data = colon )

然后把生存曲线保存为一个对象：

ggsurv <- ggsurvplot(fit3, data = colon,
  fun = "cumhaz", conf.int = TRUE,
  risk.table = TRUE, risk.table.col="strata",
  ggtheme = theme_bw())

接下来就可以分别提取生存曲线（这里是cumhaz，累积风险曲线）、risk table、删失事件表，根据不同的变量进行分面即可：

# 分面累积风险曲线
curv_facet <- ggsurv$plot + facet_grid(rx ~ adhere)
curv_facet

# 分面risk table，和上面的累积风险曲线分面方法一样
ggsurv$table + facet_grid(rx ~ adhere, scales = "free")+
 theme(legend.position = "none")

# risk table另一种分面方法，由于有3个分类变量，可以选择自己需要的分面方法
tbl_facet <- ggsurv$table + facet_grid(.~ adhere, scales = "free")
tbl_facet + theme(legend.position = "none")

# 重新安排下布局，把生存曲线和risk table画在一起
g2 <- ggplotGrob(curv_facet)
g3 <- ggplotGrob(tbl_facet)
min_ncol <- min(ncol(g2), ncol(g3))
g <- gridExtra::gtable_rbind(g2[, 1:min_ncol], g3[, 1:min_ncol], size="last")
g$widths <- grid::unit.pmax(g2$widths, g3$widths)
grid::grid.newpage()
grid::grid.draw(g)

如果想根据某个变量进行分组绘制生存曲线，然后分面展示，也可以用ggsurvplot_facet()实现：

fit <- survfit( Surv(time, status) ~ sex, data = colon )

# 根据rx进行分组，展示每个组内的生存曲线
ggsurvplot_facet(fit, colon, 
                 facet.by = "rx",
                 palette = "jco", 
                 pval = TRUE)

## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

## Warning: `select_()` was deprecated in dplyr 0.7.0.
## Please use `select()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

还可以根据多个变量进行分面展示：

ggsurvplot_facet(fit, colon, facet.by = c("rx", "adhere"),
                palette = "jco", pval = TRUE)

fit2 <- survfit( Surv(time, status) ~ sex + rx, data = colon )
ggsurvplot_facet(fit2, colon, facet.by = "adhere",
                palette = "jco", pval = TRUE)

同时绘制多个生存函数

data(colon)

## Warning in data(colon): data set 'colon' not found

f1 <- survfit(Surv(time, status) ~ adhere, data = colon)
f2 <- survfit(Surv(time, status) ~ rx, data = colon)
fits <- list(sex = f1, rx = f2)

# 一下子画好！在循环出图时有用处
legend.title <- list("sex", "rx")
ggsurvplot_list(fits, colon, legend.title = legend.title)

## $sex

## 
## $rx

## 
## attr(,"class")
## [1] "list"            "ggsurvplot_list"

根据某一个变量分组绘制

比如以colon数据为例，我们想以rx（治疗方式）进行分组，然后看每个组内的生存曲线，可以通过ggsurvplot_group_by()实现。

rm(list = ls())

fit <- survfit( Surv(time, status) ~ sex, data = colon )

# Visualize: grouped by treatment rx
#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
ggsurv.list <- ggsurvplot_group_by(fit, colon, group.by = "rx", risk.table = TRUE,
                                 pval = TRUE, conf.int = TRUE, palette = "jco")
names(ggsurv.list)

## [1] "rx.Obs::sex"     "rx.Lev::sex"     "rx.Lev+5FU::sex"

这个图形和上面的分面展示中的ggsurvplot_facet画出来的图形是一样的，区别就是一个是分面，这个是分开多个图形！

可以根据多个变量进行分组，比如下面这个情况，会分别绘制6张生存曲线图：

# Visualize: grouped by treatment rx and adhere
ggsurv.list <- ggsurvplot_group_by(fit, colon, group.by = c("rx", "adhere"),
                                 risk.table = TRUE,
                                 pval = TRUE, conf.int = TRUE, palette = "jco")

# 6张图的名字，图没有画出来，感兴趣的可以自己试试看
names(ggsurv.list)

## [1] "rx:Obs, adhere:0::sex"     "rx:Obs, adhere:1::sex"    
## [3] "rx:Lev, adhere:0::sex"     "rx:Lev, adhere:1::sex"    
## [5] "rx:Lev+5FU, adhere:0::sex" "rx:Lev+5FU, adhere:1::sex"

在原有生存曲线的基础上增加

先画好一个生存曲线图，然后在原图的基础上添加新的生存曲线图，类似于base r中常用的add = T，比如在这篇推文中介绍的：多个时间点和多指标生存曲线

library(survival)

# 注意这里的surv_fit，是survfit的封装
fit <- surv_fit(Surv(time, status) ~ sex, data = lung)

# Visualize survival curves
ggsurvplot(fit, data = lung,
          risk.table = TRUE, pval = TRUE,
          surv.median.line = "hv", palette = "jco")

在上面图形的基础上添加所有人的总的生存曲线：

# Add survival curves of pooled patients (Null model)
# Use add.all = TRUE option
ggsurvplot(fit, data = lung,
          risk.table = TRUE, pval = TRUE,
          surv.median.line = "hv", palette = "jco",
          add.all = TRUE)

多个生存函数画在一起

比如把PFS和OS的生存曲线画在一张图上。

rm(list = ls())
# 构建一个示例数据集
set.seed(123)
demo.data <- data.frame(
   os.time = colon$time,
   os.status = colon$status,
   pfs.time = sample(colon$time),
   pfs.status = colon$status,
   sex = colon$sex, rx = colon$rx, adhere = colon$adhere
 )

# 总体的PFS和OS生存曲线
pfs <- survfit( Surv(pfs.time, pfs.status) ~ 1, data = demo.data)
os <- survfit( Surv(os.time, os.status) ~ 1, data = demo.data)

# Combine on the same plot
fit <- list(PFS = pfs, OS = os)
ggsurvplot_combine(fit, demo.data)

这个情况你用ggsurvplot_list也能画，不过就是分开的两个图形了！

如果是分类变量会自动画出多条生存曲线：

pfs <- survfit( Surv(pfs.time, pfs.status) ~ rx, data = demo.data)
os <- survfit( Surv(os.time, os.status) ~ rx, data = demo.data)
# Combine on the same plot
fit <- list(PFS = pfs, OS = os)
ggsurvplot_combine(fit, demo.data)

参考资料

survminer包帮助文档