单样本GSEA分析即ssGSEA,可以计算肿瘤组织中免疫细胞的比例,从而量化免疫浸润。ssGSEA可以用R 当中的GSVA包来计算。
### 1. 下载,读入免疫细胞特征基因集
# http://cis.hku.hk/TISIDB/download.php 下载基因集
# Gene signatures of 28 tumor-infiltrating lymphocytes (TILs)
# wget http://cis.hku.hk/TISIDB/data/download/CellReports.txt
# TIL abundance across cancer types
# wget http://cis.hku.hk/TISIDB/data/download/TIL_abundance.zip
options(stringsAsFactors = F)
# setwd("/file/path")
# CellReports.txt 第一列为免疫细胞名称,第二列为NA,剩下的列为基因名
# R 读txt文件,每行的列数不一样,只能逐行读取并处理
contents <- readLines("CellReports.txt")
geneSet <- list()
for (i in contents) {
str_i <- strsplit(i,split = "\t")
gene_num <- length(str_i[[1]])-2
name <- str_i[[1]][1]
elements <- str_i[[1]][3:gene_num]
geneSet[[name]] <- elements
#print(str_i[[1]][1])
#print(str_i[[1]][3:gene_num])
}
length(geneSet)
names(geneSet)
# View(geneSet)
### 2.读入基因表达数据
Dat <- read.table("TCGA-READ.htseq_fpkm-uq.tsv",
sep = "\t",
header=T)
Dat[1:3,1:5]
# 行名转化,去掉版本号,如ENSG00000242268.2 -> ENSG00000242268
library(stringr)
Dat$ENSEMBL <- word(Dat$Ensembl_ID,1,sep=fixed(".")) # 提取点号前面的部分
## id转化,ENSEMBL -> SYMBOL, 因为geneSet中是SYMBOL
library(org.Hs.eg.db)
ls("package:org.Hs.eg.db") # 查看包的信息
keytypes(org.Hs.eg.db) # 基因编号系统名称
columns(org.Hs.eg.db)
##columns shows which kinds of data can be returned for the AnnotationDb object.
##keytypes allows the user to discover which keytypes can be passed in to select or ##keys and the keytype argument.
map_df <- select(org.Hs.eg.db, keys=keys(org.Hs.eg.db),
columns=c("SYMBOL","ENSEMBL"))
head(map_df)
Dat2 <- merge(Dat,map_df,by='ENSEMBL')
head(Dat2)
Dat2[1:3,1:5]
## 去除ENTREZID,Ensembl_ID列
Dat2 <- Dat2[,-c(1,2,ncol(Dat2)-1)]
## 检查是否有空值或NA
table(is.na(Dat2$SYMBOL))
table(Dat2$SYMBOL=="")
## 合并相同的symbol
Dat3 <- aggregate(.~SYMBOL,data=Dat2,mean)
head(Dat3)
Dat3[1:3,1:5]
dim(Dat3)
## 转化为最终的表达矩阵
rownames(Dat3) <- Dat3$SYMBOL
Dat3 <- Dat3[,-1] # 去除SYMBOL列
Dat3[1:3,1:5]
dim(Dat3) # [1] 行为基因:35302,列为样本:177
MatrixData <- as.matrix(Dat3)
### 3. 单样本GSEA分析
#安装单样本GSEA分析包:GSVA
#if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#BiocManager::install("GSVA")
library(GSVA)
# ?gsva
gsva.es <- gsva(MatrixData,geneSet)
# kcdf 默认为"Gaussian",如果是RNA-seq的count值,参数设置为kcdf="Poisson"
dim(gsva.es)
# Min-Max标准化是指对原始数据进行线性变换
# 将每个样本中不同的免疫细胞比例标准化到0-1之间
gsva.es.norm <- gsva.es
for (i in colnames(gsva.es)) {
gsva.es.norm[,i] <- (gsva.es[,i] -min(gsva.es[,i]))/(max(gsva.es[,i] )-min(gsva.es[,i] ))
}
# dim(gsva.es.norm)
# apply(gsva.es.norm[,1:6], 2, range)
# 查看样本类型
table(word(colnames(gsva.es.norm),4,sep=fixed(".")))
# 根据样本编号,进行分组 -11A结尾为正常
# sample_anno: 行名为样品名,列为分组信息
sample_anno <- data.frame(group = ifelse(grepl("11A$",colnames(gsva.es.norm)),
"normal","cancer"))
rownames(sample_anno)=colnames(gsva.es.norm)
## 画热图
library(pheatmap)
pheatmap(gsva.es.norm,
show_colnames = F,
cluster_rows = T,
cluster_cols = T,
annotation=sample_anno,
fontsize=5)