1、下载临床相关的表格以及原始数据(GDCdata-TCGA-LAML-CLINICDATA里面),这里我们以TCGA-LAML(急性髓细胞白血病)数据为例,下载其临床数据信息。
#####一、下载临床数据#####
setwd("H:\\RDATA\\test/")
# 临床数据
# XML数据-全面信息
library(TCGAbiolinks)
query <- GDCquery(
project = "TCGA-LAML", #癌症TCGA存储文件,可以在官网查询
data.category = "Clinical", #需要下载的文件类型,可以在官网查看可以下在什么信息
data.type = "Clinical Supplement", #需要下载的文件类型
data.format = "BCR XML") #文件格式
GDCdownload(query = query, #下载文件,下载过的话,后面直接将directory改到这个储存位置就可以了
method = "api",
directory = "GDCdata-TCGA-LAML-CLINICDATA",# 默认文件夹名称GDCdata,后面也要相应修改
files.per.chunk = 3) # 网速慢、文件大可以设置files.per.chunk分开下载
# 循环输出临床数据
clinical.info<-c("patient","radiation")
#可以提取的信息
#clinical.info<-c("patient","drug","follow_up","radiation","stage_event","new_tumor_event","admin")
# 循环写出
for(i in clinical.info){
clidata <- GDCprepare_clinic(query, clinical.info = i,directory = 'GDCdata-TCGA-LAML-CLINICDATA')
#directory:临床文件储存的位置
write.csv(clidata, file = paste0('TCGA-LAML_clinical_',i,'.csv'), row.names =F)
}
# 在excel中处理数据,也可以在R处理
Tips:
所有TCGA数据可以在官网查看,关于TCGA使用方法下面有个视频可以参考看看:
GDC (cancer.gov)https://portal.gdc.cancer.gov/TCGA-01-TCGA数据库介绍_哔哩哔哩_bilibili
https://www.bilibili.com/video/BV1824y1H75N/?spm_id_from=333.999.0.0
2、下载矩阵数据(Count矩阵),这里我们下载转录组数据为例
#####二、下载矩阵数据#####
# counts下载
# 以RNA-seq表达矩阵为例
query <- GDCquery(project = "TCGA-LAML",
data.category = "Transcriptome Profiling",
data.type = 'Gene Expression Quantification',
experimental.strategy = "RNA-Seq",
workflow.type = "STAR - Counts")
GDCdownload(query,directory = "GDCdata-TCGA-LAML-COUNTS") #保存的位置
testdata <- GDCprepare(query = query, directory = 'GDCdata-TCGA-LAML-COUNTS') # SummarizedExperiment对象
library(SummarizedExperiment)
names(assays(testdata)) #可以进行提取的矩阵
rowdata <- rowData(testdata) # 查看rowdata内容
names(rowdata)
table(rowdata$gene_type) # 统计基因类型,下面可以提取的选择就在这里
head(rowdata$gene_name,10) # gene_name就是symbol
length(rowdata$gene_name) #有多少gene
test_miRNA <- testdata[rowdata$gene_type == "miRNA"] # 提取miRNA
test_mRNA <- testdata[rowdata$gene_type == "protein_coding",] # 提取mRNA
到这里位置我们有两个文件夹,GDCdata-TCGA-LAML-CLINICDATA和GDCdata-TCGA-LAML-COUNTS,分别存放着临床数据和数值矩阵两个原始文件,如果之后要重新分析,可以不用下载,直接将directory放到这两个文件对应的地方。
三、提取矩阵,这里我们使用counts矩阵为例
#####三、提取表达矩阵#####
# 提取表达矩阵
# mRNA的counts矩阵
test.mRNA.counts <- assay(test_mRNA,"unstranded") # counts矩阵用于后续差异分析
# mRNA的tpm矩阵
test.mRNA.tpm <- assay(test_mRNA,"tpm_unstrand")
# mRNA的fpkm矩阵
test.mRNA.fpkm <- assay(test_mRNA,"fpkm_unstrand")
# 添加gene_symbol,基因名称
mRNA.symbol <- rowData(test_mRNA)$gene_name
# 合并
test.mrna.frame <- cbind(as.data.frame(mRNA.symbol),
as.data.frame(test.mRNA.counts))
dim(test.mrna.frame) #基因/样品数量
四、初步处理并保存该count矩阵
#####四、初步处理#####
# 去重数据
qc = as.matrix(test.mrna.frame)
rownames(qc)=qc[,1] # gene_symbol
exp=qc[,2:ncol(qc)] # matrix
dimnames=list(rownames(exp),colnames(exp))
data=matrix(as.numeric(as.matrix(exp)),nrow=nrow(exp),dimnames=dimnames)
# 对重复的基因取平均值
data = limma::avereps(data) # 去重,出现多行取平均值
# 过滤表达量低的基因,可根据基因数目进行调整
CHOL.test=data[rowMeans(data)>5,]
output <- rbind(colnames(CHOL.test),CHOL.test)
dim(output)
write.table(output, file="LAML_mRNA矩阵去重.txt", sep="\t", quote=F, col.names=F)
以及根据“TCGA-LAML_clinical_patient”准备好了meta分组文件(怎么准备?用excel吧=-=):
五、RNAseq(使用的DESEQ2)
#####五、RNAseq#####
#####5.1 读入文件并处理#####
rt=read.table("LAML_mRNA矩阵去重.txt",sep="\t",header=T,check.names=F, row.names = 1)
dim(rt)
rt <- round(rt,0) #counts需要取整数
rt <- rt[, substr(names(rt), 14, 16) == "03A"] #仅保留03A的数据
#TCGA命名规则:https://zhuanlan.zhihu.com/p/564801425,这个一定要看一遍
patient_ID <- read.csv("meta.csv") #读入meta文件(根据临床文件自己准备好,这里准备的是性别)
patient_ID <- patient_ID[!duplicated(patient_ID$id), ]#去重复行名(有的病人有多个样本)
patientIDs <- patient_ID[, 1] #提取ID向量
rt <- rbind(colnames(rt), rt) #加一行格式为TCGA-XX-XXXX的行
names(rt) <- substr(names(rt), 1, 12) #加一行格式为TCGA-XX-XXXX的行
selected_rt <- rt[, colnames(rt) %in% patientIDs] #取矩阵和meta数据中都有的病例(因为counts矩阵有时候会有缺失的,或者说没有对应病人的)
colnames(selected_rt) <- as.character(unlist(selected_rt[1,]))#列名改格式为TCGA-XX-XXXX
selected_rt <- selected_rt[-1,]#列名改格式为TCGA-XX-XXXX
names(selected_rt) <- substr(names(selected_rt), 1, 12)#列名改格式为TCGA-XX-XXXX
has_duplicates <- any(duplicated(names(selected_rt)))# 看有无重复列名(一个病例多个样本)
selected_rt <- selected_rt[, !duplicated(names(selected_rt))]#去重复列名
A <- c(colnames(selected_rt)) #处理过的矩阵里面所有病例
B <- c(patient_ID$id) #meta里面所有病例
different_values <- setdiff(B, A) #有临床数据但是没有相对应的矩阵数据的病例
patient_ID <- patient_ID[!patient_ID$id %in% different_values, ] #删掉这些没有对应数据的病例
write.csv(selected_rt,file = "mRNA.csv")
write.csv(patient_ID,file = "mymeta.csv")
#####5.2 进行DESeQ2前的预处理#####
mycounts <- read.csv("mRNA.csv",row.names = 1, check.names = F) #数据矩阵
mymeta <- read.csv("mymeta.csv", stringsAsFactors = T, row.names = 1) #分组矩阵
rownames(mymeta) <- mymeta$id #行名改为ID,但是忘了有啥用了
colnames(mycounts) == mymeta$id #检查行名列名一致性,通常全为FALSE,数据少的时候在EXCEL里手动调一下
#处理行列一致性
mycounts <- as.data.frame(t(mycounts)) #数值矩阵转置
mycounts$V1 <- NA
mycounts$V1 <- rownames(mycounts) #行名(sample名放到最后一列,相比于直接处理行名bug更少)
mycounts_1 <- mycounts[match(mymeta$id, mycounts$V1), ] #将数值矩阵的行顺序按照分组矩阵的顺序更改
mycounts_1 <- mycounts_1[, -ncol(mycounts_1)] #删掉最后一列(行名列)
mycounts_1 <- as.data.frame(t(mycounts_1)) #转置回来
colnames(mycounts_1) == mymeta$id #全为TRUE可进行下面操作,这里也可以存一下矩阵,之后再做可以不用处理直接进行DESEQ2
write.csv(mycounts_1,file = "mRNA.csv")#会直接覆盖之前的
write.csv(mymeta, file = "mymeta.csv")#这个没覆盖,可以覆盖一下免得弄错
#####5.3 DESEQ2一条龙#####
library(DESeq2)
mycounts_1 <- read.csv("mRNA.csv",row.names = 1, check.names = F) #数据矩阵
mymeta <- read.csv("mymeta.csv", stringsAsFactors = T, row.names = 1) #分组矩阵
dds <- DESeqDataSetFromMatrix(countData = mycounts_1, #数值矩阵
colData = mymeta, #分组矩阵
design = ~gender) #分组依据的那个列名
dds
dds <- dds[rowSums(counts(dds)) > 5,] #过滤一遍,基因表达之和得大于5
dds$gender<-relevel(dds$gender, ref="MALE")#谁作为参照组
dds$gender
table(dds$gender) #样品计数,这里要记下来一下,后面会用
dds <- DESeq(dds) #进行自动差异分析
res <- results(dds) #提取结果文件
head(res)
class(res)
res_1 <- data.frame(res) #数据框化
library(dplyr)
res_1 %>%
mutate(group = case_when(
log2FoldChange >= 0 & pvalue <=0.05 ~"UP",
log2FoldChange <= 0 & pvalue <=0.05 ~"DOWN",
TRUE ~ "NOT_CHANGE"
)) -> res_2 #区分上下调基因,其中条件可自选,条件在列名里看
head(res_2)
table(res_2$group) #上下调基因数量
res_2$symbol <- row.names(res_2) #加一列存下symbol后面会用
write.csv(res_2, file = "diff_expr_results.csv") #保存差异基因文件,可以自己根据这个分析,也可以跟我这走走流程
六、可视化
#####六、富集分析#####
######6.1 GO分析#####
#采取方法up和down gene分别富集
library(org.Hs.eg.db)
library(clusterProfiler)
library(stringr)
library(ggplot2)
upgene <- res_2[res_2$group == "UP", ]
upggoBP <- enrichGO(upgene$symbol, org.Hs.eg.db, keyType = "SYMBOL",ont = "BP",
pvalueCutoff = 0.05, pAdjustMethod = "BH", qvalueCutoff = 0.2,
minGSSize = 10, maxGSSize = 500, readable = FALSE, pool = FALSE)
upGOBP <- upggoBP@result %>%
mutate(GeneRatio_a = as.numeric(str_extract(GeneRatio, "\\d+")),
BgRatio_a = as.numeric(str_extract(BgRatio, "\\d+")),
RichFactor = GeneRatio_a / BgRatio_a)
write.csv(upGOBP, 'UPGOBP.csv')
downgene <- res_2[res_2$group == "DOWN", ]
downggoBP <- enrichGO(downgene$symbol, org.Hs.eg.db, keyType = "SYMBOL",ont = "BP",
pvalueCutoff = 0.05, pAdjustMethod = "BH", qvalueCutoff = 0.2,
minGSSize = 10, maxGSSize = 500, readable = FALSE, pool = FALSE)
downGOBP <- downggoBP@result %>%
mutate(GeneRatio_a = as.numeric(str_extract(GeneRatio, "\\d+")),
BgRatio_a = as.numeric(str_extract(BgRatio, "\\d+")),
RichFactor = GeneRatio_a / BgRatio_a)
write.csv(downGOBP, 'downGOBP.csv')
# GO分析的双侧柱状图
upGOBP <- read.csv("UPGOBP.csv",row.names = 1)
downGOBP <- read.csv("downGOBP.csv", row.names = 1)
upGOBP$Group <- NA
upGOBP$Group = 1
downGOBP$Group <- NA
downGOBP$Group = -1
upGOBP$Description<-sub("(.)", "\\U\\1",upGOBP$Description,perl=TRUE)
downGOBP$Description<-sub("(.)", "\\U\\1",downGOBP$Description,perl=TRUE)
upGOBP.sorted <- upGOBP[order(upGOBP$pvalue), ] # 根据pvalue从小到大排序
upGOBP <- upGOBP.sorted[1:20, ] # 选择前20行
downGOBP.sorted <- downGOBP[order(downGOBP$pvalue), ] # 根据pvalue从小到大排序
downGOBP <- downGOBP.sorted[1:20, ] # 选择前20行
dat=rbind(upGOBP,downGOBP)
colnames(dat)
dat$group <- NA
dat$group[which(dat$Group >0)]='up'
dat$group[which(dat$Group <0)]='down'
dat$RichFactor = dat$RichFactor*dat$Group
dat <- dat[dat$pvalue<0.05,]
gk_plot <-
ggplot(dat,aes(reorder(Description, RichFactor),RichFactor,fill = group))+
geom_col()+
theme_bw()+
theme(panel.grid.major=element_blank(),
panel.grid.minor=element_blank(),
panel.border = element_blank(),
legend.title = element_blank(),
axis.text = element_text(color="black",size=15),
axis.line.x = element_line(color='black'),
axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
legend.position = 'right')+
coord_flip()+
geom_segment(aes(y=0, yend=0,x=0,xend=41))+ #y轴长度调整
geom_text(data = dat[which(dat$RichFactor>0),],aes(x=Description, y=-0.01, label=Description),
hjust=1, size=5)+
geom_text(data = dat[which(dat$RichFactor<0),],aes(x=Description, y=0.01, label=Description),
hjust=0, size=5)+
geom_text(data = dat[which(dat$RichFactor>0),],aes(label=sprintf("%.2e", pvalue)),
hjust=-0.1, size=5, color='red')+
geom_text(data = dat[which(dat$RichFactor<0),],aes(label=sprintf("%.2e", pvalue)),
hjust=1.1, size=5, color="red")+
scale_fill_manual(values = c("#fb6a4a","#3182bd"), breaks = c("up", "down"))+
scale_x_discrete(expand = expansion(mult = c(0,0)))+
ylim(-1, 1)+
labs(x='', y='RichFactor')+
theme(legend.text = element_text(size = 12),
axis.title.x = element_text(size = 14))+
ggtitle("GO BP of The Top 20 Most Significant P-values")+
theme(plot.title = element_text(hjust = 0.5, size = 20))
gk_plot
#####6.2 KEGG分析#####
#唯一注意的一点就是把它换为ENTREZID,再换回去
library(clusterProfiler)
library(stringr)
upgene <- res_2[res_2$group == "UP", ]
genes <- bitr(upgene$symbol, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = "org.Hs.eg.db")
genes <- genes[,2]
ekegg <- enrichKEGG(genes, organism = 'hsa',
pvalueCutoff = 0.05,pAdjustMethod = 'BH',qvalueCutoff = 0.2,
minGSSize=10,maxGSSize=500,use_internal_data=F)
ekegg <- setReadable(ekegg, 'org.Hs.eg.db','ENTREZID')
upKEGG <- ekegg@result %>%
mutate(GeneRatio_a = as.numeric(str_extract(GeneRatio, "\\d+")),
BgRatio_a = as.numeric(str_extract(BgRatio, "\\d+")),
RichFactor = GeneRatio_a / BgRatio_a)
write.csv(upKEGG, 'UPKEGG.csv')
downgene <- res_2[res_2$group == "DOWN", ]
genes <- downgene$symbol
genes <- bitr(genes, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = "org.Hs.eg.db")
genes <- genes[,2]
ekegg <- enrichKEGG(genes, organism = 'hsa',
pvalueCutoff = 0.05,pAdjustMethod = 'BH',qvalueCutoff = 0.2,
minGSSize=10,maxGSSize=500,use_internal_data=F)
ekegg <- setReadable(ekegg, 'org.Hs.eg.db','ENTREZID')
downKEGG <- ekegg@result %>%
mutate(GeneRatio_a = as.numeric(str_extract(GeneRatio, "\\d+")),
BgRatio_a = as.numeric(str_extract(BgRatio, "\\d+")),
RichFactor = GeneRatio_a / BgRatio_a)
write.csv(downKEGG, 'downKEGG.csv')
#画图偷懒直接移花接木
#upGOBP <- read.csv("UPKEGG.csv",row.names = 1)
#downGOBP <- read.csv("downKEGG.csv", row.names = 1)
#####七、单个基因分析#####
#基因表达箱线图
pTSPAN6d <- plotCounts(dds, gene="TSPAN6", intgroup="gender", returnData=TRUE) #提取数据
library(ggsignif)
pTSPAN6 <- ggplot(pTSPAN6d, aes(x=gender, y=count)) +
geom_boxplot(aes(fill=gender)) +
scale_y_log10() +
theme_bw() +
labs(x = " ", y = "Relative Counts", fill = "Gender") +
ggtitle("TSPAN6")+
#theme(legend.position = "none")+
#加上就是不要图例
geom_signif(comparisons = list(c("MALE", "FEMALE")),
annotations = c("***"), #这里查看差异基因表手动改相关系数星星
y_position = c(3, 3), #这里调p值横线线的高低
tip_length = c(0, 0)) + #这里调小撇撇的长短
annotate("text", x = 1, y = 0, label = "(n=13)", #数量看前面那个记住的东西
vjust = 4, hjust = 0.5, size = 4, fontface = "bold") +
annotate("text", x = 2, y = 0, label = "(n=14)", #数量看前面那个记住的东西
vjust = 4, hjust = 0.5, size = 4, fontface = "bold") +
coord_cartesian(clip = "off") +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
axis.title.x = element_text(size = 18, face = "bold"),
axis.title.y = element_text(size = 18, face = "bold"),
axis.text = element_text(size = 18, face = "bold"),
plot.title = element_text(size = 18, face = "bold.italic"),
legend.title = element_text(size = 10, face = "bold"))
pTSPAN6
#其余同理还可以拼图
library(patchwork)
(pTSPAN6 + pa) / (pb + pc) +
plot_layout(guides = 'collect')
#####八、火山图#####
library(ggrepel)
library(ggplot2)
library(pheatmap)
##筛选条件设置
log2FC_cutoff = 0.5
pvalue_cutoff = 0.05
##选取差异分析结果
need_DEG <- res_1[,c(2,5)] #选取log2FoldChange, pvalue信息
colnames(need_DEG) <- c('log2FoldChange','pvalue')
need_DEG$significance <- as.factor(ifelse(need_DEG$pvalue < pvalue_cutoff & abs(need_DEG$log2FoldChange) > log2FC_cutoff,
ifelse(need_DEG$log2FoldChange > log2FC_cutoff ,'UP','DOWN'),'NOT_CHANGE'))
title <- paste0(' Up : ',nrow(need_DEG[need_DEG$significance =='UP',]) ,
'\n Down : ',nrow(need_DEG[need_DEG$significance =='DOWN',]),
'\n |Log2FoldChange| >= ',round(log2FC_cutoff,3))
ggplot(data=need_DEG,
aes(x=log2FoldChange, y=-log10(pvalue),
color=significance)) +
#点和背景
geom_point(alpha=0.4, size=1) +
theme_classic()+ #无网格线
#坐标轴
xlab("log2 ( FoldChange )") +
ylab("-log10 ( pvalue)") +
#标题文本
ggtitle( title ) +
#分区颜色
scale_colour_manual(values = c('blue','grey','red'))+
#辅助线
geom_vline(xintercept = c(-log2FC_cutoff,log2FC_cutoff),lty=4,col="grey",lwd=0.8) +
geom_hline(yintercept = -log10(pvalue_cutoff),lty=4,col="grey",lwd=0.8) +
#图例标题间距等设置
theme(plot.title = element_text(hjust = 0.5),
plot.margin=unit(c(2,2,2,2),'lines'), #上右下左
legend.title = element_blank(), #不显示图例标题
legend.position="right") + #图例位置
#添加标签
geom_text_repel(data = filter(res_2, abs(log2FoldChange) > 5 & -log10(pvalue) > 5),
max.overlaps = getOption("ggrepel.max.overlaps", default = 20),
aes(label = symbol,
color = group),
size = 2)
#####九、差异表达热图#####
#needDEG见上
gene_up <- rownames(need_DEG[with(need_DEG,log2FoldChange>log2FC_cutoff & pvalue<pvalue_cutoff),])
gene_down <- rownames(need_DEG[with(need_DEG,log2FoldChange< -log2FC_cutoff & pvalue<pvalue_cutoff),])
df02 <- c(head(gene_up, 10), #取前10 padj上下调基因名
head(gene_down, 10))
#df02 <- c(gene_up,gene_down) #取所有
#用counts来画图
diff_expr <- mycounts_1[intersect(rownames(mycounts_1),df02),]
## 用标准化(标准化Counts值) 来画图
#vsd <- vst(dds, blind = FALSE)
#normalizeExp <- assay(vsd)
## 差异基因的标准化Count
#diff_expr <- normalizeExp[intersect(rownames(normalizeExp),df02),]
#head(diff_expr)
## 差异热图
library(pheatmap)
annotation_col <- data.frame(Group = factor(c(rep("MALE",78), rep("FEMALE",59))))#前面记住的
rownames(annotation_col) <- colnames(diff_expr)
p <- pheatmap(diff_expr,
annotation_col = annotation_col,
color = colorRampPalette(c("#8854d0", "#ffffff","#fa8231"))(50),
cluster_cols = F,
show_rownames = F,
show_colnames = F,
scale = "row", ## none, row, column
fontsize = 12,
fontsize_row = 12,
fontsize_col = 6,
border = FALSE)
p