1.TCGA-GBM数据下载
{
library(TCGAbiolinks)
library(SummarizedExperiment)
query <- GDCquery(project = 'TCGA-GBM',
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
workflow.type = "HTSeq - Counts")
GDCdownload(query, method = "api", files.per.chunk = 100)
expdat <- GDCprepare(query = query)
expr = assay(expdat)
expr = as.data.frame(expr)
save(expr,file = 'expr.Rdata')
}
2.提取mRNA数据
{
library(rtracklayer)
library(dplyr)
gtf <- import('Homo_sapiens.GRCh38.97.chr.gtf')
gtf_df <- as.data.frame(gtf)
gene_df <- select(gtf_df,
c(gene_id,gene_name,gene_biotype))
index <- duplicated(gene_df$gene_id)
gene_df = gene_df[!index,]
dim(gene_df)
mRNA_df = gene_df[gene_df$gene_biotype == 'protein_coding',]
dim(mRNA_df)
save(mRNA_df,file = 'mRNA_df.Rdata')
load("D:/个人空间/文件保存/R-project/GBM/expr.Rdata")
exprSet = expr[match(mRNA_df$gene_id,rownames(expr)),]
dim(exprSet)
exprSet = na.omit(exprSet)
dim(exprSet)
save(exprSet, file = 'exprSet.Rdata')
}
3.ID转换,ensemblID转换为symbolID
{
library(limma)
load("D:/个人空间/文件保存/R-project/GBM/exprSet.Rdata")
exprSet$names = rownames(exprSet)
exprSet$names = mRNA_df[match(exprSet$names,mRNA_df$gene_id),2]
dim(exprSet)
table(duplicated(exprSet$names)) # 有3个重复的基因名
# 对重复基因名取平均表达量,然后将基因名作为行名
exprSet = avereps(exprSet[,-ncol(exprSet)],ID = exprSet$names)
dim(exprSet)
save(exprSet, file = 'exprSet_names_by_symbol.Rdata')
}
4.数据整理
{
# 4.1 去除低表达量的基因
load("D:/个人空间/文件保存/R-project/GBM/exprSet_names_by_symbol.Rdata")
pick_row <- apply(exprSet, 1, function(x){
sum(x == 0) < 40
})
exprSet1 <- exprSet[pick_row,]
# 4.2 分组(癌症组织和癌旁组织)
library(stringr)
tumor <- colnames(exprSet1)[as.integer(substr(colnames(exprSet1),14,15)) < 10]
normal <- colnames(exprSet1)[as.integer(substr(colnames(exprSet1),14,15)) >= 10]
tumor_sample <- exprSet1[,tumor]
normal_sample <- exprSet1[,normal]
exprSet_by_group <- cbind(tumor_sample,normal_sample)
group_list <- c(rep('tumor',ncol(tumor_sample)),rep('normal',ncol(normal_sample)))
save(exprSet_by_group, group_list, file = 'exprSet_by_group_list.Rdata')
}
5.PCA
{
library(FactoMineR)
library(factoextra)
load("D:/个人空间/文件保存/R-project/GBM/exprSet_by_group_list.Rdata")
data = as.data.frame(t(exprSet_by_group))
data <- cbind(data,group = as.factor(group_list))
pca <- PCA(data[,-ncol(data)], graph = FALSE)
eig.val <- get_eigenvalue(pca)# 一列:特征值,二列:特征值的方差贡献度,三列:累计方差贡献度
fviz_eig(pca, addlabels = TRUE, ylim = c(0, 100))
fviz_pca_ind(pca,
geom.ind = "point",
col.ind = data$group,
palette = "jco",
addEllipses = TRUE,
legend.title = "Groups",
title = 'PCA')
}
6.差异表达
- 差异表达
{
library(limma)
library(edgeR)
DGElist <- DGEList( counts = exprSet_by_group, group = factor(group_list))
# 挑选感兴趣基因
keep_gene <- rowSums( cpm(DGElist) > 1 ) >= 2
table(keep_gene)
DGElist <- DGElist[ keep_gene,keep.lib.sizes = FALSE]
DGElist <- calcNormFactors(DGElist) # 计算归一化因子以对齐计数矩阵的列
DGElist$samples
design <- model.matrix( ~0 + factor(group_list))
rownames(design) <- colnames(DGElist)
colnames(design) <- levels(factor(group_list))
# 转换RNA-Seq数据为线性建模做好准备
v <- voom(DGElist, design, plot = TRUE, normalize = "quantile")
# 给出一系列阵列,为每个基因拟合线性模型
fit <- lmFit(v, design)
# 构造对应于一组参数的指定对比的对比矩阵。
cont.matrix <- makeContrasts(contrasts = c('tumor-normal'), levels = design)
# 给定适合微阵列数据的线性模型,计算给定对比组的估计系数和标准误差
fit2 <- contrasts.fit(fit, cont.matrix)
# 差分表达的经验贝叶斯统计
fit2 <- eBayes(fit2)
nrDEG_limma_voom = topTable(fit2, coef = 'tumor-normal', n = Inf)
nrDEG_limma_voom = na.omit(nrDEG_limma_voom)
head(nrDEG_limma_voom)
save(nrDEG_limma_voom,file = 'nrDEG.Rdata')
}
- 火山图
library(ggplot2)
library(ggrepel)
nrDEG <- nrDEG_limma_voom
nrDEG$change <- ifelse(nrDEG$adj.P.Val < 0.01 & abs(nrDEG$logFC) > 2.5,
ifelse(nrDEG$logFC > 2.5,'UP','DOWN'),
'NOT')
table(nrDEG$change)
save(nrDEG,file = 'nrDEG_by_group.Rdata')
# 重点关注基因
nrDEG$sign <- ifelse(nrDEG$adj.P.Val < 0.001 & abs(nrDEG$logFC) > 6.5,
rownames(nrDEG),
NA)
table(nrDEG$sign)
ggplot(data= nrDEG, aes(x = logFC, y = -log10(adj.P.Val), color = change)) +
geom_point(alpha=0.8, size = 1) +
theme_bw(base_size = 15) +
theme(plot.title=element_text(hjust=0.5), # 标题居中
panel.grid.minor = element_blank(),
panel.grid.major = element_blank()) + # 网格线设置为空白
geom_hline(yintercept=2 ,linetype=4) +
geom_vline(xintercept=c(-2.5,2.5) ,linetype=4 ) +
scale_color_manual(name = "",
values = c("red", "green", "black"),
limits = c("UP", "DOWN", "NOT")) +
geom_label_repel(aes(label=sign), # 防止标签过多重叠
fontface="bold",
color="grey50",
box.padding=unit(0.35, "lines"), # 文本框周边填充
point.padding=unit(0.5, "lines"), # 点周边填充
segment.colour = "grey50", # 连接点与标签的线段的颜色
force = T) +
labs(title = 'GBM DEG volcano')
- 热图
library( "pheatmap" )
nrDEG_Z = nrDEG[ order( nrDEG$logFC ), ]
nrDEG_F = nrDEG[ order( -nrDEG$logFC ), ]
choose_gene = c( rownames( nrDEG_Z )[1:50], rownames( nrDEG_F )[1:50] )
choose_matrix = exprSet_by_group[ choose_gene, ]
choose_matrix = t( scale( t( choose_matrix ) ) )
choose_matrix[choose_matrix > 2] = 2
choose_matrix[choose_matrix < -2] = -2
annotation_col = data.frame( group = factor( group_list ) )
rownames( annotation_col ) = colnames( exprSet_by_group )
pheatmap( fontsize_row = 4,
choose_matrix,
annotation_col = annotation_col,
show_rownames = T,
show_colnames = F,
annotation_legend = T,
cluster_cols = T,
filename = 'heatmap.png')
7.富集分析
- kegg_enrichment_analysis
library( "clusterProfiler" )
library( "org.Hs.eg.db" )
load("nrDEG_by_group.Rdata")
nrDEG$SYMBOL <- rownames(nrDEG)
df <- bitr( rownames( nrDEG ), fromType = "SYMBOL", toType = c( "ENTREZID" ),
OrgDb = org.Hs.eg.db )
head( df )
nrDEG = merge( nrDEG, df, by = 'SYMBOL' )
head( nrDEG )
gene_up = nrDEG[ nrDEG$change == 'UP', 'ENTREZID' ]
gene_down = nrDEG[ nrDEG$change == 'DOWN', 'ENTREZID' ]
gene_diff = c( gene_up, gene_down )
gene_all = as.character(nrDEG[ ,'ENTREZID'] )
g_list = list( gene_up = gene_up, gene_down = gene_down, gene_diff = gene_diff)
kk.up <- enrichKEGG(gene = gene_up,
organism = 'hsa',
universe = gene_all,
pvalueCutoff = 0.01,
qvalueCutoff = 0.01)
kk.dowm <- enrichKEGG(gene = gene_down,
organism = 'hsa',
universe = gene_all,
pvalueCutoff = 0.01,
qvalueCutoff = 0.01)
kegg_down_dt <- as.data.frame(kk.dowm)
kegg_up_dt <- as.data.frame( kk.up )
down_kegg <- kegg_down_dt[ kegg_down_dt$pvalue < 0.05, ]
down_kegg$group <- 'down_pathway'
up_kegg <- kegg_up_dt[ kegg_up_dt$pvalue < 0.05, ]
up_kegg$group <- 'up_pathway'
dat = rbind(up_kegg,down_kegg)
dat$pvalue = -log10(dat$pvalue)
dat$group = factor(dat$group)
library(ggpubr)
ggbarplot(dat,x = 'Description',y = 'pvalue',
fill = 'group',
color = 'white',
palette = 'jco',
sort.val = 'asc',
xlab = 'Pathway names',
ylab = '-log10 P-value',
title = 'Pathway enrichment') +
rotate() +
theme_minimal()
- GO_enrichment_analysis
BP <- enrichGO( gene = gene_diff,
universe = gene_all,
OrgDb = org.Hs.eg.db,
keyType = 'ENTREZID',
ont = 'BP',
pAdjustMethod = "BH",
pvalueCutoff = 0.01,
qvalueCutoff = 0.01,
readable = TRUE)
barplot(BP,showCategory=20)
dotplot(BP,showCategory=20)
CC <- enrichGO( gene = gene_diff,
universe = gene_all,
OrgDb = org.Hs.eg.db,
keyType = 'ENTREZID',
ont = 'CC',
pAdjustMethod = "BH",
pvalueCutoff = 0.01,
qvalueCutoff = 0.01,
readable = TRUE)
barplot(CC,showCategory=20)
dotplot(CC,showCategory=20)
MF <- enrichGO( gene = gene_diff,
universe = gene_all,
OrgDb = org.Hs.eg.db,
keyType = 'ENTREZID',
ont = 'MF',
pAdjustMethod = "BH",
pvalueCutoff = 0.01,
qvalueCutoff = 0.01,
readable = TRUE)
barplot(MF,showCategory=20)
dotplot(MF,showCategory=20) +
scale_x_continuous(limits = c(0,0.08), breaks = c(0.00,0.04,0.08))
- DO_enrichment_analysis
library(DOSE)
enrich.do <- enrichDO(gene = gene_diff,
universe = gene_all,
ont = 'DO',
pvalueCutoff = 0.05,
pAdjustMethod = 'BH',
minGSSize = 5,
maxGSSize = 500,
qvalueCutoff = 0.05,
readable = F)
barplot(enrich.do)
dotplot(enrich.do)
本博客内容将同步更新到个人微信公众号:生信玩家。欢迎大家关注~~~