好记性不如烂笔头
年龄越大越明显
还是要留下一些痕迹
1. 装载R包以及配置环境
options("repos"=c(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/"))
options(BioC_mirror="https://mirrors.tuna.tsinghua.edu.cn/bioconductor")
if(!requireNamespace("tidyr",quietly = TRUE)) install.packages("tidyr",update = F,ask = F)
if(!requireNamespace("dplyr",quietly = TRUE)) install.packages("dplyr",update = F,ask = F)
if(!requireNamespace("ggplot2",quietly = TRUE)) install.packages("ggplot2",update = F,ask = F)
if(!requireNamespace("data.table",quietly = TRUE)) install.packages("data.table",update = F,ask = F)
if(!requireNamespace("ggrepel",quietly = TRUE)) install.packages("ggrepel",update = F,ask = F)
if(!requireNamespace("pheatmap",quietly = TRUE)) install.packages("pheatmap",update = F,ask = F)
if(!requireNamespace("ggfortify",quietly = TRUE)) install.packages("ggfortify",update = F,ask = F)
if(!requireNamespace("ggpubr",quietly = TRUE)) install.packages("ggpubr",update = F,ask = F)
if(!requireNamespace("ggsignif",quietly = TRUE)) install.packages("ggsignif",update = F,ask = F)
if(!requireNamespace("tibble",quietly = TRUE)) install.packages("tibble",update = F,ask = F)
if(!requireNamespace("cowplot",quietly = TRUE)) install.packages("cowplot",update = F,ask = F)
if(!requireNamespace("future.apply",quietly = TRUE)) install.packages("future.apply",update = F,ask = F)
if(!requireNamespace("BiocManager",quietly = TRUE)) install.packages("BiocManager",update = F,ask = F)
if(!requireNamespace("GSEABase",quietly = TRUE)) BiocManager::install("GSEABase",update = F,ask = F)
if(!requireNamespace("GSVA",quietly = TRUE)) BiocManager::install("GSVA",update = F,ask = F)
2. 寻找数据
可以从TCGA原网下载,
可以从整理好的数据集直接下载UCSC Xena,### http://xena.ucsc.edu/
直接链接###UCSC Xena (xenabrowser.net)
需要下载基因表达数据以及临床信息数据
其他数据下载类似
3. 处理数据
### 处理TCGA数据#设置工作目录
setwd("/your/desired/directory/path")
rm(list = ls()) #清空既往数据
dd <- data.table::fread("data/tcga_RSEM_gene_tpm",data.table = F)
test <- dd[1:10,1:10] #测试数据用于查看数据内容
### 清洁数据
### 1.基因注释 2.行列转置 3.增加分组
### 临床信息
clin <- data.table::fread("Survival_SupplementalTable_S1_20171025_xena_sp",data.table = F)
colnames(clin)[2:3] <- c("TCGA_id","type")
### 提取交集
index <- clin$sample %in% colnames(dd)
clin <- clin[index,1:3]
dd <- dd[,c("sample",clin$sample)]
colnames(dd)[1] <- "gene_id"
# 探针转换
# 下载后先解压gencode.v23.annotation.gtf.gz
# 读入需要时间,耐心等待
# gtf1 <- rtracklayer::import('gencode.v23.annotation.gtf')
# gtf_df <- as.data.frame(gtf1)
# save(gtf_df,file = "resource/gtf_df.Rdata")
load(file = "resource/gtf_df.Rdata")
### 提取编码mRNA,基因注释,行列转置
library(dplyr)
library(tibble)
#1mRNA_exprSet <- gtf_df %>%
#筛选gene,和编码指标
filter(type=="gene",gene_type=="protein_coding") %>%
#选取两列
dplyr::select(c(gene_name,gene_id)) %>%
#和表达量数据合并
inner_join(dd,by ="gene_id") %>%
#去除一列
dplyr::select(-gene_id) %>%
#去重
distinct(gene_name,.keep_all = T) %>%
#列名变成行名
column_to_rownames("gene_name") %>%
#转置
t() %>%
#转为数据框
as.data.frame() %>%
#行名变成列名
rownames_to_column("sample")
#2稍做修改
mRNA_exprSet <- gtf_df %>%
#筛选gene,和编码指标
#filter(type=="gene",gene_type=="protein_coding") %>%
#选取两列
dplyr::select(c(gene,gene_id)) %>%
#和表达量数据合并
inner_join(dd,by ="gene_id") %>%
#去除一列
dplyr::select(-gene_id) %>%
#去重
distinct(gene,.keep_all = T) %>%
#列名变成行名
column_to_rownames("gene") %>%
#转置
t() %>%
#转为数据框
as.data.frame() %>%
#行名变成列名
rownames_to_column("sample")
save(mRNA_exprSet,file = "mRNA_exprSet_Pan_cancer.Rdata")
load(file = "mRNA_exprSet_Pan_cancer.Rdata")
test <- mRNA_exprSet[1:10,1:10]
### 添加分组信息,查看分组
### https://mp.weixin.qq.com/s/Ph1O6V5RkxkyrKpVmB5ODA
table(substring(mRNA_exprSet$sample,14,15))
mRNA_exprSet <- cbind(clin,subtype=substring(mRNA_exprSet$sample,14,15),mRNA_exprSet[,-1])
test <- mRNA_exprSet[1:10,1:10]
### 去除癌旁组织和重复组织
tcga_mRNA_exprSet <- mRNA_exprSet %>%
filter(subtype != "11") %>%
distinct(sample,.keep_all = T) %>%
as.data.frame()
save(tcga_mRNA_exprSet,file = "output/tcga_mRNA_exprSet.Rdata")
##########################################################################################
### 批量相关性分析
rm(list=ls())
load(file = "output/tcga_mRNA_exprSet.Rdata")
data <- tcga_mRNA_exprSet
test <- data[1:10,1:10]
### 测试数据和相关性分析的方法
gene1 <- as.numeric(data[,"METTL3"])
gene2 <- as.numeric(data[,"SETD2"])
dd <- cor.test(gene1,gene2,method="spearman")
dd$estimate
dd$p.value
### 写一个函数批量计算每个癌症中的相关性系数
### 输入两个基因名称和清洁格式的数据框
splitdata <- split(data,data$type)
getpancordata <- function(gene1,gene2,splitdata,method="spearman"){
do.call(rbind,lapply(splitdata, function(x){
dd <- cor.test(as.numeric(x[,gene1]),as.numeric(x[,gene2]),method=method)
data.frame(type=x$type[1],cor=dd$estimate,p.value=dd$p.value,method = method)
}))
}
plotdf <- getpancordata("WTAP","SETD2",splitdata)
得到相关性开始画图
### 画图
library(ggplot2)
ggplot(plotdf,aes(-log10(p.value),cor))+
geom_point()
library(ggplot2)
options(scipen = 2)
ggplot(data = plotdf,aes(-log10(p.value),cor))+
## 画点
geom_point(size=6,fill="purple",shape = 21,colour="black",stroke = 2)+
## 改变y轴
scale_y_continuous(expand = c(0,0),limits = c(-1.1,1.1),breaks = seq(-1,1,0.2))+
## 改变x轴
scale_x_log10(limits = c(0.01, 1000),breaks = c(0.01,0.1,10,1000))+
## 加横线
geom_hline(yintercept = 0,size=1.5)+
## 加竖线
geom_vline(xintercept = -log10(0.05),size=1.5)+
## 坐标轴
labs(x=bquote(-log[10]~italic("P")),y=paste0(plotdf$method[1]," correlation (r)"))+
theme(axis.title=element_text(size=20),
axis.text = element_text(face = "bold",size = 16),
axis.ticks.length=unit(.4, "cm"),
axis.ticks = element_line(colour = "black", size = 1),
panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black"),
panel.border = element_rect(colour = "black", fill=NA, size=1.5),
plot.margin = margin(1, 1, 1, 1, "cm"))
## 加载R包
## https://github.com/tomwenseleers/export
library(export)
## 导成PPT可编辑的格式
graph2ppt(file="output/dotplot.pptx")
## 导成AI可以编辑的状态
graph2eps(file="output/dotplot2.eps")
###################################################################################
####看单个癌症中的相关性
library(dplyr)
library(ggplot2)
library(ggpubr)
## 选取单个癌症的数据,比如BRCA
dd <- data %>%
filter(type=="BRCA")
ggplot(dd,aes(METTL3,SETD2))+
# 打点
geom_point(col="#984ea3")+
# 加线
geom_smooth(method=lm, se=T,na.rm=T, fullrange=T,size=2,col="#fdc086")+
# 加边线
geom_rug(col="#7fc97f")+
# 相关性系数
stat_cor(method = "pearson",digits = 3,size=5)+
# 修饰
theme_bw()+
theme(plot.title = element_text(hjust = 0.5),
plot.margin = margin(1, 1, 1, 1, "cm"))
###############################################################################
## 可以写成函数,方便操作
sincorplot <- function(a,b,type="ALL",method = "spearman",title=T,source = "TCGA"){
if(type=="ALL"){
plot_df <- data[,c(a,b)]
type = "Data"
}else{
plot_df <- data[data$type %in% type,c(a,b)]
}
if(title){title = paste0(type," from ",source)}else{title=NULL}
names(plot_df) <- c("geneA","geneB")
ggplot(plot_df,aes(geneA,geneB))+
geom_point(col="#984ea3")+
geom_smooth(method=lm, se=T,na.rm=T, fullrange=T,size=2,col="#fdc086")+
geom_rug(col="#7fc97f")+
stat_cor(method = method,digits = 3,size=5)+
xlab(a)+
ylab(b)+
labs(x=a,y=b,title = title)+
theme_bw()+
theme(plot.margin = margin(1, 1, 1, 1, "cm"))
}
## 前提是,数据框已经存在,而且名称是data
## 测试一下
sincorplot("METTL3","SETD2")
sincorplot("METTL3","SETD2","BRCA")
sincorplot("FOXA1","ESR1","BRCA")
sincorplot("SAMD11","ISG15","OV",title=F)
## 加载R包
library(export)
## 导成PPT可编辑的格式
graph2ppt(file="corplot.pptx")
##################################
rm(list = ls())
load(file = "output/tcga_mRNA_exprSet.Rdata")
source("resource/GTBA_functions.R")
data <- tcga_mRNA_exprSet
splitdata <- split(data,data$type)
gene1 ="METTL3"
gene2 = "FOXA1"
plotdf <- getpancordata(gene1,gene2,data=splitdata,method = "spearman")
pancorplot(plotdf)
pancorplot(plotdf,label = "all")
pancorplot(plotdf,label = "none")
pancorplot(plotdf,label = "sig")
pancorplot(plotdf,label = "BRCA")
pancorplot(plotdf,label = c("BRCA","THYM"))
pancorplot(plotdf,label = c("BRCA","THYM"),legend.position="none")
## shape 21-24
## shape =21,圆形
pancorplot(plotdf,label = "all",shape =21)
## shape =22,正方形
pancorplot(plotdf,label = "all",shape =22)
## shape =24,三角形
pancorplot(plotdf,label = "all",shape =24)
### 单样本
sincorplot("METTL3","SETD2",source = "TCGA")
sincorplot("METTL3","SETD2","BRCA",source = "TCGA")
sincorplot("FOXA1","ESR1","BRCA",source = "TCGA")
sincorplot("SAMD11","ISG15","OV",source = "TCGA")
GTEx数据
#### GTEX的数据
rm(list = ls())
dd <- data.table::fread("data/gtex_RSEM_gene_tpm",data.table = F)
test <- dd[1:100,1:100]
### 清洁数据三部曲
### 临床信息
clin <- data.table::fread("data/GTEX_phenotype",data.table = F)
colnames(clin)[c(1,3)] <- c("sample","type")
index <- clin$sample %in% colnames(dd)
clin <- clin[index,c(1,3)]
dd <- dd[,c("sample",clin$sample)]
colnames(dd)[1] <- "gene_id"
# 探针转换
# 下载后先解压gencode.v23.annotation.gtf.gz
# 读入需要时间,耐心等待
# gtf1 <- rtracklayer::import('gencode.v23.annotation.gtf')
# gtf_df <- as.data.frame(gtf1)
# save(gtf_df,file = "resource/gtf_df.Rdata")
load(file = "resource/gtf_df.Rdata")
library(dplyr)
library(tibble)
## 提取mRNA
mRNA_exprSet <- gtf_df %>%
filter(type=="gene",gene_type=="protein_coding") %>%
dplyr::select(c(gene_name,gene_id)) %>%
inner_join(dd,by ="gene_id") %>%
dplyr::select(-gene_id) %>%
distinct(gene_name,.keep_all = T) %>%
column_to_rownames("gene_name") %>%
t() %>%
as.data.frame() %>%
rownames_to_column("sample")
test <- mRNA_exprSet[1:10,1:10]
mRNA_exprSet <- cbind(clin,mRNA_exprSet[,-1])
test <- mRNA_exprSet[1:10,1:10]
### 去除无信息组织
table(mRNA_exprSet$type)
gtex_mRNA_exprSet <- mRNA_exprSet %>%
filter(type != "<not provided>") %>%
as.data.frame()
test <- gtex_mRNA_exprSet[1:10,1:10]
save(gtex_mRNA_exprSet,file = "output/gtex_mRNA_exprSet.Rdata")
##########################################################################################
rm(list = ls())
load(file = "output/gtex_mRNA_exprSet.Rdata")
source("resource/GTBA_functions.R")
data <- gtex_mRNA_exprSet
splitdata <- split(data,data$type)
gene1 ="METTL3"
gene2 = "SETD2"
plotdf <- getpancordata(gene1,gene2,data=splitdata,method = "spearman")
pancorplot(plotdf)
pancorplot(plotdf,label = "all")
pancorplot(plotdf,label = "none")
pancorplot(plotdf,label = "sig")
pancorplot(plotdf,label = "Brain")
## shape 21-24
pancorplot(plotdf,label = "all",shape =21)
pancorplot(plotdf,label = "all",shape =22)
pancorplot(plotdf,label = "all",shape =24)
### 单样本
sincorplot("METTL3","SETD2",data = gtex_mRNA_exprSet,source = "GTEX")
table(data$type)
sincorplot("METTL3","SETD2","Brain",data = gtex_mRNA_exprSet,source = "GTEX")
sincorplot("METTL3","SETD2","Small Intestine",data = gtex_mRNA_exprSet,source = "GTEX")
CCLE
### CCLE数据处理
### https://depmap.org/portal/download/
rm(list = ls())
### 读入表达量数据
dd <- data.table::fread("data/CCLE_RNAseq_rsem_genes_tpm_20180929.txt",data.table = F)
test <- dd[1:100,1:100]
#### 批量log2,为了跟TCGA和GTEx统一起来
datalog <- as.data.frame(apply(dd[,-c(1,2)]+0.001,2,log2))
dd <- cbind(gene_id=dd$gene_id,datalog)
test <- dd[1:100,1:100]
### 读入注释文件
clin <- data.table::fread("data/Cell_lines_annotations_20181226.txt",data.table = F)
colnames(clin)[c(1,5)] <- c("sample","type")
index <- clin$sample %in% colnames(dd)
clin <- clin[index,c(1,5)]
### 提取表达量数据
dd <- dd[,c("gene_id",clin$sample)]
test <- dd[1:100,1:100]
### 清洁数据三部曲
### 探针转换
### 下载后先解压gencode.v19.genes.v7_model.patched_contigs.gtf
## 读入需要时间,耐心等待
gtf1 <- rtracklayer::import('resource/gencode.v19.genes.v7_model.patched_contigs.gtf')
gtf_df <- as.data.frame(gtf1)
library(dplyr)
library(tibble)
## 提取mRNA
mRNA_exprSet <- gtf_df %>%
#筛选gene,和编码指标
filter(type=="gene",gene_type=="protein_coding") %>%
dplyr::select(c(gene_name,gene_id)) %>%
inner_join(dd,by ="gene_id") %>%
dplyr::select(-gene_id) %>%
distinct(gene_name,.keep_all = T) %>%
column_to_rownames("gene_name") %>%
t() %>%
as.data.frame() %>%
rownames_to_column("sample")
test <- mRNA_exprSet[1:10,1:10]
mRNA_exprSet <- cbind(clin,mRNA_exprSet[,-1])
test <- mRNA_exprSet[1:10,1:10]
table(mRNA_exprSet$type)
sort(table(mRNA_exprSet$type))
### 其中有一些样本条目太少,不能单独做相关性分析。
library(dplyr)
ccle_mRNA_exprSet <- mRNA_exprSet %>%
group_by(type) %>%
mutate(number = length(type)) %>%
dplyr::select(number,everything()) %>%
ungroup() %>%
filter(number>5) %>%
dplyr::select(-number) %>%
as.data.frame()
class(ccle_mRNA_exprSet)
save(ccle_mRNA_exprSet,file = "output/ccle_mRNA_exprSet.Rdata")
##########################################################################################
rm(list = ls())
load(file = "output/ccle_mRNA_exprSet.Rdata")
source("resource/GTBA_functions.R")
data <- ccle_mRNA_exprSet
splitdata <- split(data,data$type)
gene1 ="METTL3"
gene2 = "SETD2"
plotdf <- getpancordata(gene1,gene2,data=splitdata,method = "spearman")
pancorplot(plotdf)
pancorplot(plotdf,label = "all")
pancorplot(plotdf,label = "none")
pancorplot(plotdf,label = "sig")
pancorplot(plotdf,label = "kidney")
## shape 21-24
pancorplot(plotdf,label = "all",shape =21)
pancorplot(plotdf,label = "all",shape =22)
pancorplot(plotdf,label = "all",shape =24)
### 单样本
sincorplot("METTL3","SETD2",data = ccle_mRNA_exprSet,source = "CCLE")
table(data$type)
sincorplot("METTL3","SETD2","breast",data = ccle_mRNA_exprSet,source = "CCLE")
sincorplot("METTL3","SETD2","lung",data = ccle_mRNA_exprSet,source = "CCLE")
(都是果子老师的财富)