【bulk RNA-seq】DESeq2差异分析 硬核

心血来潮写了个bulk RNA-seq的代码,该有的基本都有了。省去了路径和具体的基因。以后更新一下用公共数据跑出来的图吧。最近比较忙。 包非原创,代码是原创。转载或引用请注明出处或与我联系。

版本:R 4.1

################################
## 导入需要用到的R package以及自定义函数
################################
library(DESeq2) #差异分析的包
library(clusterProfiler) #基因名转换&GO富集分析的包
library(dplyr)
library(RColorBrewer) #调色包
library(gplots)
library(corrplot)
MyName2Color <- function(name,
                       palette = brewer.pal(9,"Set1"),
                       is.row = FALSE,
                       na.value = NULL){
  name.factor <- as.factor(name)
  if(!is.null(names(palette))){
    palette=palette[levels(name.factor)]
  }else{
    print("your palette order must adapt to you names level order")
  }
  name.color <- palette[name.factor]
  name.color <- as.matrix(name.color)
  if(is.row){
    name.color <- t(name.color)
  }
  if(!is.null(na.value)){
    name.color[is.na(name.color)]=na.value
  }
  return(name.color)
} #自定义函数
################################
## 导入原始数据 并标注样本信息
################################
setwd("/home/") #设置工作路径
read.count <- read.delim("read_count.tab") #读表达矩阵
rownames(read.count) <- read.count$ID
# sample.information <- read.csv() #读样品信息
read.count <- read.count[,-1] #去掉第一列
sample.information <- as.data.frame(colnames(read.count)) #变为dataframe格式
colnames(sample.information) <- "ID"
row.names(sample.information) <- sample.information$ID
sample.information$condition <- c("KO","KO","KO","WT","WT","WT") #设置样本信息

################################
## DEseq2
################################
dds <- DESeqDataSetFromMatrix(countData = read.count, colData = sample.information, design = ~condition) #构建DEseq矩阵
dds.temporary <- rowSums(counts(dds) >= 10) >= 3   #过滤低表达基因,至少在3个样品中都有5个reads 
dds <- dds[dds.temporary, ] 
dds <- DESeq(dds) #运行DEseq2

##### PCA 看看样品间的差异(比如有无明显批次效应) 
vsdata <- rlog(dds, blind=FALSE)  #归一化(样本数小于30用rlog函数)
plotPCA(vsdata, intgroup = "condition") #绘制样本的PCA图

result <- results(dds,alpha = 0.1) #导出结果
summary(result) #查看结果的概览
sum(result$padj<0.1, na.rm = TRUE) #检查pvalue<0.1的基因个数
res_data <- merge(as.data.frame(result), #DEseq2的检验结果
                  as.data.frame(counts(dds,normalize=TRUE)), #用DEseq2 normalize后的表达矩阵
                  by="row.names",sort=FALSE) # 合并DEseq2的结果与标准化之后的表达矩阵

Symbol <- bitr(res_data$Row.names,fromType = 'ENSEMBL',toType = 'SYMBOL',OrgDb = "org.Mm.eg.db") # ENSEMBL ID转基因名
res_data <- res_data[res_data$Row.names%in%Symbol$ENSEMBL,] #有部分ID转换失败,去除之
Symbol <- Symbol[!duplicated(Symbol$ENSEMBL),] #有部分ID转换失败,去除之
rownames(res_data) <- res_data$Row.names #改变行名称
rownames(Symbol) <- Symbol$ENSEMBL #改变行名称
res_data <- cbind(res_data, Symbol) #把基因名放到res_data里,列名为SYMBOL
res_data <- res_data[!duplicated(res_data$SYMBOL),] #去掉SYMBOL列里重复的行

up_DEG <- subset(res_data, padj < 0.1 & log2FoldChange > 0) #筛选上调基因
up_DEG$cluster <- "up"
down_DEG <- subset(res_data, padj < 0.1 & log2FoldChange < 0) #筛选下调基因
down_DEG$cluster <- "down"
gene.info <- read.csv("/home/xxx.csv")#基因信息的表格
rownames(gene.info) <- gene.info$EnsemblGeneID
down_DEG_merge <- cbind(down_DEG,gene.info[rownames(gene.info)%in%rownames(down_DEG),])
up_DEG_merge <- cbind(up_DEG,gene.info[rownames(gene.info)%in%rownames(up_DEG),])

####### 将全部基因的信息以及上下调基因的信息写入csv
write.csv(res_data, "all.csv") #写入csv表格,保存在工作目录下
write.csv(up_DEG_merge, "up.csv")
write.csv(down_DEG_merge, "down.csv")

################################
up_and_down_gene <- rbind(down_DEG, up_DEG) #合并两个表格
rownames(up_and_down_gene) <- up_and_down_gene$SYMBOL

##### 绘制heatmap
pdf("Normalized_count.pdf",20,25)
heatmap.2(x = as.matrix(log2(up_and_down_gene[,c("")]+1)),
          scale = "none", col=bluered(100), 
          trace = "none", density.info = "none",
          RowSideColors = MyName2Color(up_and_down_gene$cluster, palette = c("#d53e4f", "#3288bd")))
dev.off()

##### 样本间相关性分析
cor.sample <- cor(up_and_down_gene[,c("")])#相关性分析函数
pdf("correlation_afterNormalization.pdf",9,7) #做相关性分析图
corrplot(cor.sample, method="number", type="upper", order="hclust", tl.col="black", tl.srt=45)
dev.off()

##### GO富集分析
GO_database <- 'org.Mm.eg.db' #设置GO数据集 这个数据集用BiocManager::install("org.Mm.eg.db")可以安装
universe <- bitr(Symbol$SYMBOL,fromType = 'SYMBOL',toType = 'ENTREZID',OrgDb = GO_database) #选取背景基因 并转换为ENTREZ ID
gene.UPregulate <- bitr(up_DEG$SYMBOL,
             fromType = 'SYMBOL',toType = 'ENTREZID',OrgDb = GO_database)
gene.DOWNregulate <- bitr(down_DEG$SYMBOL,
                          fromType = 'SYMBOL',toType = 'ENTREZID',OrgDb = GO_database)

GO.UP <- enrichGO(gene.UPregulate$ENTREZID,#GO富集分析
               OrgDb = GO_database,
               keyType = "ENTREZID",#设定读取的gene ID类型
               ont = "BP",#Biological Process)
               pvalueCutoff = 0.05,#设定p值阈值
               qvalueCutoff = 0.05,#设定q值阈值
               readable = T)
GO.DOWN <- enrichGO(gene.DOWNregulate$ENTREZID,#GO富集分析
                  OrgDb = GO_database,
                  keyType = "ENTREZID",#设定读取的gene ID类型
                  ont = "BP",
                  pvalueCutoff = 0.05,#设定p值阈值
                  qvalueCutoff = 0.05,#设定q值阈值
                  readable = T)

pdf("GOplot.upregulate.pdf",12,5)
barplot(GO.UP)
dev.off()

pdf("GOplot.downregulate.pdf",12,5)
barplot(GO.DOWN)
dev.off()

##### xxx相关基因表达量的变化
#在kegg上找到xxx所在的xxx通路的下游基因xxx:即xxx,是受体基因  下游有xxx
rownames(res_data) <- res_data$SYMBOL
gene.count_normalized <- res_data[,c("")]
MyBoxplot_genecount <- function(read.count, gene){
  df <- as.data.frame(t(read.count[gene,]))
  df$sample2 <- c("KO","KO","KO","WT","WT","WT")
  colnames(df) <- c("Read_Count","Sample")
  # print(df)
  ggplot(df, aes(x = Sample, y = Read_Count)) + 
    geom_boxplot() + 
    labs(title = gene) + 
    theme_bw() + #设置背景为无色
    theme(plot.title = element_text(face = "bold.italic", size = 19)) + #设置表格字体为斜体,以及字号大小
    theme(panel.grid = element_blank()) #设置标尺线为无色
}#写个函数方便批量做图

pdf("xxx_associated_genes.pdf",4,4) #批量做图
MyBoxplot_genecount(gene.count_normalized, "xxx")
MyBoxplot_genecount(gene.count_normalized, "xxx")
MyBoxplot_genecount(gene.count_normalized, "xxx")
MyBoxplot_genecount(gene.count_normalized, "xxx")
dev.off()


##### 火山图
data_VolcanoMap <- res_data[,c("log2FoldChange", "padj", "SYMBOL")]
data_VolcanoMap$UpDown <- "invariable" #加新的一列,列名为UpDown,方便做图的时候分类
data_VolcanoMap[data_VolcanoMap$SYMBOL%in%up_DEG$SYMBOL,]$UpDown <- "UP_regulated"
data_VolcanoMap[data_VolcanoMap$SYMBOL%in%down_DEG$SYMBOL,]$UpDown <- "DOWN_regulated"
data_VolcanoMap$nega_logPadj <- -log10(data_VolcanoMap$padj) #对Pvalue adj取负log10

pdf("VolcanoMap.pdf",6.6,5)
ggplot(data = data_VolcanoMap) +
  geom_point(aes(x=log2FoldChange, y=nega_logPadj, color=UpDown)) +
  labs(title="Volcano Plot") +
  theme_bw() +
  theme(panel.grid = element_blank())
dev.off()





  • 5
    点赞
  • 24
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值