1.准备文件
数据文件及分组文件
2.EDGER包代码
检测的数据文件:是count值
gene_name | sample1 | sample2 | sample3 | sample4 | sample5 |
gene1 | 0 | 0 | 0 | 0 | 0 |
gene2 | 0 | 0 | 0 | 0 | 0 |
gene3 | 1106 | 1278 | 1486 | 933 | 1799 |
gene4 | 412 | 329 | 241 | 351 | 246 |
gene5 | 0 | 0 | 0 | 0 | 10 |
gene6 | 296 | 70 | 348 | 134 | 603 |
gene7 | 2 | 3 | 2 | 2 | 1 |
gene8 | 144 | 579 | 220 | 515 | 266 |
gene9 | 1 | 0 | 0 | 2 | 22 |
gene10 | 0 | 0 | 0 | 0 | 0 |
gene11 | 0 | 0 | 0 | 0 | 0 |
gene12 | 0 | 0 | 0 | 0 | 0 |
gene13 | 51 | 39 | 29 | 36 | 80 |
gene14 | 0 | 0 | 0 | 0 | 0 |
分组文件:如下表所示
group | |
samples1 | low |
samples2 | high |
samples3 | low |
samples4 | high |
samples5 | low |
##加载需要的R包
library(ggplot2)
library(edgeR)
library(DESeq2)
#输入数据文件
gene_expr <- read_excel("gene_expr.xlsx", sheet = 1)
gene_expr <- as.data.frame(gene_expr)
rownames(gene_expr) <- gene_expr$gene_name
gene_expr <- gene_expr[,-1]
#输入分组文件
group <- read_excel("Surv_risk_score_plot_data.xlsx", sheet = 1)
colnames(group)[1] <- "samples"
gene_expr_f <- gene_expr[,c(group$samples)]
group_f <- factor(group$group, levels = c("low", "high"))
#####edger差异分析
dgelist <- DGEList(counts = gene_expr_f, group = group_f)
keep <- rowSums(cpm(dgelist) > 1 ) >= 2
dgelist <- dgelist[keep, , keep.lib.sizes = FALSE]
dgelist_norm <- calcNormFactors(dgelist, method = 'TMM')
design <- model.matrix(~group_f)
dge <- estimateDisp(dgelist_norm, design, robust = TRUE)
##glmFit
fit <- glmFit(dge, design, robust = TRUE)
lrt <- topTags(glmLRT(fit), n = nrow(dgelist$counts))
write.table(lrt, 'control_treat.glmLRT.txt', sep = '\t', col.names = NA, quote = FALSE)
##glmQLFit
fit <- glmQLFit(dge, design, robust = TRUE)
lrt <- topTags(glmQLFTest(fit), n = nrow(dgelist$counts))
write.table(lrt, 'control_treat.glmQLFit.txt', sep = '\t', col.names = NA, quote = FALSE)
y <- calcNormFactors(dgelist)
normalized_counts_e <- t(t(y$counts) / y$samples$lib.size / y$samples$norm.factors) * 1000000
##base_mean
base_mean <- apply(normalized_counts_e,1,mean)
base_mean <- data.frame(base_mean)
##low_risk_mean
group_l <- group[which(group$group == "low"),]
normalized_counts_l <- normalized_counts_e[,group_l$samples]
low_risk_mean <- apply(normalized_counts_l,1,mean)
low_risk_mean <- data.frame(low_risk_mean)
##high_risk_mean
group_h <- group[which(group$group == "high"),]
normalized_counts_h <- normalized_counts_e[,group_h$samples]
high_risk_mean <- apply(normalized_counts_h,1,mean)
high_risk_mean <- data.frame(high_risk_mean)
##所有元素列整合
gene_diff_edger <- cbind(as.data.frame(lrt), base_mean, low_risk_mean, high_risk_mean)
gene_diff_edger <- gene_diff_edger[,c(6:8,1:5)]
##根据自己的需求筛选结果,这里是|logfc|>1 & p<0.05
diff_gene_Group <- subset(gene_diff_edger, FDR < 0.05 & abs(logFC) > 1)
##输出结果文件
write.csv(gene_diff_edger, file = "gene_diff_deseq2.csv")
write.csv(diff_gene_Group, file = "gene_diff_deseq2_filter.csv")
结果如下图所示:包含全部样本均值、各分组均值、logfc、logcpm、p值等。
base_mean | low_risk_mean | high_risk_mean | logFC | logCPM | PValue | FDR | |
gene1 | 108.9773 | 91.74475 | 126.6202 | 7.168478 | 1.467689 | 3.28E-13 | 7.73E-09 |
gene2 | 30.81182 | 28.97566 | 32.69171 | 4.753182 | 3.222326 | 1.06E-12 | 1.24E-08 |
gene3 | 25.79494 | 24.87987 | 26.73181 | 4.64686 | -1.48684 | 1.25E-10 | 9.84E-07 |
gene4 | 0.263818 | 0.434894 | 0.088669 | 4.565269 | -1.33146 | 9.75E-10 | 5.74E-06 |
gene5 | 30.71506 | 31.82798 | 29.57565 | 4.070106 | 6.886127 | 4.87E-09 | 2.10E-05 |
gene6 | 0.555596 | 0.825402 | 0.279365 | 4.922281 | -0.09559 | 5.34E-09 | 2.10E-05 |
gene7 | 6.363766 | 7.166553 | 5.541866 | 5.154932 | -0.54138 | 7.71E-09 | 2.10E-05 |
gene8 | 0.712746 | 0.711988 | 0.713522 | 3.39454 | -1.91466 | 7.77E-09 | 2.10E-05 |
gene9 | 2.958876 | 3.130801 | 2.782858 | -4.07627 | -1.53769 | 8.01E-09 | 2.10E-05 |
gene10 | 58.42198 | 58.40859 | 58.43569 | 3.36764 | -1.84871 | 1.38E-08 | 3.26E-05 |
gene11 | 113.1053 | 105.7111 | 120.6755 | -3.73657 | -1.85478 | 1.66E-08 | 3.55E-05 |
gene12 | 69.6251 | 73.90783 | 65.2404 | -4.54293 | 1.299065 | 2.37E-08 | 4.65E-05 |
gene13 | 0.5174 | 0.619828 | 0.412533 | 4.82702 | 0.457257 | 2.59E-08 | 4.70E-05 |
gene14 | 0.750647 | 0.684489 | 0.818379 | 4.660751 | -0.88345 | 3.15E-08 | 5.01E-05 |
gene15 | 41.46281 | 44.56856 | 38.28311 | 4.480072 | -0.59418 | 3.19E-08 | 5.01E-05 |
3.DESEQ2包代码
##deseq2差异分析
condition <- factor(group_f, levels = c("low", "high"))
coldata <- data.frame(row.names = colnames(gene_expr_f), condition)
dat <- gene_expr_f
dds <- DESeqDataSetFromMatrix(countData = dat, colData = coldata, design= ~condition)
dds1 <- DESeq(dds, fitType = 'mean', minReplicatesForReplace = 7, parallel = FALSE)
res <- results(dds1, contrast = c('condition', 'high', 'low'))
res
dds2 <- estimateSizeFactors(dds)
sizeFactors(dds2)
normalized_counts <- counts(dds2, normalized=TRUE)
##low_risk_mean
group_l <- group[which(group$group == "low"),]
normalized_counts_l <- normalized_counts[,group_l$samples]
low_risk_mean <- apply(normalized_counts_l,1,mean)
low_risk_mean <- data.frame(low_risk_mean)
##high_risk_mean
group_h <- group[which(group$group == "high"),]
normalized_counts_h <- normalized_counts[,group_h$samples]
high_risk_mean <- apply(normalized_counts_h,1,mean)
high_risk_mean <- data.frame(high_risk_mean)
gene_diff_deseq2 <- as.data.frame(res)
##所有元素列整合
gene_diff_deseq2 <- cbind(gene_diff_deseq2, low_risk_mean, high_risk_mean)
gene_diff_deseq2 <- gene_diff_deseq2[,c(1,7:8,2:6)]
##根据自己的需求筛选结果,这里是|logfc|>1 & p<0.05
diff_gene_Group2 <- subset(gene_diff_deseq2, padj < 0.05 & abs(log2FoldChange) > 1)
write.csv(gene_diff_deseq2, file = "gene_diff_deseq2.csv")
write.csv(diff_gene_Group2, file = "gene_diff_deseq2_filter.csv")
结果和EDGER结果类似,就不再赘述了。得到差异基因后,可以做一些可视化,火山图之类的,也可以再往下做一下富集分析等等。