差异分析(R包——DESEQ2&EDGER)

1.准备文件

数据文件及分组文件

2.EDGER包代码

检测的数据文件:是count值

gene_namesample1sample2sample3sample4sample5
gene100000
gene200000
gene31106127814869331799
gene4412329241351246
gene5000010
gene629670348134603
gene723221
gene8144579220515266
gene9100222
gene1000000
gene1100000
gene1200000
gene135139293680
gene1400000

分组文件:如下表所示

group
samples1low
samples2high
samples3low
samples4high
samples5low

##加载需要的R包
library(ggplot2)
library(edgeR)
library(DESeq2)

#输入数据文件
gene_expr <- read_excel("gene_expr.xlsx", sheet = 1)
gene_expr <- as.data.frame(gene_expr)
rownames(gene_expr) <- gene_expr$gene_name
gene_expr <- gene_expr[,-1]

#输入分组文件
group <- read_excel("Surv_risk_score_plot_data.xlsx", sheet = 1)
colnames(group)[1] <- "samples"
gene_expr_f <- gene_expr[,c(group$samples)]
group_f <- factor(group$group, levels = c("low", "high"))

#####edger差异分析
dgelist <- DGEList(counts = gene_expr_f, group = group_f)

keep <- rowSums(cpm(dgelist) > 1 ) >= 2
dgelist <- dgelist[keep, , keep.lib.sizes = FALSE]
dgelist_norm <- calcNormFactors(dgelist, method = 'TMM')

design <- model.matrix(~group_f)
dge <- estimateDisp(dgelist_norm, design, robust = TRUE)

##glmFit
fit <- glmFit(dge, design, robust = TRUE)
lrt <- topTags(glmLRT(fit), n = nrow(dgelist$counts))

write.table(lrt, 'control_treat.glmLRT.txt', sep = '\t', col.names = NA, quote = FALSE)

##glmQLFit
fit <- glmQLFit(dge, design, robust = TRUE)
lrt <- topTags(glmQLFTest(fit), n = nrow(dgelist$counts))

write.table(lrt, 'control_treat.glmQLFit.txt', sep = '\t', col.names = NA, quote = FALSE)

y <- calcNormFactors(dgelist)
normalized_counts_e <- t(t(y$counts) / y$samples$lib.size / y$samples$norm.factors) * 1000000

##base_mean
base_mean <- apply(normalized_counts_e,1,mean)
base_mean <- data.frame(base_mean)

##low_risk_mean
group_l <- group[which(group$group == "low"),]
normalized_counts_l <- normalized_counts_e[,group_l$samples]
low_risk_mean <- apply(normalized_counts_l,1,mean)
low_risk_mean <- data.frame(low_risk_mean)

##high_risk_mean
group_h <- group[which(group$group == "high"),]
normalized_counts_h <- normalized_counts_e[,group_h$samples]
high_risk_mean <- apply(normalized_counts_h,1,mean)
high_risk_mean <- data.frame(high_risk_mean)

##所有元素列整合
gene_diff_edger <- cbind(as.data.frame(lrt), base_mean, low_risk_mean, high_risk_mean)
gene_diff_edger <- gene_diff_edger[,c(6:8,1:5)]

##根据自己的需求筛选结果,这里是|logfc|>1 & p<0.05
diff_gene_Group <- subset(gene_diff_edger, FDR < 0.05 & abs(logFC) > 1)

##输出结果文件
write.csv(gene_diff_edger, file = "gene_diff_deseq2.csv")
write.csv(diff_gene_Group, file = "gene_diff_deseq2_filter.csv")


结果如下图所示:包含全部样本均值、各分组均值、logfc、logcpm、p值等。

base_meanlow_risk_meanhigh_risk_meanlogFClogCPMPValueFDR
gene1108.977391.74475126.62027.1684781.4676893.28E-137.73E-09
gene230.8118228.9756632.691714.7531823.2223261.06E-121.24E-08
gene325.7949424.8798726.731814.64686-1.486841.25E-109.84E-07
gene40.2638180.4348940.0886694.565269-1.331469.75E-105.74E-06
gene530.7150631.8279829.575654.0701066.8861274.87E-092.10E-05
gene60.5555960.8254020.2793654.922281-0.095595.34E-092.10E-05
gene76.3637667.1665535.5418665.154932-0.541387.71E-092.10E-05
gene80.7127460.7119880.7135223.39454-1.914667.77E-092.10E-05
gene92.9588763.1308012.782858-4.07627-1.537698.01E-092.10E-05
gene1058.4219858.4085958.435693.36764-1.848711.38E-083.26E-05
gene11113.1053105.7111120.6755-3.73657-1.854781.66E-083.55E-05
gene1269.625173.9078365.2404-4.542931.2990652.37E-084.65E-05
gene130.51740.6198280.4125334.827020.4572572.59E-084.70E-05
gene140.7506470.6844890.8183794.660751-0.883453.15E-085.01E-05
gene1541.4628144.5685638.283114.480072-0.594183.19E-085.01E-05

3.DESEQ2包代码

##deseq2差异分析
condition <- factor(group_f, levels = c("low", "high"))
coldata <- data.frame(row.names = colnames(gene_expr_f), condition)
dat <- gene_expr_f

dds <- DESeqDataSetFromMatrix(countData = dat, colData = coldata, design= ~condition)
dds1 <- DESeq(dds, fitType = 'mean', minReplicatesForReplace = 7, parallel = FALSE)
res <- results(dds1, contrast = c('condition', 'high', 'low'))
res


dds2 <- estimateSizeFactors(dds)
sizeFactors(dds2)
normalized_counts <- counts(dds2, normalized=TRUE)

##low_risk_mean
group_l <- group[which(group$group == "low"),]
normalized_counts_l <- normalized_counts[,group_l$samples]
low_risk_mean <- apply(normalized_counts_l,1,mean)
low_risk_mean <- data.frame(low_risk_mean)

##high_risk_mean
group_h <- group[which(group$group == "high"),]
normalized_counts_h <- normalized_counts[,group_h$samples]
high_risk_mean <- apply(normalized_counts_h,1,mean)
high_risk_mean <- data.frame(high_risk_mean)

gene_diff_deseq2 <- as.data.frame(res)

##所有元素列整合
gene_diff_deseq2 <- cbind(gene_diff_deseq2, low_risk_mean, high_risk_mean)
gene_diff_deseq2 <- gene_diff_deseq2[,c(1,7:8,2:6)]

##根据自己的需求筛选结果,这里是|logfc|>1 & p<0.05
diff_gene_Group2 <- subset(gene_diff_deseq2, padj < 0.05 & abs(log2FoldChange) > 1)

write.csv(gene_diff_deseq2, file = "gene_diff_deseq2.csv")
write.csv(diff_gene_Group2, file = "gene_diff_deseq2_filter.csv")

结果和EDGER结果类似,就不再赘述了。得到差异基因后,可以做一些可视化,火山图之类的,也可以再往下做一下富集分析等等。

  • 7
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
差异基因分析是一种常用的生物信息学分析方法,用于找出在不同条件下表达量差异显著的基因。在R语言中,可以使用一些常见的(例如edgeR, DESeq2)进行差异基因分析。 下面是一个使用DESeq2进行差异基因分析的示例代码: ```R # 导入DESeq2 library(DESeq2) # 导入原始表达矩阵数据 counts <- read.table("expression_counts.txt", header = TRUE, row.names = 1) # 创建一个DESeq2对象 dds <- DESeqDataSetFromMatrix(countData = counts, colData = coldata, design = ~ condition) # 进行基因表达分析 dds <- DESeq(dds) # 查找差异表达基因 res <- results(dds) # 筛选差异表达基因 sig_genes <- subset(res, padj < 0.05 & abs(log2FoldChange) > 1) # 输出差异表达基因 write.table(sig_genes, file = "differential_genes.txt", sep = "\t", quote = FALSE, col.names = NA) ``` 以上代码中,首先导入DESeq2,然后读取原始的基因表达量数据,并使用DESeqDataSetFromMatrix函数创建一个DESeq2对象。接下来,使用DESeq函数对基因表达进行分析,并使用results函数查找差异表达基因。最后,通过设置阈值来筛选出差异表达显著的基因,并将结果输出到"differential_genes.txt"文件中。 需要注意的是,该示例只是基础的差异基因分析流程,具体的分析方法和参数设置还需要根据实际情况进行调整。此外,还可以结合一些可视化方法(如绘制热图、富集分析等)进一步探索差异表达基因的生物学功能和通路注释等信息。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值