library(edgeR)
#表达矩阵
genesymbol<-read.delim("/Users/Desktop/ESCC_RNA_count_data/ESCC_count_cancer_expression.txt",header = TRUE,check.names = F)
#样本信息
condition<-read.delim("/Users/Desktop/ESCC_RNA_count_data/clinical merge.txt",header=TRUE)
#对重复的基因名求和
genesymbol<-avereps(genesymbol[,-1],ID=genesymbol$id)
condition<-condition[,c("submitter_id","alcohol_history")]
#id保持一致
for(i in 1:ncol(genesymbol)){
colnames(genesymbol)[i]<-substr(colnames(genesymbol)[i],1,12)
}
condition<-condition[!condition$alcohol_history=="not_reported",]
genesymbol_alcohol<-genesymbol[,colnames(genesymbol)%in%condition$submitter_id]
#将样本分类数据转化为因子
row.names(condition)<-condition$submitter_id
condition[,2]<-as.factor(condition[,2])
condition$alcohol_history<-relevel(condition$alcohol_history,ref="0")
以上是对数据的处理,下面开始使用edgeR进行分析
#构建DGElist对象
dge<-DGEList(counts = genesymbol_alcohol,group = condition$alcohol_history)
#过滤基因表达量(至少在两个样本中表达量大于1)
keep <- rowSums(cpm(dge)>1) >= 2
table(keep)
#重置文库大小
dge<-dge[keep,,keep.lib.sizes=FALSE]
#对数据进行标准化
dge<-calcNormFactors(dge)
#聚类与热图,edgeR建议用logCPM这个指标来画图(prior.count=2 也就是logCPM=log2(CPM+2/L),避免得到的logCPM为0)
logCPM<-cpm(dge,log=TRUE,prior.count=2)
#差异分析
#构建分组矩阵
design<-model.matrix(~0+condition$alcohol_history)
colnames(design)<-levels(condition$alcohol_history)
rownames(design)<-colnames(dge)
#对于多因素实验,edgeR方法计算离散度,通过一个设计矩阵拟合GLM模型来考虑多因素影响
dge<-estimateDisp(dge,design)#估计离散值
fit<- glmFit(dge, design) #拟合模型
fit2 <- glmLRT(fit, contrast=c(-1,1))
DEG2<-topTags(fit2, n=nrow(dge))
DEG2<-as.data.frame(DEG2)
#不同阈值输出结果
logFC_cutoff_1<-c(1)
DEG2$change_1=as.factor(
ifelse(DEG2$PValue<0.05&abs(DEG2$logFC)>logFC_cutoff_1,
ifelse(DEG2$logFC>logFC_cutoff_1,"up","down"),"NOT")
)
logFC_cutoff_2<-c(2)
DEG2$change_2=as.factor(
ifelse(DEG2$PValue<0.05&abs(DEG2$logFC)>logFC_cutoff_2,
ifelse(DEG2$logFC>logFC_cutoff_2,"up","down"),"NOT")
)
logFC_cutoff_sd<-with(DEG2,mean(abs(logFC))+2*sd(abs(logFC)))
DEG2$change_sd=as.factor(
ifelse(DEG2$PValue<0.05&abs(DEG2$logFC)>logFC_cutoff_sd,
ifelse(DEG2$logFC>logFC_cutoff_sd,"up","down"),"NOT")
)
write.table(DEG2,'/Users/Desktop/ESCC_RNA_count_data/DE_edgeR.txt',quote = F,sep = '\t')