用limma包和WGCNA包进行RNA-seq数据分析
GE<-read.table('TCGA-COAD.htseq_counts.tsv',header=T,sep='\t',stringsAsFactors = F)
group<-rep('tumor',512)
group[grep("11A", colnames(GE)[-1], ignore.case = FALSE, perl = FALSE)]<-"normal"
library("limma")
design <- model.matrix(~0+factor(group))
colnames(design)=levels(factor(group))
rownames(design)=colnames(GE)[-1]
head(design)
exprSet<-GE[,-1]
rownames(exprSet)<-GE[,1]
norm <- voom(exprSet, design, plot = TRUE)
tag<-apply(exprSet,1,function(x) {
if (length(which(x==0))>512*0.8) {
re=0
}else {
re=1
}
})
table(tag)
exprSet<-exprSet[which(tag==1),]
norm <- voom(exprSet, design, plot = TRUE)
par(mfrow = c(2, 2))
boxplot(exprSet)
plotDensities(exprSet)
boxplot(norm$E)
plotDensities(norm$E)
fit <- lmFit(norm, design, method = 'ls')
contrast <- makeContrasts('treat-control', levels = design)
contrast
fit2 <- contrasts.fit(fit, contrast)
fit2 <- eBayes(fit2)
qqt(fit2$t, df = fit2$df.prior+fit2$df.residual, pch = 16, cex = 0.2)
abline(0,1)
diff_gene <- topTable(fit2, number = Inf, adjust.method = 'fdr')
head(diff_gene, 10)
write.table(diff_gene, 'gene_diff.txt', col.names = NA, sep = '\t', quote = FALSE)
diff_gene[which(diff_gene$logFC >= 1 & diff_gene$adj.P.Val < 0.01),'sig'] <- 'red'
diff_gene[which(diff_gene$logFC <= -1 & diff_gene$adj.P.Val < 0.01),'sig'] <- 'blue'
diff_gene[which(abs(diff_gene$logFC) < 1 | diff_gene$adj.P.Val >= 0.01),'sig'] <- 'gray'
log2FoldChange <- diff_gene$logFC
FDR <- diff_gene$adj.P.Val
plot(log2FoldChange, -log10(FDR), pch = 20, col = diff_gene$sig)
abline(v = 1, lty = 2)
abline(v = -1, lty = 2)
abline(h = -log(0.01, 10), lty = 2)
upGenes<-diff_gene[which(diff_gene$logFC >= 1 & diff_gene$adj.P.Val < 0.01),]
downGenes<-diff_gene[which(diff_gene$logFC <= -1 & diff_gene$adj.P.Val < 0.01),]
sig_genes<-rbind(upGenes,downGenes)
dim(upGenes)
dim(downGenes)
dim(sig_genes)
gencode<-read.table('gencode.v22.annotation.gene.probeMap',header = T,sep = '\t',stringsAsFactors = F)
FPKM<-read.table('TCGA-COAD.htseq_fpkm.tsv',header=T,sep='\t',stringsAsFactors=F)
sig_genes<-data.frame(rownames(sig_genes))
colnames(sig_genes)<-'Ensembl_ID'
sigE<-merge(FPKM,sig_genes,by.x = 'Ensembl_ID',by.y = 'Ensembl_ID')
sigSymbolE<-merge(gencode[,1:2],sigE,by.x='id',by.y='Ensembl_ID')
sigSymbolE<-sigSymbolE[,-1]
probe2gene = aggregate(sigSymbolE[,-1], by=list(sigSymbolE[,"gene"]),mean)
write.table(probe2gene,'DEG.txt',sep='\t',row.names = F,quote = F)
save.image('.Rdata')
library(WGCNA)
library(stringr)
library(flashClust)
library(iterators)
femData <- read.table("DEG.txt",header=TRUE,stringsAsFactors=F)
datExpr0 = as.data.frame(t(log2(femData[,-1]+1)))
colnames(datExpr0) = femData[,1]
rownames(datExpr0) = names(femData[,-1])
gsg = goodSamplesGenes(datExpr0, verbose = 3);
gsg$allOK
sampleTree = hclust(dist(datExpr0), method = "average");
sizeGrWindow(12,9)
par(cex = 0.6);
par(mar = c(0,4,2,0))
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5,
cex.axis = 1.5, cex.main = 2)
abline(h = 32, col = "red");
clust = cutreeStatic(sampleTree, cutHeight = 32, minSize = 10)
table(clust)
keepSamples = (clust==1)
datExpr = datExpr0[keepSamples, ]
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
powers = c(c(1:10), seq(from = 12, to=20, by=2))
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)
sizeGrWindow(9, 5)
par(mfrow = c(1,2));
cex1 = 0.9;
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red");
abline(h=0.90,col="red")
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
net = blockwiseModules(datExpr, power = 6,
TOMType = "unsigned", minModuleSize = 30,
reassignThreshold = 0, mergeCutHeight = 0.25,
numericLabels = TRUE, pamRespectsDendro = FALSE,
saveTOMs = TRUE,
saveTOMFileBase = "coadTOM",
verbose = 3)
table(net$colors)
sizeGrWindow(12, 9)
mergedColors = labels2colors(net$colors)
plotDendroAndColors(net$dendrograms[[1]], mergedColors[net$blockGenes[[1]]],
"Module colors",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
moduleLabels = net$colors
moduleColors = labels2colors(net$colors)
MEs = net$MEs;
geneTree = net$dendrograms[[1]];
sample<-rownames(datExpr)
tumor<-rep(1,length(sample))
normal<-rep(0,length(sample))
tumor[grep("11A", sample)]<-0
normal[grep("11A", sample)]<-1
table(tumor)
table(normal)
datTraits<-data.frame(tumor,normal)
rownames(datTraits)<-sample
nGenes = ncol(datExpr);
nSamples = nrow(datExpr);
MEs0 = moduleEigengenes(datExpr, moduleColors)$eigengenes
MEs = orderMEs(MEs0)
moduleTraitCor = cor(MEs, datTraits, use = "p");
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples);
sizeGrWindow(10,6)
textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep = "");
dim(textMatrix) = dim(moduleTraitCor)
par(mar = c(6, 8.5, 3, 3));
labeledHeatmap(Matrix = moduleTraitCor,
xLabels = names(datTraits),
yLabels = names(MEs),
ySymbols = names(MEs),
colorLabels = FALSE,
colors = greenWhiteRed(50),
textMatrix = textMatrix,
setStdMargins = FALSE,
cex.text = 0.5,
zlim = c(-1,1),
main = paste("Module-trait relationships"))
dissTOM = 1-TOMsimilarityFromExpr(datExpr, power = 6);
plotTOM = dissTOM^7;
diag(plotTOM) = NA;
sizeGrWindow(9,9)
TOMplot(plotTOM, geneTree, moduleColors, main = "Network heatmap plot, all genes")
关于WGCNA包中还有更丰富的功能,请参见Bioconductor中的官方说明。