写在前面
前前后后大概折腾了三次WGCNA了,每次都是写完代码就丢,结果每次都要重新再搜,重新再写,这次学聪明了,直接把代码贴这么吧
原理解释:https://www.jianshu.com/p/dcae5fb86559?u_atoken=e16b2ac9-9664-42b9-bb1c-49a45f1a11b0&u_asession=01MLkAxi-M40JjZ5r6WTKoU2sbbcp7KOA240ayukKB6WSkMiUXOr1oyIfTkEGvdg_8X0KNBwm7Lovlpxjd_P_q4JsKWYrT3W_NKPr8w6oU7K-puYzLRlqCmpUoXcuGAHVxCvvWHyhA8I9G3hxoTho1LGBkFo3NEHBv0PZUm6pbxQU&u_asig=05l2QgGJthwgv4Uw3jz0mPDRkh58doK6-Ieui0r9LbOgyAPib376puxdfX0tzTCLvwXfAgpkdDexTBule-QHbsYa3YSKUlHavAtPIJ4pbovWINYJ43nfhtpWabcF-0-H1CZguQB3aXXjl0gtvWw6IBztUYLjEOfil5CV6hEEAZByr9JS7q8ZD7Xtz2Ly-b0kmuyAKRFSVJkkdwVUnyHAIJzXcYLzxfgYKGLtpraF9KrFAe1SYMszt5zK0d5JHkSNU2ChTz2MQxpCmDDGYlh3aZze3h9VXwMyh6PgyDIVSG1W_8WoYkjdpAgnJEM0hdeCYmS0gA-YCVD9qXtlq4leRItJe4rWFDKW9yzkBwkP99mWZkyMZ3kbspbzJ3w_m6YfhwmWspDxyAEEo4kbsryBKb9Q&u_aref=cU92VpXaXPCZosCyj67OA5bHh30%3D
还有英文教程:https://horvath.genetics.ucla.edu/html/CoexpressionNetwork/Rpackages/WGCNA/Tutorials/
结果
自己大概基本整个流程
#########基本设置
options(stringsAsFactors = F)
rm(list = ls())
if (TRUE){
library(data.table)
library(tidyr)
library(dplyr)
library(tibble)
library(caret)
library(limma)
}
pwd = "xxxxxx"
setwd(pwd)
library(readr)
library(dplyr)
library(readxl)
# 数据,临床数据
data_1 = read_xlsx("gse122063_expr..xlsx" ,sheet = 1 ) %>% as.data.frame()
data_2 = read_xlsx("gse37263_expr.xlsx" ,sheet = 1)%>% as.data.frame()
group_1 = read_xlsx("122063分組.xlsx" ,sheet = 1)%>% as.data.frame()
group_2 = read_xlsx("37263分组.xlsx" ,sheet = 1)%>% as.data.frame()
con_gene = intersect(data_1[,1],data_2[,1])
rownames(data_1) = data_1[,1]
rownames(data_2) = data_2[,1]
data_1 = data_1[,-1]
data_2 = data_2[,-1]
# 合并,去批次
data_all = cbind(data_1[con_gene,],data_2[con_gene,])
colnames(group_1) = colnames(group_2)
pheno = rbind(group_1,group_2)
batch = c(rep(1,dim(group_1)[1]),rep(2,dim(group_2)[1]))
edata = data_all
edata = matrix(data = as.matrix(data_all), nrow=4748, ncol = 116)
mode(edata) = "numeric"
combat_edata = ComBat(dat=edata, batch=batch, mod=NULL, par.prior=TRUE, prior.plots=FALSE)
colnames(combat_edata) = c(group_1$SAMPLE,group_2$SAMPLE)
rownames(combat_edata) = con_gene
# WGCNA部分
library(WGCNA)
options(stringsAsFactors = FALSE)
enableWGCNAThreads()
##筛选方差前25%的基因##我这边没做
#m.mad <- apply(dataExpr,1,mad)
#dataExprVar <- dataExpr[which(m.mad >
# max(quantile(m.mad, probs=seq(0, 1, 0.25))[2],0.01)),]
expro = combat_edata
datExpr=as.data.frame(t(combat_edata));
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
##样本聚类检查离群值##
gsg = goodSamplesGenes(datExpr, verbose = 3);
gsg$allOK
sampleTree = hclust(dist(datExpr), method = "average")
plot(sampleTree, main = "Sample clustering to detect outliers"
, sub="", xlab="")
save(datExpr, file = "FPKM-01-dataInput.RData")
##没有离群值,不作处理##
##软阈值筛选+作图##
powers = c(c(1:10), seq(from = 12, to=20, by=2))
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)
par(mfrow = c(1,2));
cex1 = 0.9;
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red");
abline(h=0.90,col="red")
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
##作图,看R方##
betal = 10
dev.off()
k.dataOne <- softConnectivity(datExpr, power = betal) -1
scaleFreePlot(k.dataOne, main = paste(" power=", betal))
##一步法网络构建:One-step network construction and module detection##
net = blockwiseModules(datExpr, power = betal, maxBlockSize = 6000,
TOMType = "unsigned", minModuleSize = 30,
reassignThreshold = 0, mergeCutHeight = 0.25,
numericLabels = TRUE, pamRespectsDendro = FALSE,
saveTOMs = TRUE,
saveTOMFileBase = "AS-green-FPKM-TOM",
verbose = 3)
table(net$colors)
##绘画结果展示##
# open a graphics window
#sizeGrWindow(12, 9)
# Convert labels to colors for plotting
mergedColors = labels2colors(net$colors)
# Plot the dendrogram and the module colors underneath
plotDendroAndColors(net$dendrograms[[1]], mergedColors[net$blockGenes[[1]]],
"Module colors",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
##结果保存
moduleLabels = net$colors
moduleColors = labels2colors(net$colors)
table(moduleColors)
MEs = net$MEs;
geneTree = net$dendrograms[[1]];
save(MEs, moduleLabels, moduleColors, geneTree,
file = "AS-green-FPKM-02-networkConstruction-auto.RData")
##表型与模块相关性+图##
trainDt= pheno
rownames(trainDt) = rownames(datExpr)
table(rownames(trainDt) == rownames(datExpr))
trainDt$AD = 0
trainDt$CON = 0
trainDt$AD[trainDt$CLASS=="AD"] = 1
trainDt$CON[trainDt$CLASS=="control"] = 1
moduleLabelsAutomatic = net$colors
moduleColorsAutomatic = labels2colors(moduleLabelsAutomatic)
moduleColorsWW = moduleColorsAutomatic
MEs0 = moduleEigengenes(datExpr, moduleColorsWW)$eigengenes
MEsWW = orderMEs(MEs0)
samples = t(combat_edata)
modTraitCor = cor(MEsWW, trainDt[,c(3,4)], use = "p")
colnames(MEsWW)
modlues = MEsWW
modTraitP = corPvalueStudent(modTraitCor, nSamples)
textMatrix = paste(signif(modTraitCor, 2), "\n(", signif(modTraitP, 1), ")", sep = "")
dim(textMatrix) = dim(modTraitCor)
dev.off()
labeledHeatmap(Matrix = modTraitCor, xLabels = colnames(trainDt[,c(3,4)]), yLabels = names(MEsWW), cex.lab = 0.5, yColorWidth=0.01,
xColorWidth = 0.03,
ySymbols = colnames(modlues), colorLabels = FALSE, colors = blueWhiteRed(50),
textMatrix = textMatrix, setStdMargins = FALSE, cex.text = 0.5, zlim = c(-1,1)
, main = paste("Module-trait relationships"))
## 保存模块与基因
gene = cbind(con_gene,mergedColors)
write.csv(gene,"module_gene.csv")
write.csv(combat_edata,"profile_remove_batch.csv")
# -------------------------------------- #
# 6.WGCNA #
# -------------------------------------- #
#清楚缓存
rm(list = ls())
#=====================================================================================
#
# Code chunk 1
#
#=====================================================================================
# Display the current working directory
getwd();
# If necessary, change the path below to the directory where the data files are stored.
# "." means current directory. On Windows use a forward slash / instead of the usual \.
workingDir = ".";
setwd(workingDir);
# Load the WGCNA package
library(WGCNA);
# The following setting is important, do not omit.
options(stringsAsFactors = FALSE);
#Read in the Expr_matrix_IRG data set
IRGData = read.csv("./result/Expr_matrix_IRGs.csv");
# Take a quick look at what is in the data set:
dim(IRGData);
names(IRGData);
#=====================================================================================
#
# Code chunk 2
#
#=====================================================================================
datExpr0 = as.data.frame(t(IRGData[, -c(1)]));
names(datExpr0) = IRGData$X;
rownames(datExpr0) = names(IRGData)[-c(1)];
a = cbind(substr(rownames(datExpr0),1,12),substr(rownames(datExpr0),13,16))
datExpr0 = datExpr0[order(rownames(datExpr0)),]
b = match(unique(substr(rownames(datExpr0),1,12)) , substr(rownames(datExpr0),1,12))
datExpr0 = datExpr0[b,]
rownames(datExpr0) = substr(rownames(datExpr0),1,12)
#datExpr0 就是一个以每行为样本,一列为一个基因的数据框
#=====================================================================================
#
# Code chunk 3
#
#=====================================================================================
gsg = goodSamplesGenes(datExpr0, verbose = 3);
gsg$allOK
#=====================================================================================
#
# Code chunk 4
#
#=====================================================================================
if (!gsg$allOK)
{
# Optionally, print the gene and sample names that were removed:
if (sum(!gsg$goodGenes)>0)
printFlush(paste("Removing genes:", paste(names(datExpr0)[!gsg$goodGenes], collapse = ", ")));
if (sum(!gsg$goodSamples)>0)
printFlush(paste("Removing samples:", paste(rownames(datExpr0)[!gsg$goodSamples], collapse = ", ")));
# Remove the offending genes and samples from the data:
datExpr0 = datExpr0[gsg$goodSamples, gsg$goodGenes]
}
#注:
#在这一步之后,一般会有一个筛选基因的过程。因为电脑的配置问题,一般来说运行几万个基因的表达量来构建TOM矩阵
#和邻接矩阵是非常吃力的,我试过把全部的基因都用来构建网络,但是构建矩阵的过程非常费时间,而且很难能运行成功。
#所以在完成剔除异常值之后,我们还需要进一步挑选基因。至于怎么筛选基因要看自己的目的,粗暴一点的可以按表达量
#为前5000的进行提取,通常用5000个基因进行分析。(具体讨论可见WGCNA分析进阶版常见问题整理)。
#筛选表达量前50000的基因
#WGCNA_matrix = datExpr0[,order(apply(datExpr0,2,mad), decreasing = T)[1:5000]]
#datExpr0 <- WGCNA_matrix ## top 5000 mad genes
#=====================================================================================
#
# Code chunk 5
#
#=====================================================================================
sampleTree = hclust(dist(datExpr0), method = "average");
plot(sampleTree,main = "Sample clustering to detect outliers",sub="",xlab="");
abline(h = 1000000, col = "red");
dev.new()
dev.off()
#=====================================================================================
#
# Code chunk 6
#
#=====================================================================================
# Plot a line to show the cut
abline(h = 1000000, col = "red");
# Determine cluster under the line
clust = cutreeStatic(sampleTree, cutHeight = 1000000, minSize = 10)
table(clust)
# clust 1 contains the samples we want to keep.
keepSamples = (clust==1)
datExpr = datExpr0[keepSamples, ]
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
#=====================================================================================
#
# Code chunk 7
#
#=====================================================================================
traitData = read.csv("./result/TCGA-GBM-clinical.csv");
dim(traitData)
names(traitData)
# remove columns that hold information we do not need.
allTraits = traitData[, -c(1,6,7,8,9)];
allTraits = allTraits[, -c(4,5) ];
dim(allTraits)
names(allTraits)
# Form a data frame analogous to expression data that will hold the clinical traits.
tumorSamples = rownames(datExpr);
a<-gsub("[.]","-",tumorSamples)
a1<-substr(a,1,12)
a1
traitRows =unique(match(a1, allTraits$Tumor_Sample_Barcode)) ;
traitRows
datTraits = allTraits[traitRows, -1];
rownames(datTraits) = allTraits[traitRows, 1];
collectGarbage();
#datTraits表示样本中每个性状的表达情况
#=====================================================================================
#
# Code chunk 8
#
#=====================================================================================
# Re-cluster samples
sampleTree2 = hclust(dist(datExpr), method = "average")
# Convert traits to a color representation: white means low, red means high, grey means missing entry
traitColors = numbers2colors(datTraits, signed = FALSE);
# Plot the sample dendrogram and the colors underneath.
plotDendroAndColors(sampleTree2, traitColors,
groupLabels = names(datTraits),
main = "Sample dendrogram and trait heatmap")