WGCNA

愿航

已于 2023-06-03 16:33:32 修改

阅读量732

点赞数

文章标签： r语言

于 2021-10-01 09:41:38 首次发布

本文链接：https://blog.csdn.net/wish_to_top/article/details/120575537

版权

写在前面

前前后后大概折腾了三次WGCNA了，每次都是写完代码就丢，结果每次都要重新再搜，重新再写，这次学聪明了，直接把代码贴这么吧
原理解释：https://www.jianshu.com/p/dcae5fb86559?u_atoken=e16b2ac9-9664-42b9-bb1c-49a45f1a11b0&u_asession=01MLkAxi-M40JjZ5r6WTKoU2sbbcp7KOA240ayukKB6WSkMiUXOr1oyIfTkEGvdg_8X0KNBwm7Lovlpxjd_P_q4JsKWYrT3W_NKPr8w6oU7K-puYzLRlqCmpUoXcuGAHVxCvvWHyhA8I9G3hxoTho1LGBkFo3NEHBv0PZUm6pbxQU&u_asig=05l2QgGJthwgv4Uw3jz0mPDRkh58doK6-Ieui0r9LbOgyAPib376puxdfX0tzTCLvwXfAgpkdDexTBule-QHbsYa3YSKUlHavAtPIJ4pbovWINYJ43nfhtpWabcF-0-H1CZguQB3aXXjl0gtvWw6IBztUYLjEOfil5CV6hEEAZByr9JS7q8ZD7Xtz2Ly-b0kmuyAKRFSVJkkdwVUnyHAIJzXcYLzxfgYKGLtpraF9KrFAe1SYMszt5zK0d5JHkSNU2ChTz2MQxpCmDDGYlh3aZze3h9VXwMyh6PgyDIVSG1W_8WoYkjdpAgnJEM0hdeCYmS0gA-YCVD9qXtlq4leRItJe4rWFDKW9yzkBwkP99mWZkyMZ3kbspbzJ3w_m6YfhwmWspDxyAEEo4kbsryBKb9Q&u_aref=cU92VpXaXPCZosCyj67OA5bHh30%3D
还有英文教程：https://horvath.genetics.ucla.edu/html/CoexpressionNetwork/Rpackages/WGCNA/Tutorials/

结果

在这里插入图片描述
自己大概基本整个流程

#########基本设置
options(stringsAsFactors = F)
rm(list = ls()) 
if (TRUE){
  library(data.table)
  library(tidyr)
  library(dplyr)
  library(tibble)
  library(caret)
  library(limma)
}

pwd = "xxxxxx"
setwd(pwd)
library(readr)
library(dplyr)
library(readxl)

# 数据,临床数据
data_1 = read_xlsx("gse122063_expr..xlsx" ,sheet = 1 ) %>% as.data.frame()
data_2 = read_xlsx("gse37263_expr.xlsx" ,sheet = 1)%>% as.data.frame()
group_1 = read_xlsx("122063分組.xlsx" ,sheet = 1)%>% as.data.frame()
group_2 = read_xlsx("37263分组.xlsx" ,sheet = 1)%>% as.data.frame()
con_gene = intersect(data_1[,1],data_2[,1])
rownames(data_1) = data_1[,1]
rownames(data_2) = data_2[,1]
data_1 = data_1[,-1]
data_2 = data_2[,-1]

# 合并，去批次 
data_all = cbind(data_1[con_gene,],data_2[con_gene,])
colnames(group_1) = colnames(group_2)
pheno = rbind(group_1,group_2)
batch = c(rep(1,dim(group_1)[1]),rep(2,dim(group_2)[1]))
edata = data_all
edata = matrix(data = as.matrix(data_all), nrow=4748, ncol = 116) 
mode(edata) = "numeric"
combat_edata = ComBat(dat=edata, batch=batch, mod=NULL, par.prior=TRUE, prior.plots=FALSE)
colnames(combat_edata) =  c(group_1$SAMPLE,group_2$SAMPLE)
rownames(combat_edata) = con_gene


# WGCNA部分
library(WGCNA)
options(stringsAsFactors = FALSE)
enableWGCNAThreads()

##筛选方差前25%的基因##我这边没做
#m.mad <- apply(dataExpr,1,mad)
#dataExprVar <- dataExpr[which(m.mad >
# max(quantile(m.mad, probs=seq(0, 1, 0.25))[2],0.01)),]
expro = combat_edata
datExpr=as.data.frame(t(combat_edata));
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)

##样本聚类检查离群值##
gsg = goodSamplesGenes(datExpr, verbose = 3);
gsg$allOK
sampleTree = hclust(dist(datExpr), method = "average")
plot(sampleTree, main = "Sample clustering to detect outliers"
     , sub="", xlab="")
save(datExpr, file = "FPKM-01-dataInput.RData")

##没有离群值，不作处理##
##软阈值筛选+作图##
powers = c(c(1:10), seq(from = 12, to=20, by=2))
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)
par(mfrow = c(1,2));
cex1 = 0.9;
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
     main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     labels=powers,cex=cex1,col="red");
abline(h=0.90,col="red")
plot(sft$fitIndices[,1], sft$fitIndices[,5],
     xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
     main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")

##作图，看R方##
betal = 10
dev.off()
k.dataOne <- softConnectivity(datExpr, power = betal) -1
scaleFreePlot(k.dataOne, main = paste(" power=", betal))

##一步法网络构建：One-step network construction and module detection##
net = blockwiseModules(datExpr, power = betal, maxBlockSize = 6000,
                       TOMType = "unsigned", minModuleSize = 30,
                       reassignThreshold = 0, mergeCutHeight = 0.25,
                       numericLabels = TRUE, pamRespectsDendro = FALSE,
                       saveTOMs = TRUE,
                       saveTOMFileBase = "AS-green-FPKM-TOM",
                       verbose = 3)
table(net$colors)

##绘画结果展示##
# open a graphics window
#sizeGrWindow(12, 9)
# Convert labels to colors for plotting
mergedColors = labels2colors(net$colors)
# Plot the dendrogram and the module colors underneath
plotDendroAndColors(net$dendrograms[[1]], mergedColors[net$blockGenes[[1]]],
                    "Module colors",
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05)

##结果保存
moduleLabels = net$colors
moduleColors = labels2colors(net$colors)
table(moduleColors)
MEs = net$MEs;
geneTree = net$dendrograms[[1]];
save(MEs, moduleLabels, moduleColors, geneTree,
     file = "AS-green-FPKM-02-networkConstruction-auto.RData")

##表型与模块相关性+图##
trainDt= pheno
rownames(trainDt) = rownames(datExpr)
table(rownames(trainDt) == rownames(datExpr))
trainDt$AD = 0
trainDt$CON = 0
trainDt$AD[trainDt$CLASS=="AD"] = 1
trainDt$CON[trainDt$CLASS=="control"] = 1
moduleLabelsAutomatic = net$colors
moduleColorsAutomatic = labels2colors(moduleLabelsAutomatic)
moduleColorsWW = moduleColorsAutomatic
MEs0 = moduleEigengenes(datExpr, moduleColorsWW)$eigengenes
MEsWW = orderMEs(MEs0)
samples = t(combat_edata)
modTraitCor = cor(MEsWW, trainDt[,c(3,4)], use = "p")
colnames(MEsWW)
modlues = MEsWW
modTraitP = corPvalueStudent(modTraitCor, nSamples)
textMatrix = paste(signif(modTraitCor, 2), "\n(", signif(modTraitP, 1), ")", sep = "")
dim(textMatrix) = dim(modTraitCor)
dev.off()
labeledHeatmap(Matrix = modTraitCor, xLabels = colnames(trainDt[,c(3,4)]), yLabels = names(MEsWW), cex.lab = 0.5,  yColorWidth=0.01, 
               xColorWidth = 0.03,
               ySymbols = colnames(modlues), colorLabels = FALSE, colors = blueWhiteRed(50), 
               textMatrix = textMatrix, setStdMargins = FALSE, cex.text = 0.5, zlim = c(-1,1)
               , main = paste("Module-trait relationships"))

## 保存模块与基因
gene = cbind(con_gene,mergedColors)
write.csv(gene,"module_gene.csv")
write.csv(combat_edata,"profile_remove_batch.csv")

# -------------------------------------- #
#               6.WGCNA                  #
# -------------------------------------- #
#清楚缓存
rm(list = ls())
#=====================================================================================
#
#  Code chunk 1
#
#=====================================================================================

# Display the current working directory
getwd();
# If necessary, change the path below to the directory where the data files are stored. 
# "." means current directory. On Windows use a forward slash / instead of the usual \.
workingDir = ".";
setwd(workingDir); 
# Load the WGCNA package
library(WGCNA);
# The following setting is important, do not omit.
options(stringsAsFactors = FALSE);
#Read in the Expr_matrix_IRG data set
IRGData = read.csv("./result/Expr_matrix_IRGs.csv");
# Take a quick look at what is in the data set:
dim(IRGData);
names(IRGData);


#=====================================================================================
#
#  Code chunk 2
#
#=====================================================================================


datExpr0 = as.data.frame(t(IRGData[, -c(1)]));
names(datExpr0) = IRGData$X;
rownames(datExpr0) = names(IRGData)[-c(1)];
a = cbind(substr(rownames(datExpr0),1,12),substr(rownames(datExpr0),13,16))
datExpr0 = datExpr0[order(rownames(datExpr0)),]
b = match(unique(substr(rownames(datExpr0),1,12)) , substr(rownames(datExpr0),1,12))
datExpr0 = datExpr0[b,]
rownames(datExpr0) = substr(rownames(datExpr0),1,12)
#datExpr0 就是一个以每行为样本，一列为一个基因的数据框

#=====================================================================================
#
#  Code chunk 3
#
#=====================================================================================


gsg = goodSamplesGenes(datExpr0, verbose = 3);
gsg$allOK


#=====================================================================================
#
#  Code chunk 4
#
#=====================================================================================


if (!gsg$allOK)
{
  # Optionally, print the gene and sample names that were removed:
  if (sum(!gsg$goodGenes)>0) 
    printFlush(paste("Removing genes:", paste(names(datExpr0)[!gsg$goodGenes], collapse = ", ")));
  if (sum(!gsg$goodSamples)>0) 
    printFlush(paste("Removing samples:", paste(rownames(datExpr0)[!gsg$goodSamples], collapse = ", ")));
  # Remove the offending genes and samples from the data:
  datExpr0 = datExpr0[gsg$goodSamples, gsg$goodGenes]
}

#注：
#在这一步之后，一般会有一个筛选基因的过程。因为电脑的配置问题，一般来说运行几万个基因的表达量来构建TOM矩阵
#和邻接矩阵是非常吃力的，我试过把全部的基因都用来构建网络，但是构建矩阵的过程非常费时间，而且很难能运行成功。
#所以在完成剔除异常值之后，我们还需要进一步挑选基因。至于怎么筛选基因要看自己的目的，粗暴一点的可以按表达量
#为前5000的进行提取，通常用5000个基因进行分析。（具体讨论可见WGCNA分析进阶版常见问题整理）。

#筛选表达量前50000的基因
#WGCNA_matrix = datExpr0[,order(apply(datExpr0,2,mad), decreasing = T)[1:5000]]
#datExpr0 <- WGCNA_matrix  ## top 5000 mad genes
#=====================================================================================
#
#  Code chunk 5
#
#=====================================================================================


sampleTree = hclust(dist(datExpr0), method = "average");
plot(sampleTree,main = "Sample clustering to detect outliers",sub="",xlab="");
abline(h = 1000000, col = "red");
dev.new()
dev.off()


#=====================================================================================
#
#  Code chunk 6
#
#=====================================================================================


# Plot a line to show the cut
abline(h = 1000000, col = "red");
# Determine cluster under the line
clust = cutreeStatic(sampleTree, cutHeight = 1000000, minSize = 10)
table(clust)
# clust 1 contains the samples we want to keep.
keepSamples = (clust==1)
datExpr = datExpr0[keepSamples, ]
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)


#=====================================================================================
#
#  Code chunk 7
#
#=====================================================================================


traitData = read.csv("./result/TCGA-GBM-clinical.csv");
dim(traitData)
names(traitData)

# remove columns that hold information we do not need.
allTraits = traitData[, -c(1,6,7,8,9)];
allTraits = allTraits[, -c(4,5) ];
dim(allTraits)
names(allTraits)

# Form a data frame analogous to expression data that will hold the clinical traits.

tumorSamples = rownames(datExpr);
a<-gsub("[.]","-",tumorSamples)
a1<-substr(a,1,12)
a1
traitRows =unique(match(a1, allTraits$Tumor_Sample_Barcode)) ;
traitRows
datTraits = allTraits[traitRows, -1];
rownames(datTraits) = allTraits[traitRows, 1];

collectGarbage();

#datTraits表示样本中每个性状的表达情况

#=====================================================================================
#
#  Code chunk 8
#
#=====================================================================================

# Re-cluster samples
sampleTree2 = hclust(dist(datExpr), method = "average")
# Convert traits to a color representation: white means low, red means high, grey means missing entry
traitColors = numbers2colors(datTraits, signed = FALSE);
# Plot the sample dendrogram and the colors underneath.
plotDendroAndColors(sampleTree2, traitColors,
                    groupLabels = names(datTraits), 
                    main = "Sample dendrogram and trait heatmap")