WGCNA

写在前面

前前后后大概折腾了三次WGCNA了,每次都是写完代码就丢,结果每次都要重新再搜,重新再写,这次学聪明了,直接把代码贴这么吧
原理解释:https://www.jianshu.com/p/dcae5fb86559?u_atoken=e16b2ac9-9664-42b9-bb1c-49a45f1a11b0&u_asession=01MLkAxi-M40JjZ5r6WTKoU2sbbcp7KOA240ayukKB6WSkMiUXOr1oyIfTkEGvdg_8X0KNBwm7Lovlpxjd_P_q4JsKWYrT3W_NKPr8w6oU7K-puYzLRlqCmpUoXcuGAHVxCvvWHyhA8I9G3hxoTho1LGBkFo3NEHBv0PZUm6pbxQU&u_asig=05l2QgGJthwgv4Uw3jz0mPDRkh58doK6-Ieui0r9LbOgyAPib376puxdfX0tzTCLvwXfAgpkdDexTBule-QHbsYa3YSKUlHavAtPIJ4pbovWINYJ43nfhtpWabcF-0-H1CZguQB3aXXjl0gtvWw6IBztUYLjEOfil5CV6hEEAZByr9JS7q8ZD7Xtz2Ly-b0kmuyAKRFSVJkkdwVUnyHAIJzXcYLzxfgYKGLtpraF9KrFAe1SYMszt5zK0d5JHkSNU2ChTz2MQxpCmDDGYlh3aZze3h9VXwMyh6PgyDIVSG1W_8WoYkjdpAgnJEM0hdeCYmS0gA-YCVD9qXtlq4leRItJe4rWFDKW9yzkBwkP99mWZkyMZ3kbspbzJ3w_m6YfhwmWspDxyAEEo4kbsryBKb9Q&u_aref=cU92VpXaXPCZosCyj67OA5bHh30%3D
还有英文教程:https://horvath.genetics.ucla.edu/html/CoexpressionNetwork/Rpackages/WGCNA/Tutorials/

结果

在这里插入图片描述
自己大概基本整个流程

#########基本设置
options(stringsAsFactors = F)
rm(list = ls()) 
if (TRUE){
  library(data.table)
  library(tidyr)
  library(dplyr)
  library(tibble)
  library(caret)
  library(limma)
}

pwd = "xxxxxx"
setwd(pwd)
library(readr)
library(dplyr)
library(readxl)

# 数据,临床数据
data_1 = read_xlsx("gse122063_expr..xlsx" ,sheet = 1 ) %>% as.data.frame()
data_2 = read_xlsx("gse37263_expr.xlsx" ,sheet = 1)%>% as.data.frame()
group_1 = read_xlsx("122063分組.xlsx" ,sheet = 1)%>% as.data.frame()
group_2 = read_xlsx("37263分组.xlsx" ,sheet = 1)%>% as.data.frame()
con_gene = intersect(data_1[,1],data_2[,1])
rownames(data_1) = data_1[,1]
rownames(data_2) = data_2[,1]
data_1 = data_1[,-1]
data_2 = data_2[,-1]

# 合并,去批次 
data_all = cbind(data_1[con_gene,],data_2[con_gene,])
colnames(group_1) = colnames(group_2)
pheno = rbind(group_1,group_2)
batch = c(rep(1,dim(group_1)[1]),rep(2,dim(group_2)[1]))
edata = data_all
edata = matrix(data = as.matrix(data_all), nrow=4748, ncol = 116) 
mode(edata) = "numeric"
combat_edata = ComBat(dat=edata, batch=batch, mod=NULL, par.prior=TRUE, prior.plots=FALSE)
colnames(combat_edata) =  c(group_1$SAMPLE,group_2$SAMPLE)
rownames(combat_edata) = con_gene


# WGCNA部分
library(WGCNA)
options(stringsAsFactors = FALSE)
enableWGCNAThreads()

##筛选方差前25%的基因##我这边没做
#m.mad <- apply(dataExpr,1,mad)
#dataExprVar <- dataExpr[which(m.mad >
# max(quantile(m.mad, probs=seq(0, 1, 0.25))[2],0.01)),]
expro = combat_edata
datExpr=as.data.frame(t(combat_edata));
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)

##样本聚类检查离群值##
gsg = goodSamplesGenes(datExpr, verbose = 3);
gsg$allOK
sampleTree = hclust(dist(datExpr), method = "average")
plot(sampleTree, main = "Sample clustering to detect outliers"
     , sub="", xlab="")
save(datExpr, file = "FPKM-01-dataInput.RData")

##没有离群值,不作处理##
##软阈值筛选+作图##
powers = c(c(1:10), seq(from = 12, to=20, by=2))
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)
par(mfrow = c(1,2));
cex1 = 0.9;
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
     main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     labels=powers,cex=cex1,col="red");
abline(h=0.90,col="red")
plot(sft$fitIndices[,1], sft$fitIndices[,5],
     xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
     main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")

##作图,看R方##
betal = 10
dev.off()
k.dataOne <- softConnectivity(datExpr, power = betal) -1
scaleFreePlot(k.dataOne, main = paste(" power=", betal))

##一步法网络构建:One-step network construction and module detection##
net = blockwiseModules(datExpr, power = betal, maxBlockSize = 6000,
                       TOMType = "unsigned", minModuleSize = 30,
                       reassignThreshold = 0, mergeCutHeight = 0.25,
                       numericLabels = TRUE, pamRespectsDendro = FALSE,
                       saveTOMs = TRUE,
                       saveTOMFileBase = "AS-green-FPKM-TOM",
                       verbose = 3)
table(net$colors)

##绘画结果展示##
# open a graphics window
#sizeGrWindow(12, 9)
# Convert labels to colors for plotting
mergedColors = labels2colors(net$colors)
# Plot the dendrogram and the module colors underneath
plotDendroAndColors(net$dendrograms[[1]], mergedColors[net$blockGenes[[1]]],
                    "Module colors",
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05)

##结果保存
moduleLabels = net$colors
moduleColors = labels2colors(net$colors)
table(moduleColors)
MEs = net$MEs;
geneTree = net$dendrograms[[1]];
save(MEs, moduleLabels, moduleColors, geneTree,
     file = "AS-green-FPKM-02-networkConstruction-auto.RData")

##表型与模块相关性+图##
trainDt= pheno
rownames(trainDt) = rownames(datExpr)
table(rownames(trainDt) == rownames(datExpr))
trainDt$AD = 0
trainDt$CON = 0
trainDt$AD[trainDt$CLASS=="AD"] = 1
trainDt$CON[trainDt$CLASS=="control"] = 1
moduleLabelsAutomatic = net$colors
moduleColorsAutomatic = labels2colors(moduleLabelsAutomatic)
moduleColorsWW = moduleColorsAutomatic
MEs0 = moduleEigengenes(datExpr, moduleColorsWW)$eigengenes
MEsWW = orderMEs(MEs0)
samples = t(combat_edata)
modTraitCor = cor(MEsWW, trainDt[,c(3,4)], use = "p")
colnames(MEsWW)
modlues = MEsWW
modTraitP = corPvalueStudent(modTraitCor, nSamples)
textMatrix = paste(signif(modTraitCor, 2), "\n(", signif(modTraitP, 1), ")", sep = "")
dim(textMatrix) = dim(modTraitCor)
dev.off()
labeledHeatmap(Matrix = modTraitCor, xLabels = colnames(trainDt[,c(3,4)]), yLabels = names(MEsWW), cex.lab = 0.5,  yColorWidth=0.01, 
               xColorWidth = 0.03,
               ySymbols = colnames(modlues), colorLabels = FALSE, colors = blueWhiteRed(50), 
               textMatrix = textMatrix, setStdMargins = FALSE, cex.text = 0.5, zlim = c(-1,1)
               , main = paste("Module-trait relationships"))

## 保存模块与基因
gene = cbind(con_gene,mergedColors)
write.csv(gene,"module_gene.csv")
write.csv(combat_edata,"profile_remove_batch.csv")
# -------------------------------------- #
#               6.WGCNA                  #
# -------------------------------------- #
#清楚缓存
rm(list = ls())
#=====================================================================================
#
#  Code chunk 1
#
#=====================================================================================

# Display the current working directory
getwd();
# If necessary, change the path below to the directory where the data files are stored. 
# "." means current directory. On Windows use a forward slash / instead of the usual \.
workingDir = ".";
setwd(workingDir); 
# Load the WGCNA package
library(WGCNA);
# The following setting is important, do not omit.
options(stringsAsFactors = FALSE);
#Read in the Expr_matrix_IRG data set
IRGData = read.csv("./result/Expr_matrix_IRGs.csv");
# Take a quick look at what is in the data set:
dim(IRGData);
names(IRGData);


#=====================================================================================
#
#  Code chunk 2
#
#=====================================================================================


datExpr0 = as.data.frame(t(IRGData[, -c(1)]));
names(datExpr0) = IRGData$X;
rownames(datExpr0) = names(IRGData)[-c(1)];
a = cbind(substr(rownames(datExpr0),1,12),substr(rownames(datExpr0),13,16))
datExpr0 = datExpr0[order(rownames(datExpr0)),]
b = match(unique(substr(rownames(datExpr0),1,12)) , substr(rownames(datExpr0),1,12))
datExpr0 = datExpr0[b,]
rownames(datExpr0) = substr(rownames(datExpr0),1,12)
#datExpr0 就是一个以每行为样本,一列为一个基因的数据框

#=====================================================================================
#
#  Code chunk 3
#
#=====================================================================================


gsg = goodSamplesGenes(datExpr0, verbose = 3);
gsg$allOK


#=====================================================================================
#
#  Code chunk 4
#
#=====================================================================================


if (!gsg$allOK)
{
  # Optionally, print the gene and sample names that were removed:
  if (sum(!gsg$goodGenes)>0) 
    printFlush(paste("Removing genes:", paste(names(datExpr0)[!gsg$goodGenes], collapse = ", ")));
  if (sum(!gsg$goodSamples)>0) 
    printFlush(paste("Removing samples:", paste(rownames(datExpr0)[!gsg$goodSamples], collapse = ", ")));
  # Remove the offending genes and samples from the data:
  datExpr0 = datExpr0[gsg$goodSamples, gsg$goodGenes]
}

#注:
#在这一步之后,一般会有一个筛选基因的过程。因为电脑的配置问题,一般来说运行几万个基因的表达量来构建TOM矩阵
#和邻接矩阵是非常吃力的,我试过把全部的基因都用来构建网络,但是构建矩阵的过程非常费时间,而且很难能运行成功。
#所以在完成剔除异常值之后,我们还需要进一步挑选基因。至于怎么筛选基因要看自己的目的,粗暴一点的可以按表达量
#为前5000的进行提取,通常用5000个基因进行分析。(具体讨论可见WGCNA分析进阶版常见问题整理)。

#筛选表达量前50000的基因
#WGCNA_matrix = datExpr0[,order(apply(datExpr0,2,mad), decreasing = T)[1:5000]]
#datExpr0 <- WGCNA_matrix  ## top 5000 mad genes
#=====================================================================================
#
#  Code chunk 5
#
#=====================================================================================


sampleTree = hclust(dist(datExpr0), method = "average");
plot(sampleTree,main = "Sample clustering to detect outliers",sub="",xlab="");
abline(h = 1000000, col = "red");
dev.new()
dev.off()


#=====================================================================================
#
#  Code chunk 6
#
#=====================================================================================


# Plot a line to show the cut
abline(h = 1000000, col = "red");
# Determine cluster under the line
clust = cutreeStatic(sampleTree, cutHeight = 1000000, minSize = 10)
table(clust)
# clust 1 contains the samples we want to keep.
keepSamples = (clust==1)
datExpr = datExpr0[keepSamples, ]
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)


#=====================================================================================
#
#  Code chunk 7
#
#=====================================================================================


traitData = read.csv("./result/TCGA-GBM-clinical.csv");
dim(traitData)
names(traitData)

# remove columns that hold information we do not need.
allTraits = traitData[, -c(1,6,7,8,9)];
allTraits = allTraits[, -c(4,5) ];
dim(allTraits)
names(allTraits)

# Form a data frame analogous to expression data that will hold the clinical traits.

tumorSamples = rownames(datExpr);
a<-gsub("[.]","-",tumorSamples)
a1<-substr(a,1,12)
a1
traitRows =unique(match(a1, allTraits$Tumor_Sample_Barcode)) ;
traitRows
datTraits = allTraits[traitRows, -1];
rownames(datTraits) = allTraits[traitRows, 1];

collectGarbage();

#datTraits表示样本中每个性状的表达情况

#=====================================================================================
#
#  Code chunk 8
#
#=====================================================================================

# Re-cluster samples
sampleTree2 = hclust(dist(datExpr), method = "average")
# Convert traits to a color representation: white means low, red means high, grey means missing entry
traitColors = numbers2colors(datTraits, signed = FALSE);
# Plot the sample dendrogram and the colors underneath.
plotDendroAndColors(sampleTree2, traitColors,
                    groupLabels = names(datTraits), 
                    main = "Sample dendrogram and trait heatmap")
WGCNA (Weighted Gene Co-expression Network Analysis) 是一种基于基因共表达网络的数据分析方法,旨在揭示基因之间的关系以及它们与样本性状之间的关联。Python是一种流行的编程语言,可以用于实现WGCNA。 Python中有许多库可以用来进行WGCNA分析,如scipy、networkx和numpy等。首先,我们需要根据基因表达数据构建基因共表达网络。可以使用scipy库中的函数计算基因之间的相关性,然后根据相关性构建共表达网络。接下来,可以使用networkx库对网络进行分析,例如计算基因的度中心性和介数中心性等指标。这些指标可以帮助我们了解网络中的重要基因。 在WGCNA中,为了实现模块化,通常会将相似的基因分组到同一个模块中。在Python中,可以使用numpy库中的函数执行这个步骤。首先,通过hierarchical clustering算法对基因进行聚类分析,然后使用动态切割算法将聚类结果划分为不同的模块。这些模块可以表示不同的生物学功能模块。 最后,可以使用WGCNA分析来探索基因模块与样本性状之间的关联。可以使用scipy库中的统计函数计算基因模块与样本性状之间的相关性,并进行统计学显著性检验。这可以帮助我们找到与样本性状相关的基因模块,并了解这些基因模块在样本分类和特征选择中的重要性。 总之,Python是一种强大的工具,可以用于实现WGCNA分析。通过使用Python中的一些库和函数,我们可以构建基因共表达网络,进行模块化分析,并揭示基因模块与样本性状之间的关联。这有助于我们在基因表达数据中发现重要的生物学信息。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值