1.WGCNA基本概念
加权基因共表达网络分析 (WGCNA, Weighted correlation network analysis)是用来描述不同样品之间基因关联模式的系统生物学方法,可以用来鉴定高度协同变化的基因集,并根据基因集的内连性和基因集与表型之间的关联鉴定候补生物标记基因或治疗靶点。相比于只关注差异表达的基因,WGCNA利用数千或近万个变化最大的基因或全部基因的信息识别感兴趣的基因集,并与表型进行显著性关联分析。一是充分利用了信息,二是把数千个基因与表型的关联转换为数个基因集与表型的关联,免去了多重假设检验校正的问题。
基本分析流程如下
-
构建基因共表达网络:使用加权的表达相关性。
-
识别基因集:基于加权相关性,进行层级聚类分析,并根据设定标准切分聚类结果,获得不同的基因模块,用聚类树的分枝和不同颜色表示。
-
如果有表型信息,计算基因模块与表型的相关性,鉴定性状相关的模块。
-
研究模型之间的关系,从系统层面查看不同模型的互作网络。
-
从关键模型中选择感兴趣的驱动基因,或根据模型中已知基因的功能推测未知基因的功能。
-
导出TOM矩阵,绘制相关性图。
参考链接:https://www.jianshu.com/p/e9cc3f43441d
实践:
1.所需数据:GDC,TCGA官网所下载的数据(GDC)
得到WGCNA分析所需的TPM或者FPKM格式,或者经各种方式归一化的数据。
2.建立工作路径,清空环境变量,加载包
setwd("C:/Users/Graduate/Desktop/cr")
rm(list = ls())
#1.基因聚类并绘制热图
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("impute")
library(WGCNA)
library(reshape2)
library(stringr)
3.预处理数据,counts转TPM格式
#预处理 counts转TPM----
library(rtracklayer)
library(tidyverse)
gtf=rtracklayer::import("C:/Users/Graduate/Desktop/gencode.v43.basic.annotation.gtf.gz")
class(gtf)
gtf = as.data.frame(gtf)
dim(gtf)
table(gtf$type)
#挑选基因的最长转录本
library(tidyverse)
tra = gtf[gtf$type=="transcript",
c("start","end","gene_name")]
length(unique(tra$gene_name))
glt = mutate(tra, length = end - start) %>%
arrange(desc(length)) %>%
filter(!duplicated(gene_name)) %>%
select(c(3,4))
head(glt)
write.csv(glt,'gene_最长转录本长度.csv')
#count值转TPM
library(xlsx)
library(readxl)
exp<- read.csv("Gene Symbol_matrix1未匹配.csv",header = T)#读取基因文件
input<- read.csv("C:/Users/Graduate/Desktop/cr/gene_最长转录本长度.csv",row.names = 1)
#自己在Excel中把网盘里的txt文件基因和长度共两列提取出来
library(dplyr)
merge<-merge(exp,input,by="gene_name")#根据基因那列进行合并
merge <- na.omit(merge)#删除错误值行
write.csv(merge,file = "merge.csv")#读出文件,直接往下运行也行
#2.计算TPM
mycounts<-read.csv("merge.csv",check.names = F,row.names = 1)
head(mycounts)
rownames(mycounts)<-mycounts$gene_name
mycounts<-mycounts[,-1]
head(mycounts)#最后一列Length是基因长度
#TPM计算
kb <- mycounts$length / 1000
kb
countdata <- mycounts[,1:614]
rpk <- countdata / kb
rpk
tpm <- t(t(rpk)/colSums(rpk) * 1000000)
head(tpm)
write.csv(tpm,file="82_KIRC_tpm.csv")
#TPM转FPKM----
fpkm <- t(t(rpk)/colSums(tpm) * 10^6)
nrow(fpkm)
nrow(tpm)
#标准化转换后基因数不变
write.csv(fpkm,file = "./差异基因fpkm.csv")
4.读取表达矩阵
#1.1读取express,去除表达量为0的样本
expres=read.csv("差异基因fpkm.csv",header = T)
#expres=read.table('Annotated_TCGA_matrix.txt',sep="\t",row.names=1,header=T,check.names=F,quote="!")
expres <- aggregate( . ~ symbol_id,data=expres, max)
row.names(expres) <- expres[,1]
expres <- expres[,-1]
expres=expres[which(rowSums(expres)>0),]
expres=expres[order(rowSums(expres),decreasing = T),]
##1.2剔除正常样本,留下肿瘤样本
expres[]=lapply(expres, as.numeric)
tumor <- colnames(expres)[as.integer(substr(colnames(expres),14,15)) < 10] #样本名的14、15位为1-9的是肿瘤样本,>=10为正常或癌旁样本
tumor_sample <- expres[,tumor]
expres <- tumor_sample
datExpr=t(expres)
datExpr
#####1.3样本聚类
sampleTree = hclust(dist(datExpr), method = "average")
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="")
####
powers = c(c(1:10), seq(from = 12, to=20, by=2))
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)
##sizeGrWindow(9, 5)
par(mfrow = c(1,2));
cex1 = 0.9;
5.软阈值选择
######Powerz值选择beta1
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red");
#######
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
power_select=sft[["powerEstimate"]]
power=power_select
power_select
######module detection
##一步法网络构建:One-step network construction and module detection##
# power: 上一步计算的软阈值
# maxBlockSize: 计算机能处理的最大模块的基因数量 (默认5000);
# 4G内存电脑可处理8000-10000个,16G内存电脑可以处理2万个,32G内存电脑可
# 以处理3万个
# 计算资源允许的情况下最好放在一个block里面。
# corType: pearson or bicor
# numericLabels: 返回数字而不是颜色作为模块的名字,后面可以再转换为颜色
# saveTOMs:最耗费时间的计算,存储起来,供后续使用
# mergeCutHeight: 合并模块的阈值,越大模块越少
6.设置参数
####参数
# 为与原文档一致,故未修改
type = "unsigned"
# 相关性计算
# 官方推荐 biweight mid-correlation & bicor
# corType: pearson or bicor
# 为与原文档一致,故未修改
corType = "pearson"
cor <- WGCNA::cor
corFnc = ifelse(corType=="pearson",cor,bicor)
# 对二元变量,如样本性状信息计算相关性时,
# 或基因表达严重依赖于疾病状态时,需设置下面参数
maxPOutliers = ifelse(corType=="pearson",1,0.05)
# 关联样品性状的二元变量时,设置
robustY = ifelse(corType=="pearson",T,F)
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
7.网络的构建,以无向网络为例
cor <- WGCNA::cor
net = blockwiseModules(datExpr, power = power, maxBlockSize = nGenes,
TOMType = type, minModuleSize = 30,
reassignThreshold = 0, mergeCutHeight = 0.25,
numericLabels = TRUE, pamRespectsDendro = FALSE,
saveTOMs=TRUE, corType = corType,
maxPOutliers=maxPOutliers, loadTOMs=TRUE,
saveTOMFileBase = paste0("AS-green-FPKM-TOM"),
verbose = 3)
save(net,datExpr,file = './WGCNA.Rdata')
cor<-stats::cor
table(net$colors)
moduleLabels = net$colors
moduleColors = labels2colors(moduleLabels)
# Plot the dendrogram and the module colors underneath
# 如果对结果不满意,还可以recutBlockwiseTrees,节省计算时间
plotDendroAndColors(net$dendrograms[[1]], moduleColors[net$blockGenes[[1]]],
"Module colors",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
######
ADJ= adjacency(datExpr,power = power_select)
vis=exportNetworkToCytoscape(ADJ,edgeFile="edge1.txt",nodeFile="node.txt",threshold = 0.4)
##########
####箱线图
# module eigengene, 可以绘制线图,作为每个模块的基因表达趋势的展示
MEs = net$MEs
MEs_col = MEs
colnames(MEs_col) = paste0("ME", labels2colors(
as.numeric(str_replace_all(colnames(MEs),"ME",""))))
MEs_col = orderMEs(MEs_col)
# 根据基因间表达量进行聚类所得到的各模块间的相关性图
# marDendro/marHeatmap 设置下、左、上、右的边距
plotEigengeneNetworks(MEs_col, "Eigengene adjacency heatmap",
marDendro = c(3,3,2,4),
marHeatmap = c(3,4,2,2), plotDendrograms = T,
xLabelsAngle = 90)
save(plotEigengeneNetworks,file = './marHeatmapWGCNA.Rdata')
########TOM PLOT
load(net$TOMFiles[1], verbose=T)
## Loading objects:
## TOM
TOM <- as.matrix(TOM)
dissTOM = 1-TOM
# Transform dissTOM with a power to make moderately strong
# connections more visible in the heatmap
plotTOM = dissTOM^7
# Set diagonal to NA for a nicer plot
diag(plotTOM) = NA
# Call the plot function
# 这一部分特别耗时,行列同时做层级聚类
TOMplot(plotTOM, net$dendrograms, moduleColors,
main = "Network heatmap plot, all genes")
save(TOMplot,file = '.plotTOMWGCNA.Rdata')
######网络图
probes = colnames(datExpr)
dimnames(TOM) <- list(probes, probes)
# Export the network into edge and node list files Cytoscape can read
# threshold 默认为0.5, 可以根据自己的需要调整,也可以都导出后在
# cytoscape中再调整
cyt = exportNetworkToCytoscape(TOM,threshold = 0,
nodeNames = probes, nodeAttr = moduleColors)
cyt$edgeData
cyt$nodeData
8.基因与患者表型,性状关联
#2.患者表型、性状与基因模块相关联
clinical=read.csv("clinical.csv",header = T)
#2.1读取clinical
#clinical=read.table("clinical.txt",sep = "\t",header = T)
#clinical$days_to_death[is.na(clinical$days_to_death)]=0
#clinical$days_to_last_followup[is.na(clinical$days_to_last_followup)]=0
#clinical$surv_time=clinical$days_to_last_followup+clinical$days_to_death
clinical=clinical[-which(clinical$surv_time==0),]
####去重
clinical=clinical[!duplicated(clinical$TCGA_id),]
k=rownames(datExpr)
k=substr(k,1,12)
row.names(datExpr)=k
datExpr=datExpr[!duplicated(row.names(datExpr)),]
row=match(row.names(datExpr),clinical$TCGA_id)
dcli=clinical[row,]
dcli=dcli[!duplicated(dcli$TCGA_id),]
dcli = na.omit(dcli)
rownames(dcli)=dcli$TCGA_id
dcli=dcli[,-1]
dclic=subset(dcli,select = c('age_at_index','gender',
'surv_time','EMT_group','ajcc_pathologic_stage'))
##############样本临床数据聚类
library(flashClust)
A = adjacency(t(datExpr), type = "distance")
# this calculates the whole network connectivity
k = as.numeric(apply(A, 2, sum)) - 1
# standardized connectivity
Z.k = scale(k)
thresholdZ.k = -5 # often -2.5
sampleTree = flashClust(as.dist(1 - A), method = "average")
# Convert traits to a color representation: where red indicates high
# values
traitColors = data.frame(numbers2colors(dclic, signed = FALSE))
dimnames(traitColors)[[2]] = names(dclic)
outlierColor = ifelse(Z.k < thresholdZ.k, "red", "blue")
datColors = data.frame(outlier = outlierColor, traitColors)
plotDendroAndColors(sampleTree, groupLabels = names(datColors), colors = datColors,
main = "Sample dendrogram and trait heatmap",cex.colorLabels = 1,cex.rowText = 0.4,cex.dendroLabels = 0.1)
9.关联表型数据,绘制热图
####2.2关联表型数据
moduleLabelsAutomatic = net$colors
# Convert labels to colors for plotting
moduleColorsAutomatic = labels2colors(moduleLabelsAutomatic)
moduleColorsFemale = moduleColorsAutomatic
# Define numbers of genes and samples
t_rt=datExpr
nGenes = ncol(t_rt)
nSamples = nrow(t_rt)
# Recalculate MEs with color labels
MEs0 = moduleEigengenes(t_rt, moduleColorsFemale)$eigengenes
MEsFemale = orderMEs(MEs0)
modTraitCor = cor(MEsFemale, dclic, use = "p")
modTraitP = corPvalueStudent(modTraitCor, nSamples)
# Since we have a moderately large number of modules and traits, a
# suitable graphical representation will help in reading the table. We
# color code each association by the correlation value: Will display
# correlations and their p-values
textMatrix = paste(signif(modTraitCor, 2), "\n(", signif(modTraitP, 1), ")",
sep = "")
dim(textMatrix) = dim(modTraitCor)
par(mar = c(4, 8, 1, 2))
# Display the correlation values within a heatmap plot
#tiff('c.tiff',width = 4000,height = 2700,res = 300)
labeledHeatmap(Matrix = modTraitCor, xLabels = c('age_at_index','gender',
'surv_time','EMT_group','ajcc_pathologic_stage'),yLabels = names(MEsFemale),
ySymbols = names(MEsFemale), colorLabels = FALSE, colors = blueWhiteRed(50),
textMatrix = textMatrix, setStdMargins = FALSE,
cex.text = 0.6, zlim = c(-1, 1), main = paste("Module-trait relationships"),font.lab.x = 0.5,font.lab.y = 0.5)
save(labeledHeatmap,file = './labeledHeatmapWGCNA.Rdata')
10.基因与性状的关联矩阵
#####基因模块结果
blocknumber=1
datColors = data.frame(moduleColorsAutomatic)[net$blockGenes[[blocknumber]], ]
datKME = signedKME(t_rt, MEsFemale)
datSummary=rownames(expres)
datout=data.frame(datSummary,datColors,datKME )
datout
#######
# 计算性状与基因的相关性矩阵
## 只有连续型性状才能进行计算,如果是离散变量,在构建样品表时就转为0-1矩阵。
if (corType=="pearson") {
geneModuleMembership = as.data.frame(cor(datExpr, MEs_col, use = "p"))
MMPvalue = as.data.frame(corPvalueStudent(
as.matrix(geneModuleMembership), nSamples))
} else {
geneModuleMembershipA = bicorAndPvalue(datExpr, MEs_col, robustY=robustY)
geneModuleMembership = geneModuleMembershipA$bicor
MMPvalue = geneModuleMembershipA$p
}
if (corType=="pearson") {
geneTraitCor = as.data.frame(cor(datExpr, dclic, use = "p"))
geneTraitP = as.data.frame(corPvalueStudent(
as.matrix(geneTraitCor), nSamples))
} else {
geneTraitCorA = bicorAndPvalue(datExpr, dclic, robustY=robustY)
geneTraitCor = as.data.frame(geneTraitCorA$bicor)
geneTraitP = as.data.frame(geneTraitCorA$p)
}
#######
table(net$colors)
module = "turquoise"
pheno = "EMT_group"
modNames = substring(colnames(MEs_col), 3)
# 获取关注的列
module_column = match(module, modNames)
pheno_column = match(pheno,colnames(dclic))
# 获取模块内的基因
moduleGenes = moduleColors == module
sizeGrWindow(7, 7)
par(mfrow = c(1,1))
# 与性状高度相关的基因,也是与性状相关的模型的关键基因
verboseScatterplot(abs(geneModuleMembership[moduleGenes, module_column]),
abs(geneTraitCor[moduleGenes, pheno_column]),
xlab = paste("Module Membership in", module, "module"),
ylab = paste("Gene significance for", pheno),
main = paste("Module membership vs. gene significance\n"),
cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)