qgg包-续2-大数据集教程

最新推荐文章于 2023-10-11 18:08:15 发布

谁曾经不是菜鸟啦

最新推荐文章于 2023-10-11 18:08:15 发布

阅读量910

点赞数

分类专栏：笔记 R语言

原文链接：http://psoerensen.github.io/qgg/articles/prepare_ukb_phenotypes.html

版权

笔记同时被 2 个专栏收录

18 篇文章 2 订阅

订阅专栏

R语言

5 篇文章 0 订阅

订阅专栏

qgg包-续2-大数据集教程

使用1000基因组项目数据的教程

遗传数据来自公开可获得的国际基因组样本资源- 1000基因组项目。

加载库

library(devtools)
install_github("psoerensen/qgg")
library(qgg)

下载1000 基因组数据

download.file(url="https://data.broadinstitute.org/alkesgroup/LDSCORE/1000G_Phase1_plinkfiles.tgz",dest="./1000G_Phase1_plinkfiles.tgz")

download.file(url="https://data.broadinstitute.org/alkesgroup/LDSCORE/1000G_Phase3_plinkfiles.tgz",dest="./1000G_Phase3_plinkfiles.tgz")

cmd <- "tar -xvzf 1000G_Phase1_plinkfiles.tgz"
cmd <- "tar -xvzf 1000G_Phase3_plinkfiles.tgz"
system(cmd)

准备qgg的基因型数据

bedfiles <- paste("./1000G_EUR_Phase3_plink/1000G.EUR.QC.",1:22,".bed",sep="")
bimfiles <- paste("./1000G_EUR_Phase3_plink/1000G.EUR.QC.",1:22,".bim",sep="")
famfiles <- paste("./1000G_EUR_Phase3_plink/1000G.EUR.QC.",1:22,".fam",sep="")

fnRAW <- "./1000G.raw"
  
Glist <- gprep(study="1000G", fnRAW=fnRAW, bedfiles=bedfiles, bimfiles=bimfiles, famfiles=famfiles, overwrite=TRUE)

save(Glist, file="./Glist_1000G.Rdata", compress=FALSE)

摸拟数据

随机样本因果标记集。

rsids1 <- sample(Glist$rsids,100)
rsids2 <- sample(Glist$rsids,900)
rsids <- c(rsids1,rsids2)

提取因果标记的基因型。

W1 <- getW(Glist=Glist, rsids=rsids1, scale=TRUE, impute=TRUE) 
W2 <- getW(Glist=Glist, rsids=rsids2, scale=TRUE, impute=TRUE)

单标记方差

S1 <- 0.5/length(rsids1)
S2 <- 0.5/length(rsids2)
Se <- 1

单一标记效应

s1 <- rnorm(n=length(rsids1), mean=0, sd=sqrt(S1))
s2 <- rnorm(n=length(rsids2), mean=0, sd=sqrt(S2))

总遗传效应

g1 <- W1%*%s1
g2 <- W2%*%s2

残差

e <- rnorm(Glist$n)

表型(命名向量或矩阵)

y <- rowSums(cbind(g1,g2,e))
data <- data.frame(y = y, mu = 1)

为固定效果创建模型矩阵

X <- model.matrix(y ~ 0 + mu , data = data)

计算基因组关系矩阵(GRM)

fnG <- "./G.grm"
fnG1 <- "./G1.grm"
fnG2 <- "./G2.grm"

GRMlist <- grm(Glist=Glist, ids=Glist$ids, rsids=rsids, scale=TRUE, msize=100, ncores=4, fnG=fnG, overwrite=TRUE)

GRMlist1 <- grm(Glist=Glist, ids=Glist$ids, rsids=rsids1, scale=TRUE, msize=100, ncores=4, fnG=fnG1, overwrite=TRUE)

GRMlist2 <- grm(Glist=Glist, ids=Glist$ids, rsids=rsids2, scale=TRUE, msize=100, ncores=4, fnG=fnG2, overwrite=TRUE)

GRMlist12 <- mergeGRM(list(GRMlist1,GRMlist2)) 

W <- getW(Glist=Glist, rsids=rsids, scale=TRUE, impute=TRUE) 
W1 <- getW(Glist=Glist, rsids=rsids1, scale=TRUE, impute=TRUE) 
W2 <- getW(Glist=Glist, rsids=rsids2, scale=TRUE, impute=TRUE)

G <- grm(W=W)
G1 <- grm(W=W1)
G2 <- grm(W=W2)

G <- getGRM(GRMlist=GRMlist, ids=Glist$ids)
G1 <- getGRM(GRMlist=GRMlist1, ids=Glist$ids)
G2 <- getGRM(GRMlist=GRMlist2, ids=Glist$ids)

G <- grm(Glist=Glist, ids=Glist$ids, rsids=rsids, scale=TRUE, msize=100, ncores=4, fnG=fnG, overwrite=TRUE, returnGRM=TRUE)

G1 <- grm(Glist=Glist, ids=Glist$ids, rsids=rsids1, scale=TRUE, msize=100, ncores=4, fnG=fnG1, overwrite=TRUE, returnGRM=TRUE)

G2 <- grm(Glist=Glist, ids=Glist$ids, rsids=rsids2, scale=TRUE, msize=100, ncores=4, fnG=fnG2, overwrite=TRUE, returnGRM=TRUE)

使用GRMlist(适用于大数据)的示例

使用REML拟合线性混合模型-估计方差分量

fitG <- greml(y=y, X=X, GRMlist=GRMlist, theta=c(0.1,0.4), ncores=4)
fitG1 <- greml(y=y, X=X, GRMlist=GRMlist1, theta=c(0.1,0.4), ncores=4)
fitG2 <- greml(y=y, X=X, GRMlist=GRMlist2, theta=c(0.1,0.4), ncores=4)
fitG12 <- greml(y=y, X=X, GRMlist=GRMlist12, theta=c(0.1,0.1,0.4), ncores=4)

估计方差分量

fitG$theta
fitG1$theta
fitG2$theta
fitG12$theta

Genetic values

head(fitG$g)
head(fitG1$g)
head(fitG2$g)
head(fitG12$g)

g <- gblup(GRMlist=GRMlist,fit=fitG)
g1 <- gblup(GRMlist=GRMlist1,fit=fitG1)
g2 <- gblup(GRMlist=GRMlist2,fit=fitG2)
g12 <- gblup(GRMlist=GRMlist12,fit=fitG12)

使用GRM(适用于小数据)的示例

使用REML拟合线性混合模型-估计方差分量

G <- grm(Glist=Glist, ids=Glist$ids, rsids=rsids, scale=TRUE, msize=100, ncores=4, fnG=fnG, overwrite=TRUE, returnGRM=TRUE)

G1 <- grm(Glist=Glist, ids=Glist$ids, rsids=rsids1, scale=TRUE, msize=100, ncores=4, fnG=fnG1, overwrite=TRUE, returnGRM=TRUE)

G2 <- grm( Glist=Glist, ids=Glist$ids, rsids=rsids2, scale=TRUE, msize=100, ncores=4, fnG=fnG2, overwrite=TRUE, returnGRM=TRUE)

fitG <- greml(y=y, X=X, GRM=list(G=G), theta=c(0.1,0.4), ncores=4)
fitG1 <- greml(y=y, X=X, GRM=list(G1=G1), theta=c(0.1,0.4), ncores=4)
fitG2 <- greml(y=y, X=X, GRM=list(G2=G2), theta=c(0.1,0.4), ncores=4)
fitG12 <- greml(y=y, X=X, GRM=list(G1=G1,G2=G2), theta=c(0.1,0.1,0.4), ncores=4)

估计方差分量

fitG$theta
fitG1$theta
fitG2$theta
fitG12$theta

Genetic values

head(fitG$g)
head(fitG1$g)
head(fitG2$g)
head(fitG12$g)

REML分析和交叉验证

n <- length(y)
fold <- 10
nsets <- 5

validate <- replicate(nsets, sample(1:n, as.integer(n/fold)))

cvG <- greml(y=y, X=X, GRM=list(G=G), validate = validate)
cvG1 <- greml(y=y, X=X, GRM=list(G1=G1), validate = validate)
cvG2 <- greml(y=y, X=X, GRM=list(G2=G2), validate = validate)
cvG12 <- greml(y=y, X=X, GRM=list(G1=G1,G2=G2), validate = validate)

cvG$accuracy
cvG1$accuracy
cvG2$accuracy
cvG12$accuracy

S <- matrix(0,nrow=1000,ncol=3)
rownames(S) <- c(rsids1,rsids2)
colnames(S) <- c("Set12","Set1","Set2") 
S[,1]  <- c(s1,s2)
S[1:100,2]  <- s1
S[101:1000,3]  <- s2

gt <- gscore(Glist=Glist,S=S,scale=FALSE)

单标记关联和基因集富集分析

线性模型分析和单标记关联检验

ma <- lma(y=y,X=X, Glist = Glist)
head(ma)

创建标记集

sets <- list(set1=rsids1,set2=rsids2)

Set test based on sums

mma <- gsea(stat = as.matrix(ma)[,"stat"]**2, sets = sets, method = "sum", nperm = 10000)
head(mma)

Set test based on hyperG

mma <- gsea(stat = as.matrix(ma)[,"p"], sets = sets, method = "hyperg", threshold = 0.05)
head(mma)

使用UK生物库表型数据

加载库

library(devtools)
install_github("psoerensen/qgg")
library(qgg)

为qgg准备表型数据

本教程的表型数据来自UK生物库。
表型文件获得使用ukbtools

load(file="./data/phenotypes/ukb_data.Rdata")
rel <- read.table(file="./data/genotypes/ukb31269_rel_s488363.dat",sep=" ", header=TRUE)
  
ids1 <- rel[,1]
ids2 <- rel[,2]
ids_related <- ids2[!ids2%in%ids1]
ids_related <- ids2

df <- ukb_data
is.British <- df[,63]=="British"
df <- df[is.British,]
dim(df)

is.white <- df[,81]=="Caucasian"
is.white[is.na(is.white)] <- FALSE 
df <- df[is.white,]
dim(df)

is.related <- df[,1]%in%ids_related
df <- df[!is.related,]
dim(df)

is.aneuploidy <- df[,124]=="Yes"
is.aneuploidy[is.na(is.aneuploidy)] <- FALSE 
df <- df[!is.aneuploidy,]
dim(df)

is.missing <- df[,80]<=0.05
df <- df[is.missing,]
dim(df)

df$eid <- as.character(df$eid)
rownames(df) <- df$eid 

clsCovar <- c("sex_0_0",
       "year_of_birth_0_0",
           "genetic_principal_components_0_1",
           "genetic_principal_components_0_2",
           "genetic_principal_components_0_3",
           "genetic_principal_components_0_4",
           "genetic_principal_components_0_5",
           "genetic_principal_components_0_6",
           "genetic_principal_components_0_7",
           "genetic_principal_components_0_8",
           "genetic_principal_components_0_9",
           "genetic_principal_components_0_10")


covar <- df[,clsCovar]
colnames(covar) <- c("sex","age",paste("pc",1:10,sep=""))
covar[,2] <- 2018-covar[,2] 

save(covar,file="./data/covar_ukb.Rdata")

clsPhenotypes <- c(4:12,20:43,45:62,66:71,180:183)
 
y <- df[,clsPhenotypes] 
isNA <- colSums(is.na(y))<23000
y <- y[,isNA]
dim(y)
y <- scale(y)

colnames(y) <- gsub("_0_0","",colnames(y))

df <- data.frame(covar,y)

save(df,file="./data/phenotypes/df_wbu_ukb.Rdata")

for ( i in 1:ncol(y)) {
isNA <- is.na(y[,i])
X <- model.matrix( ~ sex + age + pc1 + pc2 + pc3 + pc4 + pc5 + pc6 + pc7 + pc8 + pc9 + pc10, data=covar[!isNA,])
fit <- fastlm(y=y[!isNA,i],X=X)
y[!isNA,i] <- y[!isNA,i]-fit$yhat 
print(i)
    }

y <- scale(y)

colnames(y) <- gsub("_0_0","",colnames(y))

save(y,file="./data/phenotypes/phenotypes_scaled_ukb.Rdata")


load(file="./data/phenotypes_scaled_ukb.Rdata")

noNA <- rowSums(is.na(y))==0
idsT <- rownames(y)[noNA] 

ntrain <- rep(c(100000,150000,200000,250000,300000), each=5) 

train <- sapply(ntrain, function(x){sample(idsT,x)})

ytrain <- NULL 
for ( i in 1:length(train)) {
yobs <- y
isT <- rownames(yobs)%in%train[[i]]
yobs[!isT,] <- NA  
ytrain <- cbind(ytrain,as.matrix(yobs))
print(i)
    } 

reps <- as.character(rep(rep(1:5,each=10),times=5))
reps <- paste(rep(c("100K","150K","200K","250K","300K"),each=50),reps,sep="_")

colnames(ytrain) <- gsub("_0_0","",colnames(ytrain))
colnames(ytrain) <- paste(colnames(ytrain),reps,sep="_")

save(ytrain,file="./data/ytrain_scaled_ukb.Rdata")
save(train,file="./data/train_ukb.Rdata")

cls <- grep("height",colnames(ytrain))
ytrain <- ytrain[,cls]
save(ytrain, file="./data/ytrain_scaled_height_ukb.Rdata")

使用英国生物库数据指南

本教程的表型和遗传数据来自英国生物库。

准备qgg的基因型数据

加载库

library(devtools)
install_github("psoerensen/qgg")
library(qgg)

表型数据加载

load(file="./data/phenotypes/df_wbu_ukb.Rdata")
idsWBU <- rownames(df)

指定bed,bim,fam文件(用于PLINK)

bimfiles <- paste("./data/genotypes/ukb_snp_chr",1:22,"_v2.bim",sep="")
bedfiles <- paste("./data/genotypes/ukb_cal_chr",1:22,"_v2.bed",sep="")
famfiles <- paste("./data/genotypes/ukb31269_cal_chr",1:22,"_v2_s488363.fam",sep="")

指定项目中应该使用哪些个体(这是可选的)

ids <- idsWBU

指定所有qgg分析中使用的基因型文件的名称(扩展名)。“raw”现在是必需的)

fnRAW <- "./data/genotypes/wbu_ukb.raw"

准备带有基因型信息的Glist结构

Glist <- gprep(study="WBU_UKB", fnRAW=fnRAW, bedfiles=bedfiles, bimfiles=bimfiles, famfiles=famfiles, ids=ids)
  
save(Glist, file="./data/genotypes/Glist_WBU_UKB.Rdata", compress=FALSE)

REML分析

加载库

library(qgg)

加载表型

load(file="/data/phenotypes/df_wbu_ukb.Rdata")
y <- na.omit(as.matrix(df[,"standing_height",drop=FALSE])[,1])
X <- model.matrix( ~ sex + age + pc1 + pc2 + pc3 + pc4 + pc5 + pc6 + pc7 + pc8 + pc9 + pc10, data=df[names(y),])

加载基因型和过滤低质量和MHC标记

load(file="/data/genotypes/Glist_WBU_UKB.Rdata")
isMAF <- Glist$maf<=0.01
isMISS <- Glist$nmiss/length(Glist$study_ids) > 0.05
isMHC <- Glist$chr==6 & Glist$position > 25602429 & Glist$position < 33471466
delete <- isMAF | isMISS | isMHC 

rsids <- Glist$rsids[!delete]

使用不同研究群体大小估计遗传力

ncores <- 12
nrep <- 5
ntrain <- c(10000,20000,30000,50000) 
   
for ( i in c(1:nrep) ) {
idsG <- sample(names(y),max(ntrain))
fnG <- "/faststorage/project/ukbiobank/grm/G.grm" 
GRMlist <- grm( Glist=Glist, ids=idsG, rsids=rsids, scale=TRUE, msize=100, ncores=12, fnG=fnG, overwrite=TRUE)
fit <- vector(length=length(ntrain),mode="list")
     for ( j in 1:length(ntrain) ) {
       idsT <- idsG[1:ntrain[j]] 
       fit[[j]] <- greml( y=y[idsT], X=as.matrix(X[idsT,]), GRMlist=GRMlist, theta=c(0.25,0.05), ncores=ncores, interface="fortran", maxit=10, verbose=TRUE)
       print(c(i,ntrain[j]))
     }
fnGREML <- paste("./results/greml_rep",i,"_100K_WBU_UKB.Rdata",sep="")
save(fit, file=fnGREML)
   }

利用染色体信息进行遗传能力的分配

nrep <- 5
ntrain <- 50000

fit <- vector(length=nrep,mode="list")

for ( i in 1:nrep) { 
     idsG <- sample(names(y),ntrain)
     fnG <- paste("/data/grm/G",1:22,".grm",sep="") 
     GRMlist <- NULL 
     for ( chr in 1:22 ) { 
       rsidsChr <- Glist$rsids[Glist$chr==chr]
       rsidsChr <- rsidsChr[rsidsChr%in%rsids] 
       GRMlist[[chr]] <- grm( Glist=Glist, ids=idsG, rsids=rsidsChr, scale=TRUE,   msize=100, ncores=16, fnG=fnG[chr], overwrite=TRUE)
     }
names(GRMlist)<- paste("CHR",1:22,sep="")
GRMlist <- mergeGRM( GRMlist ) 

fit[[i]] <- greml( y=y[idsG], X=as.matrix(X[idsG,]), GRMlist=GRMlist, theta=rep(0.01,23), ncores=16, interface="fortran", maxit=20, verbose=TRUE)

save( fit, file="./results/greml_chr_50K_WBU_UKB.Rdata")
   }

单标记分析

加载库

library(qgg)

Load phenotypes

load(file="./data/ytrain_scaled_height_ukb.Rdata")

加载基因型和过滤低质量和MHC标记

load(file="/data/genotypes/Glist_WBU_UKB.Rdata")
     isMAF <- Glist$maf<=0.01
     isMISS <- Glist$nmiss/length(Glist$study_ids) > 0.05
     isMHC <- Glist$chr==6 & Glist$position > 25602429 & Glist$position < 33471466
     delete <- isMAF | isMISS | isMHC 

rsids <- Glist$rsids[!delete]

使用不同研究人群规模的单一标记关联

ma <- lma( y=ytrain, Glist=Glist, rsids=rsids, scale=FALSE)

利用高斯-塞德尔求解线性混合模型

Load libraries

library(qgg)

为训练集加载表现型

load(file="/data/ytrain_scaled_height_ukb.Rdata")

Load genotypes

Load genotypes

加载lma结果

load(file="/results/ma_wbu_ukb.Rdata")

for ( top in c(10000,30000,50000,80000) ) {
     
nt <- ncol(ytrain)

g <- matrix(NA,nrow=Glist$n,ncol=nt)
       rownames(g) <- Glist$ids
       colnames(g) <- colnames(ytrain)

s <- vector(length=nt,mode="list")
       names(s) <- colnames(ytrain)

fnG <- paste("/results/ghat_top",top,"_wbu_ukb.Rdata",sep="")
fnS <- paste("/results/shat_top",top,"_wbu_ukb.Rdata",sep="")

for ( i in 1:nt) {

   rsids <- rownames(ma$p)[order(ma$p[,i],decreasing=FALSE)][1:top]

   yt <- na.omit(ytrain[,i])
   Xt <- model.matrix(yt~1)
   h2 <- 0.7
   lambda <- length(rsids)*(1-h2)/h2

   fit <- gsolve( y=yt, X=Xt, Glist=Glist, rsids=rsids, lambda=lambda, maxit=10, tol=1e-5 )
      names(fit$s) <- rsids
           g[,i] <- fit$g
           s[[i]] <- fit$s
           save(g, file=fnG)
           save(s, file=fnS)
      }
    }

总结预测精度

res <- NULL
for ( top in c("10000","30000","50000","80000")) { 
     fnG <- paste("./results/ghat_top",top,"_wbu_ukb.Rdata",sep="")
     load(file=fnG)
     accuracy <- NULL
     
     for ( i in 1:25) {
          yt <- na.omit(ytrain[,i])
          idsT <- names(yt)
          idsV <- rownames(y)[!rownames(y)%in%idsT]
          yobs <- y[idsV,"standing_height"]
          ypred <- g[idsV,i]
          accuracy <- rbind(accuracy,acc(yobs=yobs,ypred=ypred))
     }
     
     accuracy <- matrix(accuracy[,2],ncol=5)
     colnames(accuracy) <- c("100K","150K","200K","250K","300K")
     res[[as.character(top)]] <- accuracy
}
r2 <- sapply(res,colMeans)

基因集合富集分析

Load libraries
library(qgg)

Load Glist

load(file="data/genotypes/Glist_WBU_UKB.Rdata")
isMAF <- Glist$maf<=0.01
isMISS <- Glist$nmiss/length(Glist$study_ids) > 0.05
isMHC <- Glist$chr==6 & Glist$position > 25602429 & Glist$position < 33471466
delete <- isMAF | isMISS | isMHC 

rsids <- Glist$rsids[!delete]

根据汇总统计数据为人类的身高准备标记集
（数据源自https://portals.broadinstitute.org/collaboration/giant/index.php/GIANT_consortium）
Load Giant results

download.file( url="https://portals.broadinstitute.org/collaboration/giant/images/0/01/GIANT_HEIGHT_Wood_et_al_2014_publicrelease_HapMapCeuFreq.txt.gz",
destfile="./data/annotation/giant.txt.gz")

snp <- read.delim2(file="/data/annotation/giant.txt.gz")
snp <- snp[snp[,1]%in%rsids,]
o <- order(as.numeric(as.character(snp[,7])), decreasing=FALSE)
head(snp[o,]).
p <- as.numeric(as.character(snp$p))
names(p) <- snp$MarkerName

rsids_ordered <- as.character(snp[o,1])

sets <- split(rsids_ordered, ceiling(seq_along(rsids_ordered)/100))
save(sets, file="/data/annotation/setsGiant.Rdata")

加载单标记汇总统计(来自lma分析)

load(file="/data/results/ma_wbu_ukb.Rdata")

进行基因集富集分析

res <- mma(stat=ma$stat**2, sets=sets, ncores=16, np=1000000)
save(res,file="/data/results/mma_wbu_ukb.Rdata")

head(res$p)

谁曾经不是菜鸟啦

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
1
评论
qgg包-续2-大数据集教程

qgg包-续2-大数据集教程目录qgg包-续2-大数据集教程使用1000基因组项目数据的教程下载1000 基因组数据准备qgg的基因型数据摸拟数据计算基因组关系矩阵(GRM)使用GRMlist(适用于大数据)的示例使用GRM(适用于小数据)的示例REML分析和交叉验证单标记关联和基因集富集分析使用UK生物库表型数据为qgg准备表型数据使用英国生物库数据指南准备qgg的基因型数据REML分析单标记分析利用高斯-塞德尔求解线性混合模型基因集合富集分析使用1000基因组项目数据的教程遗传数据来自公开可获得的
复制链接

扫一扫

专栏目录