MR分析（主要记录一下，有多个分析时，一些循环的代码，以及分析前数据清洗准备工作）

onlooker_zsq

已于 2024-06-28 17:12:37 修改

阅读量449

点赞数 6

文章标签： r语言

于 2024-04-24 14:47:37 首次发布

本文链接：https://blog.csdn.net/onlookerq/article/details/137714288

版权

setwd('H:\\Puberty_SEM\\MR_GWAS')
library(TwoSampleMR)
library(dplyr)
files = c('ukb-a-25')
files[1]

这里用计算继续F值，但是由于要用到samplesize，所以先用if语句判断一下数据中是否含有samplesize值，如果是NA值，就跳过，这样就减少了观看每个文档的时间；这条命令也适用于其他的很多种场景中；所以简单记录一下；

for (i in seq_along(files)){
exposure_dat <- extract_instruments(outcomes = files[i])
if(any(is.na(exposure_dat$samplesize.exposure))){
  print(paste("列", "column_name", "包含NA值，跳过这个步骤。"))
}else{
exposure_dat = mutate(exposure_dat,R=get_r_from_bsen(exposure_dat$beta.exposure,
                                                     exposure_dat$se.exposure,
                                                     exposure_dat$samplesize.exposure))
exposure_dat = mutate(exposure_dat,F=(exposure_dat$samplesize.exposure-2)*((R*R)/(1-R*R)))
exposure_dat = exposure_dat %>% filter(F>10)
}
outcome_dat <- read_outcome_data(
  snps = exposure_dat$SNP,
  filename = "PSY_factor.txt",
  sep = "\t",
  snp_col = "rsID",
  beta_col = "BETA",
  se_col = "SE",
  effect_allele_col = "EA",
  other_allele_col = "NEA",
  pval_col = "P",
  samplesize_col = "N"
)
dat <- harmonise_data(
  exposure_dat = exposure_dat, 
  outcome_dat = outcome_dat
)
filename1 <- paste0("dat_",files[i], ".txt")#这里给输出文件命名一下
write.table(dat,file=filename1,row.names = F,sep = '\t',quote = F)
res <- mr(dat)
res = generate_odds_ratios(res)
filename2 <- paste0("res_",files[i], ".txt")
write.table(res,file=filename2,row.names = F,sep = '\t',quote = F)
het <- mr_heterogeneity(dat)
filename3 <- paste0("het_",files[i], ".txt")
write.table(res,file=filename3,row.names = F,sep = '\t',quote = F)
plt <- mr_pleiotropy_test(dat)
filename4 <- paste0("plt_",files[i], ".txt")
write.table(res,file=filename4,row.names = F,sep = '\t',quote = F)
#这里一样用了一个if循环判断一下
if(any(is.na(exposure_dat$samplesize.exposure))){
  print(paste("列", "column_name", "包含NA值，跳过这个步骤。"))
}else{
out <- directionality_test(dat)
filename5 <- paste0("out_",files[i], ".txt")
}
write.table(res,file=filename5,row.names = F,sep = '\t',quote = F)
}

##批量提取某GWAS数据中多个snp 1Mb范围内p值小于5e-8的所有snplibrary(data.table)

library(dplyr)
df = fread('age of menarche.txt',header = T)
files = c(##多个snp名，进行循环
)
files[1]
for (i in seq_along(files)){
  df1 = df[(df$RSID==files[i]),] #df 数据框中RSID列等于files的那一行
  a = df1$CHR  #df 数据框中RSID列等于files的那一行，的CHR
  a = as.numeric(a)
  b = df1$POS
  b = as.numeric(b)
  subset_df = subset(df,CHR==a & POS<(b+500000) & POS>(b-500000) & P<5e-8) #提取df数据框中CHR==a，POS 1Mb范围内P值小于5e-8的SNP
  filename1 <- paste0("voice_",files[i], ".txt")
  write.table(subset_df,file = filename1,row.names = F,sep = '\t',quote = F)
}

#RSID获取其chr和bp

library(dplyr)
library(data.table)
library(tidyr)
match = fread("snp150_hg19.txt",header=T,check.names=F,sep="\t")#用hg19作为reference 进行转换
files = c(#多个GWAS数据
)
files[1]
for (i in seq_along(files)){
tes = fread(files[i],header=T,check.names=F,sep="\t") 
colnames(tes)[1] = 'name'#因为snp150_hg19.txt数据中列名为name，所以转换以下
need=dplyr::left_join(tes,match,by="name")
need = na.omit(need)
need=temp%>%separate(`chromosome:start`,c('CHR','BP'),'[:]') 
colnames(need)=c('SNP','A1','A2','Freq','BETA','SE','P','N','CHR','BP') 

need = select(need,c('SNP','CHR','BP','A1','A2','Freq','BETA','SE','P','N'))

write.table(need,file = filest[i],append = F,quote = F ,sep = "\t",row.names=F)
}

#批量添加表型列和按不同的阈值筛选IVs

setwd('H:\\Puberty_SEM\\MR_GWAS\\Exposure')
library(dplyr)
library(data.table)
library(tidyr)
filest <- list.files(pattern = "*.txt")
filelent <- length(filest)
filest[2]
lf <-list.files(pattern = ".txt$") #以report.tsv 结尾的
files <- gsub("\\.txt", "", lf) 
files[2]
for(i in 1:length(filest)){
  temp <-fread(filest[i],header = T,sep = "\t")
  column_to_check <- "Phenotype"#检查是否有表型列，如果没有添加文件名作为表型列
  if(column_to_check %in% colnames(temp)){
    print(paste(column_to_check, "exist in the dataframe"))
  } else {
    temp = mutate(temp,Phenotype = files[i])
  }
  less_than_5e_8_count <- sum(temp$P < 5e-8)#检查GWAS summary data数据是否用P值小于5e-8的值，如果有用5e-8作为阈值，如果没有用1e-5作为阈值；
  if (less_than_5e_8_count < 1000) {
    temp1e_5 = temp %>% filter(P<1e-5)
    temp1e_5 = 
    select(temp1e_5,c('SNP','CHR','BP','A1','A2','BETA','SE','P','N','Phenotype'))
    filename1 <- paste(files[i],'.1E_5')
    write.table(temp1e_5,file = filename1,append = F,quote = F ,sep = "\t",row.names=F)
  } else{
    temp5e_8 = temp %>% filter(P<5e-8)
    temp5e_8 = 
    select(temp5e_8,c('SNP','CHR','BP','A1','A2','BETA','SE','P','N','Phenotype'))#为了合并时列数和列名均一致
    filename2 <- paste(files[i],'.5E_8')
    write.table(temp5e_8,file = filename2,append = F,quote = F ,sep = "\t",row.names=F)
  }
}
#将1e-5和5e-8的数据分别合并
filest <- list.files(pattern = "*.1E_5")
filelent <- length(filest)
newdatat <- c()
for(i in 1:length(filest)){
  temp <-read.table(filest[i],header = T,sep = "\t")
  newdatat = rbind(newdatat,temp)
}
write.table(x = newdatat,file = "Exposure_1e_5.txt",append = F,quote = F ,sep = "\t",row.names=F)

filest <- list.files(pattern = "*.5E_8")
filelent <- length(filest)
newdatat <- c()
for(i in 1:length(filest)){
  temp <-read.table(filest[i],header = T,sep = "\t")
  newdatat = rbind(newdatat,temp)
}
write.table(x = newdatat,file = "Exposure_5E_8.txt",append = F,quote = F ,sep = "\t",row.names=F)

##ieu opengwas 数据（.vcf.gz）转换为twosampleMR的格式

setwd('H:\\Puberty_SEM\\MR_GWAS\\Biomarkers\\biomarkers')
library(VariantAnnotation)
library(gwasglue)
library(TwoSampleMR)
library(ieugwasr)
library(dplyr)
lf <-list.files(pattern = ".gz$")#以report.tsv 结尾的
files <- gsub("\\.gz", "", lf) 
files[1]
for(i in 1:length(lf)){
exposure_vcf <- readVcf(lf[i])
exposure_data <- gwasvcf_to_TwoSampleMR(exposure_vcf)
write.table(exposure_data,file = files[i],row.names = F,sep = '\t',quote = F)
}
library(data.table)
library(dplyr)
lf <-list.files(pattern = ".txt$")
for(i in 1:length(lf)){
  temp <-fread(lf[i],header = T,sep = "\t")
  colnames(temp)[colnames(temp)=='chr.exposure'] = 'CHR'
  colnames(temp)[colnames(temp)=='pos.exposure'] = 'POS'
  colnames(temp)[colnames(temp)=='other_allele.exposure'] = 'A2'
  colnames(temp)[colnames(temp)=='effect_allele.exposure'] = 'A1'
  colnames(temp)[colnames(temp)=='beta.exposure'] = 'BETA'
  colnames(temp)[colnames(temp)=='se.exposure'] = 'SE'
  colnames(temp)[colnames(temp)=='pval.exposure'] = 'P'
  colnames(temp)[colnames(temp)=='eaf.exposure'] = 'freq'
  colnames(temp)[colnames(temp)=='samplesize.exposure'] = 'N'
  colnames(temp)[colnames(temp)=='SNP'] = 'SNP'
  temp = select(temp,c('SNP','CHR','POS','A1','A2','freq','BETA','SE','P','N'))
  write.table(temp,file = lf[i],append = F,quote = F ,sep = "\t",row.names=F)
}

setwd('H:\\Puberty_SEM\\MR_GWAS\\Biomarkers\\biomarkers')
library(TwoSampleMR)
filest <- list.files(pattern = "*.vcf")
filelent <- length(filest)
filest[2]
files <- gsub("\\.vcf", "", filest)   
files[1]
exposure_dat = read.table('./../PSY_factor_clump.txt',header = T,sep = '\t')
for (i in 1:length(filest )){
    outcome_dat <- read_outcome_data(
    snps = exposure_dat$SNP,
    filename = filest[i],
    sep = "\t",
    snp_col = "SNP",
    beta_col = "BETA",
    se_col = "SE",
    effect_allele_col = "A1",
    other_allele_col = "A2",
    pval_col = "P",
    samplesize_col = "N",
    phenotype_col = 'Phenotype'
  )
  dat <- harmonise_data(
    exposure_dat = exposure_dat, 
    outcome_dat = outcome_dat
  )
  filename1 <- paste0("dat_",files[i], ".txt")
  write.table(dat,file=filename1,row.names = F,sep = '\t',quote = F)
  res <- mr(dat)
  res = generate_odds_ratios(res)
  filename2 <- paste0("res_",files[i], ".txt")
  write.table(res,file=filename2,row.names = F,sep = '\t',quote = F)
  het <- mr_heterogeneity(dat)
  filename3 <- paste0("het_",files[i], ".txt")
  write.table(het,file=filename3,row.names = F,sep = '\t',quote = F)
  plt <- mr_pleiotropy_test(dat)
  filename4 <- paste0("plt_",files[i], ".txt")
  write.table(plt,file=filename4,row.names = F,sep = '\t',quote = F)
  if(any(is.na(outcome_dat$samplesize.outcome))){
    print(paste("列", "column_name", "包含NA值，跳过这个步骤。"))
  }else{
    out <- directionality_test(dat)
    filename5 <- paste0("out_",files[i], ".txt")
  }
  write.table(out,file=filename5,row.names = F,sep = '\t',quote = F)
}

setwd('H:\\Puberty_SEM\\MR_GWAS\\Biomarkers\\biomarkers\\res')
filest <- list.files(pattern = "*.txt")
filelent <- length(filest)
newdatat <- c()
for(i in 1:length(filest)){
  temp <-read.table(filest[i],header = T,sep = '\t')
  newdatat = rbind(newdatat,temp)
}
write.table(x = newdatat,file = "res.txt",append = F,quote = F ,sep = "\t",row.names=F)

library(MRlap)
library(data.table)
library(dplyr)
setwd('H:\\Puberty_SEM\\MR_GWAS\\Biomarkers\\biomarkers')
filest <- list.files(pattern = "*.vcf")
filelent <- length(filest)
filest[2]
files <- gsub("\\.vcf", "", filest)   
files[1]
outcome_dat = fread('PSY_factor.txt',header = T)
colnames(outcome_dat)[colnames(outcome_dat)=='BP'] = 'pos'
colnames(outcome_dat)[colnames(outcome_dat)=='CHR'] = 'chr'
user_SNP_input = read.table('PSY_factor_clump.txt',header = T)
user_SNP_input = user_SNP_input$SNP
for (i in seq_along(filest)){
  exposure_dat = fread(filest[i],header = T)
  colnames(exposure_dat)[colnames(exposure_dat)=='BP'] = 'pos'
  colnames(exposure_dat)[colnames(exposure_dat)=='CHR'] = 'chr'
  exposure_dat$A1 = toupper(exposure_dat$A1)
  exposure_dat$A2 = toupper(exposure_dat$A2)
    A = MRlap(exposure = outcome_dat,
              exposure_name = 'mvPuberty',
              outcome = exposure_dat,
              outcome_name = files[i],
              ld = "./../../../eur_w_ld_chr",
              hm3 = "./../../../eur_w_ld_chr/w_hm3.snplist",
              do_pruning = FALSE,
              user_SNPsToKeep = user_SNP_input)
    filename1 <- paste0(files[i],"_MRlap", ".RData")
    save(A,file = filename1)
  }
}

setwd('H:\\Puberty_SEM\\MR_GWAS\\Biomarkers\\biomarkers\\MRlap')
library(dplyr)
filest <- list.files(pattern = ".RData")
files <- gsub("\\.RData", "", filest) 
filelent <- length(filest)
for(i in 1:length(filest)){
  load(filest[i])
  df1 = unlist(A["MRcorrection"])
  df1 = t(df1)
  df1 = as.data.frame(df1)
  df1 = select(df1,c('MRcorrection.observed_effect',
                     'MRcorrection.observed_effect_se',
                     'MRcorrection.m_IVs',
                     'MRcorrection.observed_effect_p',
                     'MRcorrection.corrected_effect',
                     'MRcorrection.corrected_effect_se',
                     'MRcorrection.corrected_effect_p',
                     'MRcorrection.test_difference',
                     'MRcorrection.p_difference'))
  df2 = unlist(A["LDSC"])
  df2 = t(df2)
  df2 = as.data.frame(df2)
  filename1 <- paste(files[i],'.txt')
  filename2 <- paste(files[i],'_ldsc','.txt')
  write.table(df1,file = filename1,append = F,quote = F ,sep = "\t",row.names=F)
  write.table(df2,file = filename2,append = F,quote = F ,sep = "\t",row.names=F)
}
setwd('H:\\Puberty_SEM\\MR_GWAS\\Biomarkers\\biomarkers\\MRlap\\MRlap')
filest <- list.files(pattern = "*.txt")
filelent <- length(filest)
files <- gsub("\\_MRlap.txt", "", filest)
merge.data <- read.table(file = filest[1],header=T,sep="\t")   
merge.data <- cbind(files[1], merge.data )#左侧增加列/cbind( merge.data, dir[1])#右边增加列
colnames(merge.data)[1]='ID'
for(i in 2:40){
  new.data <- read.table(file = filest[i], header=T, sep="\t")
  new.data <- cbind(files[i], new.data)
  colnames(new.data)[1]='ID'
  merge.data <- rbind(merge.data,new.data)
}
write.table(x = merge.data,file = "MRlap.txt",append = F,quote = F ,sep = "\t",row.names=F)

setwd('G:\\materal_prengancy\\New_analysis')
library(TwoSampleMR)
library(dplyr)
library(ieugwasr)
library(data.table)
exposure_dat = read_exposure_data('./BW.txt',
                                  sep = "\t",
                                  snp_col = "SNP",
                                  beta_col = "BETA",
                                  se_col = "SE",
                                  effect_allele_col = "A1",
                                  other_allele_col = "A2",
                                  pval_col = "P",
                                  samplesize_col = 'N',
                                  phenotype_col = 'Phenotype')
colnames(exposure_dat)[1] ='rsid'
colnames(exposure_dat)[6] = 'pval'
colnames(exposure_dat)[11] = 'id'
exposure_dat=ld_clump(dat = exposure_dat,
                      plink_bin = 'C:/Users/DELL/Documents/R/win-library/4.1/plinkbinr/bin/plink_Windows.exe',
                      bfile = "H:\\Puberty_SEM\\MR_GWAS\\EUR\\EUR",#链接：http://fileserve.mrcieu.ac.uk/ld/1kg.v3.tgz
                      clump_kb = 1000,
                      clump_r2 = 0.01
)
colnames(exposure_dat)[1] ='SNP'
colnames(exposure_dat)[6] = 'pval.exposure'
colnames(exposure_dat)[11] = 'id.exposure'
write.table(exposure_dat,file = './../Offspring_BW.txt',row.names = F,sep = '\t',quote = F)

exposure_dat = mutate(exposure_dat,R=get_r_from_bsen(exposure_dat$beta.exposure,
                                                     exposure_dat$se.exposure,
                                                     exposure_dat$samplesize.exposure))
exposure_dat = mutate(exposure_dat,F=(exposure_dat$samplesize.exposure-2)*((R*R)/(1-R*R)))
exposure_dat = exposure_dat %>% filter(F>10)
exposure_dat = read.table('BW.txt',header = T,sep = '\t')
outcome_path = 'G:\\materal_prengancy\\New_analysis\\outcome/'
lf <-list.files(path = outcome_path,pattern = ".txt$") 
files <- gsub("\\.txt", "", lf) 
filest = paste('./outcome/',lf,sep = '')
files[1]
for (i in 1:length(filest)){
    outcome_dat <- read_outcome_data(
    snps = exposure_dat$SNP,
    filename = filest[i],
    sep = "\t",
    snp_col = "SNP",
    beta_col = "BETA",
    se_col = "SE",
    effect_allele_col = "A1",
    other_allele_col = "A2",
    pval_col = "P",
    samplesize_col = "N",
    phenotype_col = 'Phenotype'
  )
  dat <- harmonise_data(
    exposure_dat = exposure_dat, 
    outcome_dat = outcome_dat
  )
  filename1 <- paste('BW_',files[i],'_dat','.txt')
  write.table(dat,file=filename1,row.names = F,sep = '\t',quote = F)
  res <- mr(dat)
  res = generate_odds_ratios(res)
  filename2 <- paste('BW_',files[i],'_res','.txt')
  write.table(res,file=filename2,row.names = F,sep = '\t',quote = F)
  het <- mr_heterogeneity(dat)
  filename3 <- paste('BW_',files[i],'_het','.txt')
  write.table(het,filename3,row.names = F,sep = '\t',quote = F)
  plt <- mr_pleiotropy_test(dat)
  filename4 <- paste('BW_',files[i],'_plt','.txt')
  write.table(plt,file=filename4,row.names = F,sep = '\t',quote = F)
  if(any(is.na(outcome_dat$samplesize.outcome))){
    print(paste("列", "column_name", "包含NA值，跳过这个步骤。"))
  }else{
    out <- directionality_test(dat)
    filename5 <- paste('BW_',files[i],'_out','.txt')
  }
  write.table(out,file=filename5,row.names = F,sep = '\t',quote = F)
}

#######################################################################MR-Lasso
setwd('G:\\materal_prengancy\\New_analysis\\BW_Puberty_MR/dat/')
library(MendelianRandomization)
library(dplyr)
filest <- list.files(pattern = "*.txt")
filelent <- length(filest)
files <- gsub("\\.txt", "", filest)
for(i in 1:length(filest)){
df = read.table(filest[i],header = T,sep = '\t')
df = split(df,df$exposure)
df1 = df[['Offspring_BW']]
filename1 <- paste('Offspring_BW_',files[i],'.txt')
write.table(df1,file = filename1,row.names = F,sep = '\t',quote = F)
df2 = df[['Fetal_BW']]
filename2 <- paste('Fetal_BW_',files[i],'.txt')
write.table(df2,file = filename2,row.names = F,sep = '\t',quote = F)
df3 = df[['Maternal_BW']]
filename3 <- paste('Maternal_BW_',files[i],'.txt')
write.table(df3,file = filename3,row.names = F,sep = '\t',quote = F)
}
setwd('G:\\materal_prengancy\\New_analysis\\BW_Puberty_MR/dat/MR-LASSO/')
filest <- list.files(pattern = "*.txt")
filelent <- length(filest)
files <- gsub("\\.txt", "", filest)
for(i in 1:length(filest)){
  data = read.table(filest[i],header = T,sep = '\t')
  data = select(data,c('SNP','beta.exposure','se.exposure','beta.outcome','se.outcome','exposure','outcome'))
  colnames(data) = c('snps','bx','bxse','by','byse','exposure','outcome')
  data$by = as.numeric(data$by)
  data$byse = as.numeric(data$byse)
  data$bx = as.numeric(data$bx)
  data$bxse = as.numeric(data$bxse)
  data =mr_input(bx = data$bx,bxse = data$bxse,by = data$by,byse=data$byse)
  df = mr_lasso(data,distribution = "normal", alpha = 0.05)
  filename4 <- paste(files[i],'.RData')
  save(df,file = filename4)
}
filest <- list.files(pattern = "*.RData")
filelent <- length(filest)
files <- gsub("\\.RData", "", filest)
for(i in 1:length(filest)){
  load(filest[i])
  df1 = c(df@Valid,df@Estimate,df@StdError,df@Pvalue,df@Lambda)
  df1 = as.data.frame(df1)
  df1 = t(df1)
  df1 = as.data.frame(df1)
  colnames(df1)=c('SNPS','beta','se','p','lambda')
  df1 = mutate(df1,ID=files[i])
  filename5 <- paste('MR_Lasso_',files[i],'.txt')
  write.table(df1,file =filename5,row.names = F,sep = '\t',quote = F)
}

setwd('G:\\materal_prengancy\\New_analysis\\BW_Puberty_MR\\dat/MR-LASSO/')
filest <- list.files(pattern = "*.txt")
filelent <- length(filest)
newdatat <- c()
for(i in 1:length(filest)){
  temp <-read.table(filest[i],header = T,sep = '\t')
  newdatat = rbind(newdatat,temp)
}
write.table(x = newdatat,file = "BW_Puberty_MR-LASSO.txt",append = F,quote = F ,sep = "\t",row.names=F)