ReactomePA数据库下载及后续格式处理

Reactome分析需要用到ReactomePA包处理

由于硬件能力有限,只能借助多个脚本实现(主打一个那个方便上手用哪个),总之自用能解决问题,及时记录一下,可能后面还能用的到的

最终想要得到的结果类型如下:

文件1:pathway_TERM2NAME.txt

文件2:pathway_TERM2SYMBOL.txt

第一步ReactomePA数据库下载:

rm(list = ls())
# 安装 ReactomePA
BiocManager::install("ReactomePA")
library(ReactomePA)
library(reactome.db)
ls("package:reactome.db")
keytypes(reactome.db)
library(dplyr)

PathwayId_Genes <- as.list(reactomePATHID2EXTID) %>% .[grep("HSA",names(.))]
PathwayId_PathwayName <- as.list(reactomePATHID2NAME) %>% .[grep("HSA",names(.))]
PathwayName_PathwayId <- as.list(reactomePATHNAME2ID) %>% .[grep("HSA",.)]
Genes_PathwayId <- as.list(reactomeEXTID2PATHID) %>% .[grep("HSA",.)]
name_id <- unlist(lapply(PathwayName_PathwayId, function(x) x[[1]]))
unpair <- PathwayId_PathwayName[!names(PathwayId_PathwayName) %in% name_id]
names(unpair)

##PathwayId_Genes,列表:pathwayID genes
##PathwayId_PathwayName列表:pathwayID 功能描述信息
##通过pathwayID使得两个文件的共同列具有ID顺序一致
PathwayId_PathwayName <- PathwayId_PathwayName[names(PathwayId_Genes)] #保持相同顺序

##HSA_list列表包括三个元素,
HSA_list <-  mapply(c, PathwayId_Genes, PathwayId_PathwayName, SIMPLIFY=FALSE) # 合并list

# 提取 pathway描述信息pathwayID、genes、PathwayName
#unlist(strsplit(x[length(x)],":"))[2]
#unlist(strsplit(hsa_list$`R-HSA-109582`[623],":"))[2]


description = unlist(lapply(HSA_list , function(x) {
  paste(unlist(strsplit(x[length(x)],":"))[2])
}))

# 提取 pathway 所有 genes list
genes = unlist(lapply(HSA_list , function(x) {
  paste(unlist(x)[1:(length(x)-1)],collapse =';')
}))
# 合并
HSA_pathway = data.frame(Pathway_Name = description, Genes = genes)


write.table(HSA_pathway, file = "HSA.txt", append = FALSE, quote = FALSE, sep = "\t",
                 eol = "\n", na = "NA", dec = ".", row.names = TRUE,
                 col.names = TRUE, qmethod = c("escape", "double"),
                 fileEncoding = "")

得到的HSA.txt结果文件如下:

第二步:进行结果格式处理

(1)通过shell简单处理

le HSA.txt  | awk -F "\t" '{print $1"\t"$2}' | sed '1d' | sed '1ipathway\tdescription' >hsa/pathway_TERM2NAME.txt

le HSA.txt  | awk -F "\t" '{print $1"\t"$3}' | sed '1d' | sed '1ipathway\tgeneID' >hsa/TERM2SYMBOL.txt

(2)通过python处理

python sy.py

pathway_dic = {}
with open ("TERM2SYMBOL.txt", 'r') as fr, open ("pathway_geneid.txt", 'w') as fw1:
    for line in fr:
        if "pathway" in line:
            print(line.strip(), file=fw1)
        else:
            line=line.strip().split("\t")
            pathwayid=line[0]
            geneid = line[1]
            if geneid in pathway_dic:pass
            else:
                pathway_dic[pathwayid] = geneid

    for k, v in pathway_dic.items():
        g = v.strip().split(";")
        for i in g:
            print(k+"\t"+i,file=fw1)

输出文件pathway_geneid.txt格式如下:

(3)进行ID格式转换,利用R脚本进行ENTREZID和SYMBOL转换

library(org.Hs.eg.db)
library(org.Mm.eg.db)
library(org.Rn.eg.db)
pathway_gene<- read.table("pathway_geneid.txt", header=T)
g_list<-as.character(pathway_gene$geneID)
g_symbol = mapIds(x = org.Hs.eg.db,#注释包
                   keys = g_list, #需要转换的基因ID
                   keytype = "ENTREZID", #需要转换的类型
                   column = "SYMBOL")  #需要转换为的类型

##小鼠
g_symbol = mapIds(x = org.Mm.eg.db,#注释包
                   keys = g_list, #需要转换的基因ID
                   keytype = "ENTREZID", #需要转换的类型
                   column = "SYMBOL")  #需要转换为的类型
##大鼠
g_symbol = mapIds(x = org.Rn.eg.db,#注释包
                   keys = g_list, #需要转换的基因ID
                   keytype = "ENTREZID", #需要转换的类型
                   column = "SYMBOL")  #需要转换为的类型
symbol_names<-as.character(g_symbol)
df <- data.frame(geneID=names(g_symbol), SYMBOL=symbol_names)
write.table(df, file = "genid_symblo.txt", append = FALSE, quote = F, sep = "\t",
                 eol = "\n", na = "NA", dec = ".", row.names = F,
                 col.names = TRUE, qmethod = c("escape", "double"),
                 fileEncoding = "")

输出文件(genid_symblo.txt)如下:

(4)根据以上两个输出文件的共同列,生成最终文件(pathway_TERM2SYMBOL.txt)

join pathway_geneid.txt  genid_symblo.txt  -1 2 -2 1 -t $'\t'  | cut -f2,3 >pathway_TERM2SYMBOL.txt

pathway_TERM2SYMBOL.txt格式如下

  • 3
    点赞
  • 10
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值