生信基础 | 人-小鼠基因之间的比较

ca35e235cbb9a45c3b1fac920e9e47bc.gif

接前文:生信基础 | 人-小鼠同源基因之间的转换

00ae1b48986797bbdf9803509d7bfef0.png

先看看MSigDB中的基因ENTREZID是否可以全部转化为SYMBOL。

library(biomaRt)
library(clusterProfiler)
library(org.Hs.eg.db)
library(org.Mm.eg.db)
library(AnnotationDbi)
load("F:/BioInfoStudy/data/genesets/mouse_H_v5p2.rdata")
load("F:/BioInfoStudy/data/genesets/human_H_v5p2.rdata")
hsa_ent <- c()
for(x in names(Hs.H)){
  hsa_ent <- c(hsa_ent,Hs.H[[x]])
}
hsa_ent <- unique(hsa_ent)
length(hsa_ent)


hsa_ent2symb <- bitr(hsa_ent,
                     fromType = "ENTREZID",
                     toType = "SYMBOL",
                     OrgDb = org.Hs.eg.db)
dim(hsa_ent2symb)


diff_ent <- setdiff(hsa_ent,hsa_ent2symb$ENTREZID)


ent2symb <- toTable(org.Hs.egSYMBOL)
ent2symb[match(diff_ent,ent2symb$gene_id),]


AnnotationDbi::select(org.Hs.eg.db,
                      keys=diff_ent,columns=c("SYMBOL"),
                      keytype="ENTREZID" )
mget(diff_ent,
     org.Hs.egSYMBOL,
     ifnotfound=NA

通过多种转换方式,发现人的ENTREZID有3个无法找到对应的SYMBOL,NCBI上检索了一下,也没找到这3个基因。

> diff_ent
[1] "285464" "11217"  "80761"

再看看小鼠的。

###--------------------------------------小鼠
mus_ent <- c()
for(x in names(Mm.H)){
  mus_ent <- c(mus_ent,Mm.H[[x]])
}
mus_ent <- unique(mus_ent)
length(mus_ent)


mus_ent2symb <- bitr(mus_ent,
                     fromType = "ENTREZID",
                     toType = "SYMBOL",
                     OrgDb = org.Mm.eg.db)
dim(mus_ent2symb)


diff_ent <- setdiff(mus_ent,mus_ent2symb$ENTREZID)


ent2symb <- toTable(org.Hs.egSYMBOL)
ent2symb[match(diff_ent,ent2symb$gene_id),]


AnnotationDbi::select(org.Hs.eg.db,
                      keys=diff_ent,columns=c("SYMBOL"),
                      keytype="ENTREZID" )


#org.Mm.egMAPCOUNTS
mget(diff_ent,
     org.Mm.egSYMBOL,
     ifnotfound=NA)

居然有15个没有找到对应的symbol。

> diff_ent
 [1] "675165"    "100038993" "100861969" "100043385" "100504735" "105245585" "546061"   
 [8] "625758"    "668131"    "665343"    "667941"    "11641"     "100039053" "100043918"
[15] "100043921"

写一个函数,用于人-小鼠同源基因的转换;

####------------------------------
conversion_BtHsaAndMusGenes <- function(genes,species){
  human <- biomaRt::useMart(biomart = "ENSEMBL_MART_ENSEMBL",
                            dataset = "hsapiens_gene_ensembl")
  mouse <- biomaRt::useMart(biomart = "ENSEMBL_MART_ENSEMBL",
                            dataset = "mmusculus_gene_ensembl")
  if(species == "human"){
    gt <- biomaRt::getLDS(attributes = c("hgnc_symbol"),
                          filters = "hgnc_symbol",
                          values = genes,
                          mart = human,
                          attributesL = c("mgi_symbol"),
                          martL = mouse,uniqueRows = T)
  }else if(species == "mouse"){
    gt <- biomaRt::getLDS(attributes = c("mgi_symbol"),
                          filters = "mgi_symbol",
                          values = genes , mart = mouse,
                          attributesL = c("hgnc_symbol"),
                          martL = human, uniqueRows=T)
  }else{stop("Wrong species information")}
  return(gt)
}

加载我自己从gtf文件读入的所有基因信息进行转换。

load(file = "K:/BioInfoFiles/human/gencode.v35_human_hsaGeneInfo.Rdata")
head(hsaGeneInfo)[1:3,]
# > head(hsaGeneInfo)
# ensembl   symbol      gene_type geneLength
# 1 ENSG00000000003   TSPAN6 protein_coding       4536
# 2 ENSG00000000005     TNMD protein_coding       1476
# 3 ENSG00000000419     DPM1 protein_coding       1207


load(file = "K:/BioInfoFiles//mouse/gencode.vM27_mouse_musGeneInfo.Rdata")
head(musGeneInfo)
# > head(musGeneInfo)[1:4,]
# ensembl symbol      gene_type geneLength
# 1 ENSMUSG00000000001  Gnai3 protein_coding       3262
# 2 ENSMUSG00000000003   Pbsn protein_coding        902
# 3 ENSMUSG00000000028  Cdc45 protein_coding       3506
# 4 ENSMUSG00000000031    H19         lncRNA       2460


###把小鼠基因转换为人的同源基因
m2h <- conversion_BtHsaAndMusGenes(genes = musGeneInfo$symbol,species = "mouse")


head(m2h)
# > head( m2h)[1:2,]
# MGI.symbol HGNC.symbol
# 1    mt-Atp6     MT-ATP6
# 2     mt-Co2      MT-CO2
dim(m2h)
# > dim(m2h)
# [1] 23075     2
###把人基因转换为小鼠的同源基因
h2m <- conversion_BtHsaAndMusGenes(genes = hsaGeneInfo$symbol,species = "human")
h2m[1:2,]
# > h2m[1:2,]
# HGNC.symbol MGI.symbol
# 1      MT-CO2     mt-Co2
# 2      MT-CO1     mt-Co1
dim(h2m)
# > dim(h2m)
# [1] 22714     2


##有重复的
length(unique(m2h$HGNC.symbol))
length(unique(m2h$MGI.symbol))
# > length(unique(m2h$HGNC.symbol))
# [1] 18411
# > length(unique(m2h$MGI.symbol))
# [1] 19308

这里比较MSigDB数据集中小鼠与人基因之间的差异并统计。

statgeneinfo <- data.frame()
identical(names(Hs.H),names(Mm.H))
#gs <- names(Mm.H)[1]
for(gs in names(Mm.H)){
  total_pre_hsa <- length(Hs.H[[gs]])
  total_pre_mus <- length(Mm.H[[gs]])
  hsa_symbol <- bitr(Hs.H[[gs]],
                     fromType = "ENTREZID",
                     toType = "SYMBOL",
                     OrgDb = org.Hs.eg.db)[,2] %>% na.omit()
  total_after_hsa <- length(hsa_symbol)
  mus_symbol <- bitr(Mm.H[[gs]],
                     fromType = "ENTREZID",
                     toType = "SYMBOL",
                     OrgDb = org.Mm.eg.db)[,2] %>% na.omit()
  total_after_mus <- length(mus_symbol)


  hsa_mus_con_g <- intersect(hsa_symbol,m2h[match(mus_symbol,mus_ent2symb$SYMBOL),2])
  hsa_mus_con_len <- length(hsa_mus_con_g)
  hsa_uniq <- total_pre_hsa - hsa_mus_con_len
  mus_uniq <- total_pre_mus - hsa_mus_con_len
  hsa_na <- total_pre_hsa - total_after_hsa
  mus_na <- total_pre_mus - total_after_mus


  info <- data.frame(dataset = gs,
                     total_pre_hsa = total_pre_hsa,
                     total_pre_mus = total_pre_mus,
                     total_after_hsa = total_after_hsa,
                     total_after_mus = total_after_mus,
                     hsa_mus_con_len = hsa_mus_con_len,
                     hsa_uniq = hsa_uniq,
                     mus_uniq = mus_uniq,
                     hsa_na = hsa_na,
                     mus_na = mus_na)
  statgeneinfo <- rbind(statgeneinfo,info)
}

ec6f2b963dba371c4ed05b3a6658c920.png

total_pre_hsa和total_pre_mus:原始数据集中的基因个数

total_after_hsa和total_after_mus:进行基因转换后的基因个数

hsa_mus_con_len:人和小鼠共有的基因个数

hsa_uniq和mus_uniq:人和小鼠中各特有的基因个数。

hsa_na和mus_na:未能将ENTREZID转化为SYMBOL的个数


参考:

生信基础 | 人-小鼠同源基因之间的转换

biomaRt包实现不同物种之间同源基因转换

生信中各种ID转换

基于小鼠的基因集数据库资源

765522fb5bdb3d5f1fbe6f22db056c83.png

d7e9b3a08217951b103ae985806009b8.png

1d79cd1d75cff011a32bba1f2ff59e13.png

7d9b1b257ba6452fb97544acbccb0fb4.png

05257874987191a956acd6df0c402bf4.png

483621fcf3a8a7a3702121aef6ef1bfd.png

2f36b991ee55a43f800556e4558e6bf2.png

592178e1ce08110b8e8d4d173ad7ee4e.png

e3a925562b4039557ba3d0984eb2f475.png

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

【云森】

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值