msigdbr hallmarks gsea broad研究所-CSDN博客

本文链接：https://blog.csdn.net/qq_52813185/article/details/134063222

关注微信：生信小博士

使用msigdbr r包

#BiocManager::install("msigdb")
#https://www.gsea-msigdb.org/gsea/msigdb
#https://cran.r-project.org/web/packages/msigdbr/vignettes/msigdbr-intro.html
#https://bioconductor.org/packages/release/data/experiment/vignettes/msigdb/inst/doc/msigdb.html#the-molecular-signatures-database-msigdb
#https://www.gsea-msigdb.org/gsea/msigdb/collections.jsp
library(msigdb)

library(ExperimentHub)
library(GSEABase)

#6提取并制备人的hallmarks列表---------
all_gene_sets_hs = msigdbr::msigdbr(species = "Homo sapiens") #Mus musculus

all_gene_sets_hs 
all_gene_sets_hs$gs_name %>%table()
all_gene_sets_hs$gs_cat %>%table()
all_gene_sets_hs$gs_subcat  %>%table()
all_gene_sets_hs$gs_id  %>%table() %>%tail()

all_gene_sets_hs_list=split(x = all_gene_sets_hs$gene_symbol,f=all_gene_sets_hs$gs_name )

all_gene_sets_hs_list
length(all_gene_sets_hs_list)

MSIGDB_CANONICAL= all_gene_sets_hs %>% dplyr::filter(gs_cat=="H")
MSIGDB_CANONICAL

MSIGDB_CANONICAL_list=split(x=MSIGDB_CANONICAL$gene_symbol,f = MSIGDB_CANONICAL$gs_name)

length(MSIGDB_CANONICAL_list)

完整代码如下


.libPaths(c("/home/data/t040413/R/x86_64-pc-linux-gnu-library/4.2",
            "/home/data/t040413/R/yll/usr/local/lib/R/site-library", 
            "/usr/local/lib/R/library",
            "/home/data/refdir/Rlib/"))


#BiocManager::install("msigdb")
#https://www.gsea-msigdb.org/gsea/msigdb
#https://cran.r-project.org/web/packages/msigdbr/vignettes/msigdbr-intro.html
#https://bioconductor.org/packages/release/data/experiment/vignettes/msigdb/inst/doc/msigdb.html#the-molecular-signatures-database-msigdb
#https://www.gsea-msigdb.org/gsea/msigdb/collections.jsp
library(msigdb)

library(ExperimentHub)
library(GSEABase)

#To download the data, we first need to get a list of the data available in the msigdb package and determine the unique identifiers for each data. The query() function assists in getting this list.

1#1 获取总的基因集合的名字--------
eh = ExperimentHub()
all_gene_sets_name=query(eh , 'msigdb')
all_gene_sets_name

#Data can then be downloaded using the unique identifier.
eh[['EH5421']]


#use the custom accessor to select a specific version of MSigDB
msigdb.hs = getMsigdb(org = 'hs', id = 'SYM', version = '7.4')
msigdb.hs




#Each signature is stored in a GeneSet object and can be processed using functions in the GSEABase R/Bioconductor package.

gs = msigdb.hs[[1000]]
gs
geneIds(gs)


#get collection type
collectionType(gs)


#get MSigDB category
bcCategory(collectionType(gs))


#get MSigDB subcategory
bcSubCategory(collectionType(gs))


#get description
description(gs)



#calculate the number of signatures in each category
table(sapply(lapply(msigdb.hs, collectionType), bcCategory))



#calculate the number of signatures in each subcategory
table(sapply(lapply(msigdb.hs, collectionType), bcSubCategory))

#plot the distribution of sizes
hist(sapply(lapply(msigdb.hs, geneIds), length),
     main = 'MSigDB signature size distribution',
     xlab = 'Signature size')


listCollections(msigdb.hs)
#> [1] "c1" "c2" "c3" "c4" "c5" "c6" "c7" "c8" "h"
listSubCollections(msigdb.hs)


#retrieeve the hallmarks gene sets
subsetCollection(msigdb.hs, 'h')



#retrieve the biological processes category of gene ontology
subsetCollection(msigdb.hs, 'c5', 'GO:BP')



#4提取所有人类基因集合数据----------
all_species=msigdbr::msigdbr_species()
all_species
all_gene_sets_hs = msigdbr::msigdbr(species = "Homo sapiens") #Mus musculus

head(all_gene_set_hs)

#查看所有的collections------
all_collections=msigdbr::msigdbr_collections()
all_collections


#5 提取人 鼠 特定的ollection------
#You can retrieve data for a specific collection, such as the hallmark gene sets.

m_gene_sets = msigdbr::msigdbr(species = "mouse", category = "H")
head(m_gene_sets)

h_gene_sets = msigdbr::msigdbr(species = "human", category = "H")
head(h_gene_sets)



h_gene_sets$gs_name

#6提取并制备人的hallmarks列表---------
all_gene_sets_hs = msigdbr::msigdbr(species = "Homo sapiens") #Mus musculus
#saveRDS(all_gene_sets_hs,file="~/datasets/all_gene_sets_hs_msigdb.rds")

all_gene_sets_hs 
all_gene_sets_hs$gs_name %>%table()
all_gene_sets_hs$gs_cat %>%table()
all_gene_sets_hs$gs_subcat  %>%table()
all_gene_sets_hs$gs_id  %>%table() %>%tail()
all_gene_sets_hs_list=split(x = all_gene_sets_hs$gene_symbol,f=all_gene_sets_hs$gs_name )
all_gene_sets_hs_list
length(all_gene_sets_hs_list)
MSIGDB_CANONICAL= all_gene_sets_hs %>% dplyr::filter(gs_cat=="H")
MSIGDB_CANONICAL
MSIGDB_CANONICAL_list=split(x=MSIGDB_CANONICAL$gene_symbol,f = MSIGDB_CANONICAL$gs_name)
length(MSIGDB_CANONICAL_list)
names(MSIGDB_CANONICAL_list)

#saveRDS(MSIGDB_CANONICAL_list,file = "~/datasets/Genesets_Dec19.rds")