Attach necessary libraries:
library(ASURATDB)
library(DOSE) # For using `data(DO2EG)`
ASURATDB function format_DO()
reformats a Disease
Ontology database.
data(DO2EG)
dict_DO <- enrichDO(unlist(DO2EG), ont = "DO", pvalueCutoff = 1,
pAdjustMethod = "BH", minGSSize = 0, maxGSSize = 1e+10,
qvalueCutoff = 1, readable = FALSE)
human_DO <- format_DO(dict = dict_DO@result, all_geneIDs = dict_DO@gene,
orgdb = org.Hs.eg.db::org.Hs.eg.db)
# Save data.
# save(human_DO, file = "genes2bioterm/20201213_human_DO.rda")
The data were stored in the following repositories:
ASURATDB functions collect_CO()
and
format_CO()
load a Cell Ontology database using ontoProc
package and reformat the database, respectively.
Tips: As of December 2020, Cell Ontology database might not be complete enough for some biological contexts. For example, well-known marker genes for pancreatic beta cell, Ins1 and Ins2, were not registered for “type B pancreatic cell” with ID “CL:0000169”.
# Human
dict_CO <- collect_CO(orgdb = org.Hs.eg.db::org.Hs.eg.db)
human_CO <- format_CO(dict = dict_CO, orgdb = org.Hs.eg.db::org.Hs.eg.db)
# Save data.
# save(human_CO, file = "genes2bioterm/20201213_human_CO.rda")
# Mouse
dict_CO <- collect_CO(orgdb = org.Mm.eg.db::org.Mm.eg.db)
mouse_CO <- format_CO(dict = dict_CO, orgdb = org.Mm.eg.db::org.Mm.eg.db)
# Save data.
# save(mouse_CO, file = "genes2bioterm/20201211_mouse_CO.rda")
The data were stored in the following repositories:
ASURATDB functions collect_GO()
and
format_GO()
load a Gene Ontology database using
clusterProfiler package and reformat the database, respectively.
Currently, only human and mouse data are acceptable.
# Human
dict_GO <- collect_GO(orgdb = org.Hs.eg.db::org.Hs.eg.db)
human_GO <- format_GO(dict = dict_GO, orgdb = org.Hs.eg.db::org.Hs.eg.db)
# Human reduced
human_GO_red <- human_GO
onts <- c("MF", "BP", "CC")
for(i in seq_along(onts)){
ids <- human_GO[[onts[i]]][which(human_GO[[onts[i]]]$Count >= 2), ]$ID
mat <- human_GO$similarity_matrix[[onts[i]]][ids, ids]
human_GO_red$similarity_matrix[[onts[i]]] <- mat
}
# Save data.
# save(human_GO_red, file = "genes2bioterm/20201213_human_GO_red.rda")
# Mouse
dict_GO <- collect_GO(orgdb = org.Mm.eg.db::org.Mm.eg.db)
mouse_GO <- format_GO(dict = dict_GO, orgdb = org.Mm.eg.db::org.Mm.eg.db)
# Mouse reduced
mouse_GO_red <- mouse_GO
onts <- c("MF", "BP", "CC")
for(i in seq_along(onts)){
ids <- mouse_GO[[onts[i]]][which(mouse_GO[[onts[i]]]$Count >= 2), ]$ID
mat <- mouse_GO$similarity_matrix[[onts[i]]][ids, ids]
mouse_GO_red$similarity_matrix[[onts[i]]] <- mat
}
# Save data.
# save(mouse_GO_red, file = "genes2bioterm/20201211_mouse_GO_red.rda")
The data were stored in the following repositories:
ASURATDB functions collect_KEGG()
and
format_KEGG()
load a KEGG database using KEGGREST package
via the internet and reformat the database, respectively.
The arguments of collect_KEGG()
are
organism
and categories
. Here,
organism
must obey the naming rule of KEGG (see
KEGGREST
function listDatabases()
) and
categories
must be one of "pathway"
,
"module"
, and "drug"
(only for human) in the
current version.
# Human
dict_KEGG <- collect_KEGG(organism = "hsa", categories = c("pathway"))
human_KEGG <- format_KEGG(dict = list(pathway = dict_KEGG[["pathway"]][["success"]]),
orgdb = org.Hs.eg.db::org.Hs.eg.db)
# Save data.
# save(human_KEGG, file = "genes2bioterm/20201213_human_KEGG.rda")
# Mouse
dict_KEGG <- collect_KEGG(organism = "mmu", categories = c("pathway"))
mouse_KEGG <- format_KEGG(dict = list(pathway = dict_KEGG[["pathway"]][["success"]]),
orgdb = org.Mm.eg.db::org.Mm.eg.db)
# Save data.
# save(mouse_KEGG, file = "genes2bioterm/20201211_mouse_KEGG.rda")
# Human (drug)
dict_KEGG_drug <- collect_KEGG(organism = "hsa", categories = c("drug"))
human_KEGG_drug <- format_KEGG(dict = list(drug = dict_KEGG_drug[["drug"]][["success"]]),
orgdb = org.Hs.eg.db::org.Hs.eg.db)
# Save data.
# save(human_KEGG_drug, file = "genes2bioterm/20221102_human_KEGG_drug.rda")
Note collect_KEGG()
uses KEGGREST
function
keggGet()
, which may produce both successful and
unsuccessful results. The data were stored in the following
repositories:
Load databases, where category is “H” (hallmark gene sets) and
species is human (cf. msigdbr::msigdbr_species()
).
dbtable <- msigdbr::msigdbr(species = "Homo sapiens", category = "H")
Reformat the database.
dbtable_gsetID <- dbtable[, which(colnames(dbtable) %in% c("gs_name", "gs_id"))]
dbtable_gsetID <- unique(dbtable_gsetID)
dbtable_geneID <- split(x = dbtable$human_entrez_gene, f = dbtable$gs_name)
dbtable_symbol <- split(x = dbtable$gene_symbol, f = dbtable$gs_name)
stopifnot(identical(length(dbtable_geneID), length(dbtable_symbol)))
res <- c("ID", "Description", "Count", "Gene", "GeneID", "IC")
res <- data.frame(matrix(ncol = 6, nrow = 0, dimnames = list(NULL, res)))
for(i in 1:length(dbtable_geneID)){
res <- rbind(res, data.frame(
ID = dbtable_gsetID$gs_id[i],
Description = dbtable_gsetID$gs_name[i],
IC = NA,
Count = length(dbtable_geneID[[i]]),
Gene = paste(dbtable_symbol[[i]], collapse = "/"),
GeneID = paste(dbtable_geneID[[i]], collapse = "/")))
}
human_MSigDB_Hallmark <- list(hallmark = res)
# Save data.
# save(human_MSigDB_Hallmark, file = "genes2bioterm/20230127_human_MSigDB_Hallmark.rda")
The data were stored in the following repositories:
Load databases, where category is “C3” (regulatory target gene sets)
and species is human (cf. msigdbr::msigdbr_species()
).
dbtable <- msigdbr::msigdbr(species = "Homo sapiens", category = "C2")
dbtable <- dbtable[which(dbtable$gs_subcat == "CP:BIOCARTA"), ]
Reformat the database.
dbtable_gsetID <- dbtable[, which(colnames(dbtable) %in% c("gs_name", "gs_id"))]
dbtable_gsetID <- unique(dbtable_gsetID)
dbtable_geneID <- split(x = dbtable$human_entrez_gene, f = dbtable$gs_name)
dbtable_symbol <- split(x = dbtable$gene_symbol, f = dbtable$gs_name)
stopifnot(identical(length(dbtable_geneID), length(dbtable_symbol)))
res <- c("ID", "Description", "Count", "Gene", "GeneID", "IC")
res <- data.frame(matrix(ncol = 6, nrow = 0, dimnames = list(NULL, res)))
for(i in 1:length(dbtable_geneID)){
res <- rbind(res, data.frame(
ID = dbtable_gsetID$gs_id[i],
Description = dbtable_gsetID$gs_name[i],
IC = NA,
Count = length(dbtable_geneID[[i]]),
Gene = paste(dbtable_symbol[[i]], collapse = "/"),
GeneID = paste(dbtable_geneID[[i]], collapse = "/")))
}
human_MSigDB_BIOCARTA <- list(BIOCARTA = res)
# Save data.
# save(human_MSigDB_BIOCARTA, file = "genes2bioterm/20230211_human_MSigDB_BIOCARTA.rda")
The data were stored in the following repositories:
Load databases, where category is “C3” (regulatory target gene sets)
and species is human (cf. msigdbr::msigdbr_species()
).
dbtable <- msigdbr::msigdbr(species = "Homo sapiens", category = "C3")
dbtable <- dbtable[which(dbtable$gs_subcat == "TFT:GTRD"), ]
Reformat the database.
dbtable_gsetID <- dbtable[, which(colnames(dbtable) %in% c("gs_name", "gs_id"))]
dbtable_gsetID <- unique(dbtable_gsetID)
dbtable_geneID <- split(x = dbtable$human_entrez_gene, f = dbtable$gs_name)
dbtable_symbol <- split(x = dbtable$gene_symbol, f = dbtable$gs_name)
stopifnot(identical(length(dbtable_geneID), length(dbtable_symbol)))
res <- c("ID", "Description", "Count", "Gene", "GeneID", "IC")
res <- data.frame(matrix(ncol = 6, nrow = 0, dimnames = list(NULL, res)))
for(i in 1:length(dbtable_geneID)){
res <- rbind(res, data.frame(
ID = dbtable_gsetID$gs_id[i],
Description = dbtable_gsetID$gs_name[i],
IC = NA,
Count = length(dbtable_geneID[[i]]),
Gene = paste(dbtable_symbol[[i]], collapse = "/"),
GeneID = paste(dbtable_geneID[[i]], collapse = "/")))
}
human_MSigDB_GTRD <- list(GTRD = res)
# Save data.
# save(human_MSigDB_GTRD, file = "genes2bioterm/20230211_human_MSigDB_GTRD.rda")
The data were stored in the following repositories:
Load databases.
dbtable <- clustermole::clustermole_markers()
sort(unique(dbtable$db))
[1] "ARCHS4" "CellMarker" "MSigDB" "PanglaoDB" "SaVanT" "TISSUES"
[7] "xCell"
Select species and databases.
dbtable <- dbtable[which(dbtable$species == "Human"), ]
dbtable <- dbtable[which(dbtable$db == "MSigDB"),]
dbtable$geneID <- NA
Change gene symbols into entrez IDs.
dictionary <- AnnotationDbi::select(org.Hs.eg.db::org.Hs.eg.db,
key = dbtable$gene_original,
columns = c("SYMBOL", "ENTREZID"),
keytype = "SYMBOL")
dictionary <- dictionary[!duplicated(dictionary$SYMBOL), ]
dictionary <- dictionary[which(!is.na(dictionary$SYMBOL)),]
for(i in 1:nrow(dbtable)){
gene <- dbtable$gene_original[i]
inds <- which(dictionary$SYMBOL == gene)
dbtable$geneID[i] <- dictionary[inds,]$ENTREZID
}
Reformat the database. Here, the identifier of each biological term are named “MSigDBID.”
dbtable_geneID <- split(x = dbtable$geneID, f = dbtable$celltype)
dbtable_symbol <- split(x = dbtable$gene_original, f = dbtable$celltype)
stopifnot(identical(length(dbtable_geneID), length(dbtable_symbol)))
res <- c("ID", "Description", "Count", "Gene", "GeneID", "IC")
res <- data.frame(matrix(ncol = 6, nrow = 0, dimnames = list(NULL, res)))
for(i in 1:length(dbtable_geneID)){
res <- rbind(res, data.frame(
ID = paste("MSigDBID:", i, sep = ""),
Description = names(dbtable_geneID)[i],
IC = NA,
Count = length(dbtable_geneID[[i]]),
Gene = paste(dbtable_symbol[[i]], collapse = "/"),
GeneID = paste(dbtable_geneID[[i]], collapse = "/")))
}
human_MSigDB <- list(cell = res)
# Save data.
# save(human_MSigDB, file = "genes2bioterm/20220308_human_MSigDB.rda")
The data were stored in the following repositories:
Load databases.
dbtable <- clustermole::clustermole_markers()
sort(unique(dbtable$db))
[1] "ARCHS4" "CellMarker" "MSigDB" "PanglaoDB" "SaVanT" "TISSUES"
[7] "xCell"
Select species and databases.
dbtable <- dbtable[which(dbtable$species == "Human"), ]
dbtable <- dbtable[which(dbtable$db == "CellMarker"),]
dbtable$geneID <- NA
Change gene symbols into entrez IDs.
dictionary <- AnnotationDbi::select(org.Hs.eg.db::org.Hs.eg.db,
key = dbtable$gene_original,
columns = c("SYMBOL", "ENTREZID"),
keytype = "SYMBOL")
dictionary <- dictionary[!duplicated(dictionary$SYMBOL), ]
dictionary <- dictionary[which(!is.na(dictionary$SYMBOL)),]
for(i in 1:nrow(dbtable)){
gene <- dbtable$gene_original[i]
inds <- which(dictionary$SYMBOL == gene)
dbtable$geneID[i] <- dictionary[inds,]$ENTREZID
}
Reformat the database. Here, the identifier of each biological term are named “CellMarkerID.”
dbtable_geneID <- split(x = dbtable$geneID, f = dbtable$celltype)
dbtable_symbol <- split(x = dbtable$gene_original, f = dbtable$celltype)
stopifnot(identical(length(dbtable_geneID), length(dbtable_symbol)))
res <- c("ID", "Description", "Count", "Gene", "GeneID", "IC")
res <- data.frame(matrix(ncol = 6, nrow = 0, dimnames = list(NULL, res)))
for(i in 1:length(dbtable_geneID)){
res <- rbind(res, data.frame(
ID = paste("CellMarkerID:", i, sep = ""),
Description = names(dbtable_geneID)[i],
IC = NA,
Count = length(dbtable_geneID[[i]]),
Gene = paste(dbtable_symbol[[i]], collapse = "/"),
GeneID = paste(dbtable_geneID[[i]], collapse = "/")))
}
human_CellMarker <- list(cell = res)
# Save data.
# save(human_CellMarker, file = "genes2bioterm/20220308_human_CellMarker.rda")
The data were stored in the following repositories:
Create a cell type-related database by combining Cell ontology and MSigDB databases for analyzing human single-cell transcriptome data.
urlpath <- "https://github.com/keita-iida/ASURATDB/blob/main/genes2bioterm/"
load(url(paste0(urlpath, "20201213_human_CO.rda?raw=true")))
load(url(paste0(urlpath, "20220308_human_MSigDB.rda?raw=true")))
res <- rbind(human_CO[["cell"]], human_MSigDB[["cell"]])
human_CB <- list(cell = res)
Create a cell type-related database by combining Cell ontology, MSigDB, and CellMarker databases for analyzing human single-cell transcriptome data.
urlpath <- "https://github.com/keita-iida/ASURATDB/blob/main/genes2bioterm/"
load(url(paste0(urlpath, "20201213_human_CO.rda?raw=true")))
load(url(paste0(urlpath, "20220308_human_MSigDB.rda?raw=true")))
load(url(paste0(urlpath, "20220304_human_CellMarker.rda?raw=true")))
res <- do.call("rbind", list(human_CO[["cell"]], human_MSigDB[["cell"]],
human_CellMarker[["cell"]]))
human_CB <- list(cell = res)
Create a cell type-related database by combining Disease Ontology, Cell ontology and MSigDB databases for analyzing complex human single-cell transcriptome data.
urlpath <- "https://github.com/keita-iida/ASURATDB/blob/main/genes2bioterm/"
load(url(paste0(urlpath, "20201213_human_DO.rda?raw=true")))
load(url(paste0(urlpath, "20201213_human_CO.rda?raw=true")))
load(url(paste0(urlpath, "20220308_human_MSigDB.rda?raw=true")))
res <- do.call("rbind", list(human_DO[["disease"]], human_CO[["cell"]],
human_MSigDB[["cell"]]))
human_CB <- list(cell = res)
Create a cell type-related database by combining Disease Ontology, Cell ontology, MSigDB, and CellMarker databases for analyzing complex human single-cell transcriptome data.
urlpath <- "https://github.com/keita-iida/ASURATDB/blob/main/genes2bioterm/"
load(url(paste0(urlpath, "20201213_human_DO.rda?raw=true")))
load(url(paste0(urlpath, "20201213_human_CO.rda?raw=true")))
load(url(paste0(urlpath, "20220308_human_MSigDB.rda?raw=true")))
load(url(paste0(urlpath, "20220304_human_CellMarker.rda?raw=true")))
res <- do.call("rbind", list(human_DO[["disease"]], human_CO[["cell"]],
human_MSigDB[["cell"]], human_CellMarker[["cell"]]))
human_CB <- list(cell = res)
sessionInfo()
#> R version 4.2.1 (2022-06-23)
#> Platform: x86_64-apple-darwin17.0 (64-bit)
#> Running under: macOS Big Sur ... 10.16
#>
#> Matrix products: default
#> BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
#>
#> locale:
#> [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> loaded via a namespace (and not attached):
#> [1] digest_0.6.31 R6_2.5.1 jsonlite_1.8.4 evaluate_0.20
#> [5] cachem_1.0.6 rlang_1.0.6 cli_3.6.0 rstudioapi_0.14
#> [9] jquerylib_0.1.4 bslib_0.4.2 rmarkdown_2.20 tools_4.2.1
#> [13] xfun_0.37 yaml_2.3.7 fastmap_1.1.0 compiler_4.2.1
#> [17] htmltools_0.5.4 knitr_1.42 sass_0.4.5