Attach necessary libraries:
library(ASURATDB)
library(DOSE) # For using `data(DO2EG)`
ASURATDB function format_DO() reformats a Disease
Ontology database.
data(DO2EG)
dict_DO <- enrichDO(unlist(DO2EG), ont = "DO", pvalueCutoff = 1,
pAdjustMethod = "BH", minGSSize = 0, maxGSSize = 1e+10,
qvalueCutoff = 1, readable = FALSE)
human_DO <- format_DO(dict = dict_DO@result, all_geneIDs = dict_DO@gene,
orgdb = org.Hs.eg.db::org.Hs.eg.db)
# Save data.
# save(human_DO, file = "genes2bioterm/20201213_human_DO.rda")
The data were stored in the following repositories:
ASURATDB functions collect_CO() and
format_CO() load a Cell Ontology database using ontoProc
package and reformat the database, respectively.
Tips: As of December 2020, Cell Ontology database might not be complete enough for some biological contexts. For example, well-known marker genes for pancreatic beta cell, Ins1 and Ins2, were not registered for “type B pancreatic cell” with ID “CL:0000169”.
# Human
dict_CO <- collect_CO(orgdb = org.Hs.eg.db::org.Hs.eg.db)
human_CO <- format_CO(dict = dict_CO, orgdb = org.Hs.eg.db::org.Hs.eg.db)
# Save data.
# save(human_CO, file = "genes2bioterm/20201213_human_CO.rda")
# Mouse
dict_CO <- collect_CO(orgdb = org.Mm.eg.db::org.Mm.eg.db)
mouse_CO <- format_CO(dict = dict_CO, orgdb = org.Mm.eg.db::org.Mm.eg.db)
# Save data.
# save(mouse_CO, file = "genes2bioterm/20201211_mouse_CO.rda")
The data were stored in the following repositories:
ASURATDB functions collect_GO() and
format_GO() load a Gene Ontology database using
clusterProfiler package and reformat the database, respectively.
Currently, only human and mouse data are acceptable.
# Human
dict_GO <- collect_GO(orgdb = org.Hs.eg.db::org.Hs.eg.db)
human_GO <- format_GO(dict = dict_GO, orgdb = org.Hs.eg.db::org.Hs.eg.db)
# Human reduced
human_GO_red <- human_GO
onts <- c("MF", "BP", "CC")
for(i in seq_along(onts)){
ids <- human_GO[[onts[i]]][which(human_GO[[onts[i]]]$Count >= 2), ]$ID
mat <- human_GO$similarity_matrix[[onts[i]]][ids, ids]
human_GO_red$similarity_matrix[[onts[i]]] <- mat
}
# Save data.
# save(human_GO_red, file = "genes2bioterm/20201213_human_GO_red.rda")
# Mouse
dict_GO <- collect_GO(orgdb = org.Mm.eg.db::org.Mm.eg.db)
mouse_GO <- format_GO(dict = dict_GO, orgdb = org.Mm.eg.db::org.Mm.eg.db)
# Mouse reduced
mouse_GO_red <- mouse_GO
onts <- c("MF", "BP", "CC")
for(i in seq_along(onts)){
ids <- mouse_GO[[onts[i]]][which(mouse_GO[[onts[i]]]$Count >= 2), ]$ID
mat <- mouse_GO$similarity_matrix[[onts[i]]][ids, ids]
mouse_GO_red$similarity_matrix[[onts[i]]] <- mat
}
# Save data.
# save(mouse_GO_red, file = "genes2bioterm/20201211_mouse_GO_red.rda")
The data were stored in the following repositories:
ASURATDB functions collect_KEGG() and
format_KEGG() load a KEGG database using KEGGREST package
via the internet and reformat the database, respectively.
The arguments of collect_KEGG() are
organism and categories. Here,
organism must obey the naming rule of KEGG (see
KEGGREST function listDatabases()) and
categories must be one of "pathway",
"module", and "drug" (only for human) in the
current version.
# Human
dict_KEGG <- collect_KEGG(organism = "hsa", categories = c("pathway"))
human_KEGG <- format_KEGG(dict = list(pathway = dict_KEGG[["pathway"]][["success"]]),
orgdb = org.Hs.eg.db::org.Hs.eg.db)
# Save data.
# save(human_KEGG, file = "genes2bioterm/20201213_human_KEGG.rda")
# Mouse
dict_KEGG <- collect_KEGG(organism = "mmu", categories = c("pathway"))
mouse_KEGG <- format_KEGG(dict = list(pathway = dict_KEGG[["pathway"]][["success"]]),
orgdb = org.Mm.eg.db::org.Mm.eg.db)
# Save data.
# save(mouse_KEGG, file = "genes2bioterm/20201211_mouse_KEGG.rda")
# Human (drug)
dict_KEGG_drug <- collect_KEGG(organism = "hsa", categories = c("drug"))
human_KEGG_drug <- format_KEGG(dict = list(drug = dict_KEGG_drug[["drug"]][["success"]]),
orgdb = org.Hs.eg.db::org.Hs.eg.db)
# Save data.
# save(human_KEGG_drug, file = "genes2bioterm/20221102_human_KEGG_drug.rda")
Note collect_KEGG() uses KEGGREST function
keggGet(), which may produce both successful and
unsuccessful results. The data were stored in the following
repositories:
Load databases, where category is “H” (hallmark gene sets) and
species is human (cf. msigdbr::msigdbr_species()).
dbtable <- msigdbr::msigdbr(species = "Homo sapiens", category = "H")
Reformat the database.
dbtable_gsetID <- dbtable[, which(colnames(dbtable) %in% c("gs_name", "gs_id"))]
dbtable_gsetID <- unique(dbtable_gsetID)
dbtable_geneID <- split(x = dbtable$human_entrez_gene, f = dbtable$gs_name)
dbtable_symbol <- split(x = dbtable$gene_symbol, f = dbtable$gs_name)
stopifnot(identical(length(dbtable_geneID), length(dbtable_symbol)))
res <- c("ID", "Description", "Count", "Gene", "GeneID", "IC")
res <- data.frame(matrix(ncol = 6, nrow = 0, dimnames = list(NULL, res)))
for(i in 1:length(dbtable_geneID)){
res <- rbind(res, data.frame(
ID = dbtable_gsetID$gs_id[i],
Description = dbtable_gsetID$gs_name[i],
IC = NA,
Count = length(dbtable_geneID[[i]]),
Gene = paste(dbtable_symbol[[i]], collapse = "/"),
GeneID = paste(dbtable_geneID[[i]], collapse = "/")))
}
human_MSigDB_Hallmark <- list(hallmark = res)
# Save data.
# save(human_MSigDB_Hallmark, file = "genes2bioterm/20230127_human_MSigDB_Hallmark.rda")
The data were stored in the following repositories:
Load databases, where category is “C3” (regulatory target gene sets)
and species is human (cf. msigdbr::msigdbr_species()).
dbtable <- msigdbr::msigdbr(species = "Homo sapiens", category = "C2")
dbtable <- dbtable[which(dbtable$gs_subcat == "CP:BIOCARTA"), ]
Reformat the database.
dbtable_gsetID <- dbtable[, which(colnames(dbtable) %in% c("gs_name", "gs_id"))]
dbtable_gsetID <- unique(dbtable_gsetID)
dbtable_geneID <- split(x = dbtable$human_entrez_gene, f = dbtable$gs_name)
dbtable_symbol <- split(x = dbtable$gene_symbol, f = dbtable$gs_name)
stopifnot(identical(length(dbtable_geneID), length(dbtable_symbol)))
res <- c("ID", "Description", "Count", "Gene", "GeneID", "IC")
res <- data.frame(matrix(ncol = 6, nrow = 0, dimnames = list(NULL, res)))
for(i in 1:length(dbtable_geneID)){
res <- rbind(res, data.frame(
ID = dbtable_gsetID$gs_id[i],
Description = dbtable_gsetID$gs_name[i],
IC = NA,
Count = length(dbtable_geneID[[i]]),
Gene = paste(dbtable_symbol[[i]], collapse = "/"),
GeneID = paste(dbtable_geneID[[i]], collapse = "/")))
}
human_MSigDB_BIOCARTA <- list(BIOCARTA = res)
# Save data.
# save(human_MSigDB_BIOCARTA, file = "genes2bioterm/20230211_human_MSigDB_BIOCARTA.rda")
The data were stored in the following repositories:
Load databases, where category is “C3” (regulatory target gene sets)
and species is human (cf. msigdbr::msigdbr_species()).
dbtable <- msigdbr::msigdbr(species = "Homo sapiens", category = "C3")
dbtable <- dbtable[which(dbtable$gs_subcat == "TFT:GTRD"), ]
Reformat the database.
dbtable_gsetID <- dbtable[, which(colnames(dbtable) %in% c("gs_name", "gs_id"))]
dbtable_gsetID <- unique(dbtable_gsetID)
dbtable_geneID <- split(x = dbtable$human_entrez_gene, f = dbtable$gs_name)
dbtable_symbol <- split(x = dbtable$gene_symbol, f = dbtable$gs_name)
stopifnot(identical(length(dbtable_geneID), length(dbtable_symbol)))
res <- c("ID", "Description", "Count", "Gene", "GeneID", "IC")
res <- data.frame(matrix(ncol = 6, nrow = 0, dimnames = list(NULL, res)))
for(i in 1:length(dbtable_geneID)){
res <- rbind(res, data.frame(
ID = dbtable_gsetID$gs_id[i],
Description = dbtable_gsetID$gs_name[i],
IC = NA,
Count = length(dbtable_geneID[[i]]),
Gene = paste(dbtable_symbol[[i]], collapse = "/"),
GeneID = paste(dbtable_geneID[[i]], collapse = "/")))
}
human_MSigDB_GTRD <- list(GTRD = res)
# Save data.
# save(human_MSigDB_GTRD, file = "genes2bioterm/20230211_human_MSigDB_GTRD.rda")
The data were stored in the following repositories:
Load databases.
dbtable <- clustermole::clustermole_markers()
sort(unique(dbtable$db))
[1] "ARCHS4" "CellMarker" "MSigDB" "PanglaoDB" "SaVanT" "TISSUES"
[7] "xCell"
Select species and databases.
dbtable <- dbtable[which(dbtable$species == "Human"), ]
dbtable <- dbtable[which(dbtable$db == "MSigDB"),]
dbtable$geneID <- NA
Change gene symbols into entrez IDs.
dictionary <- AnnotationDbi::select(org.Hs.eg.db::org.Hs.eg.db,
key = dbtable$gene_original,
columns = c("SYMBOL", "ENTREZID"),
keytype = "SYMBOL")
dictionary <- dictionary[!duplicated(dictionary$SYMBOL), ]
dictionary <- dictionary[which(!is.na(dictionary$SYMBOL)),]
for(i in 1:nrow(dbtable)){
gene <- dbtable$gene_original[i]
inds <- which(dictionary$SYMBOL == gene)
dbtable$geneID[i] <- dictionary[inds,]$ENTREZID
}
Reformat the database. Here, the identifier of each biological term are named “MSigDBID.”
dbtable_geneID <- split(x = dbtable$geneID, f = dbtable$celltype)
dbtable_symbol <- split(x = dbtable$gene_original, f = dbtable$celltype)
stopifnot(identical(length(dbtable_geneID), length(dbtable_symbol)))
res <- c("ID", "Description", "Count", "Gene", "GeneID", "IC")
res <- data.frame(matrix(ncol = 6, nrow = 0, dimnames = list(NULL, res)))
for(i in 1:length(dbtable_geneID)){
res <- rbind(res, data.frame(
ID = paste("MSigDBID:", i, sep = ""),
Description = names(dbtable_geneID)[i],
IC = NA,
Count = length(dbtable_geneID[[i]]),
Gene = paste(dbtable_symbol[[i]], collapse = "/"),
GeneID = paste(dbtable_geneID[[i]], collapse = "/")))
}
human_MSigDB <- list(cell = res)
# Save data.
# save(human_MSigDB, file = "genes2bioterm/20220308_human_MSigDB.rda")
The data were stored in the following repositories:
Load databases.
dbtable <- clustermole::clustermole_markers()
sort(unique(dbtable$db))
[1] "ARCHS4" "CellMarker" "MSigDB" "PanglaoDB" "SaVanT" "TISSUES"
[7] "xCell"
Select species and databases.
dbtable <- dbtable[which(dbtable$species == "Human"), ]
dbtable <- dbtable[which(dbtable$db == "CellMarker"),]
dbtable$geneID <- NA
Change gene symbols into entrez IDs.
dictionary <- AnnotationDbi::select(org.Hs.eg.db::org.Hs.eg.db,
key = dbtable$gene_original,
columns = c("SYMBOL", "ENTREZID"),
keytype = "SYMBOL")
dictionary <- dictionary[!duplicated(dictionary$SYMBOL), ]
dictionary <- dictionary[which(!is.na(dictionary$SYMBOL)),]
for(i in 1:nrow(dbtable)){
gene <- dbtable$gene_original[i]
inds <- which(dictionary$SYMBOL == gene)
dbtable$geneID[i] <- dictionary[inds,]$ENTREZID
}
Reformat the database. Here, the identifier of each biological term are named “CellMarkerID.”
dbtable_geneID <- split(x = dbtable$geneID, f = dbtable$celltype)
dbtable_symbol <- split(x = dbtable$gene_original, f = dbtable$celltype)
stopifnot(identical(length(dbtable_geneID), length(dbtable_symbol)))
res <- c("ID", "Description", "Count", "Gene", "GeneID", "IC")
res <- data.frame(matrix(ncol = 6, nrow = 0, dimnames = list(NULL, res)))
for(i in 1:length(dbtable_geneID)){
res <- rbind(res, data.frame(
ID = paste("CellMarkerID:", i, sep = ""),
Description = names(dbtable_geneID)[i],
IC = NA,
Count = length(dbtable_geneID[[i]]),
Gene = paste(dbtable_symbol[[i]], collapse = "/"),
GeneID = paste(dbtable_geneID[[i]], collapse = "/")))
}
human_CellMarker <- list(cell = res)
# Save data.
# save(human_CellMarker, file = "genes2bioterm/20220308_human_CellMarker.rda")
The data were stored in the following repositories:
Create a cell type-related database by combining Cell ontology and MSigDB databases for analyzing human single-cell transcriptome data.
urlpath <- "https://github.com/keita-iida/ASURATDB/blob/main/genes2bioterm/"
load(url(paste0(urlpath, "20201213_human_CO.rda?raw=true")))
load(url(paste0(urlpath, "20220308_human_MSigDB.rda?raw=true")))
res <- rbind(human_CO[["cell"]], human_MSigDB[["cell"]])
human_CB <- list(cell = res)
Create a cell type-related database by combining Cell ontology, MSigDB, and CellMarker databases for analyzing human single-cell transcriptome data.
urlpath <- "https://github.com/keita-iida/ASURATDB/blob/main/genes2bioterm/"
load(url(paste0(urlpath, "20201213_human_CO.rda?raw=true")))
load(url(paste0(urlpath, "20220308_human_MSigDB.rda?raw=true")))
load(url(paste0(urlpath, "20220304_human_CellMarker.rda?raw=true")))
res <- do.call("rbind", list(human_CO[["cell"]], human_MSigDB[["cell"]],
human_CellMarker[["cell"]]))
human_CB <- list(cell = res)
Create a cell type-related database by combining Disease Ontology, Cell ontology and MSigDB databases for analyzing complex human single-cell transcriptome data.
urlpath <- "https://github.com/keita-iida/ASURATDB/blob/main/genes2bioterm/"
load(url(paste0(urlpath, "20201213_human_DO.rda?raw=true")))
load(url(paste0(urlpath, "20201213_human_CO.rda?raw=true")))
load(url(paste0(urlpath, "20220308_human_MSigDB.rda?raw=true")))
res <- do.call("rbind", list(human_DO[["disease"]], human_CO[["cell"]],
human_MSigDB[["cell"]]))
human_CB <- list(cell = res)
Create a cell type-related database by combining Disease Ontology, Cell ontology, MSigDB, and CellMarker databases for analyzing complex human single-cell transcriptome data.
urlpath <- "https://github.com/keita-iida/ASURATDB/blob/main/genes2bioterm/"
load(url(paste0(urlpath, "20201213_human_DO.rda?raw=true")))
load(url(paste0(urlpath, "20201213_human_CO.rda?raw=true")))
load(url(paste0(urlpath, "20220308_human_MSigDB.rda?raw=true")))
load(url(paste0(urlpath, "20220304_human_CellMarker.rda?raw=true")))
res <- do.call("rbind", list(human_DO[["disease"]], human_CO[["cell"]],
human_MSigDB[["cell"]], human_CellMarker[["cell"]]))
human_CB <- list(cell = res)
sessionInfo()
#> R version 4.2.1 (2022-06-23)
#> Platform: x86_64-apple-darwin17.0 (64-bit)
#> Running under: macOS Big Sur ... 10.16
#>
#> Matrix products: default
#> BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
#>
#> locale:
#> [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> loaded via a namespace (and not attached):
#> [1] digest_0.6.31 R6_2.5.1 jsonlite_1.8.4 evaluate_0.20
#> [5] cachem_1.0.6 rlang_1.0.6 cli_3.6.0 rstudioapi_0.14
#> [9] jquerylib_0.1.4 bslib_0.4.2 rmarkdown_2.20 tools_4.2.1
#> [13] xfun_0.37 yaml_2.3.7 fastmap_1.1.0 compiler_4.2.1
#> [17] htmltools_0.5.4 knitr_1.42 sass_0.4.5