MacBook Pro (Big Sur, 16-inch, 2019), Processor (2.4 GHz 8-Core Intel Core i9), Memory (64 GB 2667 MHz DDR4).
Attach necessary libraries:
library(ASURAT)
library(SingleCellExperiment)
library(SummarizedExperiment)
Create random variables following Normal distributions and compute separation indices.
<- 1000
size for(i in c(3, 4, 5, 9)){
# Preparation
set.seed(1)
<- as.numeric(rnorm(size, mean = 5, sd = 1))
scores_0 names(scores_0) <- seq_len(size)
<- as.numeric(rnorm(size, mean = 1 + 1 * (i - 1), sd = 1))
scores_1 names(scores_1) <- size + seq_len(size)
<- c(scores_0, scores_1)
scores <- sort(scores, decreasing = FALSE)
scores <- ifelse(as.integer(names(scores)) <= size, 0, 1)
vec
# Count the number of steps in bubble sort.
<- bubble_sort(list(vec, 0))[[2]] # dist(vec, (0,...,1)).
dist1 <- bubble_sort(list(1 - vec, 0))[[2]] # dist(vec, (1,...,0)).
dist2 <- round((dist2 - dist1) / (dist2 + dist1), digits = 6)
sepI
# Plot the result.
<- data.frame(s = scores, l = vec)
df <- paste("I = ", round(sepI, digits = 3), sep = "")
mytext <- c(rainbow(3)[1], rainbow(3)[3])
mycolors <- factor(df$l, levels = c(1, 0))
color <- ggplot2::ggplot(df, ggplot2::aes(x = s, y = 0, color = color)) +
p ::geom_jitter(width = 0, alpha = 0.4, size = .5) +
ggplot2::theme_classic(base_size = 18) +
ggplot2::scale_y_continuous(breaks = NULL) +
ggplot2::scale_colour_manual(values = mycolors) +
ggplot2::labs(title = "", x = "Score", y = "", color = "Label") +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, vjust = -1,
ggplot2size = 16)) +
::annotate("text", x = Inf, y = Inf, label = mytext,
ggplot2hjust = 1, vjust = 1, size = 5) +
::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4)))
ggplot2<- ggExtra::ggMarginal(p, type = "density", margins = "x", size = 0.8,
p groupColour = TRUE, groupFill = TRUE)
<- sprintf("figures/figure_90_%04d.png", 19 + i)
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 4, height = 3)
ggplot2 }
Select GO IDs of interest.
<- c("GO:0070371", # ERK1 and ERK2 cascade
GOOI "GO:1904666") # regulation of ubiquitin protein ligase activity
Load the result of the section “Normalize data” in SCLC data analyses.
<- readRDS("<file path>") sclc
Load the result of the section “Compute correlation matrices” in SCLC data analyses.
<- readRDS("<file path>") cormat
Load databases.
<- "https://github.com/keita-iida/ASURATDB/blob/main/genes2bioterm/"
urlpath load(url(paste0(urlpath, "20201213_human_GO_red.rda?raw=TRUE"))) # GO
The reformatted knowledge-based data were available from the following repositories:
Add formatted databases to metadata(sce)$sign
.
<- list(GO = sclc)
sclcs metadata(sclcs$GO) <- list(sign = human_GO[["BP"]])
Redefines functional gene sets for the input database by removing
genes, which are not included in rownames(sce)
, and further
removes biological terms including too few or too many genes.
$GO <- remove_signs(sce = sclcs$GO, min_ngenes = 2, max_ngenes = 1000) sclcs
Clusters functional gene sets using a correlation graph-based decomposition method, producing strongly, variably, and weakly correlated gene sets (SCG, VCG, and WCG, respectively).
set.seed(1)
$GO <- cluster_genesets(sce = sclcs$GO, cormat = cormat,
sclcsth_posi = 0.20, th_nega = -0.20)
Select IDs and prepare a correlation submatrix.
<- GOOI[1]
goid <- metadata(sclcs$GO)$sign
df <- unlist(strsplit(df[df$SignID == goid, ]$StrgCorrGene, "/"))
goi_strg <- unlist(strsplit(df[df$SignID == goid, ]$VariCorrGene, "/"))
goi_vari <- unlist(strsplit(df[df$SignID == goid, ]$WeakCorrGene, "/"))
goi_weak <- goi_strg[seq_len(6)]
goi_strg <- goi_weak[seq_len(10)]
goi_weak <- c(goi_weak[1], goi_weak[2], goi_strg[1:6], goi_weak[3:7],
goi_all 1:2], goi_weak[8:10])
goi_vari[#goi_all <- unique(c(goi_strg, goi_vari, goi_weak))
<- cormat[goi_all, goi_all]
cmat diag(cmat) <- 0
Plot the correlation graph.
<- goi_all
label which(goi_all %in% goi_strg)] <- 1
label[which(goi_all %in% goi_vari)] <- 2
label[which(goi_all %in% goi_weak)] <- 3
label[<- label
colors which(colors == 1)] <- "grey20"
colors[which(colors == 2)] <- "grey70"
colors[which(colors == 3)] <- "white"
colors[<- label
text_colors which(text_colors == "1")] <- "white"
text_colors[which(text_colors == "2")] <- "black"
text_colors[which(text_colors == "3")] <- "black"
text_colors[<- 9
node_sizes
<- "GO:0070371 (ERK1 and ERK2 cascade)"
title <- "figures/figure_90_0001"
filename ::qgraph(input = cmat, title = title, title.cex = 0.3, filename = filename,
qgraphfiletype = "png", height = 1, width = 1,
color = colors, vsize = node_sizes, label.color = text_colors,
border.width = 3, edge.labels = FALSE,
posCol = "red", negCol = "blue", threshold = 0)
where the average correlation coefficients of SCG, VCG, and WCG are 0.3389764, 0.2280671, and 0.011150431, respectively.
Perform the following ASURAT functions.
# Keep only GO terms of interest.
<- metadata(sclcs$GO)$sign
df <- df[which(df$SignID %in% GOOI), ]
df metadata(sclcs$GO)$sign <- df
# Creates signs.
$GO <- create_signs(sce = sclcs$GO, min_cnt_strg = 2, min_cnt_vari = 2)
sclcs
# Create sign-by-sample matrices.
$GO <- makeSignMatrix(sce = sclcs$GO, weight_strg = 0.5, weight_vari = 0.5) sclcs
Install an escape package v1.5, which requires R (>= 4.1).
# devtools::install_github("ncborcherding/escape")
packageVersion("escape")
[1] ‘1.5.1’
Set a GO term of interest.
<- c("GOBP_ERK1_AND_ERK2_CASCADE",
GOOI "GOBP_REGULATION_OF_UBIQUITIN_PROTEIN_LIGASE_ACTIVITY")
Load the result of the section “Normalize data” in SCLC data analyses.
<- readRDS("<file path>") sclc
Create Seurat objects.
<- Seurat::CreateSeuratObject(counts = as.matrix(assay(sclc, "normalized")),
sclc project = "SCLC")
Perform getGeneSets()
with an argument
library = "C5"
(“ontology gene sets” in MSigDB)
and select GO terms of interest.
@misc[["getGeneSets"]] <- escape::getGeneSets(species = "Homo sapiens",
sclclibrary = "C5")
<- sclc@misc[["getGeneSets"]]
df <- df[grepl("GOBP", names(df))]
df @misc[["getGeneSets"]] <- df
sclc@misc[["GO_TERMS"]] <- df[grepl(paste(GOOI, collapse="|"), names(df))] sclc
Perform enrichIt()
, estimating ssGSEA scores, in which
the arguments are the same with those in the vignettes in escape
package.
set.seed(1)
<- escape::enrichIt(obj = sclc, gene.sets = sclc@misc[["GO_TERMS"]],
ES groups = 1000, cores = 4)
@misc[["enrichIt"]] <- ES sclc
[1] "Using sets of 1000 cells. Running 3 times."
Setting parallel calculations through a SnowParam back-end
with workers=4 and tasks=100.
Estimating ssGSEA scores for 2 gene sets.
[1] "Calculating ranks..."
[1] "Calculating absolute values from ranks..."
Setting parallel calculations through a SnowParam back-end
with workers=4 and tasks=100.
Estimating ssGSEA scores for 2 gene sets.
[1] "Calculating ranks..."
[1] "Calculating absolute values from ranks..."
Setting parallel calculations through a SnowParam back-end
with workers=4 and tasks=100.
Estimating ssGSEA scores for 2 gene sets.
[1] "Calculating ranks..."
[1] "Calculating absolute values from ranks..."
packageVersion("pagoda2")
[1] ‘1.0.10’
Set GO terms of interest.
<- c("GO:0070371", # ERK1 and ERK2 cascade
GOOI "GO:1904666") # regulation of ubiquitin protein ligase activity
Load the result of the section “Normalize data” in SCLC data analyses.
<- readRDS("<file path>") sclc
Create a pagoda2 object.
<- as.matrix(assay(sclc, "normalized"))
mat <- pagoda2::Pagoda2$new(x = mat, modelType = "plain", n.cores = 1,
sclc log.scale = TRUE, min.cells.per.gene = 0)
2283 cells, 6346 genes; normalizing ...
Using plain model
log scale ...
done.
Adjust the variance.
$adjustVariance(plot = FALSE) sclc
calculating variance fit ...
using gam
1278 overdispersed genes ... 1278
persisting ...
done.
Perform principal component analysis, where n.odgenes
is
the number of overdispersed genes.
$calculatePcaReduction(nPcs = 50, n.odgenes = 1278) sclc
running PCA using 1278 OD genes .
.
.
.
done
Perform pathway overdispersion analysis. Here, computations were partially performed on the IPR server at Institute for Protein Research, Osaka University.
# Translate gene names to ids.
suppressMessages(library(org.Hs.eg.db))
<- unlist(lapply(mget(colnames(sclc$counts), org.Hs.egALIAS2EG,
ids ifnotfound = NA), function(x) x[1]))
# Reverse map
<- names(ids)
rids names(rids) <- ids
# List all the ids per GO category.
<- list2env(eapply(org.Hs.egGO2ALLEGS,
go.env function(x) as.character(na.omit(rids[x]))))
# Remove unused GO terms.
<- names(go.env)
goterms for(i in seq_along(goterms)){
if(goterms[i] %in% GOOI){
next
else{
}::env_unbind(go.env, goterms[i])
rlang
}
}
# Pathway overdispersion analysis
$testPathwayOverdispersion(setenv = go.env, verbose = TRUE,
sclccorrelation.distance.threshold = 0.8,
min.pathway.size = 1, max.pathway.size = 100,
recalculate.pca = FALSE)
determining valid pathways
processing 2 valid pathways
scoring pathway od signifcance
compiling pathway reduction
clustering aspects based on gene loading ... 2 aspects remaining
clustering aspects based on pattern similarity ... 2 aspects remaining
Load the above computational results.
<- readRDS("<file path>")
sclc_asurat <- readRDS("<file path>")
sclc_ssgsea <- readRDS("<file path>") sclc_pagoda
Create score-by-sample (cell) matrices.
<- as.matrix(assay(sclc_asurat$GO, "counts"))
mat_asurat <- t(as.matrix(sclc_ssgsea@misc[["enrichIt"]]))
mat_ssgsea <- sclc_pagoda[["misc"]][["pathwayOD"]][["xv"]]
mat_pagoda rownames(mat_ssgsea) <- c("GO:0070371", "GO:1904666")
rownames(mat_pagoda) <- c(sclc_pagoda[["misc"]][["pathwayOD"]][["cnam"]][["aspect1"]],
"misc"]][["pathwayOD"]][["cnam"]][["aspect2"]])
sclc_pagoda[[
# Select rows.
<- as.matrix(mat_asurat[c(1, 3), ])
submat_asurat <- as.matrix(mat_ssgsea[1, ])
submat_ssgsea <- as.matrix(mat_pagoda[2, ])
submat_pagoda <- t(submat_ssgsea) ; rownames(submat_ssgsea) <- rownames(mat_ssgsea)[1]
submat_ssgsea <- t(submat_pagoda) ; rownames(submat_pagoda) <- rownames(mat_pagoda)[2]
submat_pagoda
# Select columns by random sampling.
set.seed(1)
<- sample(ncol(mat_asurat), size = 1000, replace = FALSE)
inds <- as.matrix(submat_asurat[, inds])
submat_asurat <- as.matrix(submat_ssgsea[, inds])
submat_ssgsea <- as.matrix(submat_pagoda[, inds])
submat_pagoda <- t(submat_ssgsea) ; rownames(submat_ssgsea) <- rownames(mat_ssgsea)[1]
submat_ssgsea <- t(submat_pagoda) ; rownames(submat_pagoda) <- rownames(mat_pagoda)[2]
submat_pagoda
# Scale data.
<- t(scale(t(submat_asurat)))
submat_asurat <- t(scale(t(submat_ssgsea)))
submat_ssgsea <- t(scale(t(submat_pagoda))) submat_pagoda
Create gene expression matrices.
<- c("JUN", "MIF")
genes <- assay(altExp(sclc_asurat$GO), "counts")
mat_geneex <- mat_geneex[genes, ]
mat_geneex
# Select columns.
<- as.matrix(mat_geneex[, inds])
submat_geneex
# Scale data.
<- t(scale(t(submat_geneex))) submat_geneex
Examine the scores of samples (cells).
suppressMessages(library(ComplexHeatmap))
<- "figures/figure_90_0010.png"
filename png(file = filename, height = 120, width = 400, res = 80)
#png(file = filename, height = 600, width = 2000, res = 400)
<- ComplexHeatmap::Heatmap(submat_asurat, column_title = "ASURAT",
p name = "Scaled\nSign scores",
cluster_rows = FALSE, show_row_names = TRUE,
row_names_side = "right", show_row_dend = FALSE,
show_column_names = FALSE, column_dend_side = "top",
show_parent_dend_line = FALSE)
<- ComplexHeatmap::Heatmap(submat_geneex, name = "Scaled\nExpression",
q cluster_rows = FALSE, show_row_names = TRUE,
row_names_side = "right", show_row_dend = FALSE,
show_column_names = FALSE,
show_parent_dend_line = FALSE)
<- p %v% q
p
pdev.off()
<- "figures/figure_90_0011.png"
filename png(file = filename, height = 120, width = 400, res = 80)
#png(file = filename, height = 600, width = 2000, res = 400)
<- ComplexHeatmap::Heatmap(submat_ssgsea, column_title = "ssGSEA",
p name = "Scaled\nssGSEA score",
cluster_rows = FALSE, show_row_names = TRUE,
row_names_side = "right", show_row_dend = FALSE,
show_column_names = FALSE, column_dend_side = "top",
show_parent_dend_line = FALSE)
<- ComplexHeatmap::Heatmap(submat_geneex, name = "Scaled\nExpression",
q cluster_rows = FALSE, show_row_names = TRUE,
row_names_side = "right", show_row_dend = FALSE,
show_column_names = FALSE,
show_parent_dend_line = FALSE)
<- p %v% q
p
pdev.off()
<- "figures/figure_90_0012.png"
filename png(file = filename, height = 120, width = 440, res = 80)
#png(file = filename, height = 620, width = 2200, res = 400)
<- ComplexHeatmap::Heatmap(submat_pagoda, column_title = "PAGODA2",
p name = "Scaled\nPC1 score",
cluster_rows = FALSE, show_row_names = TRUE,
row_names_side = "right", show_row_dend = FALSE,
show_column_names = FALSE, column_dend_side = "top",
show_parent_dend_line = FALSE)
<- ComplexHeatmap::Heatmap(submat_geneex, name = "Scaled\nExpression",
q cluster_rows = FALSE, show_row_names = TRUE,
row_names_side = "right", show_row_dend = FALSE,
show_column_names = FALSE,
show_parent_dend_line = FALSE)
<- p %v% q
p
pdev.off()