Attach necessary libraries:
library(ASURAT)
library(SingleCellExperiment)
library(SummarizedExperiment)
A goal of ASURAT is to cluster and characterize individual samples (cells) in terms of cell type (or disease), biological function, signaling pathway activity, and so on (see here).
Having a SingleCellExperiment object (e.g., sce
), one
can use ASURAT by confirming the following requirements:
assays(sce)
contains gene expression data with row and
column names as variable (gene) and sample (cell), respectively,If sce
contains normalized expression data (e.g.,
assay(sce, "logcounts")
), set
assay(sce, "centered")
by subtracting the data with the
mean expression levels across samples (cells).
<- as.matrix(assay(sce, "logcounts"))
mat assay(sce, "centered") <- sweep(mat, 1, apply(mat, 1, mean), FUN = "-")
One may use a Seurat function
Seurat::as.SingleCellExperiment()
for converting Seurat
objects into SingleCellExperiment ones.
Now, ready for the next step here.
Load single-cell RNA sequencing (scRNA-seq) data.
<- TENxPBMCData::TENxPBMCData(dataset = c("pbmc4k"))
sce <- as.matrix(assay(sce, "counts"))
pbmc_counts rownames(pbmc_counts) <- make.unique(rowData(sce)$Symbol_TENx)
colnames(pbmc_counts) <- make.unique(colData(sce)$Barcode)
Here, pbmc_counts
is a read count table of peripheral
blood mononuclear cells (PBMCs).
Below is a head of pbmc_counts
:
1:5, 1:3]
pbmc_counts[#> AAACCTGAGAAGGCCT-1 AAACCTGAGACAGACC-1 AAACCTGAGATAGTCA-1
#> RP11-34P13.3 0 0 0
#> FAM138A 0 0 0
#> OR4F5 0 0 0
#> RP11-34P13.7 0 0 0
#> RP11-34P13.8 0 0 0
Create SingleCellExperiment objects by inputting gene expression data.
<- SingleCellExperiment(assays = list(counts = pbmc_counts),
pbmc rowData = data.frame(gene = rownames(pbmc_counts)),
colData = data.frame(cell = colnames(pbmc_counts)))
Check data sizes.
dim(pbmc)
#> [1] 33694 4340
Remove variables (genes) and samples (cells) with low quality, by processing the following three steps:
First of all, add metadata for both variables and samples using
ASURAT function add_metadata()
.
The arguments are
sce
: SingleCellExperiment object, andmitochondria_symbol
: a string representing for
mitochondrial genes.<- add_metadata(sce = pbmc, mitochondria_symbol = "^MT-") pbmc
One can check the results in rowData(sce)
and
colData(sce)
slots.
ASURAT function remove_variables()
removes variable
(gene) data such that the numbers of non-zero expressing samples (cells)
are less than min_nsamples
.
<- remove_variables(sce = pbmc, min_nsamples = 10) pbmc
Qualities of sample (cell) data are confirmed based on proper
visualization of colData(sce)
.
<- data.frame(x = colData(pbmc)$nReads, y = colData(pbmc)$nGenes)
df ::ggplot() +
ggplot2::geom_point(ggplot2::aes(x = df$x, y = df$y), size = 1, alpha = 1) +
ggplot2::labs(x = "Number of reads", y = "Number of genes") +
ggplot2::theme_classic(base_size = 20) ggplot2
<- data.frame(x = colData(pbmc)$nReads, y = colData(pbmc)$percMT)
df ::ggplot() +
ggplot2::geom_point(ggplot2::aes(x = df$x, y = df$y), size = 1, alpha = 1) +
ggplot2::labs(x = "Number of reads", y = "Perc of MT reads") +
ggplot2::theme_classic(base_size = 20) ggplot2
ASURAT function remove_samples()
removes sample (cell)
data by setting cutoff values for the metadata.
The arguments are
sce
: SingleCellExperiment object,min_nReads
and max_nReads
: minimum and
maximum number of reads,min_nGenes
and max_nGenes
: minimum and
maximum number of non-zero expressed genes, andmin_percMT
and max_percMT
: minimum and
maximum percent of reads that map to mitochondrial genes, respectively.
If there is no mitochondrial genes, set them as NULL
.<- remove_samples(sce = pbmc, min_nReads = 5000, max_nReads = 20000,
pbmc min_nGenes = 100, max_nGenes = 1e+10,
min_percMT = 0, max_percMT = 10)
Tips: Take care not to set excessive values for the
arguments of remove_samples()
, or it may produce biased
results. Note that min_nReads = 5000
is somewhat large,
which is used only for the tutorial.
Qualities of variable (gene) data are confirmed based on proper
visualization of rowData(sce)
.
<- apply(as.matrix(assay(pbmc, "counts")), 1, mean)
aveexp <- data.frame(x = seq_len(nrow(rowData(pbmc))),
df y = sort(aveexp, decreasing = TRUE))
::ggplot() + ggplot2::scale_y_log10() +
ggplot2::geom_point(ggplot2::aes(x = df$x, y = df$y), size = 1, alpha = 1) +
ggplot2::labs(x = "Rank of genes", y = "Mean read counts") +
ggplot2::theme_classic(base_size = 20)
ggplot2#> Warning: Transformation introduced infinite values in continuous y-axis
ASURAT function remove_variables_second()
removes
variable (gene) data such that the mean read counts across samples are
less than min_meannReads
.
<- remove_variables_second(sce = pbmc, min_meannReads = 0.05) pbmc
Normalize data using log transformation with a pseudo count and center the data by subtracting with the mean expression levels across samples (cells). The resulting normalized-and-centered data are used for downstream analyses.
Perform log-normalization with a pseudo count.
assay(pbmc, "logcounts") <- log(assay(pbmc, "counts") + 1)
Center each row data.
<- assay(pbmc, "logcounts")
mat assay(pbmc, "centered") <- sweep(mat, 1, apply(mat, 1, mean), FUN = "-")
Set gene expression data into altExp(sce)
.
Tips: Take care not to use a slot name
“log-normalized” for altExp(sce)
, which may produce an
error when using a Seurat (version 4.0.5) function
as.Seurat()
in the downstream analysis.
<- "logcounts"
sname altExp(pbmc, sname) <- SummarizedExperiment(list(counts = assay(pbmc, sname)))
Add ENTREZ Gene IDs to rowData(sce)
.
<- AnnotationDbi::select(org.Hs.eg.db::org.Hs.eg.db,
dictionary key = rownames(pbmc),
columns = "ENTREZID", keytype = "SYMBOL")
<- dictionary[!duplicated(dictionary$SYMBOL), ]
dictionary rowData(pbmc)$geneID <- dictionary$ENTREZID
Infer cell or disease types, biological functions, and signaling pathway activity at the single-cell level by inputting related databases.
ASURAT transforms centered read count tables to functional feature matrices, termed sign-by-sample matrices (SSMs). Using SSMs, perform unsupervised clustering of samples (cells).
Prepare correlation matrices of gene expressions.
set.seed(1)
<- 1000
nrand_samples <- t(as.matrix(assay(pbmc, "centered")))
mat <- sample(rownames(mat), nrand_samples, prob = NULL)
ids <- cor(mat[ids, ], method = "spearman") cormat
Load databases.
<- "https://github.com/keita-iida/ASURATDB/blob/main/genes2bioterm/"
urlpath load(url(paste0(urlpath, "20201213_human_CO.rda?raw=TRUE"))) # CO
load(url(paste0(urlpath, "20220308_human_MSigDB.rda?raw=TRUE"))) # MSigDB
load(url(paste0(urlpath, "20201213_human_GO_red.rda?raw=TRUE"))) # GO
load(url(paste0(urlpath, "20201213_human_KEGG.rda?raw=TRUE"))) # KEGG
These knowledge-based data were available from the following repositories:
Create a custom-built (CB) cell type-related database by combining different databases for analyzing human single-cell transcriptome data.
<- list(human_CO[["cell"]], human_MSigDB[["cell"]])
d <- list(cell = do.call("rbind", d)) human_CB
Add formatted databases to metadata(sce)$sign
.
<- list(CB = pbmc, GO = pbmc, KG = pbmc)
pbmcs metadata(pbmcs$CB) <- list(sign = human_CB[["cell"]])
metadata(pbmcs$GO) <- list(sign = human_GO[["BP"]])
metadata(pbmcs$KG) <- list(sign = human_KEGG[["pathway"]])
ASURAT function remove_signs()
redefines functional gene
sets for the input database by removing genes, which are not included in
rownames(sce)
, and further removes biological terms
including too few or too many genes.
The arguments are
sce
: SingleCellExperiment object,min_ngenes
: minimal number of genes> 1 (the default
value is 2), andmax_ngenes
: maximal number of genes> 1 (the default
value is 1000).$CB <- remove_signs(sce = pbmcs$CB, min_ngenes = 2, max_ngenes = 1000)
pbmcs$GO <- remove_signs(sce = pbmcs$GO, min_ngenes = 2, max_ngenes = 1000)
pbmcs$KG <- remove_signs(sce = pbmcs$KG, min_ngenes = 2, max_ngenes = 1000) pbmcs
The results are stored in metadata(sce)$sign
.
ASURAT function cluster_genes()
clusters functional gene
sets using a correlation graph-based decomposition method, which
produces strongly, variably, and weakly correlated gene sets (SCG, VCG,
and WCG, respectively).
The arguments are
sce
: SingleCellExperiment object,cormat
: correlation matrix of gene expressions,th_posi
and th_nega
: threshold values of
positive and negative correlation coefficients, respectively.Tips: Empirically, typical values of
th_posi
and th_nega
are \(0.15 \le {\rm th{\_}posi} \le 0.4\) and
\(-0.4 \le {\rm th{\_}nega} \le
-0.15\), but one cannot avoid trial and error for setting these
values. An exhaustive parameter searching is time-consuming but helpful
for obtaining interpretable results.
set.seed(1)
$CB <- cluster_genesets(sce = pbmcs$CB, cormat = cormat,
pbmcsth_posi = 0.30, th_nega = -0.30)
set.seed(1)
$GO <- cluster_genesets(sce = pbmcs$GO, cormat = cormat,
pbmcsth_posi = 0.30, th_nega = -0.30)
set.seed(1)
$KG <- cluster_genesets(sce = pbmcs$KG, cormat = cormat,
pbmcsth_posi = 0.20, th_nega = -0.20)
The results are stored in metadata(sce)$sign
.
ASURAT function create_signs()
creates signs by the
following criteria:
min_cnt_strg
(the
default value is 2) andmin_cnt_vari
(the
default value is 2),which are independently applied to SCGs and VCGs, respectively.
Tips: Empirically, typical values of
min_cnt_strg
and min_cnt_vari
are \(2 \le {\rm min\_cnt\_strg} = {\rm min\_cnt\_vari}
\le 4\), but one cannot avoid trial and error for setting these
values. An exhaustive parameter searching is time-consuming but helpful
for obtaining interpretable results.
$CB <- create_signs(sce = pbmcs$CB, min_cnt_strg = 2, min_cnt_vari = 2)
pbmcs$GO <- create_signs(sce = pbmcs$GO, min_cnt_strg = 4, min_cnt_vari = 4)
pbmcs$KG <- create_signs(sce = pbmcs$KG, min_cnt_strg = 3, min_cnt_vari = 3) pbmcs
The results are stored in metadata(sce)$sign_all
, where
“CorrType” indicates SCG or VCG, “Corr” means the average correlation
coefficients of SCG or VCG, “CorrWeak” means the average correlation
coefficients of WCG, “CorrGene” means SCG or VCG, and “WeakCorrGene”
means WCG. The orders of gene symbols and ENTREZ IDs, separated by “/”,
are consistent.
Tips: If one would like to recreate signs, reset the
list of objects by, e.g.,
(pbmcs <- list(CB = pbmc, GO = pbmc, KG = pbmc)
), and go
back to remove_signs()
.
If signs have semantic similarity information, one can use ASURAT
function remove_signs_redundant()
for removing redundant
sings using the semantic similarity matrices.
The arguments are
sce
: SingleCellExperiment object,similarity_matrix
: a semantic similarity matrix,threshold
: a threshold value of semantic similarity,
used for regarding biological terms as similar ones, andkeep_rareID
: if TRUE
, biological terms
with the larger ICs are kept.Tips: The optimal value of threshold
depends on the ontology structure as well as the method for computing
semantic similarity matrix.
# pbmcs$CB <- remove_signs_redundant(sce = pbmcs$CB,
# similarity_matrix = human_CO$similarity_matrix$cell,
# threshold = 0.90, keep_rareID = TRUE)
$GO <- remove_signs_redundant(sce = pbmcs$GO,
pbmcssimilarity_matrix = human_GO$similarity_matrix$BP,
threshold = 0.70, keep_rareID = TRUE)
The results are stored in metadata(sce)$sign_SCG
,
metadata(sce)$sign_VCG
,
metadata(sce)$sign_all
, and if there exist,
metadata(sce)$sign_SCG_redundant
and
metadata(sce)$sign_VCG_redundant
.
ASURAT function remove_signs_manually()
removes signs by
specifying IDs (e.g., GOID:XXX
) or descriptions (e.g.,
COVID
) using grepl()
. The arguments are
sce
and keywords
(keywords separated by
|
).
<- "Covid|COVID"
keywords $KG <- remove_signs_manually(sce = pbmcs$KG, keywords = keywords) pbmcs
The results are stored in metadata(sce)$sign_SCG
,
metadata(sce)$sign_VCG
, and
metadata(sce)$sign_all
.
There is another ASURAT function
select_signs_manually()
, a counter part of
remove_signs_manually()
, which removes signs that do not
include keywords
(keywords separated by
|
).
<- "cell|cyte"
keywords <- select_signs_manually(sce = pbmcs$CB, keywords = keywords) test
The results are stored in metadata(sce)$sign_SCG
,
metadata(sce)$sign_VCG
, and
metadata(sce)$sign_all
.
ASURAT function create_sce_signmatrix()
creates a new
SingleCellExperiment object new_sce
, consisting of the
following information:
assayNames(new_sce)
: counts (SSM whose entries are
termed sign scores),names(colData(new_sce))
: nReads, nGenes, percMT,names(rowData(new_sce))
: ParentSignID, Description,
CorrGene, etc.,names(metadata(new_sce))
: sign_SCG, sign_VCG,
etc.,altExpNames(new_sce)
: something if there is data in
altExp(sce)
.The arguments are
sce
: SingleCellExperiment object,weight_strg
: weight parameter for SCG (the default
value is 0.5), andweight_vari
: weight parameter for VCG (the default is
0.5).$CB <- makeSignMatrix(sce = pbmcs$CB, weight_strg = 0.5, weight_vari = 0.5)
pbmcs$GO <- makeSignMatrix(sce = pbmcs$GO, weight_strg = 0.5, weight_vari = 0.5)
pbmcs$KG <- makeSignMatrix(sce = pbmcs$KG, weight_strg = 0.5, weight_vari = 0.5) pbmcs
Below are head and tail of assay(sce, "counts")
:
rbind(head(assay(pbmcs$CB, "counts")[, 1:3], n = 4),
tail(assay(pbmcs$CB, "counts")[, 1:3], n = 4))
#> AAACCTGCAGGCGATA-1 AAACCTGCATGAAGTA-1 AAACCTGGTGCGGTAA-1
#> CL:0000097-S 0.2538591 0.17084896 -0.21152070
#> CL:0000100-S 0.4204088 0.30813515 -0.30692790
#> CL:0000121-S 0.1317555 0.42244834 -0.02095966
#> CL:0000127-S 0.1319279 0.43485496 -0.23608112
#> MSigDBID:265-V 0.2879127 0.01622889 -0.05623750
#> MSigDBID:266-V 0.2971490 0.07279125 -0.18265357
#> MSigDBID:276-V 0.3259615 -0.06231676 0.03452386
#> MSigDBID:293-V 0.1557055 0.12024693 -0.12444035
Perform principal component analysis and t-distributed stochastic neighbor embedding (t-SNE).
<- c(30, 30, 50)
pca_dims <- c(2, 2, 3)
tsne_dims for(i in seq_along(pbmcs)){
set.seed(1)
<- t(as.matrix(assay(pbmcs[[i]], "counts")))
mat <- Rtsne::Rtsne(mat, dim = tsne_dims[i], pca = TRUE,
res initial_dims = pca_dims[i])
reducedDim(pbmcs[[i]], "TSNE") <- res[["Y"]]
}
Show the results of dimensional reduction in t-SNE spaces.
<- as.data.frame(reducedDim(pbmcs$CB, "TSNE"))
df ::ggplot() + ggplot2::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2]),
ggplot2color = "black", size = 1, alpha = 1) +
::labs(title = "PBMC (cell type)", x = "tSNE_1", y = "tSNE_2") +
ggplot2::theme_classic(base_size = 15) ggplot2
<- as.data.frame(reducedDim(pbmcs$GO, "TSNE"))
df ::ggplot() + ggplot2::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2]),
ggplot2color = "black", size = 1, alpha = 1) +
::labs(title = "PBMC (function)", x = "tSNE_1", y = "tSNE_2") +
ggplot2::theme_classic(base_size = 15) ggplot2
Use ASURAT function plot_dataframe3D()
for plotting
three-dimensional data. See ?plot_dataframe3D
for
details.
<- as.data.frame(reducedDim(pbmcs$KG, "TSNE"))
df plot_dataframe3D(dataframe3D = df, theta = 45, phi = 30, title = "PBMC (pathway)",
xlabel = "tSNE_1", ylabel = "tSNE_2", zlabel = "tSNE_3")
To date (December, 2021), one of the most useful clustering methods in scRNA-seq data analysis is a combination of a community detection algorithm and graph-based unsupervised clustering, developed in Seurat package.
Here, our strategy is as follows:
rowData()
and colData()
must have data),ScaleData()
, RunPCA()
,
FindNeighbors()
, and FindClusters()
,temp
,colData(temp)$seurat_clusters
into
colData(sce)$seurat_clusters
.<- c(0.20, 0.20, 0.10)
resolutions <- list(seq_len(20), seq_len(20), seq_len(20))
ds for(i in seq_along(pbmcs)){
<- Seurat::as.Seurat(pbmcs[[i]], counts = "counts", data = "counts")
surt <- as.matrix(assay(pbmcs[[i]], "counts"))
mat "SSM"]] <- Seurat::CreateAssayObject(counts = mat)
surt[[::DefaultAssay(surt) <- "SSM"
Seurat<- Seurat::ScaleData(surt, features = rownames(surt))
surt <- Seurat::RunPCA(surt, features = rownames(surt))
surt <- Seurat::FindNeighbors(surt, reduction = "pca", dims = ds[[i]])
surt <- Seurat::FindClusters(surt, resolution = resolutions[i])
surt <- Seurat::as.SingleCellExperiment(surt)
temp colData(pbmcs[[i]])$seurat_clusters <- colData(temp)$seurat_clusters
}
Show the clustering results in t-SNE spaces.
<- colData(pbmcs$CB)$seurat_clusters
labels <- as.data.frame(reducedDim(pbmcs$CB, "TSNE"))
df ::ggplot() +
ggplot2::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
ggplot2size = 1, alpha = 1) +
::labs(title = "PBMC (cell type)", x = "tSNE_1", y = "tSNE_2", color = "") +
ggplot2::theme_classic(base_size = 15) + ggplot2::scale_colour_hue() +
ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4))) ggplot2
<- colData(pbmcs$GO)$seurat_clusters
labels <- as.data.frame(reducedDim(pbmcs$GO, "TSNE"))
df ::ggplot() +
ggplot2::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
ggplot2size = 1, alpha = 1) +
::labs(title = "PBMC (function)", x = "tSNE_1", y = "tSNE_2", color = "") +
ggplot2::theme_classic(base_size = 15) +
ggplot2::scale_colour_brewer(palette = "Set1") +
ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4))) ggplot2
Use ASURAT function plot_dataframe3D()
for plotting
three-dimensional data. See ?plot_dataframe3D
for
details.
<- colData(pbmcs$KG)$seurat_clusters
labels <- scales::brewer_pal(palette = "Set2")(length(unique(labels)))[labels]
colors <- as.data.frame(reducedDim(pbmcs$KG, "TSNE")[, seq_len(3)])
df plot_dataframe3D(dataframe3D = df, labels = labels, colors = colors,
theta = 45, phi = 30, title = "PBMC (pathway)",
xlabel = "tSNE_1", ylabel = "tSNE_2", zlabel = "tSNE_3")
If there is gene expression data in altExp(sce)
, one can
infer cell cycle phases by using Seurat functions in the similar manner
as above.
<- Seurat::as.Seurat(pbmcs$CB, counts = "counts", data = "counts")
surt <- as.matrix(assay(altExp(pbmcs$CB), "counts"))
mat "GEM"]] <- Seurat::CreateAssayObject(counts = mat)
surt[[::DefaultAssay(surt) <- "GEM"
Seurat<- Seurat::ScaleData(surt, features = rownames(surt))
surt <- Seurat::RunPCA(surt, features = rownames(surt))
surt <- Seurat::CellCycleScoring(surt, s.features = Seurat::cc.genes$s.genes,
surt g2m.features = Seurat::cc.genes$g2m.genes)
<- Seurat::as.SingleCellExperiment(surt)
temp colData(pbmcs$CB)$Phase <- colData(temp)$Phase
Significant signs are analogous to differentially expressed genes but bear biological meanings. Note that naïve usages of statistical tests should be avoided because the row vectors of SSMs are centered.
Instead, ASURAT function compute_sepI_all()
computes
separation indices for each cluster against the others. Briefly, a
separation index “sepI”, ranging from -1 to 1, is a nonparametric
measure of significance of a given sign score for a given subpopulation.
The larger (resp. smaller) sepI is, the more reliable the sign is as a
positive (resp. negative) marker for the cluster.
The arguments are
sce
: SingleCellExperiment object,labels
: a vector of labels of all the samples, andnrand_samples
: an integer for the number of samples
used for random sampling, which samples at least one sample per
cluster.for(i in seq_along(pbmcs)){
set.seed(1)
<- colData(pbmcs[[i]])$seurat_clusters
labels <- compute_sepI_all(sce = pbmcs[[i]], labels = labels,
pbmcs[[i]] nrand_samples = 200)
}
The results are stored in
metadata(sce)$marker_signs
.
When computing separation indices between given clusters, e.g.,
cluster 1 versus clusters 2 and 3, use an ASURAT function
compute_sepI_clusters()
. See
?compute_sepI_clusters
for details.
To date (December, 2021), one of the most useful methods of multiple
statistical tests in scRNA-seq data analysis is to use a Seurat function
FindAllMarkers()
.
If there is gene expression data in altExp(sce)
, one can
investigate differentially expressed genes by using Seurat functions in
the similar manner as described before.
set.seed(1)
<- Seurat::as.Seurat(pbmcs$CB, counts = "counts", data = "counts")
surt <- as.matrix(assay(altExp(pbmcs$CB), "counts"))
mat "GEM"]] <- Seurat::CreateAssayObject(counts = mat)
surt[[::DefaultAssay(surt) <- "GEM"
Seurat<- Seurat::SetIdent(surt, value = "seurat_clusters")
surt <- Seurat::FindAllMarkers(surt, only.pos = TRUE,
res min.pct = 0.50, logfc.threshold = 0.50)
metadata(pbmcs$CB)$marker_genes$all <- res
Simultaneously analyze multiple sign-by-sample matrices, which helps us characterize individual samples (cells) from multiple biological aspects.
ASURAT function plot_multiheatmaps()
shows heatmaps
(ComplexHeatmap object) of sign scores and gene expression levels (if
there are), where rows and columns stand for sign (or gene) and sample
(cell), respectively. See ?plot_multiheatmaps
for
details.
First, remove unrelated signs by setting keywords, followed by selecting top significant signs and genes for the clustering results with respect to separation index and adjusted p-value, respectively.
# Significant signs
<- list()
marker_signs <- "MESENCHYMAL|LIMB|PANCREAS"
keywords for(i in seq_along(pbmcs)){
<- metadata(pbmcs[[i]])$marker_signs$all
marker_signs[[i]] <-
marker_signs[[i]] !grepl(keywords, marker_signs[[i]]$Description), ]
marker_signs[[i]][<- dplyr::group_by(marker_signs[[i]], Ident_1)
marker_signs[[i]] <- dplyr::slice_max(marker_signs[[i]], sepI, n = 2)
marker_signs[[i]] <- dplyr::slice_min(marker_signs[[i]], Rank, n = 1)
marker_signs[[i]]
}# Significant genes
<- metadata(pbmcs$CB)$marker_genes$all
marker_genes_CB <- dplyr::group_by(marker_genes_CB, cluster)
marker_genes_CB <- dplyr::slice_min(marker_genes_CB, p_val_adj, n = 2)
marker_genes_CB <- dplyr::slice_max(marker_genes_CB, avg_log2FC, n = 1) marker_genes_CB
Next, prepare the arguments.
# ssm_list
<- list() ; ssm_list <- list()
sces_sub for(i in seq_along(pbmcs)){
<- pbmcs[[i]][rownames(pbmcs[[i]]) %in% marker_signs[[i]]$SignID, ]
sces_sub[[i]] <- assay(sces_sub[[i]], "counts")
ssm_list[[i]]
}names(ssm_list) <- c("SSM_cell", "SSM_function", "SSM_pathway")
# gem_list
<- altExp(pbmcs$CB, "logcounts")
expr_sub <- expr_sub[rownames(expr_sub) %in% marker_genes_CB$gene]
expr_sub <- list(x = t(scale(t(as.matrix(assay(expr_sub, "counts"))))))
gem_list names(gem_list) <- "Scaled\nLogExpr"
# ssmlabel_list
<- list() ; ssmlabel_list <- list()
labels for(i in seq_along(pbmcs)){
<- colData(sces_sub[[i]])$seurat_clusters
tmp <- data.frame(label = tmp)
labels[[i]] <- length(unique(tmp))
n_groups if(i == 1){
$color <- scales::hue_pal()(n_groups)[tmp]
labels[[i]]else if(i == 2){
}$color <- scales::brewer_pal(palette = "Set1")(n_groups)[tmp]
labels[[i]]else if(i == 3){
}$color <- scales::brewer_pal(palette = "Set2")(n_groups)[tmp]
labels[[i]]
}<- labels[[i]]
ssmlabel_list[[i]]
}names(ssmlabel_list) <- c("Label_cell", "Label_function", "Label_pathway")
# gemlabel_list
<- data.frame(label = colData(pbmcs$CB)$Phase, color = NA)
label_CC <- list(CellCycle = label_CC) gemlabel_list
Tips: If one would like to omit some color labels (e.g., labels[[3]]), set the argument as follows:
3]] <- data.frame(label = NA, color = NA) ssmlabel_list[[
Show heatmaps for the selected signs and genes.
set.seed(1)
plot_multiheatmaps(ssm_list = ssm_list, gem_list = gem_list,
ssmlabel_list = ssmlabel_list, gemlabel_list = gemlabel_list,
nrand_samples = 100, show_row_names = TRUE, title = "PBMC")
Tips: If nrand_samples
is set,
plot_multiheatmaps()
downsamples the samples (cells)
without considering clustering labels. Take care of the clusters with
small populations.
Show violin plots for sign score distributions across cell type-related clusters.
<- colData(pbmcs$CB)$seurat_clusters
labels <- "GO:0042100-V"
variable <- "B cell proliferation"
description <- pbmcs$GO[which(rownames(pbmcs$GO) == variable), ]
subsce <- as.data.frame(t(as.matrix(assay(subsce, "counts"))))
df ::ggplot() +
ggplot2::geom_violin(ggplot2::aes(x = as.factor(labels), y = df[, 1],
ggplot2fill = labels), trim = FALSE, size = 0.5) +
::geom_boxplot(ggplot2::aes(x = as.factor(labels), y = df[, 1]),
ggplot2width = 0.15, alpha = 0.6) +
::labs(title = paste0(variable, "\n", description),
ggplot2x = "Cluster (cell type)", y = "Sign score") +
::theme_classic(base_size = 20) +
ggplot2::theme(legend.position = "none") + ggplot2::scale_fill_hue() ggplot2
Show violin plots for gene expression distributions across cell type-related clusters.
<- "CD79A"
vname <- altExp(pbmcs$CB, "logcounts")
sub <- sub[rownames(sub) == vname, ]
sub <- colData(pbmcs$CB)$seurat_clusters
labels <- as.data.frame(t(assay(sub, "counts")))
df ::ggplot() +
ggplot2::geom_violin(ggplot2::aes(x = as.factor(labels), y = df[, 1],
ggplot2fill = labels), trim = FALSE, size = 0.5) +
::geom_boxplot(ggplot2::aes(x = as.factor(labels), y = df[, 1]),
ggplot2width = 0.15, alpha = 0.6) +
::labs(title = vname, x = "Cluster (cell type)",
ggplot2y = "Normalized expression") +
::theme_classic(base_size = 20) +
ggplot2::theme(legend.position = "none") + ggplot2::scale_fill_hue() ggplot2
sessionInfo()
#> R version 4.2.1 (2022-06-23)
#> Platform: x86_64-apple-darwin17.0 (64-bit)
#> Running under: macOS Big Sur ... 10.16
#>
#> Matrix products: default
#> BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
#>
#> locale:
#> [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#>
#> attached base packages:
#> [1] stats4 stats graphics grDevices utils datasets methods
#> [8] base
#>
#> other attached packages:
#> [1] TENxPBMCData_1.15.0 HDF5Array_1.25.2
#> [3] rhdf5_2.41.2 DelayedArray_0.23.2
#> [5] Matrix_1.5-1 SingleCellExperiment_1.19.1
#> [7] SummarizedExperiment_1.27.3 Biobase_2.57.3
#> [9] GenomicRanges_1.49.1 GenomeInfoDb_1.33.16
#> [11] IRanges_2.31.2 S4Vectors_0.35.4
#> [13] BiocGenerics_0.43.4 MatrixGenerics_1.9.1
#> [15] matrixStats_0.62.0 ASURAT_1.1.0
#>
#> loaded via a namespace (and not attached):
#> [1] utf8_1.2.2 reticulate_1.26
#> [3] tidyselect_1.2.0 RSQLite_2.2.18
#> [5] AnnotationDbi_1.59.1 htmlwidgets_1.5.4
#> [7] grid_4.2.1 Rtsne_0.16
#> [9] munsell_0.5.0 codetools_0.2-18
#> [11] ica_1.0-3 future_1.28.0
#> [13] miniUI_0.1.1.1 misc3d_0.9-1
#> [15] withr_2.5.0 spatstat.random_2.2-0
#> [17] colorspace_2.0-3 progressr_0.11.0
#> [19] filelock_1.0.2 highr_0.9
#> [21] knitr_1.40 rstudioapi_0.14
#> [23] Seurat_4.2.0 ROCR_1.0-11
#> [25] tensor_1.5 listenv_0.8.0
#> [27] labeling_0.4.2 GenomeInfoDbData_1.2.9
#> [29] polyclip_1.10-4 bit64_4.0.5
#> [31] farver_2.1.1 parallelly_1.32.1
#> [33] vctrs_0.5.0 generics_0.1.3
#> [35] xfun_0.34 BiocFileCache_2.5.2
#> [37] R6_2.5.1 doParallel_1.0.17
#> [39] clue_0.3-62 bitops_1.0-7
#> [41] rhdf5filters_1.9.0 spatstat.utils_3.0-1
#> [43] cachem_1.0.6 assertthat_0.2.1
#> [45] promises_1.2.0.1 scales_1.2.1
#> [47] rgeos_0.5-9 gtable_0.3.1
#> [49] Cairo_1.6-0 globals_0.16.1
#> [51] goftest_1.2-3 rlang_1.0.6
#> [53] GlobalOptions_0.1.2 splines_4.2.1
#> [55] lazyeval_0.2.2 spatstat.geom_3.0-3
#> [57] BiocManager_1.30.19 yaml_2.3.6
#> [59] reshape2_1.4.4 abind_1.4-5
#> [61] httpuv_1.6.6 tools_4.2.1
#> [63] tcltk_4.2.1 ggplot2_3.3.6
#> [65] ellipsis_0.3.2 spatstat.core_2.4-4
#> [67] jquerylib_0.1.4 RColorBrewer_1.1-3
#> [69] ggridges_0.5.4 Rcpp_1.0.9
#> [71] plyr_1.8.7 zlibbioc_1.43.0
#> [73] purrr_0.3.5 RCurl_1.98-1.9
#> [75] rpart_4.1.19 deldir_1.0-6
#> [77] pbapply_1.5-0 GetoptLong_1.0.5
#> [79] cowplot_1.1.1 zoo_1.8-11
#> [81] SeuratObject_4.1.2 ggrepel_0.9.1
#> [83] cluster_2.1.4 magrittr_2.0.3
#> [85] data.table_1.14.4 scattermore_0.8
#> [87] circlize_0.4.15 lmtest_0.9-40
#> [89] RANN_2.6.1 fitdistrplus_1.1-8
#> [91] patchwork_1.1.2 mime_0.12
#> [93] evaluate_0.17 xtable_1.8-4
#> [95] gridExtra_2.3 shape_1.4.6
#> [97] compiler_4.2.1 tibble_3.1.8
#> [99] KernSmooth_2.23-20 crayon_1.5.2
#> [101] htmltools_0.5.3 mgcv_1.8-41
#> [103] later_1.3.0 tidyr_1.2.1
#> [105] DBI_1.1.3 ExperimentHub_2.5.0
#> [107] dbplyr_2.2.1 ComplexHeatmap_2.13.4
#> [109] MASS_7.3-58.1 rappdirs_0.3.3
#> [111] cli_3.4.1 parallel_4.2.1
#> [113] igraph_1.3.5 pkgconfig_2.0.3
#> [115] sp_1.5-0 plotly_4.10.0
#> [117] spatstat.sparse_3.0-0 foreach_1.5.2
#> [119] bslib_0.4.0 XVector_0.37.1
#> [121] stringr_1.4.1 digest_0.6.30
#> [123] sctransform_0.3.5 RcppAnnoy_0.0.20
#> [125] spatstat.data_3.0-0 Biostrings_2.65.6
#> [127] rmarkdown_2.17 leiden_0.4.3
#> [129] uwot_0.1.14 curl_4.3.3
#> [131] shiny_1.7.3 rjson_0.2.21
#> [133] nlme_3.1-160 lifecycle_1.0.3
#> [135] jsonlite_1.8.3 Rhdf5lib_1.19.2
#> [137] limma_3.53.10 viridisLite_0.4.1
#> [139] fansi_1.0.3 pillar_1.8.1
#> [141] lattice_0.20-45 KEGGREST_1.37.3
#> [143] fastmap_1.1.0 httr_1.4.4
#> [145] survival_3.4-0 interactiveDisplayBase_1.35.1
#> [147] glue_1.6.2 png_0.1-7
#> [149] iterators_1.0.14 plot3D_1.4
#> [151] BiocVersion_3.16.0 bit_4.0.4
#> [153] stringi_1.7.8 sass_0.4.2
#> [155] blob_1.2.3 org.Hs.eg.db_3.16.0
#> [157] AnnotationHub_3.5.2 memoise_2.0.1
#> [159] dplyr_1.0.10 irlba_2.3.5.1
#> [161] future.apply_1.9.1