Analysis of PBMC datasets (Reyes et al., 2020)

Keita Iida

2022-07-14

1 Computational environment

MacBook Pro (Big Sur, 16-inch, 2019), Processor (2.4 GHz 8-Core Intel Core i9), Memory (64 GB 2667 MHz DDR4).


2 Install libraries

Attach necessary libraries:

library(ASURAT)
library(SingleCellExperiment)
library(SummarizedExperiment)


3 Introduction

In this vignette, we analyze single-cell RNA sequencing (scRNA-seq) data obtained from peripheral blood mononuclear cells (PBMCs) of donors with and without bacterial sepsis (Reyes et al., Nat. Med. 26, 2020).


4 Prepare scRNA-seq data

4.1 PBMCs with and without sepsis (Reyes et al., 2020)

The data can be loaded by the following code:

pbmc <- readRDS(url("https://figshare.com/ndownloader/files/36276324"))

The data are stored in DOI:10.6084/m9.figshare.19200254 and the generating process is described below.


The data were obtained from Broad Institute Single Cell Portal: SCP548. Since the file size of the raw read count table is huge (~5.61 GB), we briefly removed gene and cell data such that the numbers of non-zero expressing cells are less than 100 and the numbers of sequencing reads are less than 2000, respectively, by using perl scripts as follows:

perl ../perl/pg_01_add_nsamples.pl
perl ../perl/pg_02_add_nreads.pl
perl ../perl/pg_03_remove_variables.pl
perl ../perl/pg_04_remove_samples.pl

Create a SingleCellExperiment object by inputting a raw read count table.

# Load a raw read count table.
fn <- "rawdata/2020_001_Reyes/SCP548/expression/scp_gex_matrix_red.csv"
pbmc <- read.csv(fn, check.names = FALSE)
genes <- pbmc[-1, 2]
cells <- colnames(pbmc)[-seq_len(2)]
pbmc <- pbmc[-1, -seq_len(2)]
rownames(pbmc) <- make.unique(genes)
colnames(pbmc) <- make.unique(cells)
# Load sample information
info <- read.delim("rawdata/2020_001_Reyes/SCP548/metadata/scp_meta.txt")
info <- info[-1, ]
rownames(info) <- info$NAME
info <- info[, -1]
info <- info[cells, ]
# Create a SingleCellExperiment object.
pbmc <- SingleCellExperiment(assays = list(counts = as.matrix(pbmc)),
                             rowData = data.frame(gene = rownames(pbmc)),
                             colData = as.data.frame(info))
dim(pbmc)
[1] 14973 95853
# Save data.
saveRDS(pbmc, file = "backup/06_001_pbmc130k_data.rds")

# Load data.
pbmc <- readRDS("backup/06_001_pbmc130k_data.rds")


5 Preprocessing

5.1 Control data quality

Remove variables (genes) and samples (cells) with low quality, by processing the following three steps:

  1. remove variables based on expression profiles across samples,
  2. remove samples based on the numbers of reads and nonzero expressed variables,
  3. remove variables based on the mean read counts across samples.

First of all, add metadata for both variables and samples using ASURAT function add_metadata().

pbmc <- add_metadata(sce = pbmc, mitochondria_symbol = "^MT-")


5.1.1 Remove variables based on expression profiles

ASURAT function remove_variables() removes variable (gene) data such that the numbers of non-zero expressing samples (cells) are less than min_nsamples.

pbmc <- remove_variables(sce = pbmc, min_nsamples = 100)


5.1.2 Remove samples based on expression profiles

Qualities of sample (cell) data are confirmed based on proper visualization of colData(sce).

title <- "PBMC (Reyes et al., 2020)"
df <- data.frame(x = colData(pbmc)$nReads, y = colData(pbmc)$nGenes)
p <- ggplot2::ggplot() +
  ggplot2::geom_point(ggplot2::aes(x = df$x, y = df$y), size = 1, alpha = 1) +
  ggplot2::labs(title = title, x = "Number of reads", y = "Number of genes") +
  ggplot2::theme_classic(base_size = 20) +
  ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 20))
filename <- "figures/figure_06_0010.png"
ggplot2::ggsave(file = filename, plot = p, dpi = 50, width = 5, height = 5)

df <- data.frame(x = colData(pbmc)$nReads, y = colData(pbmc)$percMT)
p <- ggplot2::ggplot() +
  ggplot2::geom_point(ggplot2::aes(x = df$x, y = df$y), size = 1, alpha = 1) +
  ggplot2::labs(title = title, x = "Number of reads", y = "Perc of MT reads") +
  ggplot2::theme_classic(base_size = 20) +
  ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 20))
filename <- "figures/figure_06_0011.png"
ggplot2::ggsave(file = filename, plot = p, dpi = 50, width = 5, height = 5)

ASURAT function remove_samples() removes sample (cell) data by setting cutoff values for the metadata.

pbmc <- remove_samples(sce = pbmc, min_nReads = 1500, max_nReads = 30000,
                       min_nGenes = 100, max_nGenes = 1e+10,
                       min_percMT = 0, max_percMT = 10)


5.1.3 Remove variables based on the mean read counts

Qualities of variable (gene) data are confirmed based on proper visualization of rowData(sce).

title <- "PBMC (Reyes et al., 2020)"
aveexp <- apply(as.matrix(assay(pbmc, "counts")), 1, mean)
df <- data.frame(x = seq_len(nrow(rowData(pbmc))),
                 y = sort(aveexp, decreasing = TRUE))
p <- ggplot2::ggplot() + ggplot2::scale_y_log10() +
  ggplot2::geom_point(ggplot2::aes(x = df$x, y = df$y), size = 1, alpha = 1) +
  ggplot2::labs(title = title, x = "Rank of genes", y = "Mean read counts") +
  ggplot2::theme_classic(base_size = 20) +
  ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 20))
filename <- "figures/figure_06_0015.png"
ggplot2::ggsave(file = filename, plot = p, dpi = 50, width = 5, height = 5)

ASURAT function remove_variables_second() removes variable (gene) data such that the mean read counts across samples are less than min_meannReads.

pbmc <- remove_variables_second(sce = pbmc, min_meannReads = 0.05)
dim(pbmc)
[1]  5322 77161
# Save data.
saveRDS(pbmc, file = "backup/06_002_pbmc130k_dataqc.rds")

# Load data.
pbmc <- readRDS("backup/06_002_pbmc130k_dataqc.rds")


5.2 Normalize data

Perform bayNorm() (Tang et al., Bioinformatics, 2020) for attenuating technical biases with respect to zero inflation and variation of capture efficiencies between samples (cells).

BETA <- bayNorm::BetaFun(Data = assay(pbmc, "counts"), MeanBETA = 0.06)
bay_out <- bayNorm::bayNorm(assay(pbmc, "counts"),
                            Conditions = colData(pbmc)$Patient,
                            BETA_vec = BETA[["BETA"]], Prior_type = "GG",
                            mode_version = TRUE)
mat_list <- bay_out[["Bay_out_list"]]
mat <- do.call("cbind", mat_list)
pbmc <- pbmc[, colnames(mat)]
assay(pbmc, "normalized") <- mat

Perform log-normalization with a pseudo count.

assay(pbmc, "logcounts") <- log(assay(pbmc, "normalized") + 1)

Center row data.

mat <- assay(pbmc, "logcounts")
assay(pbmc, "centered") <- sweep(mat, 1, apply(mat, 1, mean), FUN = "-")

Set gene expression data into altExp(sce).

sname <- "logcounts"
altExp(pbmc, sname) <- SummarizedExperiment(list(counts = assay(pbmc, sname)))

Add ENTREZ Gene IDs to rowData(sce).

dictionary <- AnnotationDbi::select(org.Hs.eg.db::org.Hs.eg.db,
                                    key = rownames(pbmc),
                                    columns = "ENTREZID", keytype = "SYMBOL")
dictionary <- dictionary[!duplicated(dictionary$SYMBOL), ]
rowData(pbmc)$geneID <- dictionary$ENTREZID
# Save data.
saveRDS(pbmc, file = "backup/06_003_pbmc130k_normalized.rds")

# Load data.
pbmc <- readRDS("backup/06_003_pbmc130k_normalized.rds")


6 Multifaceted sign analysis

Infer cell or disease types, biological functions, and signaling pathway activity at the single-cell level by inputting related databases.

ASURAT transforms centered read count tables to functional feature matrices, termed sign-by-sample matrices (SSMs). Using SSMs, perform unsupervised clustering of samples (cells).


6.1 Compute correlation matrices

Prepare correlation matrices of gene expressions.

mat <- t(as.matrix(assay(pbmc, "centered")))
cormat <- cor(mat, method = "spearman")
# Save data.
saveRDS(cormat, file = "backup/06_003_pbmc130k_cormat.rds")

# Load data.
cormat <- readRDS("backup/06_003_pbmc130k_cormat.rds")


6.2 Load databases

Load databases.

urlpath <- "https://github.com/keita-iida/ASURATDB/blob/main/genes2bioterm/"
load(url(paste0(urlpath, "20201213_human_CO.rda?raw=TRUE")))         # CO
load(url(paste0(urlpath, "20220308_human_MSigDB.rda?raw=TRUE")))     # MSigDB
load(url(paste0(urlpath, "20220308_human_CellMarker.rda?raw=TRUE"))) # CellMarker
load(url(paste0(urlpath, "20201213_human_GO_red.rda?raw=TRUE")))     # GO
load(url(paste0(urlpath, "20201213_human_KEGG.rda?raw=TRUE")))       # KEGG

The reformatted knowledge-based data were available from the following repositories:

Create a custom-built cell type-related databases by combining different databases for analyzing human single-cell transcriptome data.

d <- list(human_CO[["cell"]], human_MSigDB[["cell"]], human_CellMarker[["cell"]])
human_CB <- list(cell = do.call("rbind", d))

Add formatted databases to metadata(sce)$sign.

pbmcs <- list(CB = pbmc, GO = pbmc, KG = pbmc)
metadata(pbmcs$CB) <- list(sign = human_CB[["cell"]])
metadata(pbmcs$GO) <- list(sign = human_GO[["BP"]])
metadata(pbmcs$KG) <- list(sign = human_KEGG[["pathway"]])


6.3 Create signs

ASURAT function remove_signs() redefines functional gene sets for the input database by removing genes, which are not included in rownames(sce), and further removes biological terms including too few or too many genes.

pbmcs$CB <- remove_signs(sce = pbmcs$CB, min_ngenes = 2, max_ngenes = 1000)
pbmcs$GO <- remove_signs(sce = pbmcs$GO, min_ngenes = 2, max_ngenes = 1000)
pbmcs$KG <- remove_signs(sce = pbmcs$KG, min_ngenes = 2, max_ngenes = 1000)

ASURAT function cluster_genes() clusters functional gene sets using a correlation graph-based decomposition method, which produces strongly, variably, and weakly correlated gene sets (SCG, VCG, and WCG, respectively).

set.seed(1)
pbmcs$CB <- cluster_genesets(sce = pbmcs$CB, cormat = cormat,
                             th_posi = 0.20, th_nega = -0.50)
set.seed(1)
pbmcs$GO <- cluster_genesets(sce = pbmcs$GO, cormat = cormat,
                             th_posi = 0.20, th_nega = -0.30)
set.seed(1)
pbmcs$KG <- cluster_genesets(sce = pbmcs$KG, cormat = cormat,
                             th_posi = 0.15, th_nega = -0.20)

ASURAT function create_signs() creates signs by the following criteria:

  1. the number of genes in SCG>= min_cnt_strg (the default value is 2) and
  2. the number of genes in VCG>= min_cnt_vari (the default value is 2),

which are independently applied to SCGs and VCGs, respectively.

pbmcs$CB <- create_signs(sce = pbmcs$CB, min_cnt_strg = 3, min_cnt_vari = 3)
pbmcs$GO <- create_signs(sce = pbmcs$GO, min_cnt_strg = 3, min_cnt_vari = 3)
pbmcs$KG <- create_signs(sce = pbmcs$KG, min_cnt_strg = 4, min_cnt_vari = 4)


6.4 Select signs

If signs have semantic similarity information, one can use ASURAT function remove_signs_redundant() for removing redundant sings using the semantic similarity matrices.

simmat <- human_GO$similarity_matrix$BP
pbmcs$GO <- remove_signs_redundant(sce = pbmcs$GO, similarity_matrix = simmat,
                                   threshold = 0.85, keep_rareID = TRUE)

ASURAT function remove_signs_manually() removes signs by specifying IDs (e.g., GOID:XXX) or descriptions (e.g., metabolic) using grepl().

keywords <- "Covid|COVID"
pbmcs$KG <- remove_signs_manually(sce = pbmcs$KG, keywords = keywords)


6.5 Create sign-by-sample matrices

ASURAT function create_sce_signmatrix() creates a new SingleCellExperiment object new_sce, consisting of the following information:

pbmcs$CB <- makeSignMatrix(sce = pbmcs$CB, weight_strg = 0.5, weight_vari = 0.5)
pbmcs$GO <- makeSignMatrix(sce = pbmcs$GO, weight_strg = 0.5, weight_vari = 0.5)
pbmcs$KG <- makeSignMatrix(sce = pbmcs$KG, weight_strg = 0.5, weight_vari = 0.5)


6.6 Reduce dimensions of sign-by-sample matrices

Perform t-distributed stochastic neighbor embedding.

for(i in seq_along(pbmcs)){
  set.seed(1)
  mat <- t(as.matrix(assay(pbmcs[[i]], "counts")))
  res <- Rtsne::Rtsne(mat, dim = 2, pca = TRUE, initial_dims = 50)
  reducedDim(pbmcs[[i]], "TSNE") <- res[["Y"]]
}

Show the results of dimensional reduction in low-dimensional spaces.

titles <- c("Reyes 2020 (cell type)", "Reyes 2020 (function)",
            "Reyes 2020 (pathway)")
for(i in seq_along(titles)){
  df <- as.data.frame(reducedDim(pbmcs[[i]], "TSNE"))
  p <- ggplot2::ggplot() +
    ggplot2::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2]),
                        color = "black", size = 0.1, alpha = 1) +
    ggplot2::labs(title = titles[i], x = "tSNE_1", y = "tSNE_2") +
    ggplot2::theme_classic(base_size = 20) +
    ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18))
  filename <- sprintf("figures/figure_06_%04d.png", 19 + i)
  ggplot2::ggsave(file = filename, plot = p, dpi = 50, width = 4.1, height = 4.3)
}

# Save data.
saveRDS(pbmcs, file = "backup/06_004_pbmcs130k_ssm.rds")

# Load data.
pbmcs <- readRDS("backup/06_004_pbmcs130k_ssm.rds")


6.7 Cluster cells

6.7.1 Use Seurat functions

To date (December, 2021), one of the most useful clustering methods in scRNA-seq data analysis is a combination of a community detection algorithm and graph-based unsupervised clustering, developed in Seurat package.

Here, our strategy is as follows:

  1. convert SingleCellExperiment objects into Seurat objects (note that rowData() and colData() must have data),
  2. perform ScaleData(), RunPCA(), FindNeighbors(), and FindClusters(),
  3. convert Seurat objects into temporal SingleCellExperiment objects temp,
  4. add colData(temp)$seurat_clusters into colData(sce)$seurat_clusters.
resolutions <- c(0.15, 0.15, 0.15)
dims <- list(seq_len(40), seq_len(40), seq_len(40))
for(i in seq_along(pbmcs)){
  surt <- Seurat::as.Seurat(pbmcs[[i]], counts = "counts", data = "counts")
  mat <- as.matrix(assay(pbmcs[[i]], "counts"))
  surt[["SSM"]] <- Seurat::CreateAssayObject(counts = mat)
  Seurat::DefaultAssay(surt) <- "SSM"
  surt <- Seurat::ScaleData(surt, features = rownames(surt))
  surt <- Seurat::RunPCA(surt, features = rownames(surt))
  surt <- Seurat::FindNeighbors(surt, reduction = "pca", dims = dims[[i]])
  surt <- Seurat::FindClusters(surt, resolution = resolutions[i])
  temp <- Seurat::as.SingleCellExperiment(surt)
  colData(pbmcs[[i]])$seurat_clusters <- colData(temp)$seurat_clusters
}

Show the clustering results in low-dimensional spaces.

titles <- c("Reyes 2020 (cell type)", "Reyes 2020 (function)",
            "Reyes 2020 (pathway)")
for(i in seq_along(titles)){
  labels <- colData(pbmcs[[i]])$seurat_clusters
  df <- as.data.frame(reducedDim(pbmcs[[i]], "TSNE"))
  p <- ggplot2::ggplot() +
    ggplot2::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
                        size = 0.2, alpha = 1) +
    ggplot2::labs(title = titles[i], x = "tSNE_1", y = "tSNE_2", color = "") +
    ggplot2::theme_classic(base_size = 20) +
    ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) +
    ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4)))
  if(i == 1){
    p <- p + ggplot2::scale_colour_hue()
  }else if(i == 2){
    p <- p + ggplot2::scale_colour_brewer(palette = "Dark2")
  }else if(i == 3){
    p <- p + ggplot2::scale_colour_brewer(palette = "Paired")
  }
  filename <- sprintf("figures/figure_06_%04d.png", 29 + i)
  if(i == 1){
    ggplot2::ggsave(file = filename, plot = p, dpi = 50, width = 5.2, height = 4.3)
  }else{
    ggplot2::ggsave(file = filename, plot = p, dpi = 50, width = 5.1, height = 4.3)
  }
}


6.8 Investigate significant signs

Significant signs are analogous to differentially expressed genes but bear biological meanings. Note that naïve usages of statistical tests should be avoided because the row vectors of SSMs are centered.

Instead, ASURAT function compute_sepI_all() computes separation indices for each cluster against the others. Briefly, a separation index “sepI”, ranging from -1 to 1, is a nonparametric measure of significance of a given sign score for a given subpopulation. The larger (resp. smaller) sepI is, the more reliable the sign is as a positive (resp. negative) marker for the cluster.

for(i in seq_along(pbmcs)){
  set.seed(1)
  labels <- colData(pbmcs[[i]])$seurat_clusters
  pbmcs[[i]] <- compute_sepI_all(sce = pbmcs[[i]], labels = labels,
                                 nrand_samples = 20000)
}

Perform compute_sepI_all() for investigating significant signs for the clustering results of cell types.

lbs <- colData(pbmcs$CB)$seurat_clusters

pbmcs_LabelCB_SignGO <- pbmcs$GO
metadata(pbmcs_LabelCB_SignGO)$marker_signs <- NULL
set.seed(1)
pbmcs_LabelCB_SignGO <- compute_sepI_all(sce = pbmcs_LabelCB_SignGO,
                                         labels = lbs, nrand_samples = 20000)

pbmcs_LabelCB_SignKG <- pbmcs$KG
metadata(pbmcs_LabelCB_SignKG)$marker_signs <- NULL
set.seed(1)
pbmcs_LabelCB_SignKG <- compute_sepI_all(sce = pbmcs_LabelCB_SignKG,
                                         labels = lbs, nrand_samples = 20000)

for(i in c(0, 2, 3)){
  set.seed(1)
  pbmcs_LabelCB_SignGO <- compute_sepI_clusters(sce = pbmcs_LabelCB_SignGO,
                                                labels = lbs, nrand_samples = 20000,
                                                ident_1 = i,
                                                ident_2 = setdiff(c(0, 2, 3), i))
}

for(i in c(0, 2, 3)){
  set.seed(1)
  pbmcs_LabelCB_SignKG <- compute_sepI_clusters(sce = pbmcs_LabelCB_SignKG,
                                                labels = lbs, nrand_samples = 20000,
                                                ident_1 = i,
                                                ident_2 = setdiff(c(0, 2, 3), i))
}


6.9 Investigate significant genes

6.9.1 Use Seurat function

To date (December, 2021), one of the most useful methods of multiple statistical tests in scRNA-seq data analysis is to use a Seurat function FindAllMarkers().

If there is gene expression data in altExp(sce), one can investigate differentially expressed genes by using Seurat functions in the similar manner as described before.

set.seed(1)
surt <- Seurat::as.Seurat(pbmcs$CB, counts = "counts", data = "counts")
mat <- as.matrix(assay(altExp(pbmcs$CB), "counts"))
surt[["GEM"]] <- Seurat::CreateAssayObject(counts = mat)
Seurat::DefaultAssay(surt) <- "GEM"
surt <- Seurat::SetIdent(surt, value = "seurat_clusters")
res <- Seurat::FindAllMarkers(surt, only.pos = TRUE,
                              min.pct = 0.25, logfc.threshold = 0.25)
metadata(pbmcs$CB)$marker_genes$all <- res
# Save data.
saveRDS(pbmcs, file = "backup/06_005_pbmcs130k_desdeg.rds")
saveRDS(pbmcs_LabelCB_SignGO, file = "backup/06_005_pbmcs130k_LabelCB_SignGO.rds")
saveRDS(pbmcs_LabelCB_SignKG, file = "backup/06_005_pbmcs130k_LabelCB_SignKG.rds")

# Load data.
pbmcs <- readRDS("backup/06_005_pbmcs130k_desdeg.rds")
pbmcs_LabelCB_SignGO <- readRDS("backup/06_005_pbmcs130k_LabelCB_SignGO.rds")
pbmcs_LabelCB_SignKG <- readRDS("backup/06_005_pbmcs130k_LabelCB_SignKG.rds")


6.10 Multifaceted analysis

Simultaneously analyze multiple sign-by-sample matrices, which helps us characterize individual samples (cells) from multiple biological aspects.

ASURAT function plot_multiheatmaps() shows heatmaps (ComplexHeatmap object) of sign scores and gene expression levels (if there are), where rows and columns stand for sign (or gene) and sample (cell), respectively.

First, remove unrelated signs by setting keywords, followed by selecting top significant signs and genes for the clustering results with respect to separation index and p-value, respectively.

# Significant signs
marker_signs <- list()
keys <- "foofoo|hogehoge"
for(i in seq_along(pbmcs)){
  if(i == 1){
    marker_signs[[i]] <- metadata(pbmcs[[i]])$marker_signs$all
  }else if(i == 2){
    marker_signs[[i]] <- metadata(pbmcs_LabelCB_SignGO)$marker_signs$all
  }else if(i == 3){
    marker_signs[[i]] <- metadata(pbmcs_LabelCB_SignKG)$marker_signs$all
  }
  marker_signs[[i]] <- marker_signs[[i]][!grepl(keys, marker_signs[[i]]$Description), ]
  marker_signs[[i]] <- dplyr::group_by(marker_signs[[i]], Ident_1)
  marker_signs[[i]] <- dplyr::slice_max(marker_signs[[i]], sepI, n = 1)
  marker_signs[[i]] <- dplyr::slice_min(marker_signs[[i]], Rank, n = 1)
}
# Significant genes
marker_genes_CB <- metadata(pbmcs$CB)$marker_genes$all
marker_genes_CB <- dplyr::group_by(marker_genes_CB, cluster)
marker_genes_CB <- dplyr::slice_min(marker_genes_CB, p_val_adj, n = 1)
marker_genes_CB <- dplyr::slice_max(marker_genes_CB, avg_log2FC, n = 1)

Then, prepare arguments.

# ssm_list
sces_sub <- list() ; ssm_list <- list()
for(i in seq_along(pbmcs)){
  sces_sub[[i]] <- pbmcs[[i]][rownames(pbmcs[[i]]) %in% marker_signs[[i]]$SignID, ]
  ssm_list[[i]] <- assay(sces_sub[[i]], "counts")
}
names(ssm_list) <- c("SSM_celltype", "SSM_function", "SSM_pathway")
# gem_list
expr_sub <- altExp(pbmcs$CB, "logcounts")
expr_sub <- expr_sub[rownames(expr_sub) %in% marker_genes_CB$gene]
gem_list <- list(x = t(scale(t(as.matrix(assay(expr_sub, "counts"))))))
names(gem_list) <- "Scaled\nLogExpr"
# ssmlabel_list
labels <- list() ; ssmlabel_list <- list()
for(i in seq_along(pbmcs)){
  tmp <- colData(sces_sub[[i]])$seurat_clusters
  labels[[i]] <- data.frame(label = tmp)
  n_groups <- length(unique(tmp))
  if(i == 1){
    labels[[i]]$color <- scales::hue_pal()(n_groups)[tmp]
  }else if(i == 2){
    labels[[i]]$color <- scales::brewer_pal(palette = "Dark2")(n_groups)[tmp]
  }else if(i == 3){
    labels[[i]]$color <- scales::brewer_pal(palette = "Paired")(n_groups)[tmp]
  }
  ssmlabel_list[[i]] <- labels[[i]]
}
names(ssmlabel_list) <- c("Label_celltype", "Label_function", "Label_pathway")

Tips: If one would like to omit some color labels (e.g., labels[[3]]), set the argument as follows:

ssmlabel_list[[2]] <- data.frame(label = NA, color = NA)
ssmlabel_list[[3]] <- data.frame(label = NA, color = NA)

Finally, plot heatmaps for the selected signs and genes.

filename <- "figures/figure_06_0040.png"
#png(file = filename, height = 2200, width = 1800, res = 300)
png(file = filename, height = 450, width = 350, res = 60)
set.seed(1)
title <- "PBMC (Reyes et al., 2020)"
plot_multiheatmaps(ssm_list = ssm_list, gem_list = gem_list,
                   ssmlabel_list = ssmlabel_list, gemlabel_list = NULL,
                   nrand_samples = 1000, show_row_names = TRUE, title = title)
dev.off()

Show violin plots for sign score and gene expression distributions across cell type-related clusters.

labels <- colData(pbmcs$CB)$seurat_clusters
vlist <- list(c("GE", "CD14", "(Monocyte)"),
              c("GE", "HLA-DPB1", "(Dendritic cell)"),
              c("GE", "CD68", "(Phagocytosis)"),
              c("GE", "CD4", "(T cell)"),
              c("GE", "CD8B", "(T cell)"),
              c("GE", "TRAC", "(T Cell Receptor Alpha)"),
              c("GE", "LILRA4", "(Plasmacytoid dendritic cells)"),
              c("CB", "CellMarkerID:6-S", "Activated T cell (CD69, KLRG1, ...)"),
              c("CB", "CellMarkerID:184-S", "Natural killer cell (NKG7, GZMB, ...)"),
              c("CB", "CellMarkerID:99-S", "Follicular B cell (MS4A1, CD27, ...)"),
              c("CB", "CellMarkerID:53-S", "...dendritic cell (HLA-DPB1, SNX3, ...)"),
              c("GO", "GO:0032757-S", "...interleukin-8... (CD14, FCN1, ...)"),
              c("GO", "GO:0032647-S", "...interferon-alpha... (IRF7, PTPRS, ...)"),
              c("KG", "path:hsa04660-S", "T cell receptor... (CD8A, CD3E, ...)"),
              c("KG", "path:hsa04664-V", "Fc epsilon RI... (FCER1A, ALOX5AP, ...)"))
ylabels <- list(GE = "Expression", CB = "Sign score", GO = "Sign score",
                KG = "Sign score")
for(i in seq_along(vlist)){
  if(vlist[[i]][1] == "GE"){
    ind <- which(rownames(altExp(pbmcs$CB, "logcounts")) == vlist[[i]][2])
    subsce <- altExp(pbmcs$CB, "logcounts")[ind, ]
    df <- as.data.frame(t(as.matrix(assay(subsce, "counts"))))
  }else{
    ind <- which(rownames(pbmcs[[vlist[[i]][1]]]) == vlist[[i]][2])
    subsce <- pbmcs[[vlist[[i]][1]]][ind, ]
    df <- as.data.frame(t(as.matrix(assay(subsce, "counts"))))
  }
  p <- ggplot2::ggplot() +
    ggplot2::geom_violin(ggplot2::aes(x = as.factor(labels), y = df[, 1],
                                      fill = labels), trim = FALSE, size = 0.5) +
    ggplot2::geom_boxplot(ggplot2::aes(x = as.factor(labels), y = df[, 1]),
                          width = 0.15, alpha = 0.6) +
    ggplot2::labs(title = paste0(vlist[[i]][2], "\n", vlist[[i]][3]),
                  x = "Cluster (cell type)", y = ylabels[[vlist[[i]][1]]]) +
    ggplot2::theme_classic(base_size = 25) +
    ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 20),
                   legend.position = "none") +
    ggplot2::scale_fill_hue()
  filename <- sprintf("figures/figure_06_%04d.png", 49 + i)
  ggplot2::ggsave(file = filename, plot = p, dpi = 50, width = 6, height = 3.7)
}

Show violin plots for the sign score distributions across given cell type-related clusters.

labels <- colData(pbmcs$CB)$seurat_clusters
vlist <- list(c("KG", "path:hsa00071-S", "Fatty acid degradation (ACAT1, ...)"),
              c("KG", "path:hsa00010-S", "Glycolysis / Gluconeogenesis (ENO1, ...)"),
              c("KG", "path:hsa04514-V", "Cell adhesion molecules (ITGA4, ...)"),
              c("GO", "GO:0010498-S", "Proteasomal protein catabolic... (PSMB2, ...)"),
              c("GO", "GO:0044262-S", "Cellular carbohydrate metabolic... (PGD, ...)"))
mycolor <- scales::hue_pal()(11)
mycolor <- c("0" = mycolor[1], "2" = mycolor[3], "3" = mycolor[4])
for(i in seq_along(vlist)){
  inds_sign <- which(rownames(pbmcs[[vlist[[i]][1]]]) == vlist[[i]][2])
  inds_cell <- which(colData(pbmcs$CB)$seurat_clusters %in% c(0, 2, 3))
  subsce <- pbmcs[[vlist[[i]][1]]][inds_sign, inds_cell]
  sublabels <- labels[inds_cell]
  df <- as.data.frame(t(as.matrix(assay(subsce, "counts"))))
  p <- ggplot2::ggplot() +
    ggplot2::geom_violin(ggplot2::aes(x = as.factor(sublabels), y = df[, 1],
                                      fill = sublabels), trim = FALSE, size = 0.5) +
    ggplot2::geom_boxplot(ggplot2::aes(x = as.factor(sublabels), y = df[, 1]),
                          width = 0.15, alpha = 0.6) +
    ggplot2::labs(title = paste0(vlist[[i]][2], "\n", vlist[[i]][3]),
                  x = "Cluster (cell type)", y = "Sign score") +
    ggplot2::theme_classic(base_size = 25) +
    ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 20),
                   legend.position = "none") +
    ggplot2::scale_fill_manual(values = mycolor)
  filename <- sprintf("figures/figure_06_%04d.png", 69 + i)
  ggplot2::ggsave(file = filename, plot = p, dpi = 50, width = 5, height = 3.7)
}

Show violin plots for the gene expression distributions across given cell type-related clusters.

labels <- colData(pbmcs$CB)$seurat_clusters
vlist <- list(c("GE", "LILRA4", "(Plasmacytoid dendritic cells)"))
mycolor <- scales::hue_pal()(11)
mycolor <- c("6" = mycolor[7], "7" = mycolor[8], "9" = mycolor[10], "10" = mycolor[11])
for(i in seq_along(vlist)){
  inds_gene <- which(rownames(altExp(pbmcs$CB, "logcounts")) == vlist[[i]][2])
  inds_cell <- which(colData(pbmcs$CB)$seurat_clusters %in% c(6, 7, 9, 10))
  subsce <- altExp(pbmcs$CB, "logcounts")[inds_gene, inds_cell]
  sublabels <- labels[inds_cell]
  df <- as.data.frame(t(as.matrix(assay(subsce, "counts"))))
  p <- ggplot2::ggplot() +
    ggplot2::geom_violin(ggplot2::aes(x = as.factor(sublabels), y = df[, 1],
                                      fill = sublabels), trim = FALSE, size = 0.5) +
    ggplot2::geom_boxplot(ggplot2::aes(x = as.factor(sublabels), y = df[, 1]),
                          width = 0.15, alpha = 0.6) +
    ggplot2::labs(title = paste0(vlist[[i]][2], "\n", vlist[[i]][3]),
                  x = "Cluster (cell type)", y = "Expression") +
    ggplot2::theme_classic(base_size = 25) +
    ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 20),
                   legend.position = "none") +
    ggplot2::scale_fill_manual(values = mycolor)
  filename <- sprintf("figures/figure_06_%04d.png", 74 + i)
  ggplot2::ggsave(file = filename, plot = p, dpi = 50, width = 5, height = 4)
}


6.11 Infer cell types

Classify cells into large cell type categories.

colData(pbmcs$CB)$cell_type <- as.character(colData(pbmcs$CB)$seurat_clusters)
colData(pbmcs$CB)$cell_type[colData(pbmcs$CB)$cell_type == 0] <- "Mono"
colData(pbmcs$CB)$cell_type[colData(pbmcs$CB)$cell_type == 1] <- "T"
colData(pbmcs$CB)$cell_type[colData(pbmcs$CB)$cell_type == 2] <- "Mono"
colData(pbmcs$CB)$cell_type[colData(pbmcs$CB)$cell_type == 3] <- "Mono" # or T
colData(pbmcs$CB)$cell_type[colData(pbmcs$CB)$cell_type == 4] <- "NK/NKT"
colData(pbmcs$CB)$cell_type[colData(pbmcs$CB)$cell_type == 5] <- "B"
colData(pbmcs$CB)$cell_type[colData(pbmcs$CB)$cell_type == 6] <- "DC"
colData(pbmcs$CB)$cell_type[colData(pbmcs$CB)$cell_type == 7] <- "DC"
colData(pbmcs$CB)$cell_type[colData(pbmcs$CB)$cell_type == 8] <- "T"
colData(pbmcs$CB)$cell_type[colData(pbmcs$CB)$cell_type == 9] <- "DC"
colData(pbmcs$CB)$cell_type[colData(pbmcs$CB)$cell_type == 10] <- "DC"

Classify cells into the detailed categories.

colData(pbmcs$CB)$cell_state <- as.character(colData(pbmcs$CB)$seurat_clusters)
colData(pbmcs$CB)$cell_state[colData(pbmcs$CB)$cell_state == 0] <- "Mono-1"
colData(pbmcs$CB)$cell_state[colData(pbmcs$CB)$cell_state == 1] <- "T-1"
colData(pbmcs$CB)$cell_state[colData(pbmcs$CB)$cell_state == 2] <- "Mono-2"
colData(pbmcs$CB)$cell_state[colData(pbmcs$CB)$cell_state == 3] <- "Mono-3" # or T
colData(pbmcs$CB)$cell_state[colData(pbmcs$CB)$cell_state == 4] <- "NK/NKT"
colData(pbmcs$CB)$cell_state[colData(pbmcs$CB)$cell_state == 5] <- "B"
colData(pbmcs$CB)$cell_state[colData(pbmcs$CB)$cell_state == 6] <- "DC-1"
colData(pbmcs$CB)$cell_state[colData(pbmcs$CB)$cell_state == 7] <- "DC-2"
colData(pbmcs$CB)$cell_state[colData(pbmcs$CB)$cell_state == 8] <- "T-2"
colData(pbmcs$CB)$cell_state[colData(pbmcs$CB)$cell_state == 9] <- "DC-3"
colData(pbmcs$CB)$cell_state[colData(pbmcs$CB)$cell_state == 10] <- "DC-4"

Show the annotation results in low-dimensional spaces.

title <- "Reyes 2020 (cell state)"
labels <- factor(colData(pbmcs$CB)$cell_state,
                 levels = c("Mono-1", "T-1", "Mono-2", "Mono-3", "NK/NKT",
                            "B", "DC-1", "DC-2", "T-2", "DC-3", "DC-4"))
df <- as.data.frame(reducedDim(pbmcs$CB, "TSNE"))
p <- ggplot2::ggplot() +
  ggplot2::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
                      size = 0.1, alpha = 1) +
  ggplot2::labs(title = title, x = "tSNE_1", y = "tSNE_2", color = "Cell state") +
  ggplot2::theme_classic(base_size = 20) +
  ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) +
  ggplot2::scale_fill_hue() +
  ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4)))
filename <- "figures/figure_06_0080.png"
ggplot2::ggsave(file = filename, plot = p, dpi = 50, width = 5.8, height = 4.3)

Check the reported sample information (Reyes et al., 2020) in low-dimensional spaces.

types <- c("Sort", "Patient", "Cell_Type")
title <- "Reyes 2020 (cell type)"
for(i in seq_along(types)){
  labels <- colData(pbmcs$CB)[[types[i]]]
  df <- as.data.frame(reducedDim(pbmcs$CB, "TSNE"))
  p <- ggplot2::ggplot() +
    ggplot2::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
                        size = 0.1, alpha = 1) +
    ggplot2::labs(title = title, x = "tSNE_1", y = "tSNE_2", color = types[i]) +
    ggplot2::theme_classic(base_size = 20) +
    ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) +
    ggplot2::scale_fill_hue() +
    ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4)))
  filename <- sprintf("figures/figure_06_%04d.png", 80 + i)
  if(types[i] == "Sort"){
    ggplot2::ggsave(file = filename, plot = p, dpi = 50, width = 5.6, height = 4.3)
  }else if(types[i] == "Patient"){
    ggplot2::ggsave(file = filename, plot = p, dpi = 50, width = 9.3, height = 4.3)
  }else if(types[i] == "Cell_Type"){
    ggplot2::ggsave(file = filename, plot = p, dpi = 50, width = 5.8, height = 4.3)
  }
}