Several computations for PBMC datasets

Keita Iida

2022-07-14

1 Computational environment

MacBook Pro (Big Sur, 16-inch, 2019), Processor (2.4 GHz 8-Core Intel Core i9), Memory (64 GB 2667 MHz DDR4).


2 Install libraries

Attach necessary libraries:

library(ASURAT)
library(SingleCellExperiment)
library(SummarizedExperiment)


3 Introduction

In this vignette, we create figures for the computational results of PBMC datasets.


4 Compare population ratios among existing methods

Population ratios inferred by the existing methods and ASURAT are compared.


4.1 PBMC 4k

Load data.

pbmc_asurat <- readRDS("backup/04_006_pbmcs4k_annotation.rds")
pbmc_scran <- readRDS("backup/04_011_pbmc4k_scran.rds")
pbmc_seurat <- readRDS("backup/04_021_pbmc4k_seurat.rds")
pbmc_sccatch <- readRDS("backup/04_022_pbmc4k_seurat_sccatch.rds")
pbmc_monocle3 <- readRDS("backup/04_031_pbmc4k_monocle3.rds")
pbmc_sc3 <- readRDS("backup/04_041_pbmc4k_sc3.rds")

pbmcs <- list(scran = pbmc_scran, Seurat = pbmc_seurat, scCATCH = pbmc_sccatch,
              Monocle3 = pbmc_monocle3, SC3 = pbmc_sc3, ASURAT = pbmc_asurat$CB)

The hypothetical results can be obtained from Cao et al., Front. Genet. (2020).

ct <- c("T", "Mono", "B", "NK/NKT", "Megakaryocyte", "DC", "Unspecified")
cr <- c(0.4226, 0.2707, 0.1509, 0.1546, 0.0012, 0, 0)
res <- data.frame(cell_type = ct, ratio = cr, method = "SCSA")

Prepare a data frame to be plotted.

cells <- c("B", "DC", "Megakaryocyte", "Mono", "NK/NKT", "T", "Unspecified")
for(i in seq_along(pbmcs)){
  if(class(pbmcs[[i]]) == "SingleCellExperiment"){
    df <- as.data.frame(colData(pbmcs[[i]]))
  }else if(class(pbmcs[[i]]) == "cell_data_set"){
    df <- as.data.frame(colData(pbmcs[[i]]))
  }else if(class(pbmcs[[i]]) == "Seurat"){
    df <- pbmcs[[i]][[]]
  }
  df <- dplyr::group_by(df, cell_type)
  df <- dplyr::summarise(df, ratio = dplyr::n())
  df$ratio <- as.numeric(df$ratio) / sum(as.numeric(df$ratio))
  df$method <- names(pbmcs)[i]
  for(k in seq_along(cells)){
    if(!(cells[k] %in% df$cell_type)){
      tmp <- data.frame(cell_type = cells[k], ratio = 0, method = names(pbmcs)[i])
      df <- rbind(df, tmp)
    }
  }
  res <- rbind(res, df)
}

Change the names of cell types.

res$cell_type[res$cell_type == "T"] <- "T cell"
res$cell_type[res$cell_type == "Mono"] <- "Monocyte"
res$cell_type[res$cell_type == "NK/NKT"] <- "NK or NKT cell"
res$cell_type[res$cell_type == "B"] <- "B cell"
res$cell_type[res$cell_type == "DC"] <- "Dendritic cell"

Show the population ratios.

res$method <- factor(res$method, levels = unique(res$method))
n_methods <- length(unique(res$method))
mycolors <- c("black", scales::brewer_pal(palette = "Set1")(n_methods))
mycolors[7] <- mycolors[6] ; mycolors[6] <- "#FFFF33"

ymax <- 0.55
p <- ggplot2::ggplot() +
  ggplot2::geom_bar(ggplot2::aes(x = res$cell_type, y = res$ratio,
                                 fill = as.factor(res$method)),
                    stat = "identity", width = 0.8,
                    position = ggplot2::position_dodge2(width = 1)) +
  ggplot2::theme_classic(base_size = 20, base_family = "Helvetica") +
  ggplot2::scale_x_discrete(guide = ggplot2::guide_axis(angle = 45)) +
  ggplot2::ylim(0, ymax) +
  ggplot2::labs(title = "PBMC 4k", x = "", y = "Population ratio", fill = "Method") +
  ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 20),
                 legend.position = "right") +
  ggplot2::scale_fill_manual(values = mycolors)
filename <- "figures/figure_20_0010.png"
ggplot2::ggsave(file = filename, plot = p, dpi = 50, width = 11, height = 5)


4.2 PBMC 6k

Load data.

pbmc_scran <- readRDS("backup/05_011_pbmc6k_scran.rds")
pbmc_seurat <- readRDS("backup/05_021_pbmc6k_seurat.rds")
pbmc_sccatch <- readRDS("backup/05_022_pbmc6k_seurat_sccatch.rds")
pbmc_monocle3 <- readRDS("backup/05_031_pbmc6k_monocle3.rds")
pbmc_sc3 <- readRDS("backup/05_041_pbmc6k_sc3.rds")
pbmc_asurat <- readRDS("backup/05_006_pbmcs6k_annotation.rds")

pbmcs <- list(scran = pbmc_scran, Seurat = pbmc_seurat, scCATCH = pbmc_sccatch,
              Monocle3 = pbmc_monocle3, SC3 = pbmc_sc3, ASURAT = pbmc_asurat$CB)

The hypothetical results can be obtained from Cao et al., Front. Genet. (2020).

ct <- c("T", "Mono", "B", "NK/NKT", "Megakaryocyte", "DC", "Unspecified")
cr <- c(0.4920, 0.2652, 0.1292, 0.1102, 0.0035, 0, 0)
res <- data.frame(cell_type = ct, ratio = cr, method = "SCSA")

Prepare a data frame to be plotted.

cells <- c("B", "DC", "Megakaryocyte", "Mono", "NK/NKT", "T", "Unspecified")
for(i in seq_along(pbmcs)){
  if(class(pbmcs[[i]]) == "SingleCellExperiment"){
    df <- as.data.frame(colData(pbmcs[[i]]))
  }else if(class(pbmcs[[i]]) == "cell_data_set"){
    df <- as.data.frame(colData(pbmcs[[i]]))
  }else if(class(pbmcs[[i]]) == "Seurat"){
    df <- pbmcs[[i]][[]]
  }
  df <- dplyr::group_by(df, cell_type)
  df <- dplyr::summarise(df, ratio = dplyr::n())
  df$ratio <- as.numeric(df$ratio) / sum(as.numeric(df$ratio))
  df$method <- names(pbmcs)[i]
  for(k in seq_along(cells)){
    if(!(cells[k] %in% df$cell_type)){
      tmp <- data.frame(cell_type = cells[k], ratio = 0, method = names(pbmcs)[i])
      df <- rbind(df, tmp)
    }
  }
  res <- rbind(res, df)
}

Change the names of cell types.

res$cell_type[res$cell_type == "T"] <- "T cell"
res$cell_type[res$cell_type == "Mono"] <- "Monocyte"
res$cell_type[res$cell_type == "NK/NKT"] <- "NK or NKT cell"
res$cell_type[res$cell_type == "B"] <- "B cell"
res$cell_type[res$cell_type == "DC"] <- "Dendritic cell"

Show the population ratios.

res$method <- factor(res$method, levels = unique(res$method))
n_methods <- length(unique(res$method))
mycolors <- c("black", scales::brewer_pal(palette = "Set1")(n_methods))
mycolors[7] <- mycolors[6] ; mycolors[6] <- "#FFFF33"

ymax <- 0.68
p <- ggplot2::ggplot() +
  ggplot2::geom_bar(ggplot2::aes(x = res$cell_type, y = res$ratio,
                                 fill = as.factor(res$method)),
                    stat = "identity", width = 0.8,
                    position = ggplot2::position_dodge2(width = 1)) +
  ggplot2::theme_classic(base_size = 20, base_family = "Helvetica") +
  ggplot2::scale_x_discrete(guide = ggplot2::guide_axis(angle = 45)) +
  ggplot2::ylim(0, ymax) +
  ggplot2::labs(title = "PBMC 6k", x = "", y = "Population ratio",
                fill = "Method") +
  ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 20),
                 legend.position = "right") +
  ggplot2::scale_fill_manual(values = mycolors)
filename <- "figures/figure_20_0020.png"
ggplot2::ggsave(file = filename, plot = p, dpi = 50, width = 11, height = 5)


4.3 Reyes et al., 2020

Load data.

pbmc_scran <- readRDS("backup/06_011_pbmc130k_scran.rds")
pbmc_seurat <- readRDS("backup/06_021_pbmc130k_seurat.rds")
pbmc_sccatch <- readRDS("backup/06_022_pbmc130k_seurat_sccatch.rds")
pbmc_monocle3 <- readRDS("backup/06_031_pbmc130k_monocle3.rds")
#pbmc_sc3 <- readRDS("backup/06_041_pbmc130k_sc3.rds")
pbmc_asurat <- readRDS("backup/06_006_pbmcs130k_annotation.rds")

pbmcs <- list(scran = pbmc_scran, Seurat = pbmc_seurat, scCATCH = pbmc_sccatch,
              Monocle3 = pbmc_monocle3, ASURAT = pbmc_asurat$CB)

Load the sample information reported in previous paper (Reyes et al., 2020).

info <- read.delim("rawdata/2020_001_Reyes/SCP548/metadata/scp_meta.txt")
info <- info[-1, ] ; rownames(info) <- info$NAME ; info <- info[, -1]
info[which(info$Cell_Type == "NK"), ]$Cell_Type <- "NK/NKT"
tmp <- dplyr::group_by(info, Cell_Type)
tmp <- dplyr::summarise(tmp, ratio = dplyr::n())
tmp$ratio <- as.numeric(tmp$ratio) / sum(as.numeric(tmp$ratio))
df <- data.frame(cell_type = tmp$Cell_Type, ratio = tmp$ratio, method = "Reyes_2020")
res <- rbind(df, c("Unspecified", 0, "Reyes_2020"))

Prepare a data frame to be plotted.

cells <- c("B", "DC", "Megakaryocyte", "Mono", "NK/NKT", "T", "Unspecified")
for(i in seq_along(pbmcs)){
  if(class(pbmcs[[i]]) == "SingleCellExperiment"){
    df <- as.data.frame(colData(pbmcs[[i]]))
  }else if(class(pbmcs[[i]]) == "cell_data_set"){
    df <- as.data.frame(colData(pbmcs[[i]]))
  }else if(class(pbmcs[[i]]) == "Seurat"){
    df <- pbmcs[[i]][[]]
  }
  df <- dplyr::group_by(df, cell_type)
  df <- dplyr::summarise(df, ratio = dplyr::n())
  df$ratio <- as.numeric(df$ratio) / sum(as.numeric(df$ratio))
  df$method <- names(pbmcs)[i]
  for(k in seq_along(cells)){
    if(!(cells[k] %in% df$cell_type)){
      tmp <- data.frame(cell_type = cells[k], ratio = 0, method = names(pbmcs)[i])
      df <- rbind(df, tmp)
    }
  }
  res <- rbind(res, df)
}
res$ratio <- as.numeric(res$ratio)

Change the names of cell types.

res$cell_type[res$cell_type == "T"] <- "T cell"
res$cell_type[res$cell_type == "Mono"] <- "Monocyte"
res$cell_type[res$cell_type == "NK/NKT"] <- "NK or NKT cell"
res$cell_type[res$cell_type == "B"] <- "B cell"
res$cell_type[res$cell_type == "DC"] <- "Dendritic cell"

Show the population ratios.

res$method <- factor(res$method, levels = unique(res$method))
n_methods <- length(unique(res$method))
mycolors <- c("black", scales::brewer_pal(palette = "Set1")(n_methods + 1))
mycolors[7] <- mycolors[6]

ymax <- 0.65
p <- ggplot2::ggplot() +
  ggplot2::geom_bar(ggplot2::aes(x = res$cell_type, y = res$ratio,
                                 fill = as.factor(res$method)),
                    stat = "identity", width = 0.8,
                    position = ggplot2::position_dodge2(width = 1)) +
  ggplot2::theme_classic(base_size = 20, base_family = "Helvetica") +
  ggplot2::scale_x_discrete(guide = ggplot2::guide_axis(angle = 45)) +
  ggplot2::ylim(0, ymax) +
  ggplot2::labs(title = "Reyes 2020", x = "", y = "Population ratio",
                fill = "Method") +
  ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 20),
                 legend.position = "right") +
  ggplot2::scale_fill_manual(values = mycolors)
filename <- "figures/figure_20_0030.png"
ggplot2::ggsave(file = filename, plot = p, dpi = 50, width = 11, height = 5)


6 Investigate the dependency of ASURAT on the number of cells

To investigate the dependency of ASURAT on the number of cells, benchmark ASURAT performance by randomly downsampling cells.


6.1 PBMC 4k (try 1)

6.1.1 Load data

load data.

pbmc <- readRDS("backup/04_003_pbmc4k_normalized.rds")

Load databases.

urlpath <- "https://github.com/keita-iida/ASURATDB/blob/main/genes2bioterm/"
load(url(paste0(urlpath, "20201213_human_CO.rda?raw=TRUE")))         # CO
load(url(paste0(urlpath, "20220308_human_MSigDB.rda?raw=TRUE")))     # MSigDB
load(url(paste0(urlpath, "20220308_human_CellMarker.rda?raw=TRUE"))) # CellMarker

Create a custom-built cell type-related databases by combining different databases for analyzing human single-cell transcriptome data.

d <- list(human_CO[["cell"]], human_MSigDB[["cell"]], human_CellMarker[["cell"]])
human_CB <- list(cell = do.call("rbind", d))


6.1.2 Random downsampling (4000 cells)

Perform random sampling of 4000 cells.

set.seed(1)
inds <- sample(dim(pbmc)[2], 4000, prob = NULL, replace = FALSE)
subpbmc <- pbmc[, inds]

Compute a correlation matrix.

mat <- t(as.matrix(assay(subpbmc, "centered")))
cormat <- cor(mat, method = "spearman")


6.1.2.1 ASURAT

Run ASURAT protocols.

# Add formatted databases.
sce <- list(CB = subpbmc)
metadata(sce$CB) <- list(sign = human_CB[["cell"]])
# Remove biological terms including too few or too many genes.
sce$CB <- remove_signs(sce = sce$CB, min_ngenes = 2, max_ngenes = 1000)
# Create signs and sign-by-sample matrices.
set.seed(1)
sce$CB <- cluster_genesets(sce = sce$CB, cormat = cormat,
                           th_posi = 0.2, th_nega = -0.4)
sce$CB <- create_signs(sce = sce$CB, min_cnt_strg = 4, min_cnt_vari = 4)
sce$CB <- makeSignMatrix(sce = sce$CB, weight_strg = 0.5, weight_vari = 0.5)
# Perform dimensional reduction.
set.seed(1)
mat <- t(as.matrix(assay(sce$CB, "counts")))
res <- Rtsne::Rtsne(mat, dim = 2, pca = TRUE, initial_dims = 50)
reducedDim(sce$CB, "TSNE") <- res[["Y"]]
# Perform unsupervised clustering of cells.
surt <- Seurat::as.Seurat(sce$CB, counts = "counts", data = "counts")
mat <- as.matrix(assay(sce$CB, "counts"))
surt[["SSM"]] <- Seurat::CreateAssayObject(counts = mat)
Seurat::DefaultAssay(surt) <- "SSM"
surt <- Seurat::ScaleData(surt, features = rownames(surt))
surt <- Seurat::RunPCA(surt, features = rownames(surt))
surt <- Seurat::FindNeighbors(surt, reduction = "pca", dims = seq_len(40))
surt <- Seurat::FindClusters(surt, resolution = 0.15)
temp <- Seurat::as.SingleCellExperiment(surt)
colData(sce$CB)$seurat_clusters <- colData(temp)$seurat_clusters
# Show the clustering results in low-dimensional spaces.
labels <- colData(sce$CB)$seurat_clusters
df <- as.data.frame(reducedDim(sce$CB, "TSNE"))
p <- ggplot2::ggplot() +
  ggplot2::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
                      size = 1, alpha = 1) +
  ggplot2::labs(title = "PBMC 4000 cells (cell type)", x = "tSNE_1", y = "tSNE_2",
                color = "") +
  ggplot2::theme_classic(base_size = 20) + ggplot2::scale_colour_hue() +
  ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) +
  ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4)))
filename <- "figures/figure_20_0100.png"
ggplot2::ggsave(file = filename, plot = p, dpi = 50, width = 5.1, height = 4.3)
# Investigate significant signs.
set.seed(1)
labels <- colData(sce$CB)$seurat_clusters
sce$CB <- compute_sepI_all(sce = sce$CB, labels = labels, nrand_samples = NULL)
# Investigate significant genes.
set.seed(1)
surt <- Seurat::as.Seurat(sce$CB, counts = "counts", data = "counts")
mat <- as.matrix(assay(altExp(sce$CB), "counts"))
surt[["GEM"]] <- Seurat::CreateAssayObject(counts = mat)
Seurat::DefaultAssay(surt) <- "GEM"
surt <- Seurat::SetIdent(surt, value = "seurat_clusters")
res <- Seurat::FindAllMarkers(surt, only.pos = TRUE,
                              min.pct = 0.25, logfc.threshold = 0.25)
metadata(sce$CB)$marker_genes$all <- res
View(metadata(sce$CB)$marker_signs$all)
View(metadata(sce$CB)$marker_genes$all)
# Infer cell types.
# 0: T cell          # TRAC (p_val_adj ~0), EEF1A1 (p_val_adj ~0)
# 1: Monocyte        # MSigDBID:13-S (...KUPFFER_CELLS_3) (sepI ~0.99)
# 2: NK/NKT          # MSigDBID:1-V (...NK_NKT_CELLS_1) (sepI ~0.99)
# 3: B cell          # MSigDBID:23-V (...B_CELLS) (sepI ~0.94)
# 4: Dendritic cell  # CellMarkerID:72-S (Dendritic cell) (sepI ~0.98)
colData(sce$CB)$cell_type <- as.character(colData(sce$CB)$seurat_clusters)
colData(sce$CB)$cell_type[colData(sce$CB)$cell_type == 0] <- "T"
colData(sce$CB)$cell_type[colData(sce$CB)$cell_type == 1] <- "Mono"
colData(sce$CB)$cell_type[colData(sce$CB)$cell_type == 2] <- "NK/NKT"
colData(sce$CB)$cell_type[colData(sce$CB)$cell_type == 3] <- "B"
colData(sce$CB)$cell_type[colData(sce$CB)$cell_type == 4] <- "DC"
# Save data.
saveRDS(sce, file = "backup/20_001_pbmc4000_asurat.rds")
# Load data.
sce <- readRDS("backup/20_001_pbmc4000_asurat.rds")


6.1.2.2 Seurat

Run Seurat protocols.

# Create Seurat objects.
surt <- Seurat::CreateSeuratObject(counts = as.matrix(assay(subpbmc, "counts")),
                                   project = "PBMC")
# Normalization
surt <- Seurat::NormalizeData(surt, normalization.method = "LogNormalize")
# Variance stabilizing transform
n <- round(0.9 * ncol(surt))
surt <- Seurat::FindVariableFeatures(surt, selection.method = "vst", nfeatures = n)
# Scale data
surt <- Seurat::ScaleData(surt)
# Principal component analysis
surt <- Seurat::RunPCA(surt, features = Seurat::VariableFeatures(surt))
# Create k-nearest neighbor graph.
surt <- Seurat::FindNeighbors(surt, reduction = "pca", dim = seq_len(40))
# Cluster cells.
surt <- Seurat::FindClusters(surt, resolution = 0.05)
# Run t-SNE.
surt <- Seurat::RunTSNE(surt, dims.use = seq_len(2), reduction = "pca",
                        dims = seq_len(pc), do.fast = FALSE, perplexity = 30)
# Show the clustering results.
title <- "PBMC 4000 cells (Seurat)"
labels <- surt$seurat_clusters
df <- surt@reductions[["tsne"]]@cell.embeddings
p <- ggplot2::ggplot() +
  ggplot2::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
                      size = 1, alpha = 1) +
  ggplot2::labs(title = title, x = "tSNE_1", y = "tSNE_2", color = "") +
  ggplot2::theme_classic(base_size = 20) +
  ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) +
  ggplot2::scale_colour_hue() +
  ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4)))
filename <- "figures/figure_20_0105.png"
ggplot2::ggsave(file = filename, plot = p, dpi = 50, width = 5.1, height = 4.3)
# Find differentially expressed genes.
surt@misc$stat <- Seurat::FindAllMarkers(surt, only.pos = TRUE, min.pct = 0.25,
                                         logfc.threshold = 0.25)
View(surt@misc$stat[which(surt@misc$stat$p_val_adj < 10^(-100)), ])
# Infer cell types
# 0: T cell          # TRAC (p_val_adj ~0), EEF1A1 (p_val_adj ~0)
# 1: Monocyte        # S100A8 (p_val_adj ~0), S100A9 (p_val_adj ~0)
# 2: NK/NKT          # NKG7 (p_val_adj ~0), GZMA (p_val_adj ~0)
# 3: B cell          # CD79A (p_val_adj ~0), CD79B (p_val_adj ~0)
# 4: Dendritic cell  # LILRA4 (p_val_adj ~e-271), LILRB4 (p_val_adj ~e-116)
tmp <- as.integer(as.character(surt$seurat_clusters))
surt$cell_type <- tmp
surt$cell_type[surt$cell_type == 0] <- "T"
surt$cell_type[surt$cell_type == 1] <- "Mono"
surt$cell_type[surt$cell_type == 2] <- "NK/NKT"
surt$cell_type[surt$cell_type == 3] <- "B"
surt$cell_type[surt$cell_type == 4] <- "DC"
# Save data.
saveRDS(surt, file = "backup/20_001_pbmc4000_seurat.rds")
# Load data.
surt <- readRDS("backup/20_001_pbmc4000_seurat.rds")


6.1.3 Random downsampling (3000 cells)

Perform random sampling of 3000 cells.

set.seed(1)
inds <- sample(dim(pbmc)[2], 3000, prob = NULL, replace = FALSE)
subpbmc <- pbmc[, inds]

Compute a correlation matrix.

mat <- t(as.matrix(assay(subpbmc, "centered")))
cormat <- cor(mat, method = "spearman")


6.1.3.1 ASURAT

Run ASURAT protocols.

# Add formatted databases.
sce <- list(CB = subpbmc)
metadata(sce$CB) <- list(sign = human_CB[["cell"]])
# Remove biological terms including too few or too many genes.
sce$CB <- remove_signs(sce = sce$CB, min_ngenes = 2, max_ngenes = 1000)
# Create signs and sign-by-sample matrices.
set.seed(1)
sce$CB <- cluster_genesets(sce = sce$CB, cormat = cormat,
                           th_posi = 0.2, th_nega = -0.4)
sce$CB <- create_signs(sce = sce$CB, min_cnt_strg = 4, min_cnt_vari = 4)
sce$CB <- makeSignMatrix(sce = sce$CB, weight_strg = 0.5, weight_vari = 0.5)
# Perform dimensional reduction.
set.seed(1)
mat <- t(as.matrix(assay(sce$CB, "counts")))
res <- Rtsne::Rtsne(mat, dim = 2, pca = TRUE, initial_dims = 50)
reducedDim(sce$CB, "TSNE") <- res[["Y"]]
# Perform unsupervised clustering of cells.
surt <- Seurat::as.Seurat(sce$CB, counts = "counts", data = "counts")
mat <- as.matrix(assay(sce$CB, "counts"))
surt[["SSM"]] <- Seurat::CreateAssayObject(counts = mat)
Seurat::DefaultAssay(surt) <- "SSM"
surt <- Seurat::ScaleData(surt, features = rownames(surt))
surt <- Seurat::RunPCA(surt, features = rownames(surt))
surt <- Seurat::FindNeighbors(surt, reduction = "pca", dims = seq_len(40))
surt <- Seurat::FindClusters(surt, resolution = 0.10)
temp <- Seurat::as.SingleCellExperiment(surt)
colData(sce$CB)$seurat_clusters <- colData(temp)$seurat_clusters
# Show the clustering results in t-SNE spaces.
labels <- colData(sce$CB)$seurat_clusters
df <- as.data.frame(reducedDim(sce$CB, "TSNE"))
p <- ggplot2::ggplot() +
  ggplot2::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
                      size = 1, alpha = 1) +
  ggplot2::labs(title = "PBMC 3000 cells (cell type)", x = "tSNE_1", y = "tSNE_2",
                color = "") +
  ggplot2::theme_classic(base_size = 20) + ggplot2::scale_colour_hue() +
  ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) +
  ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4)))
filename <- "figures/figure_20_0110.png"
ggplot2::ggsave(file = filename, plot = p, dpi = 50, width = 5.1, height = 4.3)
# Investigate significant signs.
set.seed(1)
labels <- colData(sce$CB)$seurat_clusters
sce$CB <- compute_sepI_all(sce = sce$CB, labels = labels, nrand_samples = NULL)
# Investigate significant genes.
set.seed(1)
surt <- Seurat::as.Seurat(sce$CB, counts = "counts", data = "counts")
mat <- as.matrix(assay(altExp(sce$CB), "counts"))
surt[["GEM"]] <- Seurat::CreateAssayObject(counts = mat)
Seurat::DefaultAssay(surt) <- "GEM"
surt <- Seurat::SetIdent(surt, value = "seurat_clusters")
res <- Seurat::FindAllMarkers(surt, only.pos = TRUE,
                              min.pct = 0.25, logfc.threshold = 0.25)
metadata(sce$CB)$marker_genes$all <- res
View(metadata(sce$CB)$marker_signs$all)
View(metadata(sce$CB)$marker_genes$all)
# Infer cell types.
# 0: T cell          # EEF1A1 (p_val_adj ~e-259), TRAC (p_val_adj ~e-243)
# 1: Monocyte        # MSigDBID:13-S (...KUPFFER_CELLS_3) (sepI ~0.99)
# 2: NK/NKT          # MSigDBID:1-V (...NK_NKT_CELLS_1) (sepI ~0.99)
# 3: B cell          # MSigDBID:23-V (...B_CELLS) (sepI ~0.94)
# 4: Dendritic cell  # CellMarkerID:72-S (Dendritic cell) (sepI ~0.98)
colData(sce$CB)$cell_type <- as.character(colData(sce$CB)$seurat_clusters)
colData(sce$CB)$cell_type[colData(sce$CB)$cell_type == 0] <- "T"
colData(sce$CB)$cell_type[colData(sce$CB)$cell_type == 1] <- "Mono"
colData(sce$CB)$cell_type[colData(sce$CB)$cell_type == 2] <- "NK/NKT"
colData(sce$CB)$cell_type[colData(sce$CB)$cell_type == 3] <- "B"
colData(sce$CB)$cell_type[colData(sce$CB)$cell_type == 4] <- "DC"
# Save data.
saveRDS(sce, file = "backup/20_002_pbmc3000_asurat.rds")
# Load data.
sce <- readRDS("backup/20_002_pbmc3000_asurat.rds")


6.1.3.2 Seurat

Run Seurat protocols.

# Create Seurat objects.
surt <- Seurat::CreateSeuratObject(counts = as.matrix(assay(subpbmc, "counts")),
                                   project = "PBMC")
# Normalization
surt <- Seurat::NormalizeData(surt, normalization.method = "LogNormalize")
# Variance stabilizing transform
n <- round(0.9 * ncol(surt))
surt <- Seurat::FindVariableFeatures(surt, selection.method = "vst", nfeatures = n)
# Scale data
surt <- Seurat::ScaleData(surt)
# Principal component analysis
surt <- Seurat::RunPCA(surt, features = Seurat::VariableFeatures(surt))
# Create k-nearest neighbor graph.
surt <- Seurat::FindNeighbors(surt, reduction = "pca", dim = seq_len(40))
# Cluster cells.
surt <- Seurat::FindClusters(surt, resolution = 0.05)
# Run t-SNE.
surt <- Seurat::RunTSNE(surt, dims.use = seq_len(2), reduction = "pca",
                        dims = seq_len(pc), do.fast = FALSE, perplexity = 30)
# Show the clustering results.
title <- "PBMC 3000 cells (Seurat)"
labels <- surt$seurat_clusters
df <- surt@reductions[["tsne"]]@cell.embeddings
p <- ggplot2::ggplot() +
  ggplot2::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
                      size = 1, alpha = 1) +
  ggplot2::labs(title = title, x = "tSNE_1", y = "tSNE_2", color = "") +
  ggplot2::theme_classic(base_size = 20) +
  ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) +
  ggplot2::scale_colour_hue() +
  ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4)))
filename <- "figures/figure_20_0115.png"
ggplot2::ggsave(file = filename, plot = p, dpi = 50, width = 5.1, height = 4.3)
# Find differentially expressed genes.
surt@misc$stat <- Seurat::FindAllMarkers(surt, only.pos = TRUE, min.pct = 0.25,
                                         logfc.threshold = 0.25)
View(surt@misc$stat[which(surt@misc$stat$p_val_adj < 10^(-100)), ])
# Infer cell types
# 0: T cell          # TRAC (p_val_adj ~e-270), EEF1A1 (p_val_adj ~e-239)
# 1: Monocyte        # S100A8 (p_val_adj ~0), S100A9 (p_val_adj ~0)
# 2: NK/NKT          # NKG7 (p_val_adj ~0), GZMA (p_val_adj ~0)
# 3: B cell          # CD79A (p_val_adj ~0), CD79B (p_val_adj ~0)
# 4: Dendritic cell  # LILRA4 (p_val_adj ~e-204), LILRB4 (p_val_adj ~e-102)
tmp <- as.integer(as.character(surt$seurat_clusters))
surt$cell_type <- tmp
surt$cell_type[surt$cell_type == 0] <- "T"
surt$cell_type[surt$cell_type == 1] <- "Mono"
surt$cell_type[surt$cell_type == 2] <- "NK/NKT"
surt$cell_type[surt$cell_type == 3] <- "B"
surt$cell_type[surt$cell_type == 4] <- "DC"
# Save data.
saveRDS(surt, file = "backup/20_002_pbmc3000_seurat.rds")
# Load data.
surt <- readRDS("backup/20_002_pbmc3000_seurat.rds")


6.1.4 Random downsampling (2000 cells)

Perform random sampling of 2000 cells.

set.seed(1)
inds <- sample(dim(pbmc)[2], 2000, prob = NULL, replace = FALSE)
subpbmc <- pbmc[, inds]

Compute a correlation matrix.

mat <- t(as.matrix(assay(subpbmc, "centered")))
cormat <- cor(mat, method = "spearman")


6.1.4.1 ASURAT

Run ASURAT protocols.

# Add formatted databases.
sce <- list(CB = subpbmc)
metadata(sce$CB) <- list(sign = human_CB[["cell"]])
# Remove biological terms including too few or too many genes.
sce$CB <- remove_signs(sce = sce$CB, min_ngenes = 2, max_ngenes = 1000)
# Create signs and sign-by-sample matrices.
set.seed(1)
sce$CB <- cluster_genesets(sce = sce$CB, cormat = cormat,
                           th_posi = 0.2, th_nega = -0.4)
sce$CB <- create_signs(sce = sce$CB, min_cnt_strg = 4, min_cnt_vari = 4)
sce$CB <- makeSignMatrix(sce = sce$CB, weight_strg = 0.5, weight_vari = 0.5)
# Perform dimensional reduction.
set.seed(1)
mat <- t(as.matrix(assay(sce$CB, "counts")))
res <- Rtsne::Rtsne(mat, dim = 2, pca = TRUE, initial_dims = 50)
reducedDim(sce$CB, "TSNE") <- res[["Y"]]
# Perform unsupervised clustering of cells.
surt <- Seurat::as.Seurat(sce$CB, counts = "counts", data = "counts")
mat <- as.matrix(assay(sce$CB, "counts"))
surt[["SSM"]] <- Seurat::CreateAssayObject(counts = mat)
Seurat::DefaultAssay(surt) <- "SSM"
surt <- Seurat::ScaleData(surt, features = rownames(surt))
surt <- Seurat::RunPCA(surt, features = rownames(surt))
surt <- Seurat::FindNeighbors(surt, reduction = "pca", dims = seq_len(10))
surt <- Seurat::FindClusters(surt, resolution = 0.13)
temp <- Seurat::as.SingleCellExperiment(surt)
colData(sce$CB)$seurat_clusters <- colData(temp)$seurat_clusters
# Show the clustering results in t-SNE spaces.
labels <- colData(sce$CB)$seurat_clusters
df <- as.data.frame(reducedDim(sce$CB, "TSNE"))
p <- ggplot2::ggplot() +
  ggplot2::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
                      size = 1, alpha = 1) +
  ggplot2::labs(title = "PBMC 2000 cells (cell type)", x = "tSNE_1", y = "tSNE_2",
                color = "") +
  ggplot2::theme_classic(base_size = 20) + ggplot2::scale_colour_hue() +
  ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) +
  ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4)))
filename <- "figures/figure_20_0120.png"
ggplot2::ggsave(file = filename, plot = p, dpi = 50, width = 5.1, height = 4.3)
# Investigate significant signs.
set.seed(1)
labels <- colData(sce$CB)$seurat_clusters
sce$CB <- compute_sepI_all(sce = sce$CB, labels = labels, nrand_samples = NULL)
# Investigate significant genes.
set.seed(1)
surt <- Seurat::as.Seurat(sce$CB, counts = "counts", data = "counts")
mat <- as.matrix(assay(altExp(sce$CB), "counts"))
surt[["GEM"]] <- Seurat::CreateAssayObject(counts = mat)
Seurat::DefaultAssay(surt) <- "GEM"
surt <- Seurat::SetIdent(surt, value = "seurat_clusters")
res <- Seurat::FindAllMarkers(surt, only.pos = TRUE,
                              min.pct = 0.25, logfc.threshold = 0.25)
metadata(sce$CB)$marker_genes$all <- res
View(metadata(sce$CB)$marker_signs$all)
View(metadata(sce$CB)$marker_genes$all)
# Infer cell types.
# 0: T cell          # EEF1A1 (p_val_adj ~e-178), TRAC (p_val_adj ~e-156)
# 1: Monocyte        # MSigDBID:13-S (...KUPFFER_CELLS_3) (sepI ~0.99)
# 2: NK/NKT          # MSigDBID:1-V (...NK_NKT_CELLS_1) (sepI ~0.97)
# 3: B cell          # MSigDBID:23-V (...B_CELLS) (sepI ~0.94)
# 4: Dendritic cell  # CellMarkerID:72-S (Dendritic cell) (sepI ~0.98)
colData(sce$CB)$cell_type <- as.character(colData(sce$CB)$seurat_clusters)
colData(sce$CB)$cell_type[colData(sce$CB)$cell_type == 0] <- "T"
colData(sce$CB)$cell_type[colData(sce$CB)$cell_type == 1] <- "Mono"
colData(sce$CB)$cell_type[colData(sce$CB)$cell_type == 2] <- "NK/NKT"
colData(sce$CB)$cell_type[colData(sce$CB)$cell_type == 3] <- "B"
colData(sce$CB)$cell_type[colData(sce$CB)$cell_type == 4] <- "DC"
# Save data.
saveRDS(sce, file = "backup/20_003_pbmc2000_asurat.rds")
# Load data.
sce <- readRDS("backup/20_003_pbmc2000_asurat.rds")


6.1.4.2 Seurat

Run Seurat protocols.

# Create Seurat objects.
surt <- Seurat::CreateSeuratObject(counts = as.matrix(assay(subpbmc, "counts")),
                                   project = "PBMC")
# Normalization
surt <- Seurat::NormalizeData(surt, normalization.method = "LogNormalize")
# Variance stabilizing transform
n <- round(0.9 * ncol(surt))
surt <- Seurat::FindVariableFeatures(surt, selection.method = "vst", nfeatures = n)
# Scale data
surt <- Seurat::ScaleData(surt)
# Principal component analysis
surt <- Seurat::RunPCA(surt, features = Seurat::VariableFeatures(surt))
# Create k-nearest neighbor graph.
surt <- Seurat::FindNeighbors(surt, reduction = "pca", dim = seq_len(10))
# Cluster cells.
surt <- Seurat::FindClusters(surt, resolution = 0.06)
# Run t-SNE.
surt <- Seurat::RunTSNE(surt, dims.use = seq_len(2), reduction = "pca",
                        dims = seq_len(pc), do.fast = FALSE, perplexity = 30)
# Show the clustering results.
title <- "PBMC 2000 cells (Seurat)"
labels <- surt$seurat_clusters
df <- surt@reductions[["tsne"]]@cell.embeddings
p <- ggplot2::ggplot() +
  ggplot2::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
                      size = 1, alpha = 1) +
  ggplot2::labs(title = title, x = "tSNE_1", y = "tSNE_2", color = "") +
  ggplot2::theme_classic(base_size = 20) +
  ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) +
  ggplot2::scale_colour_hue() +
  ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4)))
filename <- "figures/figure_20_0125.png"
ggplot2::ggsave(file = filename, plot = p, dpi = 50, width = 5.1, height = 4.3)
# Find differentially expressed genes.
surt@misc$stat <- Seurat::FindAllMarkers(surt, only.pos = TRUE, min.pct = 0.25,
                                         logfc.threshold = 0.25)
View(surt@misc$stat[which(surt@misc$stat$p_val_adj < 10^(-100)), ])
# Infer cell types
# 0: T cell          # TRAC (p_val_adj ~e-177), EEF1A1 (p_val_adj ~e-158)
# 1: Monocyte        # FCN1 (p_val_adj ~0), MNDA (p_val_adj ~0)
# 2: NK/NKT          # GZMA (p_val_adj ~e-238), NKG7 (p_val_adj ~e-220)
# 3: B cell          # CD79A (p_val_adj ~e-302), MS4A1 (p_val_adj ~e-300)
# 4: Dendritic cell? # LILRA4 (p_val_adj ~e-129)
tmp <- as.integer(as.character(surt$seurat_clusters))
surt$cell_type <- tmp
surt$cell_type[surt$cell_type == 0] <- "T"
surt$cell_type[surt$cell_type == 1] <- "Mono"
surt$cell_type[surt$cell_type == 2] <- "NK/NKT"
surt$cell_type[surt$cell_type == 3] <- "B"
surt$cell_type[surt$cell_type == 4] <- "DC"
# Save data.
saveRDS(surt, file = "backup/20_003_pbmc2000_seurat.rds")
# Load data.
surt <- readRDS("backup/20_003_pbmc2000_seurat.rds")


6.1.5 Random downsampling (1500 cells)

Perform random sampling of 1500 cells.

set.seed(1)
inds <- sample(dim(pbmc)[2], 1500, prob = NULL, replace = FALSE)
subpbmc <- pbmc[, inds]

Compute a correlation matrix.

mat <- t(as.matrix(assay(subpbmc, "centered")))
cormat <- cor(mat, method = "spearman")


6.1.5.1 ASURAT

Run ASURAT protocols.

# Add formatted databases.
sce <- list(CB = subpbmc)
metadata(sce$CB) <- list(sign = human_CB[["cell"]])
# Remove biological terms including too few or too many genes.
sce$CB <- remove_signs(sce = sce$CB, min_ngenes = 2, max_ngenes = 1000)
# Create signs and sign-by-sample matrices.
set.seed(1)
sce$CB <- cluster_genesets(sce = sce$CB, cormat = cormat,
                           th_posi = 0.2, th_nega = -0.4)
sce$CB <- create_signs(sce = sce$CB, min_cnt_strg = 4, min_cnt_vari = 4)
sce$CB <- makeSignMatrix(sce = sce$CB, weight_strg = 0.5, weight_vari = 0.5)
# Perform dimensional reduction.
set.seed(1)
mat <- t(as.matrix(assay(sce$CB, "counts")))
res <- Rtsne::Rtsne(mat, dim = 2, pca = TRUE, initial_dims = 50)
reducedDim(sce$CB, "TSNE") <- res[["Y"]]
# Perform unsupervised clustering of cells.
surt <- Seurat::as.Seurat(sce$CB, counts = "counts", data = "counts")
mat <- as.matrix(assay(sce$CB, "counts"))
surt[["SSM"]] <- Seurat::CreateAssayObject(counts = mat)
Seurat::DefaultAssay(surt) <- "SSM"
surt <- Seurat::ScaleData(surt, features = rownames(surt))
surt <- Seurat::RunPCA(surt, features = rownames(surt))
surt <- Seurat::FindNeighbors(surt, reduction = "pca", dims = seq_len(50))
surt <- Seurat::FindClusters(surt, resolution = 0.40)
temp <- Seurat::as.SingleCellExperiment(surt)
colData(sce$CB)$seurat_clusters <- colData(temp)$seurat_clusters
# Show the clustering results in t-SNE spaces.
labels <- colData(sce$CB)$seurat_clusters
df <- as.data.frame(reducedDim(sce$CB, "TSNE"))
p <- ggplot2::ggplot() +
  ggplot2::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
                      size = 1, alpha = 1) +
  ggplot2::labs(title = "PBMC 1500 cells (cell type)", x = "tSNE_1", y = "tSNE_2",
                color = "") +
  ggplot2::theme_classic(base_size = 20) + ggplot2::scale_colour_hue() +
  ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) +
  ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4)))
filename <- "figures/figure_20_0130.png"
ggplot2::ggsave(file = filename, plot = p, dpi = 50, width = 5.1, height = 4.3)
# Investigate significant signs.
set.seed(1)
labels <- colData(sce$CB)$seurat_clusters
sce$CB <- compute_sepI_all(sce = sce$CB, labels = labels, nrand_samples = NULL)
# Investigate significant genes.
set.seed(1)
surt <- Seurat::as.Seurat(sce$CB, counts = "counts", data = "counts")
mat <- as.matrix(assay(altExp(sce$CB), "counts"))
surt[["GEM"]] <- Seurat::CreateAssayObject(counts = mat)
Seurat::DefaultAssay(surt) <- "GEM"
surt <- Seurat::SetIdent(surt, value = "seurat_clusters")
res <- Seurat::FindAllMarkers(surt, only.pos = TRUE,
                              min.pct = 0.25, logfc.threshold = 0.25)
metadata(sce$CB)$marker_genes$all <- res
View(metadata(sce$CB)$marker_signs$all)
View(metadata(sce$CB)$marker_genes$all)
# Infer cell types.
# 0: T cell          # EEF1A1 (p_val_adj ~e-140), TRAC (p_val_adj ~e-121)
# 1: Monocyte        # MSigDBID:20-S (...KUPFFER_CELLS_5) (sepI ~0.96)
# 2: NK/NKT cell     # MSigDBID:1-V (...NK_NKT_CELLS_5) (sepI ~0.97)
# 3: B cell          # MSigDBID:23-V (...B_CELLS) (sepI ~0.95)
# 4: Monocyte          # MSigDBID:65-S (...MACROPHAGES) (sepI ~0.96)
colData(sce$CB)$cell_type <- as.character(colData(sce$CB)$seurat_clusters)
colData(sce$CB)$cell_type[colData(sce$CB)$cell_type == 0] <- "T"
colData(sce$CB)$cell_type[colData(sce$CB)$cell_type == 1] <- "Mono"
colData(sce$CB)$cell_type[colData(sce$CB)$cell_type == 2] <- "NK/NKT"
colData(sce$CB)$cell_type[colData(sce$CB)$cell_type == 3] <- "B"
colData(sce$CB)$cell_type[colData(sce$CB)$cell_type == 4] <- "Mono"
# Save data.
saveRDS(sce, file = "backup/20_004_pbmc1500_asurat.rds")
# Load data.
sce <- readRDS("backup/20_004_pbmc1500_asurat.rds")

Run Seurat protocols.

# Create Seurat objects.
surt <- Seurat::CreateSeuratObject(counts = as.matrix(assay(subpbmc, "counts")),
                                   project = "PBMC")
# Normalization
surt <- Seurat::NormalizeData(surt, normalization.method = "LogNormalize")
# Variance stabilizing transform
n <- round(0.9 * ncol(surt))
surt <- Seurat::FindVariableFeatures(surt, selection.method = "vst", nfeatures = n)
# Scale data
surt <- Seurat::ScaleData(surt)
# Principal component analysis
surt <- Seurat::RunPCA(surt, features = Seurat::VariableFeatures(surt))
# Create k-nearest neighbor graph.
surt <- Seurat::FindNeighbors(surt, reduction = "pca", dim = seq_len(50))
# Cluster cells.
surt <- Seurat::FindClusters(surt, resolution = 0.10)
# Run t-SNE.
surt <- Seurat::RunTSNE(surt, dims.use = seq_len(2), reduction = "pca",
                        dims = seq_len(pc), do.fast = FALSE, perplexity = 30)
# Show the clustering results.
title <- "PBMC 1500 cells (Seurat)"
labels <- surt$seurat_clusters
df <- surt@reductions[["tsne"]]@cell.embeddings
p <- ggplot2::ggplot() +
  ggplot2::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
                      size = 1, alpha = 1) +
  ggplot2::labs(title = title, x = "tSNE_1", y = "tSNE_2", color = "") +
  ggplot2::theme_classic(base_size = 20) +
  ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) +
  ggplot2::scale_colour_hue() +
  ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4)))
filename <- "figures/figure_20_0135.png"
ggplot2::ggsave(file = filename, plot = p, dpi = 50, width = 5.1, height = 4.3)
# Find differentially expressed genes.
surt@misc$stat <- Seurat::FindAllMarkers(surt, only.pos = TRUE, min.pct = 0.25,
                                         logfc.threshold = 0.25)
View(surt@misc$stat[which(surt@misc$stat$p_val_adj < 10^(-100)), ])
# Infer cell types
# 0: T cell          # TRAC (p_val_adj ~e-138), EEF1A1 (p_val_adj ~e-121)
# 1: Monocyte        # MNDA (p_val_adj ~e-228), FCN1 (p_val_adj ~e-217)
# 2: NK/NKT          # GZMA (p_val_adj ~e-173), NKG7 (p_val_adj ~e-164)
# 3: B cell          # CD79A (p_val_adj ~e-231), MS4A1 (p_val_adj ~e-226)
# 4: Dendritic cell? # FCER1A (p_val_adj ~e-176)
tmp <- as.integer(as.character(surt$seurat_clusters))
surt$cell_type <- tmp
surt$cell_type[surt$cell_type == 0] <- "T"
surt$cell_type[surt$cell_type == 1] <- "Mono"
surt$cell_type[surt$cell_type == 2] <- "NK/NKT"
surt$cell_type[surt$cell_type == 3] <- "B"
surt$cell_type[surt$cell_type == 4] <- "DC"
# Save data.
saveRDS(surt, file = "backup/20_004_pbmc1500_seurat.rds")
# Load data.
surt <- readRDS("backup/20_004_pbmc1500_seurat.rds")


6.1.6 Random downsampling (1000 cells)

Perform random sampling of 1000 cells.

set.seed(1)
inds <- sample(dim(pbmc)[2], 1000, prob = NULL, replace = FALSE)
subpbmc <- pbmc[, inds]

Compute a correlation matrix.

mat <- t(as.matrix(assay(subpbmc, "centered")))
cormat <- cor(mat, method = "spearman")

Run ASURAT protocols.

# Add formatted databases.
sce <- list(CB = subpbmc)
metadata(sce$CB) <- list(sign = human_CB[["cell"]])
# Remove biological terms including too few or too many genes.
sce$CB <- remove_signs(sce = sce$CB, min_ngenes = 2, max_ngenes = 1000)
# Create signs and sign-by-sample matrices.
set.seed(1)
sce$CB <- cluster_genesets(sce = sce$CB, cormat = cormat,
                           th_posi = 0.2, th_nega = -0.4)
sce$CB <- create_signs(sce = sce$CB, min_cnt_strg = 4, min_cnt_vari = 4)
sce$CB <- makeSignMatrix(sce = sce$CB, weight_strg = 0.5, weight_vari = 0.5)
# Perform dimensional reduction.
set.seed(1)
mat <- t(as.matrix(assay(sce$CB, "counts")))
res <- Rtsne::Rtsne(mat, dim = 2, pca = TRUE, initial_dims = 50)
reducedDim(sce$CB, "TSNE") <- res[["Y"]]
# Perform unsupervised clustering of cells.
surt <- Seurat::as.Seurat(sce$CB, counts = "counts", data = "counts")
mat <- as.matrix(assay(sce$CB, "counts"))
surt[["SSM"]] <- Seurat::CreateAssayObject(counts = mat)
Seurat::DefaultAssay(surt) <- "SSM"
surt <- Seurat::ScaleData(surt, features = rownames(surt))
surt <- Seurat::RunPCA(surt, features = rownames(surt))
surt <- Seurat::FindNeighbors(surt, reduction = "pca", dims = seq_len(50))
surt <- Seurat::FindClusters(surt, resolution = 0.50)
temp <- Seurat::as.SingleCellExperiment(surt)
colData(sce$CB)$seurat_clusters <- colData(temp)$seurat_clusters
# Show the clustering results in t-SNE spaces.
labels <- colData(sce$CB)$seurat_clusters
df <- as.data.frame(reducedDim(sce$CB, "TSNE"))
p <- ggplot2::ggplot() +
  ggplot2::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
                      size = 1, alpha = 1) +
  ggplot2::labs(title = "PBMC 1000 cells (cell type)", x = "tSNE_1", y = "tSNE_2",
                color = "") +
  ggplot2::theme_classic(base_size = 20) + ggplot2::scale_colour_hue() +
  ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) +
  ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4)))
filename <- "figures/figure_20_0140.png"
ggplot2::ggsave(file = filename, plot = p, dpi = 50, width = 5.1, height = 4.3)
# Investigate significant signs.
set.seed(1)
labels <- colData(sce$CB)$seurat_clusters
sce$CB <- compute_sepI_all(sce = sce$CB, labels = labels, nrand_samples = NULL)
# Investigate significant genes.
set.seed(1)
surt <- Seurat::as.Seurat(sce$CB, counts = "counts", data = "counts")
mat <- as.matrix(assay(altExp(sce$CB), "counts"))
surt[["GEM"]] <- Seurat::CreateAssayObject(counts = mat)
Seurat::DefaultAssay(surt) <- "GEM"
surt <- Seurat::SetIdent(surt, value = "seurat_clusters")
res <- Seurat::FindAllMarkers(surt, only.pos = TRUE,
                              min.pct = 0.25, logfc.threshold = 0.25)
metadata(sce$CB)$marker_genes$all <- res
View(metadata(sce$CB)$marker_signs$all)
View(metadata(sce$CB)$marker_genes$all)
# Infer cell types.
# 0: Unspecified     # No significant genes are detected.
# 1: Monocyte        # MSigDBID:15-S (...KUPFFER_CELLS_4) (sepI ~0.99)
# 2: NK/NKT          # MSigDBID:1-V (...NK_NKT_CELLS_1) (sepI ~0.93)
# 3: B cell          # MSigDBID:23-V (...B_CELLS) (sepI ~0.95)
colData(sce$CB)$cell_type <- as.character(colData(sce$CB)$seurat_clusters)
colData(sce$CB)$cell_type[colData(sce$CB)$cell_type == 0] <- "Unspecified"
colData(sce$CB)$cell_type[colData(sce$CB)$cell_type == 1] <- "Mono"
colData(sce$CB)$cell_type[colData(sce$CB)$cell_type == 2] <- "NK/NKT"
colData(sce$CB)$cell_type[colData(sce$CB)$cell_type == 3] <- "B"
# Save data.
saveRDS(sce, file = "backup/20_005_pbmc1000_asurat.rds")
# Load data.
sce <- readRDS("backup/20_005_pbmc1000_asurat.rds")

Run Seurat protocols.

# Create Seurat objects.
surt <- Seurat::CreateSeuratObject(counts = as.matrix(assay(subpbmc, "counts")),
                                   project = "PBMC")
# Normalization
surt <- Seurat::NormalizeData(surt, normalization.method = "LogNormalize")
# Variance stabilizing transform
n <- round(0.9 * ncol(surt))
surt <- Seurat::FindVariableFeatures(surt, selection.method = "vst", nfeatures = n)
# Scale data
surt <- Seurat::ScaleData(surt)
# Principal component analysis
surt <- Seurat::RunPCA(surt, features = Seurat::VariableFeatures(surt))
# Create k-nearest neighbor graph.
surt <- Seurat::FindNeighbors(surt, reduction = "pca", dim = seq_len(50))
# Cluster cells.
surt <- Seurat::FindClusters(surt, resolution = 0.12)
# Run t-SNE.
surt <- Seurat::RunTSNE(surt, dims.use = seq_len(2), reduction = "pca",
                        dims = seq_len(pc), do.fast = FALSE, perplexity = 30)
# Show the clustering results.
title <- "PBMC 1000 cells (Seurat)"
labels <- surt$seurat_clusters
df <- surt@reductions[["tsne"]]@cell.embeddings
p <- ggplot2::ggplot() +
  ggplot2::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
                      size = 1, alpha = 1) +
  ggplot2::labs(title = title, x = "tSNE_1", y = "tSNE_2", color = "") +
  ggplot2::theme_classic(base_size = 20) +
  ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) +
  ggplot2::scale_colour_hue() +
  ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4)))
filename <- "figures/figure_20_0145.png"
ggplot2::ggsave(file = filename, plot = p, dpi = 50, width = 5.1, height = 4.3)
# Find differentially expressed genes.
surt@misc$stat <- Seurat::FindAllMarkers(surt, only.pos = TRUE, min.pct = 0.25,
                                         logfc.threshold = 0.25)
View(surt@misc$stat[which(surt@misc$stat$p_val_adj < 10^(-100)), ])
# Infer cell types
# 0: Unspecified     # No significant genes are detected.
# 1: Monocyte        # MNDA (p_val_adj ~e-167), FCN1 (p_val_adj ~e-161)
# 2: NK/NKT          # GZMA (p_val_adj ~e-113), NKG7 (p_val_adj ~e-111)
# 3: B cell          # CD79A (p_val_adj ~e-144), MS4A1 (p_val_adj ~e-132)
tmp <- as.integer(as.character(surt$seurat_clusters))
surt$cell_type <- tmp
surt$cell_type[surt$cell_type == 0] <- "Unspecified"
surt$cell_type[surt$cell_type == 1] <- "Mono"
surt$cell_type[surt$cell_type == 2] <- "NK/NKT"
surt$cell_type[surt$cell_type == 3] <- "B"
# Save data.
saveRDS(surt, file = "backup/20_005_pbmc1000_seurat.rds")
# Load data.
surt <- readRDS("backup/20_005_pbmc1000_seurat.rds")


6.2 PBMC 4k (try 2)

6.2.1 Load data

load data.

pbmc <- readRDS("backup/04_003_pbmc4k_normalized.rds")

Load databases.

urlpath <- "https://github.com/keita-iida/ASURATDB/blob/main/genes2bioterm/"
load(url(paste0(urlpath, "20201213_human_CO.rda?raw=TRUE")))         # CO
load(url(paste0(urlpath, "20220308_human_MSigDB.rda?raw=TRUE")))     # MSigDB
load(url(paste0(urlpath, "20220308_human_CellMarker.rda?raw=TRUE"))) # CellMarker

Create a custom-built cell type-related databases by combining different databases for analyzing human single-cell transcriptome data.

d <- list(human_CO[["cell"]], human_MSigDB[["cell"]], human_CellMarker[["cell"]])
human_CB <- list(cell = do.call("rbind", d))


6.2.2 Random downsampling (4000 cells)

Perform random sampling of 4000 cells.

set.seed(2)
inds <- sample(dim(pbmc)[2], 4000, prob = NULL, replace = FALSE)
subpbmc <- pbmc[, inds]

Compute a correlation matrix.

mat <- t(as.matrix(assay(subpbmc, "centered")))
cormat <- cor(mat, method = "spearman")


6.2.2.1 ASURAT

Run ASURAT protocols.

# Add formatted databases.
sce <- list(CB = subpbmc)
metadata(sce$CB) <- list(sign = human_CB[["cell"]])
# Remove biological terms including too few or too many genes.
sce$CB <- remove_signs(sce = sce$CB, min_ngenes = 2, max_ngenes = 1000)
# Create signs and sign-by-sample matrices.
set.seed(1)
sce$CB <- cluster_genesets(sce = sce$CB, cormat = cormat,
                           th_posi = 0.2, th_nega = -0.4)
sce$CB <- create_signs(sce = sce$CB, min_cnt_strg = 4, min_cnt_vari = 4)
sce$CB <- makeSignMatrix(sce = sce$CB, weight_strg = 0.5, weight_vari = 0.5)
# Perform dimensional reduction.
set.seed(1)
mat <- t(as.matrix(assay(sce$CB, "counts")))
res <- Rtsne::Rtsne(mat, dim = 2, pca = TRUE, initial_dims = 50)
reducedDim(sce$CB, "TSNE") <- res[["Y"]]
# Perform unsupervised clustering of cells.
surt <- Seurat::as.Seurat(sce$CB, counts = "counts", data = "counts")
mat <- as.matrix(assay(sce$CB, "counts"))
surt[["SSM"]] <- Seurat::CreateAssayObject(counts = mat)
Seurat::DefaultAssay(surt) <- "SSM"
surt <- Seurat::ScaleData(surt, features = rownames(surt))
surt <- Seurat::RunPCA(surt, features = rownames(surt))
surt <- Seurat::FindNeighbors(surt, reduction = "pca", dims = seq_len(40))
surt <- Seurat::FindClusters(surt, resolution = 0.15)
temp <- Seurat::as.SingleCellExperiment(surt)
colData(sce$CB)$seurat_clusters <- colData(temp)$seurat_clusters
# Show the clustering results in t-SNE spaces.
labels <- colData(sce$CB)$seurat_clusters
df <- as.data.frame(reducedDim(sce$CB, "TSNE"))
p <- ggplot2::ggplot() +
  ggplot2::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
                      size = 1, alpha = 1) +
  ggplot2::labs(title = "PBMC 4000 cells (cell type)", x = "tSNE_1", y = "tSNE_2",
                color = "") +
  ggplot2::theme_classic(base_size = 20) + ggplot2::scale_colour_hue() +
  ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) +
  ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4)))
filename <- "figures/figure_21_0100.png"
ggplot2::ggsave(file = filename, plot = p, dpi = 50, width = 5.1, height = 4.3)
# Investigate significant signs.
set.seed(1)
labels <- colData(sce$CB)$seurat_clusters
sce$CB <- compute_sepI_all(sce = sce$CB, labels = labels, nrand_samples = NULL)
# Investigate significant genes.
set.seed(1)
surt <- Seurat::as.Seurat(sce$CB, counts = "counts", data = "counts")
mat <- as.matrix(assay(altExp(sce$CB), "counts"))
surt[["GEM"]] <- Seurat::CreateAssayObject(counts = mat)
Seurat::DefaultAssay(surt) <- "GEM"
surt <- Seurat::SetIdent(surt, value = "seurat_clusters")
res <- Seurat::FindAllMarkers(surt, only.pos = TRUE,
                              min.pct = 0.25, logfc.threshold = 0.25)
metadata(sce$CB)$marker_genes$all <- res
View(metadata(sce$CB)$marker_signs$all)
View(metadata(sce$CB)$marker_genes$all)
# Infer cell types.
# 0: T cell          # TRAC (p_val_adj ~0), EEF1A1 (p_val_adj ~0)
# 1: Monocyte        # MSigDBID:13-S (...KUPFFER_CELLS_3) (sepI ~0.99)
# 2: NK/NKT          # MSigDBID:1-V (...NK_NKT_CELLS_1) (sepI ~0.99)
# 3: B cell          # MSigDBID:23-V (...B_CELLS) (sepI ~0.94)
# 4: Dendritic cell  # CellMarkerID:72-S (Dendritic cell) (sepI ~0.98)
colData(sce$CB)$cell_type <- as.character(colData(sce$CB)$seurat_clusters)
colData(sce$CB)$cell_type[colData(sce$CB)$cell_type == 0] <- "T"
colData(sce$CB)$cell_type[colData(sce$CB)$cell_type == 1] <- "Mono"
colData(sce$CB)$cell_type[colData(sce$CB)$cell_type == 2] <- "NK/NKT"
colData(sce$CB)$cell_type[colData(sce$CB)$cell_type == 3] <- "B"
colData(sce$CB)$cell_type[colData(sce$CB)$cell_type == 4] <- "DC"
# Save data.
saveRDS(sce, file = "backup/21_001_pbmc4000_asurat.rds")
# Load data.
sce <- readRDS("backup/21_001_pbmc4000_asurat.rds")


6.2.2.2 Seurat

Run Seurat protocols.

# Create Seurat objects.
surt <- Seurat::CreateSeuratObject(counts = as.matrix(assay(subpbmc, "counts")),
                                   project = "PBMC")
# Normalization
surt <- Seurat::NormalizeData(surt, normalization.method = "LogNormalize")
# Variance stabilizing transform
n <- round(0.9 * ncol(surt))
surt <- Seurat::FindVariableFeatures(surt, selection.method = "vst", nfeatures = n)
# Scale data
surt <- Seurat::ScaleData(surt)
# Principal component analysis
surt <- Seurat::RunPCA(surt, features = Seurat::VariableFeatures(surt))
# Create k-nearest neighbor graph.
surt <- Seurat::FindNeighbors(surt, reduction = "pca", dim = seq_len(40))
# Cluster cells.
surt <- Seurat::FindClusters(surt, resolution = 0.05)
# Run t-SNE.
surt <- Seurat::RunTSNE(surt, dims.use = seq_len(2), reduction = "pca",
                        dims = seq_len(pc), do.fast = FALSE, perplexity = 30)
# Show the clustering results.
title <- "PBMC 4000 cells (Seurat)"
labels <- surt$seurat_clusters
df <- surt@reductions[["tsne"]]@cell.embeddings
p <- ggplot2::ggplot() +
  ggplot2::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
                      size = 1, alpha = 1) +
  ggplot2::labs(title = title, x = "tSNE_1", y = "tSNE_2", color = "") +
  ggplot2::theme_classic(base_size = 20) +
  ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) +
  ggplot2::scale_colour_hue() +
  ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4)))
filename <- "figures/figure_21_0105.png"
ggplot2::ggsave(file = filename, plot = p, dpi = 50, width = 5.1, height = 4.3)
# Find differentially expressed genes.
surt@misc$stat <- Seurat::FindAllMarkers(surt, only.pos = TRUE, min.pct = 0.25,
                                         logfc.threshold = 0.25)
View(surt@misc$stat[which(surt@misc$stat$p_val_adj < 10^(-100)), ])
# Infer cell types
# 0: T cell          # TRAC (p_val_adj ~0), EEF1A1 (p_val_adj ~0)
# 1: Monocyte        # S100A8 (p_val_adj ~0), S100A9 (p_val_adj ~0)
# 2: NK/NKT          # NKG7 (p_val_adj ~0), GZMA (p_val_adj ~0)
# 3: B cell          # CD79A (p_val_adj ~0), CD79B (p_val_adj ~0)
# 4: Dendritic cell  # LILRA4 (p_val_adj ~e-266), LILRB4 (p_val_adj ~e-113)
tmp <- as.integer(as.character(surt$seurat_clusters))
surt$cell_type <- tmp
surt$cell_type[surt$cell_type == 0] <- "T"
surt$cell_type[surt$cell_type == 1] <- "Mono"
surt$cell_type[surt$cell_type == 2] <- "NK/NKT"
surt$cell_type[surt$cell_type == 3] <- "B"
surt$cell_type[surt$cell_type == 4] <- "DC"
# Save data.
saveRDS(surt, file = "backup/21_001_pbmc4000_seurat.rds")
# Load data.
surt <- readRDS("backup/21_001_pbmc4000_seurat.rds")


6.2.3 Random downsampling (3000 cells)

Perform random sampling of 3000 cells.

set.seed(2)
inds <- sample(dim(pbmc)[2], 3000, prob = NULL, replace = FALSE)
subpbmc <- pbmc[, inds]

Compute a correlation matrix.

mat <- t(as.matrix(assay(subpbmc, "centered")))
cormat <- cor(mat, method = "spearman")


6.2.3.1 ASURAT

Run ASURAT protocols.

# Add formatted databases.
sce <- list(CB = subpbmc)
metadata(sce$CB) <- list(sign = human_CB[["cell"]])
# Remove biological terms including too few or too many genes.
sce$CB <- remove_signs(sce = sce$CB, min_ngenes = 2, max_ngenes = 1000)
# Create signs and sign-by-sample matrices.
set.seed(1)
sce$CB <- cluster_genesets(sce = sce$CB, cormat = cormat,
                           th_posi = 0.2, th_nega = -0.4)
sce$CB <- create_signs(sce = sce$CB, min_cnt_strg = 4, min_cnt_vari = 4)
sce$CB <- makeSignMatrix(sce = sce$CB, weight_strg = 0.5, weight_vari = 0.5)
# Perform dimensional reduction.
set.seed(1)
mat <- t(as.matrix(assay(sce$CB, "counts")))
res <- Rtsne::Rtsne(mat, dim = 2, pca = TRUE, initial_dims = 50)
reducedDim(sce$CB, "TSNE") <- res[["Y"]]
# Perform unsupervised clustering of cells.
surt <- Seurat::as.Seurat(sce$CB, counts = "counts", data = "counts")
mat <- as.matrix(assay(sce$CB, "counts"))
surt[["SSM"]] <- Seurat::CreateAssayObject(counts = mat)
Seurat::DefaultAssay(surt) <- "SSM"
surt <- Seurat::ScaleData(surt, features = rownames(surt))
surt <- Seurat::RunPCA(surt, features = rownames(surt))
surt <- Seurat::FindNeighbors(surt, reduction = "pca", dims = seq_len(40))
surt <- Seurat::FindClusters(surt, resolution = 0.12)
temp <- Seurat::as.SingleCellExperiment(surt)
colData(sce$CB)$seurat_clusters <- colData(temp)$seurat_clusters
# Show the clustering results in t-SNE spaces.
labels <- colData(sce$CB)$seurat_clusters
df <- as.data.frame(reducedDim(sce$CB, "TSNE"))
p <- ggplot2::ggplot() +
  ggplot2::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
                      size = 1, alpha = 1) +
  ggplot2::labs(title = "PBMC 3000 cells (cell type)", x = "tSNE_1", y = "tSNE_2",
                color = "") +
  ggplot2::theme_classic(base_size = 20) + ggplot2::scale_colour_hue() +
  ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) +
  ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4)))
filename <- "figures/figure_21_0110.png"
ggplot2::ggsave(file = filename, plot = p, dpi = 50, width = 5.1, height = 4.3)
# Investigate significant signs.
set.seed(1)
labels <- colData(sce$CB)$seurat_clusters
sce$CB <- compute_sepI_all(sce = sce$CB, labels = labels, nrand_samples = NULL)
# Investigate significant genes.
set.seed(1)
surt <- Seurat::as.Seurat(sce$CB, counts = "counts", data = "counts")
mat <- as.matrix(assay(altExp(sce$CB), "counts"))
surt[["GEM"]] <- Seurat::CreateAssayObject(counts = mat)
Seurat::DefaultAssay(surt) <- "GEM"
surt <- Seurat::SetIdent(surt, value = "seurat_clusters")
res <- Seurat::FindAllMarkers(surt, only.pos = TRUE,
                              min.pct = 0.25, logfc.threshold = 0.25)
metadata(sce$CB)$marker_genes$all <- res
View(metadata(sce$CB)$marker_signs$all)
View(metadata(sce$CB)$marker_genes$all)
# Infer cell types.
# 0: T cell          # EEF1A1 (p_val_adj ~e-266), TRAC (p_val_adj ~e-225)
# 1: Monocyte        # MSigDBID:13-S (...KUPFFER_CELLS_3) (sepI ~0.99)
# 2: NK/NKT          # MSigDBID:1-V (...NK_NKT_CELLS_1) (sepI ~0.98)
# 3: B cell          # MSigDBID:23-V (...B_CELLS) (sepI ~0.94)
# 4: Dendritic cell  # CellMarkerID:72-S (Dendritic cell) (sepI ~0.97)
colData(sce$CB)$cell_type <- as.character(colData(sce$CB)$seurat_clusters)
colData(sce$CB)$cell_type[colData(sce$CB)$cell_type == 0] <- "T"
colData(sce$CB)$cell_type[colData(sce$CB)$cell_type == 1] <- "Mono"
colData(sce$CB)$cell_type[colData(sce$CB)$cell_type == 2] <- "NK/NKT"
colData(sce$CB)$cell_type[colData(sce$CB)$cell_type == 3] <- "B"
colData(sce$CB)$cell_type[colData(sce$CB)$cell_type == 4] <- "DC"
# Save data.
saveRDS(sce, file = "backup/21_002_pbmc3000_asurat.rds")
# Load data.
sce <- readRDS("backup/21_002_pbmc3000_asurat.rds")


6.2.3.2 Seurat

Run Seurat protocols.

# Create Seurat objects.
surt <- Seurat::CreateSeuratObject(counts = as.matrix(assay(subpbmc, "counts")),
                                   project = "PBMC")
# Normalization
surt <- Seurat::NormalizeData(surt, normalization.method = "LogNormalize")
# Variance stabilizing transform
n <- round(0.9 * ncol(surt))
surt <- Seurat::FindVariableFeatures(surt, selection.method = "vst", nfeatures = n)
# Scale data
surt <- Seurat::ScaleData(surt)
# Principal component analysis
surt <- Seurat::RunPCA(surt, features = Seurat::VariableFeatures(surt))
# Create k-nearest neighbor graph.
surt <- Seurat::FindNeighbors(surt, reduction = "pca", dim = seq_len(50))
# Cluster cells.
surt <- Seurat::FindClusters(surt, resolution = 0.06)
# Run t-SNE.
surt <- Seurat::RunTSNE(surt, dims.use = seq_len(2), reduction = "pca",
                        dims = seq_len(pc), do.fast = FALSE, perplexity = 30)
# Show the clustering results.
title <- "PBMC 3000 cells (Seurat)"
labels <- surt$seurat_clusters
df <- surt@reductions[["tsne"]]@cell.embeddings
p <- ggplot2::ggplot() +
  ggplot2::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
                      size = 1, alpha = 1) +
  ggplot2::labs(title = title, x = "tSNE_1", y = "tSNE_2", color = "") +
  ggplot2::theme_classic(base_size = 20) +
  ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) +
  ggplot2::scale_colour_hue() +
  ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4)))
filename <- "figures/figure_21_0115.png"
ggplot2::ggsave(file = filename, plot = p, dpi = 50, width = 5.1, height = 4.3)
# Find differentially expressed genes.
surt@misc$stat <- Seurat::FindAllMarkers(surt, only.pos = TRUE, min.pct = 0.25,
                                         logfc.threshold = 0.25)
View(surt@misc$stat[which(surt@misc$stat$p_val_adj < 10^(-100)), ])
# Infer cell types
# 0: NK/NKT          # GZMA (p_val_adj ~e-268), NKG7 (p_val_adj ~e-162)
# 1: T cell          # EEF1A1 (p_val_adj ~e-164), TRAC (p_val_adj ~e-126)
# 2: Monocyte        # S100A8 (p_val_adj ~0), S100A9 (p_val_adj ~0)
# 3: B cell          # CD79A (p_val_adj ~0), CD79B (p_val_adj ~0)
# 4: Dendritic cell  # LILRA4 (p_val_adj ~e-178)
tmp <- as.integer(as.character(surt$seurat_clusters))
surt$cell_type <- tmp
surt$cell_type[surt$cell_type == 0] <- "NK/NKT"
surt$cell_type[surt$cell_type == 1] <- "T"
surt$cell_type[surt$cell_type == 2] <- "Mono"
surt$cell_type[surt$cell_type == 3] <- "B"
surt$cell_type[surt$cell_type == 4] <- "DC"
# Save data.
saveRDS(surt, file = "backup/21_002_pbmc3000_seurat.rds")
# Load data.
surt <- readRDS("backup/21_002_pbmc3000_seurat.rds")


6.2.4 Random downsampling (2000 cells)

Perform random sampling of 2000 cells.

set.seed(2)
inds <- sample(dim(pbmc)[2], 2000, prob = NULL, replace = FALSE)
subpbmc <- pbmc[, inds]

Compute a correlation matrix.

mat <- t(as.matrix(assay(subpbmc, "centered")))
cormat <- cor(mat, method = "spearman")


6.2.4.1 ASURAT

Run ASURAT protocols.

# Add formatted databases.
sce <- list(CB = subpbmc)
metadata(sce$CB) <- list(sign = human_CB[["cell"]])
# Remove biological terms including too few or too many genes.
sce$CB <- remove_signs(sce = sce$CB, min_ngenes = 2, max_ngenes = 1000)
# Create signs and sign-by-sample matrices.
set.seed(1)
sce$CB <- cluster_genesets(sce = sce$CB, cormat = cormat,
                           th_posi = 0.2, th_nega = -0.4)
sce$CB <- create_signs(sce = sce$CB, min_cnt_strg = 4, min_cnt_vari = 4)
sce$CB <- makeSignMatrix(sce = sce$CB, weight_strg = 0.5, weight_vari = 0.5)
# Perform dimensional reduction.
set.seed(1)
mat <- t(as.matrix(assay(sce$CB, "counts")))
res <- Rtsne::Rtsne(mat, dim = 2, pca = TRUE, initial_dims = 50)
reducedDim(sce$CB, "TSNE") <- res[["Y"]]
# Perform unsupervised clustering of cells.
surt <- Seurat::as.Seurat(sce$CB, counts = "counts", data = "counts")
mat <- as.matrix(assay(sce$CB, "counts"))
surt[["SSM"]] <- Seurat::CreateAssayObject(counts = mat)
Seurat::DefaultAssay(surt) <- "SSM"
surt <- Seurat::ScaleData(surt, features = rownames(surt))
surt <- Seurat::RunPCA(surt, features = rownames(surt))
surt <- Seurat::FindNeighbors(surt, reduction = "pca", dims = seq_len(10))
surt <- Seurat::FindClusters(surt, resolution = 0.18)
temp <- Seurat::as.SingleCellExperiment(surt)
colData(sce$CB)$seurat_clusters <- colData(temp)$seurat_clusters
# Show the clustering results in t-SNE spaces.
labels <- colData(sce$CB)$seurat_clusters
df <- as.data.frame(reducedDim(sce$CB, "TSNE"))
p <- ggplot2::ggplot() +
  ggplot2::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
                      size = 1, alpha = 1) +
  ggplot2::labs(title = "PBMC 2000 cells (cell type)", x = "tSNE_1", y = "tSNE_2",
                color = "") +
  ggplot2::theme_classic(base_size = 20) + ggplot2::scale_colour_hue() +
  ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) +
  ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4)))
filename <- "figures/figure_21_0120.png"
ggplot2::ggsave(file = filename, plot = p, dpi = 50, width = 5.1, height = 4.3)
# Investigate significant signs.
set.seed(1)
labels <- colData(sce$CB)$seurat_clusters
sce$CB <- compute_sepI_all(sce = sce$CB, labels = labels, nrand_samples = NULL)
# Investigate significant genes.
set.seed(1)
surt <- Seurat::as.Seurat(sce$CB, counts = "counts", data = "counts")
mat <- as.matrix(assay(altExp(sce$CB), "counts"))
surt[["GEM"]] <- Seurat::CreateAssayObject(counts = mat)
Seurat::DefaultAssay(surt) <- "GEM"
surt <- Seurat::SetIdent(surt, value = "seurat_clusters")
res <- Seurat::FindAllMarkers(surt, only.pos = TRUE,
                              min.pct = 0.25, logfc.threshold = 0.25)
metadata(sce$CB)$marker_genes$all <- res
View(metadata(sce$CB)$marker_signs$all)
View(metadata(sce$CB)$marker_genes$all)
# Infer cell types.
# 0: T cell          # EEF1A1 (p_val_adj ~e-169), TRAC (p_val_adj ~e-149)
# 1: Monocyte        # MSigDBID:13-S (...KUPFFER_CELLS_3) (sepI ~0.99)
# 2: NK/NKT          # MSigDBID:1-V (...NK_NKT_CELLS_1) (sepI ~0.97)
# 3: B cell          # MSigDBID:23-V (...B_CELLS) (sepI ~0.94)
# 4: Dendritic cell  # CellMarkerID:72-S (Dendritic cell) (sepI ~0.97)
colData(sce$CB)$cell_type <- as.character(colData(sce$CB)$seurat_clusters)
colData(sce$CB)$cell_type[colData(sce$CB)$cell_type == 0] <- "T"
colData(sce$CB)$cell_type[colData(sce$CB)$cell_type == 1] <- "Mono"
colData(sce$CB)$cell_type[colData(sce$CB)$cell_type == 2] <- "NK/NKT"
colData(sce$CB)$cell_type[colData(sce$CB)$cell_type == 3] <- "B"
colData(sce$CB)$cell_type[colData(sce$CB)$cell_type == 4] <- "DC"
# Save data.
saveRDS(sce, file = "backup/21_003_pbmc2000_asurat.rds")
# Load data.
sce <- readRDS("backup/21_003_pbmc2000_asurat.rds")


6.2.4.2 Seurat

Run Seurat protocols.

# Create Seurat objects.
surt <- Seurat::CreateSeuratObject(counts = as.matrix(assay(subpbmc, "counts")),
                                   project = "PBMC")
# Normalization
surt <- Seurat::NormalizeData(surt, normalization.method = "LogNormalize")
# Variance stabilizing transform
n <- round(0.9 * ncol(surt))
surt <- Seurat::FindVariableFeatures(surt, selection.method = "vst", nfeatures = n)
# Scale data
surt <- Seurat::ScaleData(surt)
# Principal component analysis
surt <- Seurat::RunPCA(surt, features = Seurat::VariableFeatures(surt))
# Create k-nearest neighbor graph.
surt <- Seurat::FindNeighbors(surt, reduction = "pca", dim = seq_len(10))
# Cluster cells.
surt <- Seurat::FindClusters(surt, resolution = 0.06)
# Run t-SNE.
surt <- Seurat::RunTSNE(surt, dims.use = seq_len(2), reduction = "pca",
                        dims = seq_len(pc), do.fast = FALSE, perplexity = 30)
# Show the clustering results.
title <- "PBMC 2000 cells (Seurat)"
labels <- surt$seurat_clusters
df <- surt@reductions[["tsne"]]@cell.embeddings
p <- ggplot2::ggplot() +
  ggplot2::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
                      size = 1, alpha = 1) +
  ggplot2::labs(title = title, x = "tSNE_1", y = "tSNE_2", color = "") +
  ggplot2::theme_classic(base_size = 20) +
  ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) +
  ggplot2::scale_colour_hue() +
  ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4)))
filename <- "figures/figure_21_0125.png"
ggplot2::ggsave(file = filename, plot = p, dpi = 50, width = 5.1, height = 4.3)
# Find differentially expressed genes.
surt@misc$stat <- Seurat::FindAllMarkers(surt, only.pos = TRUE, min.pct = 0.25,
                                         logfc.threshold = 0.25)
View(surt@misc$stat[which(surt@misc$stat$p_val_adj < 10^(-100)), ])
# Infer cell types
# 0: T cell          # TRAC (p_val_adj ~e-164), EEF1A1 (p_val_adj ~e-152)
# 1: Monocyte        # FCN1 (p_val_adj ~0), MNDA (p_val_adj ~0)
# 2: NK/NKT          # GZMA (p_val_adj ~e-241), NKG7 (p_val_adj ~e-225)
# 3: B cell          # CD79A (p_val_adj ~e-304), MS4A1 (p_val_adj ~e-303)
# 4: Dendritic cell? # LILRA4 (p_val_adj ~e-146)
tmp <- as.integer(as.character(surt$seurat_clusters))
surt$cell_type <- tmp
surt$cell_type[surt$cell_type == 0] <- "T"
surt$cell_type[surt$cell_type == 1] <- "Mono"
surt$cell_type[surt$cell_type == 2] <- "NK/NKT"
surt$cell_type[surt$cell_type == 3] <- "B"
surt$cell_type[surt$cell_type == 4] <- "DC"
# Save data.
saveRDS(surt, file = "backup/21_003_pbmc2000_seurat.rds")
# Load data.
surt <- readRDS("backup/21_003_pbmc2000_seurat.rds")


6.2.5 Random downsampling (1500 cells)

Perform random sampling of 1500 cells.

set.seed(2)
inds <- sample(dim(pbmc)[2], 1500, prob = NULL, replace = FALSE)
subpbmc <- pbmc[, inds]

Compute a correlation matrix.

mat <- t(as.matrix(assay(subpbmc, "centered")))
cormat <- cor(mat, method = "spearman")


6.2.5.1 ASURAT

Run ASURAT protocols.

# Add formatted databases.
sce <- list(CB = subpbmc)
metadata(sce$CB) <- list(sign = human_CB[["cell"]])
# Remove biological terms including too few or too many genes.
sce$CB <- remove_signs(sce = sce$CB, min_ngenes = 2, max_ngenes = 1000)
# Create signs and sign-by-sample matrices.
set.seed(1)
sce$CB <- cluster_genesets(sce = sce$CB, cormat = cormat,
                           th_posi = 0.2, th_nega = -0.4)
sce$CB <- create_signs(sce = sce$CB, min_cnt_strg = 4, min_cnt_vari = 4)
sce$CB <- makeSignMatrix(sce = sce$CB, weight_strg = 0.5, weight_vari = 0.5)
# Perform dimensional reduction.
set.seed(1)
mat <- t(as.matrix(assay(sce$CB, "counts")))
res <- Rtsne::Rtsne(mat, dim = 2, pca = TRUE, initial_dims = 50)
reducedDim(sce$CB, "TSNE") <- res[["Y"]]
# Perform unsupervised clustering of cells.
surt <- Seurat::as.Seurat(sce$CB, counts = "counts", data = "counts")
mat <- as.matrix(assay(sce$CB, "counts"))
surt[["SSM"]] <- Seurat::CreateAssayObject(counts = mat)
Seurat::DefaultAssay(surt) <- "SSM"
surt <- Seurat::ScaleData(surt, features = rownames(surt))
surt <- Seurat::RunPCA(surt, features = rownames(surt))
surt <- Seurat::FindNeighbors(surt, reduction = "pca", dims = seq_len(20))
surt <- Seurat::FindClusters(surt, resolution = 0.30)
temp <- Seurat::as.SingleCellExperiment(surt)
colData(sce$CB)$seurat_clusters <- colData(temp)$seurat_clusters
# Show the clustering results in t-SNE spaces.
labels <- colData(sce$CB)$seurat_clusters
df <- as.data.frame(reducedDim(sce$CB, "TSNE"))
p <- ggplot2::ggplot() +
  ggplot2::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
                      size = 1, alpha = 1) +
  ggplot2::labs(title = "PBMC 1500 cells (cell type)",