MacBook Pro (Big Sur, 16-inch, 2019), Processor (2.4 GHz 8-Core Intel Core i9), Memory (64 GB 2667 MHz DDR4).
Attach necessary libraries:
library(ASURAT)
library(SingleCellExperiment)
library(SummarizedExperiment)
In this vignette, we analyze single-cell RNA sequencing (scRNA-seq) data obtained from peripheral blood mononuclear cells (PBMCs) of donors with and without bacterial sepsis (Reyes et al., Nat. Med. 26, 2020).
The data can be loaded by the following code:
<- readRDS(url("https://figshare.com/ndownloader/files/36276324")) pbmc
The data are stored in DOI:10.6084/m9.figshare.19200254 and the generating process is described below.
The data were obtained from Broad Institute Single Cell Portal: SCP548. Since the file size of the raw read count table is huge (~5.61 GB), we briefly removed gene and cell data such that the numbers of non-zero expressing cells are less than 100 and the numbers of sequencing reads are less than 2000, respectively, by using perl scripts as follows:
/perl/pg_01_add_nsamples.pl
perl ../perl/pg_02_add_nreads.pl
perl ../perl/pg_03_remove_variables.pl
perl ../perl/pg_04_remove_samples.pl perl ..
Create a SingleCellExperiment object by inputting a raw read count table.
# Load a raw read count table.
<- "rawdata/2020_001_Reyes/SCP548/expression/scp_gex_matrix_red.csv"
fn <- read.csv(fn, check.names = FALSE)
pbmc <- pbmc[-1, 2]
genes <- colnames(pbmc)[-seq_len(2)]
cells <- pbmc[-1, -seq_len(2)]
pbmc rownames(pbmc) <- make.unique(genes)
colnames(pbmc) <- make.unique(cells)
# Load sample information
<- read.delim("rawdata/2020_001_Reyes/SCP548/metadata/scp_meta.txt")
info <- info[-1, ]
info rownames(info) <- info$NAME
<- info[, -1]
info <- info[cells, ]
info # Create a SingleCellExperiment object.
<- SingleCellExperiment(assays = list(counts = as.matrix(pbmc)),
pbmc rowData = data.frame(gene = rownames(pbmc)),
colData = as.data.frame(info))
dim(pbmc)
[1] 14973 95853
# Save data.
saveRDS(pbmc, file = "backup/06_001_pbmc130k_data.rds")
# Load data.
<- readRDS("backup/06_001_pbmc130k_data.rds") pbmc
Remove variables (genes) and samples (cells) with low quality, by processing the following three steps:
First of all, add metadata for both variables and samples using
ASURAT function add_metadata()
.
<- add_metadata(sce = pbmc, mitochondria_symbol = "^MT-") pbmc
ASURAT function remove_variables()
removes variable
(gene) data such that the numbers of non-zero expressing samples (cells)
are less than min_nsamples
.
<- remove_variables(sce = pbmc, min_nsamples = 100) pbmc
Qualities of sample (cell) data are confirmed based on proper
visualization of colData(sce)
.
<- "PBMC (Reyes et al., 2020)"
title <- data.frame(x = colData(pbmc)$nReads, y = colData(pbmc)$nGenes)
df <- ggplot2::ggplot() +
p ::geom_point(ggplot2::aes(x = df$x, y = df$y), size = 1, alpha = 1) +
ggplot2::labs(title = title, x = "Number of reads", y = "Number of genes") +
ggplot2::theme_classic(base_size = 20) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 20))
ggplot2<- "figures/figure_06_0010.png"
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 5, height = 5)
ggplot2
<- data.frame(x = colData(pbmc)$nReads, y = colData(pbmc)$percMT)
df <- ggplot2::ggplot() +
p ::geom_point(ggplot2::aes(x = df$x, y = df$y), size = 1, alpha = 1) +
ggplot2::labs(title = title, x = "Number of reads", y = "Perc of MT reads") +
ggplot2::theme_classic(base_size = 20) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 20))
ggplot2<- "figures/figure_06_0011.png"
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 5, height = 5) ggplot2
ASURAT function remove_samples()
removes sample (cell)
data by setting cutoff values for the metadata.
<- remove_samples(sce = pbmc, min_nReads = 1500, max_nReads = 30000,
pbmc min_nGenes = 100, max_nGenes = 1e+10,
min_percMT = 0, max_percMT = 10)
Qualities of variable (gene) data are confirmed based on proper
visualization of rowData(sce)
.
<- "PBMC (Reyes et al., 2020)"
title <- apply(as.matrix(assay(pbmc, "counts")), 1, mean)
aveexp <- data.frame(x = seq_len(nrow(rowData(pbmc))),
df y = sort(aveexp, decreasing = TRUE))
<- ggplot2::ggplot() + ggplot2::scale_y_log10() +
p ::geom_point(ggplot2::aes(x = df$x, y = df$y), size = 1, alpha = 1) +
ggplot2::labs(title = title, x = "Rank of genes", y = "Mean read counts") +
ggplot2::theme_classic(base_size = 20) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 20))
ggplot2<- "figures/figure_06_0015.png"
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 5, height = 5) ggplot2
ASURAT function remove_variables_second()
removes
variable (gene) data such that the mean read counts across samples are
less than min_meannReads
.
<- remove_variables_second(sce = pbmc, min_meannReads = 0.05) pbmc
dim(pbmc)
[1] 5322 77161
# Save data.
saveRDS(pbmc, file = "backup/06_002_pbmc130k_dataqc.rds")
# Load data.
<- readRDS("backup/06_002_pbmc130k_dataqc.rds") pbmc
Perform bayNorm()
(Tang et al., Bioinformatics, 2020)
for attenuating technical biases with respect to zero inflation and
variation of capture efficiencies between samples (cells).
<- bayNorm::BetaFun(Data = assay(pbmc, "counts"), MeanBETA = 0.06)
BETA <- bayNorm::bayNorm(assay(pbmc, "counts"),
bay_out Conditions = colData(pbmc)$Patient,
BETA_vec = BETA[["BETA"]], Prior_type = "GG",
mode_version = TRUE)
<- bay_out[["Bay_out_list"]]
mat_list <- do.call("cbind", mat_list)
mat <- pbmc[, colnames(mat)]
pbmc assay(pbmc, "normalized") <- mat
Perform log-normalization with a pseudo count.
assay(pbmc, "logcounts") <- log(assay(pbmc, "normalized") + 1)
Center row data.
<- assay(pbmc, "logcounts")
mat assay(pbmc, "centered") <- sweep(mat, 1, apply(mat, 1, mean), FUN = "-")
Set gene expression data into altExp(sce)
.
<- "logcounts"
sname altExp(pbmc, sname) <- SummarizedExperiment(list(counts = assay(pbmc, sname)))
Add ENTREZ Gene IDs to rowData(sce)
.
<- AnnotationDbi::select(org.Hs.eg.db::org.Hs.eg.db,
dictionary key = rownames(pbmc),
columns = "ENTREZID", keytype = "SYMBOL")
<- dictionary[!duplicated(dictionary$SYMBOL), ]
dictionary rowData(pbmc)$geneID <- dictionary$ENTREZID
# Save data.
saveRDS(pbmc, file = "backup/06_003_pbmc130k_normalized.rds")
# Load data.
<- readRDS("backup/06_003_pbmc130k_normalized.rds") pbmc
Infer cell or disease types, biological functions, and signaling pathway activity at the single-cell level by inputting related databases.
ASURAT transforms centered read count tables to functional feature matrices, termed sign-by-sample matrices (SSMs). Using SSMs, perform unsupervised clustering of samples (cells).
Prepare correlation matrices of gene expressions.
<- t(as.matrix(assay(pbmc, "centered")))
mat <- cor(mat, method = "spearman") cormat
# Save data.
saveRDS(cormat, file = "backup/06_003_pbmc130k_cormat.rds")
# Load data.
<- readRDS("backup/06_003_pbmc130k_cormat.rds") cormat
Load databases.
<- "https://github.com/keita-iida/ASURATDB/blob/main/genes2bioterm/"
urlpath load(url(paste0(urlpath, "20201213_human_CO.rda?raw=TRUE"))) # CO
load(url(paste0(urlpath, "20220308_human_MSigDB.rda?raw=TRUE"))) # MSigDB
load(url(paste0(urlpath, "20220308_human_CellMarker.rda?raw=TRUE"))) # CellMarker
load(url(paste0(urlpath, "20201213_human_GO_red.rda?raw=TRUE"))) # GO
load(url(paste0(urlpath, "20201213_human_KEGG.rda?raw=TRUE"))) # KEGG
The reformatted knowledge-based data were available from the following repositories:
Create a custom-built cell type-related databases by combining different databases for analyzing human single-cell transcriptome data.
<- list(human_CO[["cell"]], human_MSigDB[["cell"]], human_CellMarker[["cell"]])
d <- list(cell = do.call("rbind", d)) human_CB
Add formatted databases to metadata(sce)$sign
.
<- list(CB = pbmc, GO = pbmc, KG = pbmc)
pbmcs metadata(pbmcs$CB) <- list(sign = human_CB[["cell"]])
metadata(pbmcs$GO) <- list(sign = human_GO[["BP"]])
metadata(pbmcs$KG) <- list(sign = human_KEGG[["pathway"]])
ASURAT function remove_signs()
redefines functional gene
sets for the input database by removing genes, which are not included in
rownames(sce)
, and further removes biological terms
including too few or too many genes.
$CB <- remove_signs(sce = pbmcs$CB, min_ngenes = 2, max_ngenes = 1000)
pbmcs$GO <- remove_signs(sce = pbmcs$GO, min_ngenes = 2, max_ngenes = 1000)
pbmcs$KG <- remove_signs(sce = pbmcs$KG, min_ngenes = 2, max_ngenes = 1000) pbmcs
ASURAT function cluster_genes()
clusters functional gene
sets using a correlation graph-based decomposition method, which
produces strongly, variably, and weakly correlated gene sets (SCG, VCG,
and WCG, respectively).
set.seed(1)
$CB <- cluster_genesets(sce = pbmcs$CB, cormat = cormat,
pbmcsth_posi = 0.20, th_nega = -0.50)
set.seed(1)
$GO <- cluster_genesets(sce = pbmcs$GO, cormat = cormat,
pbmcsth_posi = 0.20, th_nega = -0.30)
set.seed(1)
$KG <- cluster_genesets(sce = pbmcs$KG, cormat = cormat,
pbmcsth_posi = 0.15, th_nega = -0.20)
ASURAT function create_signs()
creates signs by the
following criteria:
min_cnt_strg
(the
default value is 2) andmin_cnt_vari
(the
default value is 2),which are independently applied to SCGs and VCGs, respectively.
$CB <- create_signs(sce = pbmcs$CB, min_cnt_strg = 3, min_cnt_vari = 3)
pbmcs$GO <- create_signs(sce = pbmcs$GO, min_cnt_strg = 3, min_cnt_vari = 3)
pbmcs$KG <- create_signs(sce = pbmcs$KG, min_cnt_strg = 4, min_cnt_vari = 4) pbmcs
If signs have semantic similarity information, one can use ASURAT
function remove_signs_redundant()
for removing redundant
sings using the semantic similarity matrices.
<- human_GO$similarity_matrix$BP
simmat $GO <- remove_signs_redundant(sce = pbmcs$GO, similarity_matrix = simmat,
pbmcsthreshold = 0.85, keep_rareID = TRUE)
ASURAT function remove_signs_manually()
removes signs by
specifying IDs (e.g., GOID:XXX
) or descriptions (e.g.,
metabolic
) using grepl()
.
<- "Covid|COVID"
keywords $KG <- remove_signs_manually(sce = pbmcs$KG, keywords = keywords) pbmcs
ASURAT function create_sce_signmatrix()
creates a new
SingleCellExperiment object new_sce
, consisting of the
following information:
assayNames(new_sce)
: counts (SSM whose entries are
termed sign scores),names(colData(new_sce))
: nReads, nGenes, percMT,names(rowData(new_sce))
: ParentSignID, Description,
CorrGene, etc.,names(metadata(new_sce))
: sign_SCG, sign_VCG,
etc.,altExpNames(new_sce)
: something if there is data in
altExp(sce)
.$CB <- makeSignMatrix(sce = pbmcs$CB, weight_strg = 0.5, weight_vari = 0.5)
pbmcs$GO <- makeSignMatrix(sce = pbmcs$GO, weight_strg = 0.5, weight_vari = 0.5)
pbmcs$KG <- makeSignMatrix(sce = pbmcs$KG, weight_strg = 0.5, weight_vari = 0.5) pbmcs
Perform t-distributed stochastic neighbor embedding.
for(i in seq_along(pbmcs)){
set.seed(1)
<- t(as.matrix(assay(pbmcs[[i]], "counts")))
mat <- Rtsne::Rtsne(mat, dim = 2, pca = TRUE, initial_dims = 50)
res reducedDim(pbmcs[[i]], "TSNE") <- res[["Y"]]
}
Show the results of dimensional reduction in low-dimensional spaces.
<- c("Reyes 2020 (cell type)", "Reyes 2020 (function)",
titles "Reyes 2020 (pathway)")
for(i in seq_along(titles)){
<- as.data.frame(reducedDim(pbmcs[[i]], "TSNE"))
df <- ggplot2::ggplot() +
p ::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2]),
ggplot2color = "black", size = 0.1, alpha = 1) +
::labs(title = titles[i], x = "tSNE_1", y = "tSNE_2") +
ggplot2::theme_classic(base_size = 20) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18))
ggplot2<- sprintf("figures/figure_06_%04d.png", 19 + i)
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 4.1, height = 4.3)
ggplot2 }
# Save data.
saveRDS(pbmcs, file = "backup/06_004_pbmcs130k_ssm.rds")
# Load data.
<- readRDS("backup/06_004_pbmcs130k_ssm.rds") pbmcs
To date (December, 2021), one of the most useful clustering methods in scRNA-seq data analysis is a combination of a community detection algorithm and graph-based unsupervised clustering, developed in Seurat package.
Here, our strategy is as follows:
rowData()
and colData()
must have data),ScaleData()
, RunPCA()
,
FindNeighbors()
, and FindClusters()
,temp
,colData(temp)$seurat_clusters
into
colData(sce)$seurat_clusters
.<- c(0.15, 0.15, 0.15)
resolutions <- list(seq_len(40), seq_len(40), seq_len(40))
dims for(i in seq_along(pbmcs)){
<- Seurat::as.Seurat(pbmcs[[i]], counts = "counts", data = "counts")
surt <- as.matrix(assay(pbmcs[[i]], "counts"))
mat "SSM"]] <- Seurat::CreateAssayObject(counts = mat)
surt[[::DefaultAssay(surt) <- "SSM"
Seurat<- Seurat::ScaleData(surt, features = rownames(surt))
surt <- Seurat::RunPCA(surt, features = rownames(surt))
surt <- Seurat::FindNeighbors(surt, reduction = "pca", dims = dims[[i]])
surt <- Seurat::FindClusters(surt, resolution = resolutions[i])
surt <- Seurat::as.SingleCellExperiment(surt)
temp colData(pbmcs[[i]])$seurat_clusters <- colData(temp)$seurat_clusters
}
Show the clustering results in low-dimensional spaces.
<- c("Reyes 2020 (cell type)", "Reyes 2020 (function)",
titles "Reyes 2020 (pathway)")
for(i in seq_along(titles)){
<- colData(pbmcs[[i]])$seurat_clusters
labels <- as.data.frame(reducedDim(pbmcs[[i]], "TSNE"))
df <- ggplot2::ggplot() +
p ::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
ggplot2size = 0.2, alpha = 1) +
::labs(title = titles[i], x = "tSNE_1", y = "tSNE_2", color = "") +
ggplot2::theme_classic(base_size = 20) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) +
ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4)))
ggplot2if(i == 1){
<- p + ggplot2::scale_colour_hue()
p else if(i == 2){
}<- p + ggplot2::scale_colour_brewer(palette = "Dark2")
p else if(i == 3){
}<- p + ggplot2::scale_colour_brewer(palette = "Paired")
p
}<- sprintf("figures/figure_06_%04d.png", 29 + i)
filename if(i == 1){
::ggsave(file = filename, plot = p, dpi = 50, width = 5.2, height = 4.3)
ggplot2else{
}::ggsave(file = filename, plot = p, dpi = 50, width = 5.1, height = 4.3)
ggplot2
} }
Significant signs are analogous to differentially expressed genes but bear biological meanings. Note that naïve usages of statistical tests should be avoided because the row vectors of SSMs are centered.
Instead, ASURAT function compute_sepI_all()
computes
separation indices for each cluster against the others. Briefly, a
separation index “sepI”, ranging from -1 to 1, is a nonparametric
measure of significance of a given sign score for a given subpopulation.
The larger (resp. smaller) sepI is, the more reliable the sign is as a
positive (resp. negative) marker for the cluster.
for(i in seq_along(pbmcs)){
set.seed(1)
<- colData(pbmcs[[i]])$seurat_clusters
labels <- compute_sepI_all(sce = pbmcs[[i]], labels = labels,
pbmcs[[i]] nrand_samples = 20000)
}
Perform compute_sepI_all()
for investigating significant
signs for the clustering results of cell types.
<- colData(pbmcs$CB)$seurat_clusters
lbs
<- pbmcs$GO
pbmcs_LabelCB_SignGO metadata(pbmcs_LabelCB_SignGO)$marker_signs <- NULL
set.seed(1)
<- compute_sepI_all(sce = pbmcs_LabelCB_SignGO,
pbmcs_LabelCB_SignGO labels = lbs, nrand_samples = 20000)
<- pbmcs$KG
pbmcs_LabelCB_SignKG metadata(pbmcs_LabelCB_SignKG)$marker_signs <- NULL
set.seed(1)
<- compute_sepI_all(sce = pbmcs_LabelCB_SignKG,
pbmcs_LabelCB_SignKG labels = lbs, nrand_samples = 20000)
for(i in c(0, 2, 3)){
set.seed(1)
<- compute_sepI_clusters(sce = pbmcs_LabelCB_SignGO,
pbmcs_LabelCB_SignGO labels = lbs, nrand_samples = 20000,
ident_1 = i,
ident_2 = setdiff(c(0, 2, 3), i))
}
for(i in c(0, 2, 3)){
set.seed(1)
<- compute_sepI_clusters(sce = pbmcs_LabelCB_SignKG,
pbmcs_LabelCB_SignKG labels = lbs, nrand_samples = 20000,
ident_1 = i,
ident_2 = setdiff(c(0, 2, 3), i))
}
To date (December, 2021), one of the most useful methods of multiple
statistical tests in scRNA-seq data analysis is to use a Seurat function
FindAllMarkers()
.
If there is gene expression data in altExp(sce)
, one can
investigate differentially expressed genes by using Seurat functions in
the similar manner as described before.
set.seed(1)
<- Seurat::as.Seurat(pbmcs$CB, counts = "counts", data = "counts")
surt <- as.matrix(assay(altExp(pbmcs$CB), "counts"))
mat "GEM"]] <- Seurat::CreateAssayObject(counts = mat)
surt[[::DefaultAssay(surt) <- "GEM"
Seurat<- Seurat::SetIdent(surt, value = "seurat_clusters")
surt <- Seurat::FindAllMarkers(surt, only.pos = TRUE,
res min.pct = 0.25, logfc.threshold = 0.25)
metadata(pbmcs$CB)$marker_genes$all <- res
# Save data.
saveRDS(pbmcs, file = "backup/06_005_pbmcs130k_desdeg.rds")
saveRDS(pbmcs_LabelCB_SignGO, file = "backup/06_005_pbmcs130k_LabelCB_SignGO.rds")
saveRDS(pbmcs_LabelCB_SignKG, file = "backup/06_005_pbmcs130k_LabelCB_SignKG.rds")
# Load data.
<- readRDS("backup/06_005_pbmcs130k_desdeg.rds")
pbmcs <- readRDS("backup/06_005_pbmcs130k_LabelCB_SignGO.rds")
pbmcs_LabelCB_SignGO <- readRDS("backup/06_005_pbmcs130k_LabelCB_SignKG.rds") pbmcs_LabelCB_SignKG
Simultaneously analyze multiple sign-by-sample matrices, which helps us characterize individual samples (cells) from multiple biological aspects.
ASURAT function plot_multiheatmaps()
shows heatmaps
(ComplexHeatmap object) of sign scores and gene expression levels (if
there are), where rows and columns stand for sign (or gene) and sample
(cell), respectively.
First, remove unrelated signs by setting keywords, followed by selecting top significant signs and genes for the clustering results with respect to separation index and p-value, respectively.
# Significant signs
<- list()
marker_signs <- "foofoo|hogehoge"
keys for(i in seq_along(pbmcs)){
if(i == 1){
<- metadata(pbmcs[[i]])$marker_signs$all
marker_signs[[i]] else if(i == 2){
}<- metadata(pbmcs_LabelCB_SignGO)$marker_signs$all
marker_signs[[i]] else if(i == 3){
}<- metadata(pbmcs_LabelCB_SignKG)$marker_signs$all
marker_signs[[i]]
}<- marker_signs[[i]][!grepl(keys, marker_signs[[i]]$Description), ]
marker_signs[[i]] <- dplyr::group_by(marker_signs[[i]], Ident_1)
marker_signs[[i]] <- dplyr::slice_max(marker_signs[[i]], sepI, n = 1)
marker_signs[[i]] <- dplyr::slice_min(marker_signs[[i]], Rank, n = 1)
marker_signs[[i]]
}# Significant genes
<- metadata(pbmcs$CB)$marker_genes$all
marker_genes_CB <- dplyr::group_by(marker_genes_CB, cluster)
marker_genes_CB <- dplyr::slice_min(marker_genes_CB, p_val_adj, n = 1)
marker_genes_CB <- dplyr::slice_max(marker_genes_CB, avg_log2FC, n = 1) marker_genes_CB
Then, prepare arguments.
# ssm_list
<- list() ; ssm_list <- list()
sces_sub for(i in seq_along(pbmcs)){
<- pbmcs[[i]][rownames(pbmcs[[i]]) %in% marker_signs[[i]]$SignID, ]
sces_sub[[i]] <- assay(sces_sub[[i]], "counts")
ssm_list[[i]]
}names(ssm_list) <- c("SSM_celltype", "SSM_function", "SSM_pathway")
# gem_list
<- altExp(pbmcs$CB, "logcounts")
expr_sub <- expr_sub[rownames(expr_sub) %in% marker_genes_CB$gene]
expr_sub <- list(x = t(scale(t(as.matrix(assay(expr_sub, "counts"))))))
gem_list names(gem_list) <- "Scaled\nLogExpr"
# ssmlabel_list
<- list() ; ssmlabel_list <- list()
labels for(i in seq_along(pbmcs)){
<- colData(sces_sub[[i]])$seurat_clusters
tmp <- data.frame(label = tmp)
labels[[i]] <- length(unique(tmp))
n_groups if(i == 1){
$color <- scales::hue_pal()(n_groups)[tmp]
labels[[i]]else if(i == 2){
}$color <- scales::brewer_pal(palette = "Dark2")(n_groups)[tmp]
labels[[i]]else if(i == 3){
}$color <- scales::brewer_pal(palette = "Paired")(n_groups)[tmp]
labels[[i]]
}<- labels[[i]]
ssmlabel_list[[i]]
}names(ssmlabel_list) <- c("Label_celltype", "Label_function", "Label_pathway")
Tips: If one would like to omit some color labels (e.g., labels[[3]]), set the argument as follows:
2]] <- data.frame(label = NA, color = NA)
ssmlabel_list[[3]] <- data.frame(label = NA, color = NA) ssmlabel_list[[
Finally, plot heatmaps for the selected signs and genes.
<- "figures/figure_06_0040.png"
filename #png(file = filename, height = 2200, width = 1800, res = 300)
png(file = filename, height = 450, width = 350, res = 60)
set.seed(1)
<- "PBMC (Reyes et al., 2020)"
title plot_multiheatmaps(ssm_list = ssm_list, gem_list = gem_list,
ssmlabel_list = ssmlabel_list, gemlabel_list = NULL,
nrand_samples = 1000, show_row_names = TRUE, title = title)
dev.off()
Show violin plots for sign score and gene expression distributions across cell type-related clusters.
<- colData(pbmcs$CB)$seurat_clusters
labels <- list(c("GE", "CD14", "(Monocyte)"),
vlist c("GE", "HLA-DPB1", "(Dendritic cell)"),
c("GE", "CD68", "(Phagocytosis)"),
c("GE", "CD4", "(T cell)"),
c("GE", "CD8B", "(T cell)"),
c("GE", "TRAC", "(T Cell Receptor Alpha)"),
c("GE", "LILRA4", "(Plasmacytoid dendritic cells)"),
c("CB", "CellMarkerID:6-S", "Activated T cell (CD69, KLRG1, ...)"),
c("CB", "CellMarkerID:184-S", "Natural killer cell (NKG7, GZMB, ...)"),
c("CB", "CellMarkerID:99-S", "Follicular B cell (MS4A1, CD27, ...)"),
c("CB", "CellMarkerID:53-S", "...dendritic cell (HLA-DPB1, SNX3, ...)"),
c("GO", "GO:0032757-S", "...interleukin-8... (CD14, FCN1, ...)"),
c("GO", "GO:0032647-S", "...interferon-alpha... (IRF7, PTPRS, ...)"),
c("KG", "path:hsa04660-S", "T cell receptor... (CD8A, CD3E, ...)"),
c("KG", "path:hsa04664-V", "Fc epsilon RI... (FCER1A, ALOX5AP, ...)"))
<- list(GE = "Expression", CB = "Sign score", GO = "Sign score",
ylabels KG = "Sign score")
for(i in seq_along(vlist)){
if(vlist[[i]][1] == "GE"){
<- which(rownames(altExp(pbmcs$CB, "logcounts")) == vlist[[i]][2])
ind <- altExp(pbmcs$CB, "logcounts")[ind, ]
subsce <- as.data.frame(t(as.matrix(assay(subsce, "counts"))))
df else{
}<- which(rownames(pbmcs[[vlist[[i]][1]]]) == vlist[[i]][2])
ind <- pbmcs[[vlist[[i]][1]]][ind, ]
subsce <- as.data.frame(t(as.matrix(assay(subsce, "counts"))))
df
}<- ggplot2::ggplot() +
p ::geom_violin(ggplot2::aes(x = as.factor(labels), y = df[, 1],
ggplot2fill = labels), trim = FALSE, size = 0.5) +
::geom_boxplot(ggplot2::aes(x = as.factor(labels), y = df[, 1]),
ggplot2width = 0.15, alpha = 0.6) +
::labs(title = paste0(vlist[[i]][2], "\n", vlist[[i]][3]),
ggplot2x = "Cluster (cell type)", y = ylabels[[vlist[[i]][1]]]) +
::theme_classic(base_size = 25) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 20),
ggplot2legend.position = "none") +
::scale_fill_hue()
ggplot2<- sprintf("figures/figure_06_%04d.png", 49 + i)
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 6, height = 3.7)
ggplot2 }
Show violin plots for the sign score distributions across given cell type-related clusters.
<- colData(pbmcs$CB)$seurat_clusters
labels <- list(c("KG", "path:hsa00071-S", "Fatty acid degradation (ACAT1, ...)"),
vlist c("KG", "path:hsa00010-S", "Glycolysis / Gluconeogenesis (ENO1, ...)"),
c("KG", "path:hsa04514-V", "Cell adhesion molecules (ITGA4, ...)"),
c("GO", "GO:0010498-S", "Proteasomal protein catabolic... (PSMB2, ...)"),
c("GO", "GO:0044262-S", "Cellular carbohydrate metabolic... (PGD, ...)"))
<- scales::hue_pal()(11)
mycolor <- c("0" = mycolor[1], "2" = mycolor[3], "3" = mycolor[4])
mycolor for(i in seq_along(vlist)){
<- which(rownames(pbmcs[[vlist[[i]][1]]]) == vlist[[i]][2])
inds_sign <- which(colData(pbmcs$CB)$seurat_clusters %in% c(0, 2, 3))
inds_cell <- pbmcs[[vlist[[i]][1]]][inds_sign, inds_cell]
subsce <- labels[inds_cell]
sublabels <- as.data.frame(t(as.matrix(assay(subsce, "counts"))))
df <- ggplot2::ggplot() +
p ::geom_violin(ggplot2::aes(x = as.factor(sublabels), y = df[, 1],
ggplot2fill = sublabels), trim = FALSE, size = 0.5) +
::geom_boxplot(ggplot2::aes(x = as.factor(sublabels), y = df[, 1]),
ggplot2width = 0.15, alpha = 0.6) +
::labs(title = paste0(vlist[[i]][2], "\n", vlist[[i]][3]),
ggplot2x = "Cluster (cell type)", y = "Sign score") +
::theme_classic(base_size = 25) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 20),
ggplot2legend.position = "none") +
::scale_fill_manual(values = mycolor)
ggplot2<- sprintf("figures/figure_06_%04d.png", 69 + i)
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 5, height = 3.7)
ggplot2 }
Show violin plots for the gene expression distributions across given cell type-related clusters.
<- colData(pbmcs$CB)$seurat_clusters
labels <- list(c("GE", "LILRA4", "(Plasmacytoid dendritic cells)"))
vlist <- scales::hue_pal()(11)
mycolor <- c("6" = mycolor[7], "7" = mycolor[8], "9" = mycolor[10], "10" = mycolor[11])
mycolor for(i in seq_along(vlist)){
<- which(rownames(altExp(pbmcs$CB, "logcounts")) == vlist[[i]][2])
inds_gene <- which(colData(pbmcs$CB)$seurat_clusters %in% c(6, 7, 9, 10))
inds_cell <- altExp(pbmcs$CB, "logcounts")[inds_gene, inds_cell]
subsce <- labels[inds_cell]
sublabels <- as.data.frame(t(as.matrix(assay(subsce, "counts"))))
df <- ggplot2::ggplot() +
p ::geom_violin(ggplot2::aes(x = as.factor(sublabels), y = df[, 1],
ggplot2fill = sublabels), trim = FALSE, size = 0.5) +
::geom_boxplot(ggplot2::aes(x = as.factor(sublabels), y = df[, 1]),
ggplot2width = 0.15, alpha = 0.6) +
::labs(title = paste0(vlist[[i]][2], "\n", vlist[[i]][3]),
ggplot2x = "Cluster (cell type)", y = "Expression") +
::theme_classic(base_size = 25) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 20),
ggplot2legend.position = "none") +
::scale_fill_manual(values = mycolor)
ggplot2<- sprintf("figures/figure_06_%04d.png", 74 + i)
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 5, height = 4)
ggplot2 }
Classify cells into large cell type categories.
colData(pbmcs$CB)$cell_type <- as.character(colData(pbmcs$CB)$seurat_clusters)
colData(pbmcs$CB)$cell_type[colData(pbmcs$CB)$cell_type == 0] <- "Mono"
colData(pbmcs$CB)$cell_type[colData(pbmcs$CB)$cell_type == 1] <- "T"
colData(pbmcs$CB)$cell_type[colData(pbmcs$CB)$cell_type == 2] <- "Mono"
colData(pbmcs$CB)$cell_type[colData(pbmcs$CB)$cell_type == 3] <- "Mono" # or T
colData(pbmcs$CB)$cell_type[colData(pbmcs$CB)$cell_type == 4] <- "NK/NKT"
colData(pbmcs$CB)$cell_type[colData(pbmcs$CB)$cell_type == 5] <- "B"
colData(pbmcs$CB)$cell_type[colData(pbmcs$CB)$cell_type == 6] <- "DC"
colData(pbmcs$CB)$cell_type[colData(pbmcs$CB)$cell_type == 7] <- "DC"
colData(pbmcs$CB)$cell_type[colData(pbmcs$CB)$cell_type == 8] <- "T"
colData(pbmcs$CB)$cell_type[colData(pbmcs$CB)$cell_type == 9] <- "DC"
colData(pbmcs$CB)$cell_type[colData(pbmcs$CB)$cell_type == 10] <- "DC"
Classify cells into the detailed categories.
colData(pbmcs$CB)$cell_state <- as.character(colData(pbmcs$CB)$seurat_clusters)
colData(pbmcs$CB)$cell_state[colData(pbmcs$CB)$cell_state == 0] <- "Mono-1"
colData(pbmcs$CB)$cell_state[colData(pbmcs$CB)$cell_state == 1] <- "T-1"
colData(pbmcs$CB)$cell_state[colData(pbmcs$CB)$cell_state == 2] <- "Mono-2"
colData(pbmcs$CB)$cell_state[colData(pbmcs$CB)$cell_state == 3] <- "Mono-3" # or T
colData(pbmcs$CB)$cell_state[colData(pbmcs$CB)$cell_state == 4] <- "NK/NKT"
colData(pbmcs$CB)$cell_state[colData(pbmcs$CB)$cell_state == 5] <- "B"
colData(pbmcs$CB)$cell_state[colData(pbmcs$CB)$cell_state == 6] <- "DC-1"
colData(pbmcs$CB)$cell_state[colData(pbmcs$CB)$cell_state == 7] <- "DC-2"
colData(pbmcs$CB)$cell_state[colData(pbmcs$CB)$cell_state == 8] <- "T-2"
colData(pbmcs$CB)$cell_state[colData(pbmcs$CB)$cell_state == 9] <- "DC-3"
colData(pbmcs$CB)$cell_state[colData(pbmcs$CB)$cell_state == 10] <- "DC-4"
Show the annotation results in low-dimensional spaces.
<- "Reyes 2020 (cell state)"
title <- factor(colData(pbmcs$CB)$cell_state,
labels levels = c("Mono-1", "T-1", "Mono-2", "Mono-3", "NK/NKT",
"B", "DC-1", "DC-2", "T-2", "DC-3", "DC-4"))
<- as.data.frame(reducedDim(pbmcs$CB, "TSNE"))
df <- ggplot2::ggplot() +
p ::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
ggplot2size = 0.1, alpha = 1) +
::labs(title = title, x = "tSNE_1", y = "tSNE_2", color = "Cell state") +
ggplot2::theme_classic(base_size = 20) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) +
ggplot2::scale_fill_hue() +
ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4)))
ggplot2<- "figures/figure_06_0080.png"
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 5.8, height = 4.3) ggplot2
Check the reported sample information (Reyes et al., 2020) in low-dimensional spaces.
<- c("Sort", "Patient", "Cell_Type")
types <- "Reyes 2020 (cell type)"
title for(i in seq_along(types)){
<- colData(pbmcs$CB)[[types[i]]]
labels <- as.data.frame(reducedDim(pbmcs$CB, "TSNE"))
df <- ggplot2::ggplot() +
p ::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
ggplot2size = 0.1, alpha = 1) +
::labs(title = title, x = "tSNE_1", y = "tSNE_2", color = types[i]) +
ggplot2::theme_classic(base_size = 20) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) +
ggplot2::scale_fill_hue() +
ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4)))
ggplot2<- sprintf("figures/figure_06_%04d.png", 80 + i)
filename if(types[i] == "Sort"){
::ggsave(file = filename, plot = p, dpi = 50, width = 5.6, height = 4.3)
ggplot2else if(types[i] == "Patient"){
}::ggsave(file = filename, plot = p, dpi = 50, width = 9.3, height = 4.3)
ggplot2else if(types[i] == "Cell_Type"){
}::ggsave(file = filename, plot = p, dpi = 50, width = 5.8, height = 4.3)
ggplot2
} }
Load the sample information reported in previous paper (Reyes et al., 2020) and show the reported t-SNE plots.
# Load t-SNE coordinates.
<- read.delim("rawdata/2020_001_Reyes/SCP548/cluster/scp_tsne.txt",
data header = TRUE, skip = 1, row.names = 1)
<- rownames(data)
cells # Load sample information.
<- read.delim("rawdata/2020_001_Reyes/SCP548/metadata/scp_meta.txt")
info <- info[-1, ]
info rownames(info) <- info$NAME
<- info[, -1]
info <- info[cells, ]
info # Show t-SNE plot.
<- "PBMC (Reyes 2020)"
title <- info$Cell_Type
labels <- as.data.frame(data)
df <- ggplot2::ggplot() +
p ::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
ggplot2size = 0.1, alpha = 1) +
::labs(title = title, x = "tSNE_1", y = "tSNE_2", color = "Cell_Type") +
ggplot2::theme_classic(base_size = 20) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) +
ggplot2::scale_fill_hue() +
ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4)))
ggplot2<- "figures/figure_06_0084.png"
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 5.8, height = 4.3) ggplot2
# Save data.
saveRDS(pbmcs, file = "backup/06_006_pbmcs130k_annotation.rds")
# Load data.
<- readRDS("backup/06_006_pbmcs130k_annotation.rds") pbmcs
For each cell states, investigate population ratios across subject types
<- c("Cohort", "cell_state", "seurat_clusters", "Patient")
x <- as.data.frame(colData(pbmcs$CB))[, which(colnames(colData(pbmcs$CB)) %in% x)]
df <- df[!grepl("DC", df$cell_state),]
df <- c("Control", "Leuk-UTI", "Int-URO", "URO", "Bac-SEP", "ICU-SEP", "ICU-NoSEP")
lvs <- sort(factor(unique(df$Cohort), levels = lvs))
cohorts <- c()
res for(i in seq_along(cohorts)){
<- df[which(df$Cohort == cohorts[i]), ]
df_cohorts <- dplyr::group_by(df_cohorts, Patient, cell_state, seurat_clusters,
df_cohorts
Cohort)<- dplyr::summarise(df_cohorts, n = dplyr::n())
df_cohorts <- unique(df_cohorts$Patient)
patients <- c()
res_cohorts for(j in seq_along(patients)){
<- df_cohorts[which(df_cohorts$Patient == patients[j]), ]
tmp $ratio <- tmp$n / sum(tmp$n)
tmp<- rbind(res_cohorts, tmp)
res_cohorts
}<- rbind(res, res_cohorts)
res
}
<- unique(res[, which(colnames(res) %in% c("cell_state", "seurat_clusters"))])
cells <- cells[order(cells$seurat_clusters), ]$cell_state
cells for(i in seq_along(cells)){
<- paste0("PBMC: ", cells[i])
title <- res[which(res$cell_state == cells[i]), ]
df_cells $Cohort <- factor(df_cells$Cohort, levels = lvs)
df_cells<- df_cells[order(df_cells$Cohort), ]
df_cells <- dplyr::group_by(df_cells, Cohort)
df_cells <- ggplot2::ggplot(df_cells) +
p ::geom_boxplot(ggplot2::aes(x = Cohort, y = ratio),
ggplot2fill = "grey90", lwd = 1) +
::labs(title = title, x = "", y = "Population ratio (w/o DC)") +
ggplot2::scale_x_discrete(guide = ggplot2::guide_axis(angle = 45)) +
ggplot2::theme_classic(base_size = 25) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 25),
ggplot2legend.position = "none")
<- sprintf("figures/figure_06_%04d.png", 89 + i)
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 7, height = 6.5)
ggplot2 }
Focusing on monocytes, investigate population ratios across subject types.
<- c("Cohort", "cell_type", "Patient")
x <- as.data.frame(colData(pbmcs$CB))[, which(colnames(colData(pbmcs$CB)) %in% x)]
df <- df[!grepl("DC", df$cell_type),]
df <- c("Control", "Leuk-UTI", "Int-URO", "URO", "Bac-SEP", "ICU-SEP", "ICU-NoSEP")
lvs <- sort(factor(unique(df$Cohort), levels = lvs))
cohorts <- c()
res for(i in seq_along(cohorts)){
<- df[which(df$Cohort == cohorts[i]), ]
df_cohorts <- dplyr::group_by(df_cohorts, Patient, cell_type, Cohort)
df_cohorts <- dplyr::summarise(df_cohorts, n = dplyr::n())
df_cohorts <- unique(df_cohorts$Patient)
patients <- c()
res_cohorts for(j in seq_along(patients)){
<- df_cohorts[which(df_cohorts$Patient == patients[j]), ]
tmp $ratio <- tmp$n / sum(tmp$n)
tmp<- rbind(res_cohorts, tmp)
res_cohorts
}<- rbind(res, res_cohorts)
res
}<- res[which(res$cell_type == "Mono"), ]
res
<- "PBMC: Monocyte"
title $Cohort <- factor(res$Cohort, levels = lvs)
res<- res[order(res$Cohort), ]
res <- dplyr::group_by(res, Cohort)
res <- ggplot2::ggplot(res) +
p ::geom_boxplot(ggplot2::aes(x = Cohort, y = ratio),
ggplot2fill = "grey90", lwd = 1) +
::labs(title = title, x = "", y = "Population ratio (w/o DC)") +
ggplot2::scale_x_discrete(guide = ggplot2::guide_axis(angle = 45)) +
ggplot2::theme_classic(base_size = 25) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 25),
ggplot2legend.position = "none")
<- "figures/figure_06_0100.png"
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 7, height = 6.5) ggplot2
Simultaneously show population ratios of monocyte subpopulation across subject types.
<- c("Cohort", "cell_state", "Patient")
x <- as.data.frame(colData(pbmcs$CB))[, which(colnames(colData(pbmcs$CB)) %in% x)]
df <- df[!grepl("DC", df$cell_state),]
df <- c("Control", "Leuk-UTI", "Int-URO", "URO", "Bac-SEP", "ICU-SEP", "ICU-NoSEP")
lvs <- sort(factor(unique(df$Cohort), levels = lvs))
cohorts <- c()
res2 for(i in seq_along(cohorts)){
<- df[which(df$Cohort == cohorts[i]), ]
df_cohorts <- dplyr::group_by(df_cohorts, Patient, cell_state, Cohort)
df_cohorts <- dplyr::summarise(df_cohorts, n = dplyr::n())
df_cohorts <- unique(df_cohorts$Patient)
patients <- c()
res_cohorts for(j in seq_along(patients)){
<- df_cohorts[which(df_cohorts$Patient == patients[j]), ]
tmp $ratio <- tmp$n / sum(tmp$n)
tmp<- rbind(res_cohorts, tmp)
res_cohorts
}<- rbind(res2, res_cohorts)
res2
}<- res2[which(res2$cell_state == "Mono-2"), ]
res2 colnames(res)[2] <- "cell_state"
$cell_state[which(res$cell_state == "Mono")] <- "Mono-all"
res<- rbind(res, res2)
res
# res[which(res$Cohort == "Control"), ]$Cohort <- "Control (19)"
# res[which(res$Cohort == "Leuk-UTI"), ]$Cohort <- "Leuk-UTI (10)"
# res[which(res$Cohort == "Int-URO"), ]$Cohort <- "Int-URO (7)"
# res[which(res$Cohort == "URO"), ]$Cohort <- "URO (10)"
# res[which(res$Cohort == "Bac-SEP"), ]$Cohort <- "Bac-SEP (4)"
# res[which(res$Cohort == "ICU-SEP"), ]$Cohort <- "ICU-SEP (8)"
# res[which(res$Cohort == "ICU-NoSEP"), ]$Cohort <- "ICU-NoSEP (7)"
# lvs = unique(res$Cohort)
# res$Cohort <- factor(res$Cohort, levels = lvs)
<- "PBMC: Monocyte"
title $Cohort <- factor(res$Cohort, levels = lvs)
res$cell_state <- factor(res$cell_state, levels = c("Mono-all", "Mono-2"))
res<- res[order(res$Cohort), ]
res <- dplyr::group_by(res, Cohort)
res <- ggplot2::ggplot(res, ggplot2::aes(x = Cohort, y = ratio, fill = cell_state)) +
p ::geom_boxplot(alpha = 1, lwd = 1.5) +
ggplot2::labs(title = title, x = "", y = "Population ratio (w/o D1-D4)",
ggplot2fill = "Cell state") +
::scale_x_discrete(guide = ggplot2::guide_axis(angle = 45)) +
ggplot2::theme_classic(base_size = 25) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 25),
ggplot2legend.position = "right") +
::scale_fill_manual(values = c("grey80", scales::hue_pal()(11)[3]))
ggplot2<- "figures/figure_06_0101.png"
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 10, height = 6.5) ggplot2
Load the normalized data (see here).
<- readRDS("backup/06_003_pbmc130k_normalized.rds") pbmc
Prepare a SingleCellExperiment object.
<- as.matrix(assay(pbmc, "counts"))
mat <- SingleCellExperiment(assays = list(counts = as.matrix(mat)),
pbmc rowData = data.frame(gene = rownames(mat)),
colData = data.frame(sample = colnames(mat)))
According to the scran protocol, perform cell clustering for computing size factors for individual clusters, normalize data within individual clusters, and perform a variance modeling for each gene.
# Quick cell clustering
<- scran::quickCluster(pbmc)
clusters # scran normalization
<- scran::computeSumFactors(pbmc, clusters = clusters)
pbmc <- scater::logNormCounts(pbmc)
pbmc # Perform a variance modeling
metadata(pbmc)$dec <- scran::modelGeneVar(pbmc)
Show the results of variance modeling.
<- data.frame(x = metadata(pbmc)$dec$mean, y = metadata(pbmc)$dec$total,
df z = metadata(pbmc)$dec$tech)
<- "PBMC (Reyes et al., 2020)"
title <- ggplot2::ggplot() +
p ::geom_point(ggplot2::aes(x = df[,1], y = df[,2]), color = "black",
ggplot2alpha = 1, size = 2) +
::geom_line(ggplot2::aes(x = df[,1], y = df[,3]), color = "red",
ggplot2alpha = 1, size = 2) +
::labs(title = title, x = "Mean log-expression", y = "Variance") +
ggplot2::theme_classic(base_size = 20) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 20))
ggplot2<- "figures/figure_06_0110.png"
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 4.0, height = 4.0) ggplot2
According to the scran protocol, choose top variable genes using
getTopHVGs()
, performe denoisPCA()
by
inputting the variable genes, and perform k-nearest neighbor graph-based
clustering, where k is a resolution parameter.
# Choose top variable genes.
<- scran::getTopHVGs(metadata(pbmc)$dec, n = round(0.9 * dim(pbmc)[1]))
hvg # Principal component analysis
<- scran::denoisePCA(pbmc, metadata(pbmc)$dec, subset.row = hvg)
pbmc # Cell clustering
set.seed(1)
<- scran::buildSNNGraph(pbmc, use.dimred = "PCA", k = 950)
g <- igraph::cluster_louvain(g)$membership
c colData(pbmc)$scran_clusters <- as.factor(c)
Perform t-distributed stochastic neighbor embedding.
set.seed(1)
<- reducedDim(pbmc, "PCA")
mat <- Rtsne::Rtsne(mat, dim = 2, pca = FALSE)
res reducedDim(pbmc, "TSNE") <- res[["Y"]]
Show the clustering results.
<- "Reyes 2020 (scran)"
title <- colData(pbmc)$scran_clusters
labels <- as.data.frame(reducedDim(pbmc, "TSNE"))
df <- ggplot2::ggplot() +
p ::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
ggplot2size = 0.1, alpha = 1) +
::labs(title = title, x = "tSNE_1", y = "tSNE_2", color = "") +
ggplot2::theme_classic(base_size = 20) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) +
ggplot2::scale_fill_hue() +
ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4)))
ggplot2<- "figures/figure_06_0130.png"
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 5.1, height = 4.3) ggplot2
Performs pairwiseTTests()
and
combineMarkers()
for finding cluster markers.
metadata(pbmc)$pwtt <- scran::pairwiseTTests(
x = as.matrix(assay(pbmc, "logcounts")),
groups = colData(pbmc)$scran_clusters, direction = "up")
metadata(pbmc)$cmb <- scran::combineMarkers(
de.lists = metadata(pbmc)$pwtt$statistics,
pairs = metadata(pbmc)$pwtt$pairs, pval.type = "all")
# Create a result table.
<- list()
res for(i in seq_along(metadata(pbmc)$cmb@listData)){
<- metadata(pbmc)$cmb@listData[[i]]@rownames
g <- metadata(pbmc)$cmb@listData[[i]]@listData[["p.value"]]
p <- metadata(pbmc)$cmb@listData[[i]]@listData[["FDR"]]
fd <- metadata(pbmc)$cmb@listData[[i]]@listData[["summary.logFC"]]
fc <- data.frame(label = i, gene = g, pval = p, FDR = fd, logFC = fc)
res[[i]]
}<- c()
tmp for(i in seq_along(res)){
<- rbind(tmp, res[[i]])
tmp
}metadata(pbmc)$stat <- tmp
View(metadata(pbmc)$stat[metadata(pbmc)$stat$FDR < 10^(-100),])
Defining significant genes as genes with FDR<1e-100, infer cell types using GeneCards.
1: T cell # TRAC (FDR ~0), TRAT1 (FDR ~e-288)
2: Monocyte??? # No significant genes are detected.
# (IER2 (FDR ~e-77), FCN1 (FDR ~e-72))
3: Monocyte # FCER1G (FDR ~0), AIF1 (FDR ~0)
4: Monocyte # S100A9 (FDR ~0), S100A12 (FDR ~0), S100A8 (FDR ~0)
5: NK/NKT # CD160 (FDR ~0), FCRL6 (FDR ~0), GNLY (FDR ~0)
6: B cell # BANK1 (FDR ~0), MS4A1 (FDR ~0), CD79B (FDR ~0), CD79A (FDR ~0)
7: B cell # BCL11A (FDR ~0), MZB1 (FDR ~0)
8: Dendritic cell # HLA-DRA (FDR ~0), HLA-DRB1 (FDR ~0), HLA-DQA1 (FDR ~0)
colData(pbmc)$cell_type <- as.integer(as.character(colData(pbmc)$scran_clusters))
colData(pbmc)$cell_type[colData(pbmc)$cell_type == 1] <- "T"
colData(pbmc)$cell_type[colData(pbmc)$cell_type == 2] <- "Unspecified"
colData(pbmc)$cell_type[colData(pbmc)$cell_type == 3] <- "Mono"
colData(pbmc)$cell_type[colData(pbmc)$cell_type == 4] <- "Mono"
colData(pbmc)$cell_type[colData(pbmc)$cell_type == 5] <- "NK/NKT"
colData(pbmc)$cell_type[colData(pbmc)$cell_type == 6] <- "B"
colData(pbmc)$cell_type[colData(pbmc)$cell_type == 7] <- "B"
colData(pbmc)$cell_type[colData(pbmc)$cell_type == 8] <- "DC"
colData(pbmc)$cell_state <- as.integer(as.character(colData(pbmc)$scran_clusters))
colData(pbmc)$cell_state[colData(pbmc)$cell_state == 1] <- "T"
colData(pbmc)$cell_state[colData(pbmc)$cell_state == 2] <- "Unspecified"
colData(pbmc)$cell_state[colData(pbmc)$cell_state == 3] <- "Mono-1"
colData(pbmc)$cell_state[colData(pbmc)$cell_state == 4] <- "Mono-2"
colData(pbmc)$cell_state[colData(pbmc)$cell_state == 5] <- "NK/NKT"
colData(pbmc)$cell_state[colData(pbmc)$cell_state == 6] <- "B-1"
colData(pbmc)$cell_state[colData(pbmc)$cell_state == 7] <- "B-2"
colData(pbmc)$cell_state[colData(pbmc)$cell_state == 8] <- "DC"
Show the annotation results in low-dimensional spaces.
<- "Reyes 2020 (scran)"
title <- c("T", "Mono", "NK/NKT", "B", "DC", "Unspecified")
lvs <- factor(colData(pbmc)$cell_type, levels = lvs)
labels <- as.data.frame(reducedDim(pbmc, "TSNE"))
df <- scales::hue_pal()(5)
mycolor <- c("T" = mycolor[1], "Mono" = mycolor[2], "NK/NKT" = mycolor[3],
mycolor "B" = mycolor[4], "DC" = mycolor[5], "Unspecified" = "grey80")
<- ggplot2::ggplot() +
p ::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
ggplot2size = 0.1, alpha = 1) +
::labs(title = title, x = "tSNE_1", y = "tSNE_2", color = "Cell state") +
ggplot2::theme_classic(base_size = 20) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) +
ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4))) +
ggplot2::scale_color_manual(values = mycolor)
ggplot2<- "figures/figure_06_0140.png"
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 6.1, height = 4.3) ggplot2
# Save data.
saveRDS(pbmc, file = "backup/06_011_pbmc130k_scran.rds")
# Load data.
<- readRDS("backup/06_011_pbmc130k_scran.rds") pbmc
Load the normalized data (see here) and keep the sample information.
<- readRDS("backup/06_003_pbmc130k_normalized.rds")
pbmc <- as.data.frame(colData(pbmc)) info
Create Seurat objects.
<- Seurat::CreateSeuratObject(counts = as.matrix(assay(pbmc, "counts")),
pbmc project = "PBMC", meta.data = info[, seq_len(5)])
According to the Seurat protocol, normalize data, perform variance stabilizing transform by setting the number of variable feature, scale data, and reduce dimension using principal component analysis.
# Normalization
<- Seurat::NormalizeData(pbmc, normalization.method = "LogNormalize")
pbmc # Variance stabilizing transform
<- round(0.5 * dim(pbmc)[1])
n <- Seurat::FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = n)
pbmc # Scale data
<- Seurat::ScaleData(pbmc)
pbmc # Principal component analysis
<- Seurat::RunPCA(pbmc, features = Seurat::VariableFeatures(pbmc)) pbmc
Compute the cumulative sum of variances, which is used for determining the number of the principal components (PCs).
<- which(cumsum(pbmc@reductions[["pca"]]@stdev) /
pc sum(pbmc@reductions[["pca"]]@stdev) > 0.3)[1]
Perform cell clustering.
# Create k-nearest neighbor graph.
<- Seurat::FindNeighbors(pbmc, reduction = "pca", dim = seq_len(pc))
pbmc # Cluster cells.
<- Seurat::FindClusters(pbmc, resolution = 0.1)
pbmc # Run t-SNE.
<- Seurat::RunTSNE(pbmc, dims.use = seq_len(2), reduction = "pca",
pbmc dims = seq_len(pc), do.fast = FALSE, perplexity = 30)
Show the clustering results.
<- "Reyes 2020 (Seurat)"
title <- pbmc$seurat_clusters
labels <- pbmc@reductions[["tsne"]]@cell.embeddings
df <- ggplot2::ggplot() +
p ::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
ggplot2size = 0.1, alpha = 1) +
::labs(title = title, x = "tSNE_1", y = "tSNE_2", color = "") +
ggplot2::theme_classic(base_size = 20) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) +
ggplot2::scale_fill_hue() +
ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4)))
ggplot2<- "figures/figure_06_0230.png"
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 5.1, height = 4.3) ggplot2
Check the reported sample information (Reyes et al., 2020) in low-dimensional spaces.
<- "Reyes 2020 (Seurat)"
title <- pbmc$Sort
labels <- pbmc@reductions[["tsne"]]@cell.embeddings
df <- ggplot2::ggplot() +
p ::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
ggplot2size = 0.1, alpha = 1) +
::labs(title = title, x = "tSNE_1", y = "tSNE_2", color = "Sort") +
ggplot2::theme_classic(base_size = 20) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) +
ggplot2::scale_fill_hue() +
ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4)))
ggplot2<- "figures/figure_06_0235.png"
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 5.5, height = 4.3) ggplot2
Find differentially expressed genes.
@misc$stat <- Seurat::FindAllMarkers(pbmc, only.pos = TRUE, min.pct = 0.25,
pbmclogfc.threshold = 0.25)
View(pbmc@misc$stat[which(pbmc@misc$stat$p_val_adj < 10^(-100)), ])
Defining significant genes as genes with FDR<1e-100, infer cell types using GeneCards.
0: Monocyte # S100A8 (p_val_adj ~0), S100A9 (p_val_adj ~0)
1: NK/NKT # GNLY (p_val_adj ~0), NKG7 (p_val_adj ~0)
2: T cell # TRAC (p_val_adj ~0), CD3D (p_val_adj ~0)
3: Monocyte # CD68 (p_val_adj ~0), LYN (p_val_adj ~0)
4: B cell # CD79A (p_val_adj ~0), CD79B (p_val_adj ~0)
5: Dendritic cell # LILRA4 (p_val_adj ~0), PTPRS (p_val_adj ~0),
# HLA-DMA (p_val_adj ~0)
6: Dendritic cell # HLA-DQA1 (p_val_adj ~0), HLA-DQB1 (p_val_adj ~0),
<- as.integer(as.character(pbmc$seurat_clusters))
tmp $cell_type <- tmp
pbmc$cell_type[pbmc$cell_type == 0] <- "Mono"
pbmc$cell_type[pbmc$cell_type == 1] <- "NK/NKT"
pbmc$cell_type[pbmc$cell_type == 2] <- "T"
pbmc$cell_type[pbmc$cell_type == 3] <- "Mono"
pbmc$cell_type[pbmc$cell_type == 4] <- "B"
pbmc$cell_type[pbmc$cell_type == 5] <- "DC"
pbmc$cell_type[pbmc$cell_type == 6] <- "DC"
pbmc
<- as.integer(as.character(pbmc$seurat_clusters))
tmp $cell_state <- tmp
pbmc$cell_state[pbmc$cell_state == 0] <- "Mono-1"
pbmc$cell_state[pbmc$cell_state == 1] <- "NK/NKT"
pbmc$cell_state[pbmc$cell_state == 2] <- "T"
pbmc$cell_state[pbmc$cell_state == 3] <- "Mono-2"
pbmc$cell_state[pbmc$cell_state == 4] <- "B"
pbmc$cell_state[pbmc$cell_state == 5] <- "DC-1"
pbmc$cell_state[pbmc$cell_state == 6] <- "DC-2" pbmc
Show the annotation results in low-dimensional spaces.
<- "Reyes 2020 (Seurat)"
title <- c("Mono", "NK/NKT", "T", "B", "DC")
lvs <- factor(pbmc$cell_type, levels = lvs)
labels <- pbmc@reductions[["tsne"]]@cell.embeddings
df <- ggplot2::ggplot() +
p ::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
ggplot2size = 0.1, alpha = 1) +
::labs(title = title, x = "tSNE_1", y = "tSNE_2", color = "Cell type") +
ggplot2::theme_classic(base_size = 20) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) +
ggplot2::scale_fill_hue() +
ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4)))
ggplot2<- "figures/figure_06_0240.png"
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 5.8, height = 4.3)
ggplot2
<- c("Mono-1", "NK/NKT", "T", "Mono-2", "B", "DC-1", "DC-2")
lvs <- factor(pbmc$cell_state, levels = lvs)
labels <- ggplot2::ggplot() +
p ::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
ggplot2size = 0.1, alpha = 1) +
::labs(title = title, x = "tSNE_1", y = "tSNE_2", color = "Cell state") +
ggplot2::theme_classic(base_size = 20) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) +
ggplot2::scale_fill_hue() +
ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4)))
ggplot2<- "figures/figure_06_0245.png"
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 5.8, height = 4.3) ggplot2
Show normalized gene expression levels.
<- "Reyes 2020 (Seurat)"
title <- "CD79A"
gene <- as.data.frame(pbmc@reductions[["tsne"]]@cell.embeddings)
df $expr <- as.matrix(Seurat::GetAssayData(object = pbmc, slot = "data"))[gene, ]
df<- ggplot2::ggplot() +
p ::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = df[, 3]),
ggplot2size = 0.1, alpha = 1) +
::labs(title = title, x = "tSNE_1", y = "tSNE_2", color = gene) +
ggplot2::theme_classic(base_size = 20) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) +
ggplot2::scale_color_gradient(low = "grey90", high = "red")
ggplot2<- "figures/figure_06_0250.png"
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 5.5, height = 4.3) ggplot2
Focusing on monocytes, investigate population ratios across cohorts.
<- c("Patient", "Cohort", "cell_type")
x <- pbmc[[]][, which(colnames(pbmc[[]]) %in% x)]
df <- df[!grepl("DC", df$cell_type),]
df <- c("Control", "Leuk-UTI", "Int-URO", "URO", "Bac-SEP", "ICU-SEP", "ICU-NoSEP")
lvs <- sort(factor(unique(df$Cohort), levels = lvs))
cohorts <- c()
res for(i in seq_along(cohorts)){
<- df[which(df$Cohort == cohorts[i]), ]
df_cohorts <- dplyr::group_by(df_cohorts, Patient, cell_type, Cohort)
df_cohorts <- dplyr::summarise(df_cohorts, n = dplyr::n())
df_cohorts <- unique(df_cohorts$Patient)
patients <- c()
res_cohorts for(j in seq_along(patients)){
<- df_cohorts[which(df_cohorts$Patient == patients[j]), ]
tmp $ratio <- tmp$n / sum(tmp$n)
tmp<- rbind(res_cohorts, tmp)
res_cohorts
}<- rbind(res, res_cohorts)
res
}<- res[which(res$cell_type == "Mono"), ]
res
<- "PBMC: Monocyte"
title $Cohort <- factor(res$Cohort, levels = lvs)
res<- res[order(res$Cohort), ]
res <- dplyr::group_by(res, Cohort)
res <- ggplot2::ggplot(res) +
p ::geom_boxplot(ggplot2::aes(x = Cohort, y = ratio),
ggplot2fill = "grey90", lwd = 1) +
::labs(title = title, x = "", y = "Population ratio (w/o DC)") +
ggplot2::scale_x_discrete(guide = ggplot2::guide_axis(angle = 45)) +
ggplot2::theme_classic(base_size = 25) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 25),
ggplot2legend.position = "none")
<- "figures/figure_06_0260.png"
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 7, height = 6.5) ggplot2
# Save data.
saveRDS(pbmc, file = "backup/06_021_pbmc130k_seurat.rds")
# Load data.
<- readRDS("backup/06_021_pbmc130k_seurat.rds") pbmc
Check the package version.
packageVersion("scCATCH")
[1] ‘3.0’
Load Seurat computational results.
<- readRDS("backup/06_021_pbmc130k_seurat.rds") pbmc
Create scCATCH objects.
<- as.matrix(Seurat::GetAssayData(object = pbmc, slot = "data"))
mat <- scCATCH::rev_gene(data = mat, data_type = "data", species = "Human",
mat geneinfo = scCATCH::geneinfo)
<- as.character(pbmc$seurat_clusters)
labels <- scCATCH::createscCATCH(data = mat, cluster = labels) scc
Perform findmarkergenes()
and
findcelltype()
.
<- scCATCH::findmarkergene(object = scc, species = "Human",
scc marker = scCATCH::cellmatch,
tissue = "Peripheral blood", cancer = "Normal",
cell_min_pct = 0.25, logfc = 0.25, pvalue = 0.05)
<- scCATCH::findcelltype(object = scc)
scc View(scc@celltype)
Annotate cells.
<- pbmc[[]]
tmp which(tmp$seurat_clusters == 0), ]$cell_type <- "Mono"
tmp[which(tmp$seurat_clusters == 1), ]$cell_type <- "NK/NKT"
tmp[which(tmp$seurat_clusters == 2), ]$cell_type <- "Mono"
tmp[which(tmp$seurat_clusters == 3), ]$cell_type <- "Mono"
tmp[which(tmp$seurat_clusters == 4), ]$cell_type <- "B"
tmp[which(tmp$seurat_clusters == 5), ]$cell_type <- "T"
tmp[which(tmp$seurat_clusters == 6), ]$cell_type <- "DC"
tmp[<- Seurat::AddMetaData(pbmc, tmp) pbmc
Show the annotation results in low-dimensional spaces.
<- "Reyes 2020 (Seurat + scCatch)"
title <- factor(pbmc[[]]$cell_type, levels = c("Mono", "NK/NKT", "B", "T", "DC"))
labels <- pbmc@reductions[["tsne"]]@cell.embeddings
df <- ggplot2::ggplot() +
p ::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
ggplot2size = 0.2, alpha = 1) +
::labs(title = title, x = "tSNE_1", y = "tSNE_2", color = "Cell type") +
ggplot2::theme_classic(base_size = 20) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) +
ggplot2::scale_colour_hue() +
ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4)))
ggplot2<- "figures/figure_06_0270.png"
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 5.8, height = 4.3) ggplot2
# Save data.
saveRDS(pbmc, file = "backup/06_022_pbmc130k_seurat_sccatch.rds")
# Load data.
<- readRDS("backup/06_022_pbmc130k_seurat_sccatch.rds") pbmc
Load the Seurat annotation results.
<- readRDS("backup/06_021_pbmc130k_seurat.rds") pbmc
Check the package version.
packageVersion("escape")
[1] ‘1.0.1’
Perform getGeneSets()
with an argument
library = "C8"
(“cell type signature gene sets” in MSigDB).
@misc[["getGeneSets"]] <- escape::getGeneSets(species = "Homo sapiens",
pbmclibrary = "C8")
Perform enrichIt()
, estimating ssGSEA scores, in which
the arguments are the same with those in the vignettes in escape
package.
<- escape::enrichIt(obj = pbmc, gene.sets = pbmc@misc[["getGeneSets"]],
ES groups = 1000, cores = 4)
<- Seurat::AddMetaData(pbmc, ES)
pbmc @misc[["enrichIt"]] <- ES pbmc
[1] "Using sets of 1000 cells. Running 78 times."
Setting parallel calculations through a SnowParam back-end
with workers=4 and tasks=100.
Estimating ssGSEA scores for 650 gene sets.
Setting parallel calculations through a SnowParam back-end
with workers=4 and tasks=100.
Estimating ssGSEA scores for 650 gene sets.
...
Setting parallel calculations through a SnowParam back-end
with workers=4 and tasks=100.
Estimating ssGSEA scores for 650 gene sets.
Perform t-distributed stochastic neighbor embedding.
set.seed(1)
<- pbmc@misc[["enrichIt"]]
mat <- Rtsne::Rtsne(mat, dim = 2, pca = TRUE, initial_dims = 50)
res @reductions[["tsne_ssgsea"]] <- res[["Y"]] pbmc
Perform unsupervised clustering of cells.
<- Seurat::CreateSeuratObject(counts = t(pbmc@misc[["enrichIt"]]),
surt project = "PBMC")
<- Seurat::ScaleData(surt, features = rownames(surt))
surt <- Seurat::RunPCA(surt, features = rownames(surt))
surt <- Seurat::FindNeighbors(surt, reduction = "pca", dims = seq_len(40))
surt <- Seurat::FindClusters(surt, resolution = 0.15)
surt <- Seurat::AddMetaData(pbmc, metadata = surt[[]]$seurat_clusters,
pbmc col.name = "ssgsea_clusters")
Show the clustering results in low-dimensional spaces.
<- pbmc$ssgsea_clusters
labels <- as.data.frame(pbmc@reductions[["tsne_ssgsea"]])
df <- ggplot2::ggplot() +
p ::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
ggplot2size = 0.2, alpha = 1) +
::labs(title = "Reyes 2020 (ssGSEA)", x = "tSNE_1", y = "tSNE_2",
ggplot2color = "") +
::theme_classic(base_size = 20) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) +
ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4),
ggplot2ncol = 1))
<- "figures/figure_06_0280.png"
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 5.1, height = 4.3) ggplot2
# Save data.
saveRDS(pbmc, file = "backup/06_023_pbmc130k_ssGSEA_msigdb.rds")
# Load data.
<- readRDS("backup/06_023_pbmc130k_ssGSEA_msigdb.rds") pbmc
Load ssGSEA results.
<- readRDS("backup/06_023_pbmc130k_ssGSEA_msigdb.rds") pbmc
Investigate “significant” modules for Seurat clustering results.
set.seed(1)
<- Seurat::CreateSeuratObject(counts = t(pbmc@misc[["enrichIt"]]))
surt <- Seurat::AddMetaData(surt, metadata = pbmc$cell_type,
surt col.name = "seurat_cell_type")
<- Seurat::SetIdent(surt, value = "seurat_cell_type")
surt <- Seurat::FindAllMarkers(surt, only.pos = TRUE,
res min.pct = 0.15, logfc.threshold = 0.15)
rownames(res) <- seq_len(nrow(res))
@misc[["ssGSEA_stat"]] <- res pbmc
Show violin plots for ssGSEA score distributions for Seurat annotation results.
<- c("T", "Mono", "NK/NKT", "B", "Megakaryocyte")
lvs <- factor(pbmc$cell_type, levels = lvs)
labels <- c(gsub("-", "_", "DURANTE-ADULT-OLFACTORY-NEUROEPITHELIUM-MACROPHAGES"),
descriptions gsub("-", "_", "TRAVAGLINI-LUNG-NATURAL-KILLER-T-CELL"),
gsub("-", "_", "DURANTE-ADULT-OLFACTORY-NEUROEPITHELIUM-B-CELLS"),
gsub("-", "_", "TRAVAGLINI-LUNG-PLATELET-MEGAKARYOCYTE-CELL"))
<- c(paste0("MSigDB: ssGSEA score for\n", gsub("_", "-", descriptions[1])),
titles paste0("MSigDB: ssGSEA score for\n", gsub("_", "-", descriptions[2])),
paste0("MSigDB: ssGSEA score for\n", gsub("_", "-", descriptions[3])),
paste0("MSigDB: ssGSEA score for\n", gsub("_", "-", descriptions[4])))
<- pbmc@misc[["enrichIt"]]
mat <- mat[, which(colnames(mat) %in% descriptions)]
mat for(i in seq_along(descriptions)){
<- data.frame(label = labels,
df ssGSEA = mat[, which(colnames(mat) == descriptions[i])])
# df <- tidyr::pivot_longer(df, cols = c("Sign", "ssGSEA"))
<- ggplot2::ggplot() +
p ::geom_violin(ggplot2::aes(x = as.factor(df$label), y = df$ssGSEA),
ggplot2fill = "grey80", trim = FALSE, size = 1) +
::labs(title = titles[i], x = "Seurat annotations",
ggplot2y = "ssGSEA scores") +
::scale_x_discrete(guide = ggplot2::guide_axis(angle = 45)) +
ggplot2::theme_classic(base_size = 20) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 15))
ggplot2<- sprintf("figures/figure_06_%04d.png", 289 + i)
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 6, height = 5)
ggplot2 }
# Save data.
saveRDS(pbmc, file = "backup/06_024_pbmc130k_seurat_ssGSEA.rds")
# Load data.
<- readRDS("backup/06_024_pbmc130k_seurat_ssGSEA.rds") pbmc
Load the normalized data (see here) and correlation matrix.
<- readRDS("backup/06_003_pbmc130k_normalized.rds")
pbmc <- readRDS("backup/06_003_pbmc130k_cormat.rds") cormat
Load the MSigDB.
<- "https://github.com/keita-iida/ASURATDB/blob/main/genes2bioterm/"
urlpath load(url(paste0(urlpath, "20220308_human_MSigDB.rda?raw=TRUE"))) # MSigDB
Perform ASURAT protocols.
# Create a sign-by-sample matrix.
<- list(CB = pbmc)
pbmcs metadata(pbmcs$CB) <- list(sign = human_MSigDB$cell)
$CB <- remove_signs(sce = pbmcs$CB, min_ngenes = 2, max_ngenes = 1000)
pbmcsset.seed(1)
$CB <- cluster_genesets(sce = pbmcs$CB, cormat = cormat,
pbmcsth_posi = 0.28, th_nega = -0.32)
$CB <- create_signs(sce = pbmcs$CB, min_cnt_strg = 3, min_cnt_vari = 3)
pbmcs$CB <- makeSignMatrix(sce = pbmcs$CB, weight_strg = 0.5, weight_vari = 0.5)
pbmcs# Perform dimension reduction.
set.seed(1)
<- t(as.matrix(assay(pbmcs$CB, "counts")))
mat <- Rtsne::Rtsne(mat, dim = 2, pca = TRUE, initial_dims = 50)
res reducedDim(pbmcs$CB, "TSNE") <- res[["Y"]]
# Perform a cell clustering.
<- Seurat::as.Seurat(pbmcs$CB, counts = "counts", data = "counts")
surt <- as.matrix(assay(pbmcs$CB, "counts"))
mat "SSM"]] <- Seurat::CreateAssayObject(counts = mat)
surt[[::DefaultAssay(surt) <- "SSM"
Seurat<- Seurat::ScaleData(surt, features = rownames(surt))
surt <- Seurat::RunPCA(surt, features = rownames(surt))
surt <- Seurat::FindNeighbors(surt, reduction = "pca", dims = seq_len(40))
surt <- Seurat::FindClusters(surt, resolution = 0.15)
surt <- Seurat::as.SingleCellExperiment(surt)
temp colData(pbmcs$CB)$seurat_clusters <- colData(temp)$seurat_clusters
# Show the clustering results in low-dimensional spaces.
<- colData(pbmcs$CB)$seurat_clusters
labels <- as.data.frame(reducedDim(pbmcs$CB, "TSNE"))
df <- ggplot2::ggplot() +
p ::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
ggplot2size = 0.2, alpha = 1) +
::labs(title = "Reyes 2020 (ASURAT using MSigDB)",
ggplot2x = "tSNE_1", y = "tSNE_2", color = "") +
::theme_classic(base_size = 20) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) +
ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4)))
ggplot2<- "figures/figure_06_0299.png"
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 5.1, height = 4.3)
ggplot2# Compute separation indices for each cluster against the others.
set.seed(1)
<- colData(pbmcs$CB)$seurat_clusters
labels $CB <- compute_sepI_all(sce = pbmcs$CB, labels = labels,
pbmcsnrand_samples = 20000)
# Compute differentially expressed genes.
set.seed(1)
<- Seurat::as.Seurat(pbmcs$CB, counts = "counts", data = "counts")
surt <- as.matrix(assay(altExp(pbmcs$CB), "counts"))
mat "GEM"]] <- Seurat::CreateAssayObject(counts = mat)
surt[[::DefaultAssay(surt) <- "GEM"
Seurat<- Seurat::SetIdent(surt, value = "seurat_clusters")
surt <- Seurat::FindAllMarkers(surt, only.pos = TRUE,
res min.pct = 0.25, logfc.threshold = 0.25)
metadata(pbmcs$CB)$marker_genes$all <- res
# Save data.
saveRDS(pbmcs, file = "backup/06_025_pbmc130k_asurat_msigdb.rds")
# Load data.
<- readRDS("backup/06_025_pbmc130k_asurat_msigdb.rds") pbmcs
Load the normalized data (see here).
<- readRDS("backup/06_003_pbmc130k_normalized.rds") pbmc
Create Monocle 3 objects (cell_data_set (CDS)).
<- data.frame(gene_short_name = rowData(pbmc)$gene)
gene_metadata rownames(gene_metadata) <- rowData(pbmc)$gene
<- monocle3::new_cell_data_set(
pbmc expression_data = as.matrix(assay(pbmc, "counts")),
cell_metadata = colData(pbmc),
gene_metadata = gene_metadata)
Preprocess the data under the default settings of Monocle 3 (version
1.0.0), e.g., num_dim = 50
and
norm_method = "log"
.
<- monocle3::preprocess_cds(pbmc) pbmc
Perform log-normalization with a pseudo count.
assay(pbmc, "logcounts") <- monocle3::normalized_counts(pbmc,
norm_method = "log",
pseudocount = 1)
Perform dimensionality reduction.
#pbmc <- monocle3::reduce_dimension(pbmc, reduction_method = "tSNE",
# preprocess_method = "PCA")
<- monocle3::reduce_dimension(pbmc, reduction_method = "UMAP",
pbmc preprocess_method = "PCA")
Group cells into several clusters by using a community detection algorithm.
#pbmc <- monocle3::cluster_cells(pbmc, resolution = 5e-4, reduction_method = "tSNE")
<- monocle3::cluster_cells(pbmc, resolution = 1e-6, reduction_method = "UMAP") pbmc
Show the clustering results.
<- "PBMC 130k (Monocle 3)"
title <- pbmc@clusters@listData[["UMAP"]][["clusters"]]
labels <- reducedDim(pbmc, "UMAP")
df <- ggplot2::ggplot() +
p ::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
ggplot2size = 0.2, alpha = 1) +
::labs(title = title, x = "UMAP_1", y = "UMAP_2", color = "") +
ggplot2::theme_classic(base_size = 20) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) +
ggplot2::scale_colour_hue() +
ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4)))
ggplot2<- "figures/figure_06_0330.png"
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 5.2, height = 4.3) ggplot2
Find differentially expressed genes.
set.seed(1)
metadata(pbmc)$markers <- monocle3::top_markers(pbmc, group_cells_by = "cluster",
reference_cells = 1000, cores = 2)
<- metadata(pbmc)$markers
markers <- markers[order(markers$cell_group, markers$marker_test_q_value), ]
markers View(markers[which(markers$marker_test_q_value < 10^(-100)), ])
Defining significant genes as genes with marker_teset_q_value<1e-100, infer cell types using GeneCards.
1: Monocyte # FCN1 (marker_teset_q_value ~e-239),
# LYZ (marker_teset_q_value ~e-235)
2: T cell # TRAC (marker_teset_q_value ~e-273),
# CD3D (marker_teset_q_value ~e-264)
3: Monocyte # FCN1 (marker_teset_q_value ~0),
# S100A9 (marker_teset_q_value ~e-313)
4: B cell # MS4A1 (marker_teset_q_value ~0),
# CD79B (marker_teset_q_value ~0)
5: Dendritic cell # HLA-DRA (marker_teset_q_value ~0),
# HLA-DRB1 (marker_teset_q_value ~0)
6: NK/NKT cell # GZMB (marker_teset_q_value ~0),
# TCF4 (marker_teset_q_value ~e-298)
# UGCG (marker_teset_q_value ~e-274)
# CLEC4C (marker_teset_q_value ~e-245)
7: Monocyte # TYROBP (marker_teset_q_value ~0)
# FCN1 (marker_teset_q_value ~e-172)
# LYZ (marker_teset_q_value ~e-169)
8: NK/NKT cell # GZMB (marker_teset_q_value ~e-292)
# PTPRS (marker_teset_q_value ~e-113)
9: Monocyte # S100A9 (marker_teset_q_value ~e-289)
# S100A8 (marker_teset_q_value ~e-268)
10: NK/NKT cell # EEF1A1 (marker_teset_q_value ~e-214)
# GZMB (marker_teset_q_value ~e-101)
11: Dendritic cell # HLA-DRB1 (marker_teset_q_value ~e-138)
# HLA-DQB2 (marker_teset_q_value ~e-136)
# HLA-DRA (marker_teset_q_value ~e-125)
12: Monocyte # S100A8 (marker_teset_q_value ~e-168)
# S100A9 (marker_teset_q_value ~e-131)
13: Unspecified # No significant genes are detected.
14: Unspecified # No significant genes are detected.
<- as.integer(as.character(pbmc@clusters@listData[["UMAP"]][["clusters"]]))
tmp colData(pbmc)$cell_type <- tmp
colData(pbmc)$cell_type[colData(pbmc)$cell_type == 1] <- "Mono"
colData(pbmc)$cell_type[colData(pbmc)$cell_type == 2] <- "T"
colData(pbmc)$cell_type[colData(pbmc)$cell_type == 3] <- "Mono"
colData(pbmc)$cell_type[colData(pbmc)$cell_type == 4] <- "B"
colData(pbmc)$cell_type[colData(pbmc)$cell_type == 5] <- "DC"
colData(pbmc)$cell_type[colData(pbmc)$cell_type == 6] <- "NK/NKT"
colData(pbmc)$cell_type[colData(pbmc)$cell_type == 7] <- "Mono"
colData(pbmc)$cell_type[colData(pbmc)$cell_type == 8] <- "NK/NKT"
colData(pbmc)$cell_type[colData(pbmc)$cell_type == 9] <- "Mono"
colData(pbmc)$cell_type[colData(pbmc)$cell_type == 10] <- "NK/NKT"
colData(pbmc)$cell_type[colData(pbmc)$cell_type == 11] <- "DC"
colData(pbmc)$cell_type[colData(pbmc)$cell_type == 12] <- "Mono"
colData(pbmc)$cell_type[colData(pbmc)$cell_type == 13] <- "Unspecified"
colData(pbmc)$cell_type[colData(pbmc)$cell_type == 14] <- "Unspecified"
Show the annotation results in low-dimensional spaces.
<- "PBMC 130k (Monocle 3)"
title <- factor(colData(pbmc)$cell_type,
labels levels = c("Mono", "T", "B", "DC", "NK/NKT", "Unspecified"))
<- reducedDim(pbmc, "UMAP")
df <- ggplot2::ggplot() +
p ::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
ggplot2size = 0.2, alpha = 1) +
::labs(title = title, x = "UMAP_1", y = "UMAP_2", color = "Cell type") +
ggplot2::theme_classic(base_size = 20) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) +
ggplot2::scale_colour_hue() +
ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4)))
ggplot2<- "figures/figure_06_0340.png"
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 6.1, height = 4.3) ggplot2
# Save data.
saveRDS(pbmc, file = "backup/06_031_pbmc130k_monocle3.rds")
# Load data.
<- readRDS("backup/06_031_pbmc130k_monocle3.rds") pbmc
Load the normalized data (see here).
<- readRDS("backup/06_003_pbmc130k_normalized.rds") pbmc
Prepare a SingleCellExperiment object.
<- as.matrix(assay(pbmc, "counts"))
mat <- SingleCellExperiment(assays = list(counts = as.matrix(mat),
pbmc logcounts = log2(mat + 1)),
rowData = data.frame(gene = rownames(mat)),
colData = data.frame(sample = colnames(mat)))
rowData(pbmc)$feature_symbol <- rownames(pbmc)
Run sc3()
, in which parameter ks
is
subjectively determined by considering biological backgrounds.
set.seed(1)
<- SC3::sc3(pbmc, ks = 5:15, biology = TRUE) pbmc
Plot stability indices across ks
for investigating
“optimal” number of clusters.
<- as.integer(names(metadata(pbmc)$sc3$consensus))
kss for(i in seq_along(kss)){
<- SC3::sc3_plot_cluster_stability(pbmc, k = kss[i])
p <- p + ggplot2::labs(title = paste0("Reyes 2020 (SC3) k = ", kss[i])) +
p ::theme_classic(base_size = 20) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18))
ggplot2<- sprintf("figures/figure_06_%04d.png", 409 + i)
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 5, height = 4)
ggplot2 }
Setting the optimal number of clusters, find differentially expressed genes.
<- rowData(pbmc)
row_data <- as.data.frame(row_data[ , grep("gene|sc3_7", colnames(row_data))])
markers <- markers[order(markers$sc3_7_markers_clusts, markers$sc3_7_de_padj), ]
markers View(markers[which(markers$sc3_7_de_padj < 10^(-100)), ])
Defining significant genes as genes with sc3_7_de_padj<1e-80, infer cell types using GeneCards.
1: Monocyte # S100A9 (sc3_7_de_padj ~0)
# S100A8 (sc3_7_de_padj ~0)
2: Dendritic cell # HLA-DRA (sc3_7_de_padj ~0)
# HLA-DQA1 (sc3_7_de_padj ~0)
# HLA-DQB1 (sc3_7_de_padj ~0)
3: NK/NKT cell # CD2 (sc3_7_de_padj ~0)
# GZMA (sc3_7_de_padj ~0)
# NKG7 (sc3_7_de_padj ~0)
4: B cell # CD79B (sc3_7_de_padj ~e-292)
# IGHM (sc3_7_de_padj ~e-256)
5: T cell # CD3E (sc3_7_de_padj ~0)
# CD3D (sc3_7_de_padj ~0)
# TRAC (sc3_7_de_padj ~0)
6: Monocyte # FCN1 (sc3_7_de_padj ~0)
# MS4A7 (sc3_7_de_padj ~0)
# CD68 (sc3_7_de_padj ~e-315)
7: Dendritic cell # SNX3 (sc3_7_de_padj ~e-176)
<- colData(pbmc)
col_data <- col_data[ , grep("sc3_7_clusters", colnames(col_data))]
clusters colData(pbmc)$cell_type <- as.integer(as.character(clusters))
colData(pbmc)$cell_type[colData(pbmc)$cell_type == 1] <- "Mono"
colData(pbmc)$cell_type[colData(pbmc)$cell_type == 2] <- "DC"
colData(pbmc)$cell_type[colData(pbmc)$cell_type == 3] <- "NK/NKT"
colData(pbmc)$cell_type[colData(pbmc)$cell_type == 4] <- "B"
colData(pbmc)$cell_type[colData(pbmc)$cell_type == 5] <- "T"
colData(pbmc)$cell_type[colData(pbmc)$cell_type == 6] <- "Mono"
colData(pbmc)$cell_type[colData(pbmc)$cell_type == 7] <- "DC"
colData(pbmc)$cell_type[is.na(colData(pbmc)$cell_type)] <- "Unspecified"
# Save data.
saveRDS(pbmc, file = "backup/06_041_pbmc130k_sc3.rds")
# Load data.
<- readRDS("backup/06_041_pbmc130k_sc3.rds") pbmc