MacBook Pro (Big Sur, 16-inch, 2019), Processor (2.4 GHz 8-Core Intel Core i9), Memory (64 GB 2667 MHz DDR4).
Attach necessary libraries:
library(ASURAT)
library(SingleCellExperiment)
library(SummarizedExperiment)
In this vignette, we analyze single-cell RNA sequencing (scRNA-seq) data obtained from small cell lung cancer (SCLC) patients with cisplatin treatment (Stewart et al., Nat. Cancer 1, 2020).
The data can be loaded by the following code:
<- readRDS(url("https://figshare.com/ndownloader/files/34112474")) sclc
The data are stored in DOI:10.6084/m9.figshare.19200254 and the generating process is described below.
The data were obtained from NCBI repository with accession number
GSE138474: GSM4104164.
The following functions read_matrix_10xdata()
and
read_gene_10xdata()
process the scRNA-seq data into a raw
count matrix and gene dataframe. Here, make.unique()
is
applied for naming gene symbols, which appends a sequential number with
a period delimiter for every repeat name encountered.
<- function(path_dir){
read_matrix_10xdata <- paste0(path_dir, "barcodes.tsv.gz")
barcode.path <- paste0(path_dir, "features.tsv.gz")
feature.path <- paste0(path_dir, "matrix.mtx.gz")
matrix.path <- as.matrix(Matrix::readMM(file = matrix.path))
mat <- read.delim(feature.path, header = FALSE, stringsAsFactors = FALSE)
genes <- read.delim(barcode.path, header = FALSE, stringsAsFactors = FALSE)
barcodes rownames(mat) <- make.unique(as.character(genes$V2))
colnames(mat) <- make.unique(barcodes$V1)
return(mat)
}<- function(path_dir){
read_gene_10xdata <- paste0(path_dir, "features.tsv.gz")
feature.path <- read.delim(feature.path, header = FALSE, stringsAsFactors = FALSE)
genes return(genes)
}
Create a SingleCellExperiment object by inputting a raw read count table.
<- "rawdata/2020_001_stewart/sc68_cisp/SRR10211593_count/"
path_dir <- paste0(path_dir, "filtered_feature_bc_matrix/")
path_dir <- read_matrix_10xdata(path_dir = path_dir)
sclc <- SingleCellExperiment(assays = list(counts = sclc),
sclc rowData = data.frame(gene = rownames(sclc)),
colData = data.frame(cell = colnames(sclc)))
dim(sclc)
[1] 33538 3433
Remove variables (genes) and samples (cells) with low quality, by processing the following three steps:
First of all, add metadata for both variables and samples using
ASURAT function add_metadata()
.
<- add_metadata(sce = sclc, mitochondria_symbol = "^MT-") sclc
Examine the expression levels of known SCLC marker genes (Ireland, et al., 2020), namely ASCL1, NEUROD1, YAP1, and POU2F3.
<- c("ASCL1", "NEUROD1", "YAP1", "POU2F3")
genes <- sclc[, which(colData(sclc)$nReads > 2000)]
sce set.seed(1)
<- sample(ncol(sce), size = 1000, replace = FALSE)
inds <- sce[genes, inds]
subsce <- log(as.matrix(assay(subsce, "counts")) + 1)
mat set.seed(1)
<- "figures/figure_01_0005.png"
filename png(file = filename, height = 200, width = 520, res = 100)
#png(file = filename, height = 580, width = 1600, res = 300)
<- ComplexHeatmap::Heatmap(mat, column_title = "SCLC",
p name = "Log1p\nexpression", cluster_rows = FALSE,
show_row_names = TRUE, row_names_side = "right",
show_row_dend = FALSE, show_column_names = FALSE,
column_dend_side = "top", show_parent_dend_line = FALSE)
pdev.off()
# mtx <- t(colData(subsce)$nReads) ; rownames(mtx) <- "nReads"
# q <- ComplexHeatmap::Heatmap(mtx, name = "nReads", show_row_names = TRUE,
# row_names_side = "right", show_row_dend = FALSE,
# show_column_names = FALSE, show_column_dend = FALSE,
# col = circlize::colorRamp2(c(min(mtx), max(mtx)),
# c("cyan", "magenta")))
# p <- p %v% q
# p
# dev.off()
ASURAT function remove_variables()
removes variable
(gene) data such that the numbers of non-zero expressing samples (cells)
are less than min_nsamples
.
<- remove_variables(sce = sclc, min_nsamples = 10) sclc
Qualities of sample (cell) data are confirmed based on proper
visualization of colData(sce)
.
<- "SCLC"
title <- data.frame(x = colData(sclc)$nReads, y = colData(sclc)$nGenes)
df <- ggplot2::ggplot() +
p ::geom_point(ggplot2::aes(x = df$x, y = df$y), size = 1, alpha = 1) +
ggplot2::labs(title = title, x = "Number of reads", y = "Number of genes") +
ggplot2::theme_classic(base_size = 20) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 20))
ggplot2<- "figures/figure_01_0010.png"
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 5, height = 5)
ggplot2
<- data.frame(x = colData(sclc)$nReads, y = colData(sclc)$percMT)
df <- ggplot2::ggplot() +
p ::geom_point(ggplot2::aes(x = df$x, y = df$y), size = 1, alpha = 1) +
ggplot2::labs(title = title, x = "Number of reads", y = "Perc of MT reads") +
ggplot2::theme_classic(base_size = 20) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 20))
ggplot2<- "figures/figure_01_0011.png"
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 5, height = 5) ggplot2
ASURAT function remove_samples()
removes sample (cell)
data by setting cutoff values for the metadata.
<- remove_samples(sce = sclc, min_nReads = 1400, max_nReads = 40000,
sclc min_nGenes = 1150, max_nGenes = 1e+10,
min_percMT = 0, max_percMT = 15)
Qualities of variable (gene) data are confirmed based on proper
visualization of rowData(sce)
.
<- "SCLC"
title <- apply(as.matrix(assay(sclc, "counts")), 1, mean)
aveexp <- data.frame(x = seq_len(nrow(rowData(sclc))),
df y = sort(aveexp, decreasing = TRUE))
<- ggplot2::ggplot() + ggplot2::scale_y_log10() +
p ::geom_point(ggplot2::aes(x = df$x, y = df$y), size = 1, alpha = 1) +
ggplot2::labs(title = title, x = "Rank of genes", y = "Mean read counts") +
ggplot2::theme_classic(base_size = 20) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 20))
ggplot2<- "figures/figure_01_0015.png"
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 5, height = 5) ggplot2
ASURAT function remove_variables_second()
removes
variable (gene) data such that the mean read counts across samples are
less than min_meannReads
.
<- remove_variables_second(sce = sclc, min_meannReads = 0.20) sclc
dim(sclc)
[1] 6346 2283
Perform bayNorm()
(Tang et al., Bioinformatics, 2020)
for attenuating technical biases with respect to zero inflation and
variation of capture efficiencies between samples (cells).
<- as.matrix(assay(sclc, "counts"))
mat <- bayNorm::BetaFun(Data = mat, MeanBETA = 0.06)
BETA <- bayNorm::bayNorm(mat, BETA_vec = BETA[["BETA"]], mode_version = TRUE)
bayout assay(sclc, "normalized") <- bayout$Bay_out
Perform log-normalization with a pseudo count.
assay(sclc, "logcounts") <- log(assay(sclc, "normalized") + 1)
Center row data.
<- assay(sclc, "logcounts")
mat assay(sclc, "centered") <- sweep(mat, 1, apply(mat, 1, mean), FUN = "-")
Set gene expression data into altExp(sce)
.
<- "logcounts"
sname altExp(sclc, sname) <- SummarizedExperiment(list(counts = assay(sclc, sname)))
Add ENTREZ Gene IDs to rowData(sce)
.
<- AnnotationDbi::select(org.Hs.eg.db::org.Hs.eg.db,
dictionary key = rownames(sclc),
columns = "ENTREZID", keytype = "SYMBOL")
<- dictionary[!duplicated(dictionary$SYMBOL), ]
dictionary rowData(sclc)$geneID <- dictionary$ENTREZID
Infer cell or disease types, biological functions, and signaling pathway activity at the single-cell level by inputting related databases.
ASURAT transforms centered read count tables to functional feature matrices, termed sign-by-sample matrices (SSMs). Using SSMs, perform unsupervised clustering of samples (cells).
Prepare correlation matrices of gene expressions.
<- t(as.matrix(assay(sclc, "centered")))
mat <- cor(mat, method = "spearman") cormat
Load databases.
<- "https://github.com/keita-iida/ASURATDB/blob/main/genes2bioterm/"
urlpath load(url(paste0(urlpath, "20201213_human_DO.rda?raw=TRUE"))) # DO
load(url(paste0(urlpath, "20201213_human_GO_red.rda?raw=TRUE"))) # GO
load(url(paste0(urlpath, "20201213_human_KEGG.rda?raw=TRUE"))) # KEGG
The reformatted knowledge-based data were available from the following repositories:
Add formatted databases to metadata(sce)$sign
.
<- list(DO = sclc, GO = sclc, KG = sclc)
sclcs metadata(sclcs$DO) <- list(sign = human_DO[["disease"]])
metadata(sclcs$GO) <- list(sign = human_GO[["BP"]])
metadata(sclcs$KG) <- list(sign = human_KEGG[["pathway"]])
ASURAT function remove_signs()
redefines functional gene
sets for the input database by removing genes, which are not included in
rownames(sce)
, and further removes biological terms
including too few or too many genes.
$DO <- remove_signs(sce = sclcs$DO, min_ngenes = 2, max_ngenes = 1000)
sclcs$GO <- remove_signs(sce = sclcs$GO, min_ngenes = 2, max_ngenes = 1000)
sclcs$KG <- remove_signs(sce = sclcs$KG, min_ngenes = 2, max_ngenes = 1000) sclcs
ASURAT function cluster_genes()
clusters functional gene
sets using a correlation graph-based decomposition method, producing
strongly, variably, and weakly correlated gene sets (SCG, VCG, and WCG,
respectively).
set.seed(1)
$DO <- cluster_genesets(sce = sclcs$DO, cormat = cormat,
sclcsth_posi = 0.28, th_nega = -0.22)
set.seed(1)
$GO <- cluster_genesets(sce = sclcs$GO, cormat = cormat,
sclcsth_posi = 0.20, th_nega = -0.20)
set.seed(1)
$KG <- cluster_genesets(sce = sclcs$KG, cormat = cormat,
sclcsth_posi = 0.17, th_nega = -0.16)
ASURAT function create_signs()
creates signs by the
following criteria:
min_cnt_strg
(the
default value is 2) andmin_cnt_vari
(the
default value is 2),which are independently applied to SCGs and VCGs, respectively.
$DO <- create_signs(sce = sclcs$DO, min_cnt_strg = 2, min_cnt_vari = 2)
sclcs$GO <- create_signs(sce = sclcs$GO, min_cnt_strg = 3, min_cnt_vari = 3)
sclcs$KG <- create_signs(sce = sclcs$KG, min_cnt_strg = 3, min_cnt_vari = 3) sclcs
If signs have semantic similarity information, one can use ASURAT
function remove_signs_redundant()
for removing redundant
sings using the semantic similarity matrices.
<- human_DO$similarity_matrix$disease
simmat $DO <- remove_signs_redundant(sce = sclcs$DO, similarity_matrix = simmat,
sclcsthreshold = 0.82, keep_rareID = TRUE)
<- human_GO$similarity_matrix$BP
simmat $GO <- remove_signs_redundant(sce = sclcs$GO, similarity_matrix = simmat,
sclcsthreshold = 0.80, keep_rareID = TRUE)
ASURAT function remove_signs_manually()
removes signs by
specifying IDs (e.g., GOID:XXX
) or descriptions (e.g.,
metabolic
) using grepl()
.
<- "Covid|COVID"
keywords $KG <- remove_signs_manually(sce = sclcs$KG, keywords = keywords) sclcs
ASURAT function create_sce_signmatrix()
creates a new
SingleCellExperiment object new_sce
, consisting of the
following information:
assayNames(new_sce)
: counts (SSM whose entries are
termed sign scores),names(colData(new_sce))
: nReads, nGenes, percMT,names(rowData(new_sce))
: ParentSignID, Description,
CorrGene, etc.,names(metadata(new_sce))
: sign_SCG, sign_VCG,
etc.,altExpNames(new_sce)
: something if there is data in
altExp(sce)
.$DO <- makeSignMatrix(sce = sclcs$DO, weight_strg = 0.5, weight_vari = 0.5)
sclcs$GO <- makeSignMatrix(sce = sclcs$GO, weight_strg = 0.5, weight_vari = 0.5)
sclcs$KG <- makeSignMatrix(sce = sclcs$KG, weight_strg = 0.5, weight_vari = 0.5) sclcs
Perform diffusion map for the SSM for disease.
set.seed(1)
<- destiny::DiffusionMap(t(assay(sclcs$DO, "counts")))
res reducedDim(sclcs$DO, "DMAP") <- res@eigenvectors
Perform t-distributed stochastic neighbor embedding for the SSMs for biological process and signaling pathway.
<- c("GO", "KG")
dbs for(i in seq_along(dbs)){
set.seed(1)
<- t(as.matrix(assay(sclcs[[dbs[i]]], "counts")))
mat <- Rtsne::Rtsne(mat, dim = 2, pca = TRUE, initial_dims = 100)
res reducedDim(sclcs[[dbs[i]]], "TSNE") <- res[["Y"]]
}
Show the results of dimensional reduction in low-dimensional spaces.
Use ASURAT function plot_dataframe3D()
for plotting
three-dimensional data. See ?plot_dataframe3D
for
details.
# DO
<- as.data.frame(reducedDim(sclcs$DO, "DMAP"))[, seq_len(3)]
df <- "figures/figure_01_0020.png"
filename png(file = filename, height = 250, width = 250, res = 50)
plot_dataframe3D(dataframe3D = df, theta = -45, phi = 220, title = "SCLC (disease)",
xlabel = "DC_1", ylabel = "DC_2", zlabel = "DC_3")
dev.off()
# GO
<- as.data.frame(reducedDim(sclcs$GO, "TSNE"))
df <- ggplot2::ggplot() +
p ::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2]),
ggplot2color = "black", size = 1, alpha = 1) +
::labs(title = "SCLC (function)", x = "tSNE_1", y = "tSNE_2") +
ggplot2::theme_classic(base_size = 20) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18))
ggplot2<- "figures/figure_01_0021.png"
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 4.1, height = 4.3)
ggplot2
# KEGG
<- as.data.frame(reducedDim(sclcs$KG, "TSNE"))
df <- ggplot2::ggplot() +
p ::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2]),
ggplot2color = "black", size = 1, alpha = 1) +
::labs(title = "SCLC (pathway)", x = "tSNE_1", y = "tSNE_2") +
ggplot2::theme_classic(base_size = 20) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18))
ggplot2<- "figures/figure_01_0022.png"
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 4.1, height = 4.3) ggplot2
# Load customized plot functions.
source("../R/plot_additional.R")
MERLoT is a useful package detecting a tree-like topology in data space. Using MERLoT, one can cluster cells by allocating individual cells to the branches of the data manifold and define pseudotimes along the branches.
# Preparation
<- list()
res <- reducedDims(sclcs$DO)$DMAP[, seq_len(3)]
dmap <- t(as.matrix(assay(sclcs$DO, "counts")))
mat # ScaffoldTree
1]] <- merlot::CalculateScaffoldTree(CellCoordinates = dmap,
res[[NEndpoints = 3, random_seed = 1)
# ElasticTree
2]] <- merlot::CalculateElasticTree(ScaffoldTree = res[[1]], N_yk = 30)
res[[# SignsSpaceEmbedding
3]] <- merlot::GenesSpaceEmbedding(ExpressionMatrix = mat,
res[[ElasticTree = res[[2]], NCores = 3)
# Pseudotime in high dimensional space
4]] <- merlot::CalculatePseudotimes(InputTree = res[[2]], T0 = 1)
res[[# Cluster cells based on tree branches embedded in high dimensional space.
<- res[[3]]$Cells2Branches
labels # Store the results
<- length(unique(sort(labels)))
ngroups colData(sclcs$DO)$merlot_clusters <- factor(labels, levels = seq_len(ngroups))
names(res) <- c("ScaffoldTree", "ElasticTree", "SignsSpaceEmbedding",
"Pseudotimes_highdim")
metadata(sclcs$DO)$merlot <- res
Show elastic trees and pseudotimes, embedded in the original sign score space, in diffusion map spaces.
# Elastic tree
<- colData(sclcs$DO)$merlot_clusters
labels <- scales::hue_pal()(length(unique(labels)))[labels]
colors <- "figures/figure_01_0025.png"
filename png(file = filename, height = 250, width = 250, res = 50)
plot_elasticTree3D(ElasticTree = metadata(sclcs$DO)$merlot$ElasticTree,
labels = labels, colors = colors, theta = -45, phi = 220,
title = "SCLC (disease)", xlabel = "DC_1", ylabel = "DC_2",
zlabel = "DC_3")
dev.off()
# Pseudotime
<- "figures/figure_01_0026.png"
filename png(file = filename, height = 250, width = 250, res = 50)
plot_pseudotime3D(ElasticTree = metadata(sclcs$DO)$merlot$ElasticTree,
Pseudotimes = metadata(sclcs$DO)$merlot$Pseudotimes_highdim,
labels = labels, colors = colors, theta = -45, phi = 220,
title = "SCLC (disease)", xlabel = "DC_1", ylabel = "DC_2",
zlabel = "DC_3")
dev.off()
Show clustering results in low-dimensional spaces.
<- colData(sclcs$DO)$merlot_clusters
labels <- as.data.frame(reducedDim(sclcs$DO, "DMAP"))
df <- "figures/figure_01_0030.png"
filename png(file = filename, height = 250, width = 250, res = 50)
plot_dataframe3D(dataframe3D = df, labels = labels,
theta = -45, phi = 220, title = "SCLC (disease)",
xlabel = "DC_1", ylabel = "DC_2", zlabel = "DC_3")
dev.off()
To date (December, 2021), one of the most useful clustering methods in scRNA-seq data analysis is a combination of a community detection algorithm and graph-based unsupervised clustering, developed in Seurat package.
Here, our strategy is as follows:
rowData()
and colData()
must have data),ScaleData()
, RunPCA()
,
FindNeighbors()
, and FindClusters()
,temp
,colData(temp)$seurat_clusters
into
colData(sce)$seurat_clusters
.<- c(0.10, 0.20)
resolutions <- list(seq_len(40), seq_len(30))
dims <- c("GO", "KG")
dbs for(i in seq_along(dbs)){
<- Seurat::as.Seurat(sclcs[[dbs[i]]], counts = "counts", data = "counts")
surt <- as.matrix(assay(sclcs[[dbs[i]]], "counts"))
mat "SSM"]] <- Seurat::CreateAssayObject(counts = mat)
surt[[::DefaultAssay(surt) <- "SSM"
Seurat<- Seurat::ScaleData(surt, features = rownames(surt))
surt <- Seurat::RunPCA(surt, features = rownames(surt))
surt <- Seurat::FindNeighbors(surt, reduction = "pca", dims = dims[[i]])
surt <- Seurat::FindClusters(surt, resolution = resolutions[i])
surt <- Seurat::as.SingleCellExperiment(surt)
temp colData(sclcs[[dbs[i]]])$seurat_clusters <- colData(temp)$seurat_clusters
}
Show the clustering results in low-dimensional spaces.
<- c("SCLC (function)", "SCLC (pathway)")
titles <- c("GO", "KG")
dbs for(i in seq_along(dbs)){
<- colData(sclcs[[dbs[i]]])$seurat_clusters
labels <- as.data.frame(reducedDim(sclcs[[dbs[i]]], "TSNE"))
df <- ggplot2::ggplot() +
p ::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
ggplot2size = 1, alpha = 1) +
::labs(title = titles[i], x = "tSNE_1", y = "tSNE_2", color = "") +
ggplot2::theme_classic(base_size = 20) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) +
ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4)))
ggplot2if(i == 1){
<- p + ggplot2::scale_colour_brewer(palette = "Set1")
p else if(i == 2){
}<- p + ggplot2::scale_colour_brewer(palette = "Set2")
p
}<- sprintf("figures/figure_01_%04d.png", 30 + i)
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 5.1, height = 4.3)
ggplot2 }
If there is gene expression data in altExp(sce)
, we can
easily infer cell cycle phases by using Seurat functions in the similar
manner as above.
<- Seurat::as.Seurat(sclcs$DO, counts = "counts", data = "counts")
surt <- as.matrix(assay(altExp(sclcs$DO), "counts"))
mat "GEM"]] <- Seurat::CreateAssayObject(counts = mat)
surt[[::DefaultAssay(surt) <- "GEM"
Seurat<- Seurat::ScaleData(surt, features = rownames(surt))
surt <- Seurat::RunPCA(surt, features = rownames(surt))
surt <- Seurat::CellCycleScoring(surt, s.features = Seurat::cc.genes$s.genes,
surt g2m.features = Seurat::cc.genes$g2m.genes)
<- Seurat::as.SingleCellExperiment(surt)
temp colData(sclcs$DO)$Phase <- colData(temp)$Phase
Show cell cycle phases in low-dimensional spaces.
<- factor(colData(sclcs$DO)$Phase, levels = unique(colData(sclcs$DO)$Phase))
labels <- colData(sclcs$DO)$Phase
colors which(colors == "G1")] <- rainbow(3)[3]
colors[which(colors == "S")] <- rainbow(3)[2]
colors[which(colors == "G2M")] <- rainbow(3)[1]
colors[<- as.data.frame(reducedDim(sclcs$DO, "DMAP"))
df <- "figures/figure_01_0035.png"
filename png(file = filename, height = 250, width = 250, res = 50)
plot_dataframe3D(dataframe3D = df, labels = labels, colors = colors,
theta = -45, phi = 220, title = "SCLC (disease)",
xlabel = "DC_1", ylabel = "DC_2", zlabel = "DC_3")
dev.off()
# df$label <- labels ; df$color <- colors ; df <- df[order(df$label), ]
# filename <- "figures/figure_01_0036.png"
# png(file = filename, height = 1500, width = 1500, res = 300)
# scatter3D(df[, 1], df[, 2], df[, 3], main = title, xlab = xlabel,
# ylab = ylabel, zlab = zlabel, box = F, bty = "b2", axes = F,
# nticks = 0, theta = theta, phi = phi, pch = 16, cex = 0.5,
# alpha = 1.0, col = df$color, colvar = NA, colkey = FALSE)
# graphics::legend("bottomright", legend=unique(df$label), pch = 16,
# col = unique(df$color), cex = 1.2, inset = c(0.02))
# dev.off()
Significant signs are analogous to differentially expressed genes but bear biological meanings. Note that naïve usages of statistical tests should be avoided because the row vectors of SSMs are centered.
Instead, ASURAT function compute_sepI_all()
computes
separation indices for each cluster against the others. Briefly, a
separation index “sepI”, ranging from -1 to 1, is a nonparametric
measure of significance of a given sign score for a given subpopulation.
The larger (resp. smaller) sepI is, the more reliable the sign is as a
positive (resp. negative) marker for the cluster.
for(i in seq_along(sclcs)){
set.seed(1)
if(i == 1){
<- colData(sclcs[[i]])$merlot_clusters
labels else{
}<- colData(sclcs[[i]])$seurat_clusters
labels
}<- compute_sepI_all(sce = sclcs[[i]], labels = labels,
sclcs[[i]] nrand_samples = NULL)
}
<- sclcs$GO
sclcs_LabelDO_SignGO metadata(sclcs_LabelDO_SignGO)$marker_signs <- NULL
set.seed(1)
<- compute_sepI_all(sce = sclcs_LabelDO_SignGO,
sclcs_LabelDO_SignGO labels = colData(sclcs$DO)$merlot_clusters,
nrand_samples = NULL)
<- sclcs$KG
sclcs_LabelDO_SignKG metadata(sclcs_LabelDO_SignKG)$marker_signs <- NULL
set.seed(1)
<- compute_sepI_all(sce = sclcs_LabelDO_SignKG,
sclcs_LabelDO_SignKG labels = colData(sclcs$DO)$merlot_clusters,
nrand_samples = NULL)
To date (December, 2021), one of the most useful methods of multiple
statistical tests in scRNA-seq data analysis is to use a Seurat function
FindAllMarkers()
.
If there is gene expression data in altExp(sce)
, one can
investigate differentially expressed genes by using Seurat functions in
the similar manner as described before.
set.seed(1)
<- Seurat::as.Seurat(sclcs$DO, counts = "counts", data = "counts")
surt <- as.matrix(assay(altExp(sclcs$DO), "counts"))
mat "GEM"]] <- Seurat::CreateAssayObject(counts = mat)
surt[[::DefaultAssay(surt) <- "GEM"
Seurat<- Seurat::SetIdent(surt, value = "merlot_clusters")
surt <- Seurat::FindAllMarkers(surt, only.pos = TRUE,
res min.pct = 0.25, logfc.threshold = 0.25)
metadata(sclcs$DO)$marker_genes$all <- res
Simultaneously analyze multiple sign-by-sample matrices, which helps us characterize individual samples (cells) from multiple biological aspects.
ASURAT function plot_multiheatmaps()
shows heatmaps
(ComplexHeatmap object) of sign scores and gene expression levels (if
there are), where rows and columns stand for sign (or gene) and sample
(cell), respectively.
First, remove unrelated signs by setting keywords, followed by selecting top significant signs and genes for the clustering results with respect to separation index and p-value, respectively.
# Significant signs
<- list()
marker_signs <- "cervical|Oocyte|cycle"
keys for(i in seq_along(sclcs)){
if(i == 1){
<- metadata(sclcs[[i]])$marker_signs$all
marker_signs[[i]] else if(i == 2){
}<- metadata(sclcs_LabelDO_SignGO)$marker_signs$all
marker_signs[[i]] else if(i == 3){
}<- metadata(sclcs_LabelDO_SignKG)$marker_signs$all
marker_signs[[i]]
}<- marker_signs[[i]][!grepl(keys, marker_signs[[i]]$Description), ]
marker_signs[[i]] <- dplyr::group_by(marker_signs[[i]], Ident_1)
marker_signs[[i]] <- dplyr::slice_max(marker_signs[[i]], sepI, n = 1)
marker_signs[[i]] <- dplyr::slice_min(marker_signs[[i]], Rank, n = 1)
marker_signs[[i]]
}# Significant genes
<- metadata(sclcs$DO)$marker_genes$all
marker_genes_DO <- dplyr::group_by(marker_genes_DO, cluster)
marker_genes_DO <- dplyr::slice_min(marker_genes_DO, p_val_adj, n = 5)
marker_genes_DO <- dplyr::slice_max(marker_genes_DO, avg_log2FC, n = 5) marker_genes_DO
Then, prepare arguments.
# ssm_list
<- list() ; ssm_list <- list()
sces_sub for(i in seq_along(sclcs)){
<- sclcs[[i]][rownames(sclcs[[i]]) %in% marker_signs[[i]]$SignID, ]
sces_sub[[i]] <- assay(sces_sub[[i]], "counts")
ssm_list[[i]]
}names(ssm_list) <- c("SSM_disease", "SSM_function", "SSM_pathway")
# gem_list
<- altExp(sclcs$DO, "logcounts")
expr_sub <- expr_sub[rownames(expr_sub) %in% marker_genes_DO$gene]
expr_sub <- list(x = t(scale(t(as.matrix(assay(expr_sub, "counts"))))))
gem_list names(gem_list) <- "Scaled\nLogExpr"
# ssmlabel_list
<- list() ; ssmlabel_list <- list()
labels <- colData(sces_sub[[1]])$merlot_clusters
tmp 1]] <- data.frame(label = colData(sces_sub[[1]])$merlot_clusters)
labels[[<- length(unique(tmp))
n_groups 1]]$color <- scales::hue_pal()(n_groups)[tmp]
labels[[1]] <- labels[[1]]
ssmlabel_list[[2]] <- data.frame(label = NA, color = NA)
ssmlabel_list[[3]] <- data.frame(label = NA, color = NA)
ssmlabel_list[[names(ssmlabel_list) <- c("Label_disease", NA, NA)
# gemlabel_list
<- colData(sclcs$DO)$Phase
mycolor == "G1"] <- 3
mycolor[mycolor == "S"] <- 2
mycolor[mycolor == "G2M"] <- 1
mycolor[mycolor <- data.frame(label = colData(sclcs$DO)$Phase,
label_CC color = rainbow(3)[as.integer(mycolor)])
$label <- factor(label_CC$label, levels = c("G1", "S", "G2M"))
label_CC<- list(CellCycle = label_CC) gemlabel_list
Tips: If one would like to omit some color labels (e.g., labels[[3]]), set the argument as follows:
2]] <- data.frame(label = NA, color = NA)
ssmlabel_list[[3]] <- data.frame(label = NA, color = NA) ssmlabel_list[[
Finally, plot heatmaps for the selected signs and genes.
<- "figures/figure_01_0040.png"
filename #png(file = filename, height = 1600, width = 1500, res = 300)
png(file = filename, height = 300, width = 300, res = 60)
set.seed(1)
<- "SCLC"
title plot_multiheatmaps(ssm_list = ssm_list, gem_list = gem_list,
ssmlabel_list = ssmlabel_list, gemlabel_list = gemlabel_list,
nrand_samples = 500, show_row_names = TRUE, title = title)
dev.off()
Show violin plots for the sign score distributions across cell type-related clusters.
<- colData(sclcs$DO)$merlot_clusters
labels <- list(c("DO", "DOID:74-S",
vlist "hematopoietic system disease\n(CD24, MIF, ...)"),
c("KG", "path:hsa03010-S",
"Ribosome\n(RPL22, UBA52, ...)"),
c("KG", "path:hsa01524-S",
"Platinum drug resistance\n(TOP2A, BIRC5, ...)"),
c("KG", "path:hsa05222-V",
"Small cell lung cancer\n(TP53, CDKN1A, ...)"),
c("KG", "path:hsa05235-S",
"PD-L1 expression and PD-1 checkpoint...\n(JUN, NFKBIA, ...)"),
c("GE", "CD24", ""))
<- "Cluster (disease)"
xlabel for(i in seq_along(vlist)){
if(vlist[[i]][1] == "GE"){
<- which(rownames(altExp(sclcs$DO, "logcounts")) == vlist[[i]][2])
ind <- altExp(sclcs$DO, "logcounts")[ind, ]
subsce <- as.data.frame(t(as.matrix(assay(subsce, "counts"))))
df <- "Gene expression"
ylabel else{
}<- which(rownames(sclcs[[vlist[[i]][1]]]) == vlist[[i]][2])
ind <- sclcs[[vlist[[i]][1]]][ind, ]
subsce <- as.data.frame(t(as.matrix(assay(subsce, "counts"))))
df <- "Sign score"
ylabel
}<- ggplot2::ggplot() +
p ::geom_violin(ggplot2::aes(x = as.factor(labels), y = df[, 1],
ggplot2fill = labels), trim = FALSE, size = 0.5) +
::geom_boxplot(ggplot2::aes(x = as.factor(labels), y = df[, 1]),
ggplot2width = 0.15, alpha = 0.6) +
::labs(title = paste0(vlist[[i]][2], "\n", vlist[[i]][3]),
ggplot2x = xlabel, y = ylabel, fill = "Cluster") +
::theme_classic(base_size = 25, base_family = "Helvetica") +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 20),
ggplot2legend.position = "none") +
::scale_fill_hue()
ggplot2<- sprintf("figures/figure_01_%04d.png", 49 + i)
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 5, height = 4)
ggplot2 }
Show pseudotime course plots with elastic tree for sign score distributions.
<- list(c("DO", "DOID:74-S",
vlist "hematopoietic system disease\n(CD24, MIF, ...)"),
c("DO", "DOID:5409-V",
"lung small cell carcinoma\n(BIRC5, MKI67, ...)"),
c("DO", "DOID:654-S",
"overnutrition\n(ATF3, DUSP1, ...)"))
<- colData(sclcs$DO)$merlot_clusters
labels <- length(unique(labels))
n_groups for(i in seq_along(vlist)){
<- plot_pseudotimecourse_wTree(
p sce = sclcs[[vlist[[i]][1]]], signID = vlist[[i]][2],
ElasticTree = metadata(sclcs$DO)$merlot$ElasticTree,
SignsSpaceEmbedding = metadata(sclcs$DO)$merlot$SignsSpaceEmbedding,
Pseudotimes = metadata(sclcs$DO)$merlot$Pseudotimes_highdim,
labels = labels, range_y = "cells")
<- p + ggplot2::scale_color_manual(values = scales::hue_pal()(n_groups)) +
p ::labs(title = paste0(vlist[[i]][2], "\n", vlist[[i]][3]),
ggplot2x = "Pseudotime (disease)", y = "Sign score", color = "") +
::theme_classic(base_size = 25, base_family = "Helvetica") +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 20),
ggplot2legend.position = "none")
<- sprintf("figures/figure_01_%04d.png", 59 + i)
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 6.6, height = 4.5)
ggplot2 }
Show pseudotime course plots without elastic tree for sign score distributions.
<- list(c("DO", "DOID:74-S",
vlist "hematopoietic system disease\n(CD24, MIF, ...)"),
c("KG", "path:hsa01524-S",
"Platinum drug resistance\n(TOP2A, BIRC5, ...)"),
c("KG", "path:hsa05222-V",
"Small cell lung cancer\n(TP53, CDKN1A, ...)"),
c("KG", "path:hsa05235-S",
"PD-L1 expression and PD-1 checkpoint...\n(JUN, NFKBIA, ...)"))
for(i in seq_along(vlist)){
<- plot_pseudotimecourse_woTree(
p sce = sclcs[[vlist[[i]][1]]], signID = vlist[[i]][2],
Pseudotimes = metadata(sclcs$DO)$merlot$Pseudotimes_highdim,
range_y = "cells")
<- p + ggplot2::scale_colour_hue() +
p ::labs(title = paste0(vlist[[i]][2], "\n", vlist[[i]][3]),
ggplot2x = "Pseudotime (disease)", y = "Sign score", fill = "") +
::theme_classic(base_size = 25, base_family = "Helvetica") +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 20),
ggplot2legend.position = "none")
<- sprintf("figures/figure_01_%04d.png", 69 + i)
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 6, height = 4.5)
ggplot2 }
<- c("SCLC (Ribosome active)", "SCLC (Platinum resistance)",
cell_state "SCLC (PD-L1 expression)")
colData(sclcs$DO)$cell_state <- as.character(colData(sclcs$DO)$merlot_clusters)
colData(sclcs$DO)$cell_type[colData(sclcs$DO)$cell_state == 1] <- cell_state[1]
colData(sclcs$DO)$cell_type[colData(sclcs$DO)$cell_state == 2] <- cell_state[2]
colData(sclcs$DO)$cell_type[colData(sclcs$DO)$cell_state == 3] <- cell_state[3]
Show the annotation results in low-dimensional spaces.
<- factor(colData(sclcs$DO)$cell_type, levels = cell_state)
labels <- as.data.frame(reducedDim(sclcs$DO, "DMAP"))
df <- "figures/figure_01_0080.png"
filename png(file = filename, height = 250, width = 250, res = 50)
plot_dataframe3D(dataframe3D = df, labels = labels,
theta = -45, phi = 220, title = "SCLC (disease)",
xlabel = "DC_1", ylabel = "DC_2", zlabel = "DC_3")
dev.off()
Load the data (see here).
<- readRDS("backup/01_003_sclc_normalized.rds") sclc
Create Seurat objects.
<- Seurat::CreateSeuratObject(counts = as.matrix(assay(sclc, "counts")),
sclc project = "SCLC")
According to the Seurat protocol, normalize data, perform variance stabilizing transform by setting the number of variable feature, scale data, and reduce dimension using principal component analysis.
# Normalization
<- Seurat::NormalizeData(sclc, normalization.method = "LogNormalize")
sclc # Variance stabilizing transform
<- round(0.2 * ncol(sclc))
n <- Seurat::FindVariableFeatures(sclc, selection.method = "vst", nfeatures = n)
sclc # Scale data
<- Seurat::ScaleData(sclc)
sclc # Principal component analysis
<- Seurat::RunPCA(sclc, features = Seurat::VariableFeatures(sclc)) sclc
Compute the cumulative sum of variances, which is used for determining the number of the principal components (PCs).
<- which(cumsum(sclc@reductions[["pca"]]@stdev) /
pc sum(sclc@reductions[["pca"]]@stdev) > 0.9)[1]
Perform cell clustering.
# Create k-nearest neighbor graph.
<- Seurat::FindNeighbors(sclc, reduction = "pca", dim = seq_len(pc))
sclc # Cluster cells.
<- Seurat::FindClusters(sclc, resolution = 0.1)
sclc # Run t-SNE.
<- Seurat::RunTSNE(sclc, dims.use = seq_len(2), reduction = "pca",
sclc dims = seq_len(pc), do.fast = FALSE, perplexity = 30)
# Run UMAP.
<- Seurat::RunUMAP(sclc, dims = seq_len(pc)) sclc
Show the clustering results.
<- "SCLC (Seurat)"
title <- sclc@meta.data[["seurat_clusters"]]
labels <- scales::brewer_pal(palette = "Set2")(4)
mycolor <- c("0" = mycolor[1], "1" = mycolor[2], "2" = mycolor[4])
mycolor <- sclc@reductions[["umap"]]@cell.embeddings
df <- ggplot2::ggplot() +
p ::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
ggplot2size = 1, alpha = 1) +
::labs(title = title, x = "UMAP_1", y = "UMAP_2", color = "") +
ggplot2::theme_classic(base_size = 20) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) +
ggplot2::scale_color_manual(values = mycolor) +
ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4)))
ggplot2<- "figures/figure_01_0230.png"
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 5.3, height = 4.5) ggplot2
Find differentially expressed genes.
@misc$stat <- Seurat::FindAllMarkers(sclc, only.pos = TRUE, min.pct = 0.25,
sclclogfc.threshold = 0.25)
View(sclc@misc$stat[which(sclc@misc$stat$p_val_adj < 10^(-100)), ])
Assign each cell a cell cycle score using
CellCycleScoring()
.
obj@meta.data[["Phase"]]
.
<- Seurat::cc.genes$s.genes
s.genes <- Seurat::cc.genes$g2m.genes
g2m.genes <- Seurat::CellCycleScoring(sclc, s.features = Seurat::cc.genes$s.genes,
sclc g2m.features = Seurat::cc.genes$g2m.genes)
Show the inferred cell cycle phases in low-dimensional spaces.
<- "SCLC (Seurat)"
title <- factor(sclc@meta.data[["Phase"]], levels = c("G1", "S", "G2M"))
labels <- c(rainbow(3)[3], rainbow(3)[2], rainbow(3)[1])
mycolor <- sclc@reductions[["umap"]]@cell.embeddings
df <- ggplot2::ggplot() +
p ::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
ggplot2size = 1, alpha = 1) +
::labs(title = title, x = "UMAP_1", y = "UMAP_2", color = "Phase") +
ggplot2::theme_classic(base_size = 20) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) +
ggplot2::scale_color_manual(values = mycolor) +
ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4)))
ggplot2<- "figures/figure_01_0235.png"
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 5.7, height = 4.5) ggplot2
Perform GO and KEGG enrichment analyses using differentially
expressed genes, whose adjusted p-values are<=
padj_cutoff
.
= 0.01 padj_cutoff
Prepares a list of genes, which is used for an input of
compareCluster()
in clusterProfiler package.
<- length(unique(sclc@meta.data[["seurat_clusters"]]))
n_groups <- sclc@misc[["stat"]]
tmp <- as.character(unique(tmp$cluster))
cluster_names <- list() ; glist <- list() ; label_names <- c()
g for(i in seq_len(n_groups)){
<- i - 1
im1 <- tmp[which(tmp$cluster == im1), ]
df <- df[which(df$p_val_adj <= padj_cutoff), ]$gene
g[[i]] <- clusterProfiler::bitr(g[[i]], fromType = "SYMBOL",
geneID toType = "ENTREZID",
OrgDb = org.Hs.eg.db::org.Hs.eg.db)$ENTREZID
<- geneID
glist[[i]] <- c(label_names, paste("Group_", im1, sep = ""))
label_names
}names(glist) <- label_names
Performs compareCluster()
, which easily compares
enriched biological terms across clusters.
@misc[["compareCluster_GO"]] <- clusterProfiler::compareCluster(
sclcfun = "enrichGO", OrgDb = org.Hs.eg.db::org.Hs.eg.db, ont = "BP",
glist, pAdjustMethod = "BH", pvalueCutoff = padj_cutoff)
@misc[["compareCluster_KEGG"]] <- clusterProfiler::compareCluster(
sclcfun = "enrichKEGG", organism = "hsa", keyType = "kegg",
glist, pAdjustMethod = "BH", pvalueCutoff = padj_cutoff)
# minGSSize = 10, maxGSSize = 500) # min/max size of genes annotated for testing
Show the results of the enrichment analyses.
<- enrichplot::dotplot(sclc@misc[["compareCluster_GO"]], showCategory = 5) +
p ::theme(panel.grid.major = ggplot2::element_line(size = 0.5,
ggplot2color = "grey85"),
panel.border = ggplot2::element_rect(color = "black", fill = NA,
size = 1.5))
<- "figures/figure_01_0250.png"
filename ::ggsave(file = filename, plot = p, dpi = 80, width = 9.5, height = 4)
ggplot2
<- enrichplot::dotplot(sclc@misc[["compareCluster_KEGG"]], showCategory = 5) +
p ::theme(panel.grid.major = ggplot2::element_line(size = 0.5,
ggplot2color = "grey85"),
panel.border = ggplot2::element_rect(color = "black", fill = NA,
size = 1.5))
<- "figures/figure_01_0251.png"
filename ::ggsave(file = filename, plot = p, dpi = 80, width = 6.5, height = 3.5) ggplot2
# Regress out the cell cycle effects.
<- Seurat::ScaleData(sclc, vars.to.regress = c("S.Score", "G2M.Score"),
sclc features = rownames(sclc))
# Run principal component analysis.
<- Seurat::RunPCA(sclc, features = Seurat::VariableFeatures(sclc))
sclc # Compute the cumulative sum of variances.
<- which(cumsum(sclc@reductions[["pca"]]@stdev) /
pc sum(sclc@reductions[["pca"]]@stdev) > 0.9)[1]
# Create k-nearest neighbor graph.
<- Seurat::FindNeighbors(sclc, reduction = "pca", dim = seq_len(pc))
sclc # Cluster cells.
<- Seurat::FindClusters(sclc, resolution = 0.1)
sclc # Run t-SNE.
<- Seurat::RunTSNE(sclc, dims.use = seq_len(2), reduction = "pca",
sclc dims = seq_len(pc), do.fast = FALSE, perplexity = 30)
# Run UMAP.
<- Seurat::RunUMAP(sclc, dims = seq_len(pc))
sclc # Show the clustering results.
<- "SCLC (Seurat wo CC)"
title <- sclc@meta.data[["seurat_clusters"]]
labels <- sclc@reductions[["umap"]]@cell.embeddings
df <- ggplot2::ggplot() +
p ::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
ggplot2size = 1, alpha = 1) +
::labs(title = title, x = "UMAP_1", y = "UMAP_2", color = "") +
ggplot2::theme_classic(base_size = 20) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) +
ggplot2::scale_color_brewer(palette = "Set2") +
ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4)))
ggplot2<- "figures/figure_01_0260.png"
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 5.3, height = 4.5)
ggplot2
<- factor(sclc@meta.data[["Phase"]], levels = c("G1", "S", "G2M"))
labels <- c(rainbow(3)[3], rainbow(3)[2], rainbow(3)[1])
mycolor <- sclc@reductions[["umap"]]@cell.embeddings
df <- ggplot2::ggplot() +
p ::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
ggplot2size = 1, alpha = 1) +
::labs(title = title, x = "UMAP_1", y = "UMAP_2", color = "Phase") +
ggplot2::theme_classic(base_size = 20) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) +
ggplot2::scale_color_manual(values = mycolor) +
ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4)))
ggplot2<- "figures/figure_01_0261.png"
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 5.7, height = 4.5) ggplot2