MacBook Pro (Big Sur, 16-inch, 2019), Processor (2.4 GHz 8-Core Intel Core i9), Memory (64 GB 2667 MHz DDR4).
Attach necessary libraries:
library(ASURAT)
library(SingleCellExperiment)
library(SummarizedExperiment)
In this vignette, we analyze single-cell RNA sequencing (scRNA-seq) and spatial transcriptome (ST) data obtained from primary tumors of pancreatic ductal adenocarcinoma (PDAC) patients (Moncada et al., Nat. Biotechnol. 38, 2020).
The data can be loaded by the following code:
<- readRDS(url("https://figshare.com/ndownloader/files/34112468")) pdacrna
The data are stored in DOI:10.6084/m9.figshare.19200254 and the generating process is described below.
From GSE111672, we downloaded inDrop data with sample accession numbers GSM3036909, GSM3036910, GSM3405527, GSM3405528, GSM3405529, and GSM3405530 (PDAC-A inDrop1-inDrop6).
<- c("rawdata/2020_001_Moncada/pdac_indrop/PDACA_1/results/gene_expression.tsv",
fn "rawdata/2020_001_Moncada/pdac_indrop/PDACA_2/results/gene_expression.tsv",
"rawdata/2020_001_Moncada/pdac_indrop/PDACA_3/results/gene_expression.tsv",
"rawdata/2020_001_Moncada/pdac_indrop/PDACA_4/results/gene_expression.tsv",
"rawdata/2020_001_Moncada/pdac_indrop/PDACA_5/results/gene_expression.tsv",
"rawdata/2020_001_Moncada/pdac_indrop/PDACA_6/results/gene_expression.tsv")
<- list()
pdacrna for(i in seq_along(fn)){
<- read.table(fn[i], header = TRUE, stringsAsFactors = FALSE, row.names = 1)
d colnames(d) <- paste0("PDAC-A-inDrop", i, "-", colnames(d))
<- SingleCellExperiment(assays = list(counts = as.matrix(d)),
pdacrna[[i]] rowData = data.frame(gene = rownames(d)),
colData = data.frame(sample = colnames(d)))
}
rbind(dim(pdacrna[[1]]), dim(pdacrna[[2]]), dim(pdacrna[[3]]),
dim(pdacrna[[4]]), dim(pdacrna[[5]]), dim(pdacrna[[6]]))
[,1] [,2]
[1,] 19811 10000
[2,] 19811 10000
[3,] 19811 10000
[4,] 19811 10000
[5,] 19811 10000
[6,] 19811 10000
Add metadata for both variables and samples using ASURAT function
add_metadata()
.
for(i in seq_along(pdacrna)){
<- add_metadata(sce = pdacrna[[i]], mitochondria_symbol = "^MT-")
pdacrna[[i]] }
Qualities of sample (cell) data are confirmed based on proper
visualization of colData(sce)
.
for(i in seq_along(pdacrna)){
<- data.frame(x = colData(pdacrna[[i]])$nReads,
df y = colData(pdacrna[[i]])$nGenes)
<- ggplot2::ggplot() +
p ::geom_point(ggplot2::aes(x = df$x, y = df$y), size = 1, alpha = 1) +
ggplot2::labs(title = paste0("PDAC-A inDrop ", i),
ggplot2x = "Number of reads", y = "Number of genes") +
::theme_classic(base_size = 20) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 20)) +
ggplot2::scale_x_log10(limits = c(1, 20000)) +
ggplot2::scale_y_log10(limits = c(1, 10000))
ggplot2<- ggExtra::ggMarginal(p, type = "histogram", margins = "both", size = 5,
p col = "black", fill = "gray")
<- paste0("figures/figure_08_0005_", i, ".png")
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 5, height = 5)
ggplot2 }
Confirming that the data qualities are comparable among experimental batches, concatenate all the objects horizontally.
# Take intersection of genes.
<- Reduce(intersect, list(rownames(pdacrna[[1]]), rownames(pdacrna[[2]]),
genes rownames(pdacrna[[3]]), rownames(pdacrna[[4]]),
rownames(pdacrna[[5]]), rownames(pdacrna[[6]])))
for(i in seq_along(pdacrna)){
<- pdacrna[[i]][genes, ]
pdacrna[[i]] rowData(pdacrna[[i]])$nSamples <- NULL
}# Horizontally concatenate SingleCellExperiment objects.
<- cbind(pdacrna[[1]], pdacrna[[2]], pdacrna[[3]],
pdacrna 4]], pdacrna[[5]], pdacrna[[6]])
pdacrna[[# Add metadata again.
<- add_metadata(sce = pdacrna, mitochondria_symbol = "^MT-") pdacrna
dim(pdacrna)
[1] 19811 60000
The data can be loaded by the following code:
<- readRDS(url("https://figshare.com/ndownloader/files/34112471")) pdacst
The data are stored in DOI:10.6084/m9.figshare.19200254 and the generating process is described below.
Load a raw read count table, convert Ensembl IDs into gene symbols, and change the column names.
<- "rawdata/2020_001_Moncada/pdac_st/SRR6825057_stdata.tsv"
fn <- read.table(fn, header = TRUE, stringsAsFactors = FALSE, row.names = 1)
pdacst <- t(pdacst)
pdacst <- rownames(pdacst)
ensembl <- AnnotationDbi::select(org.Hs.eg.db::org.Hs.eg.db, key = ensembl,
dictionary columns = c("SYMBOL", "ENTREZID"),
keytype = "ENSEMBL")
<- dictionary[!duplicated(dictionary$ENSEMBL), ]
dictionary which(is.na(dictionary$SYMBOL)),]$SYMBOL <- as.character("NA")
dictionary[rownames(pdacst) <- make.unique(as.character(dictionary$SYMBOL))
colnames(pdacst) <- paste0("PDAC-A-ST1_", colnames(pdacst))
Create a SingleCellExperiment object by inputting the read count table.
<- SingleCellExperiment(assays = list(counts = as.matrix(pdacst)),
pdacst rowData = data.frame(gene = rownames(pdacst)),
colData = data.frame(sample = colnames(pdacst)))
A Seurat object, including PDAC tissue images, was obtained from the authors of DOI:10.1038/s41587-019-0392-8 and DOI:10.1093/nar/gkab043, and set the tissue image data into the metadata slot.
<- "rawdata/2020_001_Moncada/pdac_st/PDAC-A_ST_list.RDS"
fn <- readRDS(file = fn)
pdacst_surt <- pdacst_surt$GSM3036911
pdacst_surt metadata(pdacst)$images <- pdacst_surt@images
Since the above SingleCellExperiment object includes spatial coordinates outside of tissues, remove such spots.
<- pdacst[, colnames(pdacst_surt)]
pdacst identical(colnames(pdacst), colnames(pdacst_surt))
[1] TRUE
dim(pdacst)
[1] 25807 428
Add metadata for both variables and samples using ASURAT function
add_metadata()
.
<- add_metadata(sce = pdacst, mitochondria_symbol = "^MT-") pdacst
Qualities of spot data are confirmed based on proper visualization of
colData(sce)
.
<- data.frame(x = colData(pdacst)$nReads, y = colData(pdacst)$nGenes)
df <- ggplot2::ggplot() +
p ::geom_point(ggplot2::aes(x = df$x, y = df$y), size = 1, alpha = 1) +
ggplot2::labs(title = "PDAC-A ST", x = "Number of reads", y = "Number of genes") +
ggplot2::theme_classic(base_size = 18) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 20)) +
ggplot2::scale_x_log10(limits = c(-NA, NA)) +
ggplot2::scale_y_log10(limits = c(-NA, NA))
ggplot2<- ggExtra::ggMarginal(p, type = "histogram", margins = "both", size = 5,
p col = "black", fill = "gray")
<- "figures/figure_09_0005.png"
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 5, height = 5) ggplot2
Remove variables (genes) and samples (cells) with low quality, by processing the following three steps:
ASURAT function remove_variables()
removes variable
(gene) data such that the numbers of non-zero expressing samples (cells)
are less than min_nsamples
.
<- remove_variables(sce = pdacrna, min_nsamples = 10)
pdacrna <- remove_variables(sce = pdacst, min_nsamples = 10) pdacst
Qualities of sample (cell) data are confirmed based on proper
visualization of colData(sce)
. ASURAT function
plot_dataframe2D()
shows scatter plots of two-dimensional
data (see here for details).
<- "PDAC-A inDrop"
title <- data.frame(x = colData(pdacrna)$nReads, y = colData(pdacrna)$nGenes)
df <- ggplot2::ggplot() +
p ::geom_point(ggplot2::aes(x = df$x, y = df$y), size = 1, alpha = 1) +
ggplot2::labs(title = title, x = "Number of reads", y = "Number of genes") +
ggplot2::theme_classic(base_size = 20) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 20))
ggplot2<- "figures/figure_08_0010.png"
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 5, height = 5) ggplot2
<- data.frame(x = colData(pdacrna)$nReads, y = colData(pdacrna)$percMT)
df <- "PDAC-A inDrop"
title <- ggplot2::ggplot() +
p ::geom_point(ggplot2::aes(x = df$x, y = df$y), size = 1, alpha = 1) +
ggplot2::labs(title = title, x = "Number of reads", y = "Perc of MT reads") +
ggplot2::theme_classic(base_size = 20) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5))
ggplot2<- "figures/figure_08_0011.png"
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 5, height = 5) ggplot2
ASURAT function remove_samples()
removes sample (cell)
data by setting cutoff values for the metadata.
<- remove_samples(sce = pdacrna, min_nReads = 1000, max_nReads = 10000,
pdacrna min_nGenes = 100, max_nGenes = 1e+10,
min_percMT = 0, max_percMT = 20)
<- remove_samples(sce = pdacst, min_nReads = 0, max_nReads = 1e+10,
pdacst min_nGenes = 0, max_nGenes = 1e+10,
min_percMT = NULL, max_percMT = NULL)
Qualities of variable (gene) data are confirmed based on proper
visualization of rowData(sce)
. ASURAT function
plot_dataframe2D()
shows scatter plots of two-dimensional
data.
<- "PDAC-A inDrop"
title <- apply(as.matrix(assay(pdacrna, "counts")), 1, mean)
aveexp <- data.frame(x = seq_len(nrow(rowData(pdacrna))),
df y = sort(aveexp, decreasing = TRUE))
<- ggplot2::ggplot() + ggplot2::scale_y_log10() +
p ::geom_point(ggplot2::aes(x = df$x, y = df$y), size = 1, alpha = 1) +
ggplot2::labs(title = title, x = "Rank of genes", y = "Mean read counts") +
ggplot2::theme_classic(base_size = 20) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5))
ggplot2<- "figures/figure_08_0015.png"
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 5, height = 5) ggplot2
<- "PDAC-A ST"
title <- apply(as.matrix(assay(pdacst, "counts")), 1, mean)
aveexp <- data.frame(x = seq_len(nrow(rowData(pdacst))),
df y = sort(aveexp, decreasing = TRUE))
<- ggplot2::ggplot() + ggplot2::scale_y_log10() +
p ::geom_point(ggplot2::aes(x = df$x, y = df$y), size = 1, alpha = 1) +
ggplot2::labs(title = title, x = "Rank of genes", y = "Mean read counts") +
ggplot2::theme_classic(base_size = 20) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5))
ggplot2<- "figures/figure_09_0015.png"
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 5, height = 5) ggplot2
ASURAT function remove_variables_second()
removes
variable (gene) data such that the mean read counts across samples are
less than min_meannReads
.
<- remove_variables_second(sce = pdacrna, min_meannReads = 0.01)
pdacrna <- remove_variables_second(sce = pdacst, min_meannReads = 0.01) pdacst
rbind(dim(pdacrna), dim(pdacst))
[1,] 12248 2034
[2,] 10364 428
Check the number of genes, which commonly exist in both the datasets.
length(intersect(rownames(pdacrna), rownames(pdacst)))
[1] 7923
Qualities of spot data are confirmed based on proper visualization of
colData(sce)
.
# scRNA-seq
<- data.frame(x = colData(pdacrna)$nReads, y = colData(pdacrna)$nGenes)
df <- ggplot2::ggplot() +
p ::geom_point(ggplot2::aes(x = df$x, y = df$y), size = 1, alpha = 1) +
ggplot2::labs(title = "PDAC-A inDrop",
ggplot2x = "Number of reads", y = "Number of genes") +
::theme_classic(base_size = 20) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 20)) +
ggplot2::scale_x_log10(limits = c(1, 20000)) +
ggplot2::scale_y_log10(limits = c(1, 10000))
ggplot2<- ggExtra::ggMarginal(p, type = "histogram", margins = "both", size = 5,
p col = "black", fill = "gray")
<- "figures/figure_10_0005.png"
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 5, height = 5)
ggplot2# ST
<- data.frame(x = colData(pdacst)$nReads, y = colData(pdacst)$nGenes)
df <- ggplot2::ggplot() +
p ::geom_point(ggplot2::aes(x = df$x, y = df$y), size = 1, alpha = 1) +
ggplot2::labs(title = "PDAC-A ST", x = "Number of reads", y = "Number of genes") +
ggplot2::theme_classic(base_size = 20) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 20)) +
ggplot2::scale_x_log10(limits = c(1, 40000)) +
ggplot2::scale_y_log10(limits = c(1, 10000))
ggplot2<- ggExtra::ggMarginal(p, type = "histogram", margins = "both", size = 5,
p col = "black", fill = "gray")
<- "figures/figure_10_0006.png"
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 5, height = 5) ggplot2
Perform bayNorm()
(Tang et al., Bioinformatics, 2020)
for attenuating technical biases with respect to zero inflation and
variation of capture efficiencies between samples (cells).
# pdacrna
<- bayNorm::bayNorm(Data = assay(pdacrna, "counts"), mode_version = TRUE)
bayout assay(pdacrna, "baynorm") <- bayout$Bay_out
# pdacst
<- bayNorm::bayNorm(Data = assay(pdacst, "counts"), mode_version = TRUE)
bayout assay(pdacst, "baynorm") <- bayout$Bay_out
Normalize the data using canonical correlation analysis-based method using Seurat functions (Butler Nat. Biotechnol., 2018).
<- list()
surt 1]] <- Seurat::as.Seurat(pdacrna, counts = "baynorm", data = "baynorm")
surt[[2]] <- Seurat::as.Seurat(pdacst, counts = "baynorm", data = "baynorm")
surt[[names(surt) <- c("inDrop", "ST")
for(i in seq_along(surt)){
<- Seurat::NormalizeData(surt[[i]])
surt[[i]] <- Seurat::FindVariableFeatures(surt[[i]], selection.method = "vst",
surt[[i]] nfeatures = 7500)
}<- Seurat::SelectIntegrationFeatures(object.list = surt, nfeatures = 7500)
genes <- Seurat::FindIntegrationAnchors(object.list = surt,
anchors anchor.features = genes)
<- Seurat::IntegrateData(anchorset = anchors,
surt normalization.method = "LogNormalize")
::DefaultAssay(surt) <- "integrated"
Seurat<- Seurat::as.SingleCellExperiment(surt)
pdac rowData(pdac) <- rownames(pdac)
Keep the tissue coordinate information.
@metadata <- pdacst@metadata pdac
dim(pdac)
[1] 6761 2462
Center row data.
<- assay(pdac, "logcounts")
mat assay(pdac, "centered") <- sweep(mat, 1, apply(mat, 1, mean), FUN = "-")
Set gene expression data into altExp(sce)
.
altExps(pdac) <- NULL # For safely using Seurat function as.Seurat() later,
# avoid using the same slot names in assayNames and altExpNames.
<- "logcounts"
sname altExp(pdac, sname) <- SummarizedExperiment(list(counts = assay(pdac, sname)))
Add ENTREZ Gene IDs to rowData(sce)
.
<- AnnotationDbi::select(org.Hs.eg.db::org.Hs.eg.db,
dictionary key = rownames(pdac),
columns = "ENTREZID", keytype = "SYMBOL")
<- dictionary[!duplicated(dictionary$SYMBOL), ]
dictionary rowData(pdac)$geneID <- dictionary$ENTREZID
Infer cell or disease types, biological functions, and signaling pathway activity at the single-cell level by inputting related databases.
ASURAT transforms centered read count tables to functional feature matrices, termed sign-by-sample matrices (SSMs). Using SSMs, perform unsupervised clustering of samples (cells).
Prepare correlation matrices of gene expressions.
<- t(as.matrix(assay(pdac, "centered")))
mat <- cor(mat, method = "spearman") cormat
Load databases.
<- "https://github.com/keita-iida/ASURATDB/blob/main/genes2bioterm/"
urlpath load(url(paste0(urlpath, "20201213_human_DO.rda?raw=TRUE"))) # DO
load(url(paste0(urlpath, "20201213_human_CO.rda?raw=TRUE"))) # CO
load(url(paste0(urlpath, "20220308_human_MSigDB.rda?raw=TRUE"))) # MSigDB
load(url(paste0(urlpath, "20220308_human_CellMarker.rda?raw=TRUE"))) # CellMarker
load(url(paste0(urlpath, "20201213_human_GO_red.rda?raw=TRUE"))) # GO
load(url(paste0(urlpath, "20201213_human_KEGG.rda?raw=TRUE"))) # KEGG
The reformatted knowledge-based data were available from the following repositories:
Create a custom-built cell type-related databases by combining different databases for analyzing human single-cell transcriptome data.
<- list(human_DO[["disease"]], human_CO[["cell"]], human_MSigDB[["cell"]],
d "cell"]])
human_CellMarker[[<- list(cell = do.call("rbind", d)) human_CB
Add formatted databases to metadata(sce)$sign
.
<- list(CB = pdac, GO = pdac, KG = pdac)
pdacs metadata(pdacs$CB) <- list(sign = human_CB[["cell"]])
metadata(pdacs$GO) <- list(sign = human_GO[["BP"]])
metadata(pdacs$KG) <- list(sign = human_KEGG[["pathway"]])
ASURAT function remove_signs()
redefines functional gene
sets for the input database by removing genes, which are not included in
rownames(sce)
, and further removes biological terms
including too few or too many genes.
$CB <- remove_signs(sce = pdacs$CB, min_ngenes = 2, max_ngenes = 1000)
pdacs$GO <- remove_signs(sce = pdacs$GO, min_ngenes = 2, max_ngenes = 1000)
pdacs$KG <- remove_signs(sce = pdacs$KG, min_ngenes = 2, max_ngenes = 1000) pdacs
ASURAT function cluster_genes()
clusters functional gene
sets using a correlation graph-based decomposition method, which
produces strongly, variably, and weakly correlated gene sets (SCG, VCG,
and WCG, respectively).
set.seed(1)
$CB <- cluster_genesets(sce = pdacs$CB, cormat = cormat,
pdacsth_posi = 0.22, th_nega = -0.32)
set.seed(1)
$GO <- cluster_genesets(sce = pdacs$GO, cormat = cormat,
pdacsth_posi = 0.22, th_nega = -0.22)
set.seed(1)
$KG <- cluster_genesets(sce = pdacs$KG, cormat = cormat,
pdacsth_posi = 0.18, th_nega = -0.23)
ASURAT function create_signs()
creates signs by the
following criteria:
min_cnt_strg
(the
default value is 2) andmin_cnt_vari
(the
default value is 2),which are independently applied to SCGs and VCGs, respectively.
$CB <- create_signs(sce = pdacs$CB, min_cnt_strg = 2, min_cnt_vari = 2)
pdacs$GO <- create_signs(sce = pdacs$GO, min_cnt_strg = 3, min_cnt_vari = 3)
pdacs$KG <- create_signs(sce = pdacs$KG, min_cnt_strg = 3, min_cnt_vari = 3) pdacs
If signs have semantic similarity information, one can use ASURAT
function remove_signs_redundant()
for removing redundant
sings using the semantic similarity matrices.
<- human_GO$similarity_matrix$BP
simmat $GO <- remove_signs_redundant(sce = pdacs$GO, similarity_matrix = simmat,
pdacsthreshold = 0.85, keep_rareID = TRUE)
ASURAT function remove_signs_manually()
removes signs by
specifying IDs (e.g., GOID:XXX
) or descriptions (e.g.,
metabolic
) using grepl()
.
<- "Covid|COVID"
keywords $KG <- remove_signs_manually(sce = pdacs$KG, keywords = keywords) pdacs
ASURAT function create_sce_signmatrix()
creates a new
SingleCellExperiment object new_sce
, consisting of the
following information:
assayNames(new_sce)
: counts (SSM whose entries are
termed sign scores),names(colData(new_sce))
: nReads, nGenes, percMT,names(rowData(new_sce))
: ParentSignID, Description,
CorrGene, etc.,names(metadata(new_sce))
: sign_SCG, sign_VCG,
etc.,altExpNames(new_sce)
: something if there is data in
altExp(sce)
.$CB <- makeSignMatrix(sce = pdacs$CB, weight_strg = 0.5, weight_vari = 0.5)
pdacs$GO <- makeSignMatrix(sce = pdacs$GO, weight_strg = 0.5, weight_vari = 0.5)
pdacs$KG <- makeSignMatrix(sce = pdacs$KG, weight_strg = 0.5, weight_vari = 0.5) pdacs
Perform t-distributed stochastic neighbor embedding.
for(i in seq_along(pdacs)){
set.seed(1)
<- t(as.matrix(assay(pdacs[[i]], "counts")))
mat <- Rtsne::Rtsne(mat, dim = 2, pca = TRUE, initial_dims = 100)
res reducedDim(pdacs[[i]], "TSNE") <- res[["Y"]]
}
Show the results of dimensional reduction in t-SNE spaces.
<- c("PDAC-A (cell type & disease)", "PDAC-A (function)",
titles "PDAC-A (pathway)")
for(i in seq_along(titles)){
<- as.data.frame(reducedDim(pdacs[[i]], "TSNE"))
df <- ggplot2::ggplot() +
p ::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2]),
ggplot2color = "black", size = 1, alpha = 1) +
::labs(title = titles[i], x = "tSNE_1", y = "tSNE_2") +
ggplot2::theme_classic(base_size = 20) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18))
ggplot2<- sprintf("figures/figure_10_%04d.png", 19 + i)
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 4.1, height = 4.3)
ggplot2 }
To date (December, 2021), one of the most useful clustering methods in scRNA-seq data analysis is a combination of a community detection algorithm and graph-based unsupervised clustering, developed in Seurat package.
Here, our strategy is as follows:
rowData()
and colData()
must have data),ScaleData()
, RunPCA()
,
FindNeighbors()
, and FindClusters()
,temp
,colData(temp)$seurat_clusters
into
colData(sce)$seurat_clusters
.<- c(0.20, 0.18, 0.15)
resolutions <- list(seq_len(10), seq_len(30), seq_len(20))
dims for(i in seq_along(pdacs)){
<- Seurat::as.Seurat(pdacs[[i]], counts = "counts", data = "counts")
surt <- as.matrix(assay(pdacs[[i]], "counts"))
mat "SSM"]] <- Seurat::CreateAssayObject(counts = mat)
surt[[::DefaultAssay(surt) <- "SSM"
Seurat<- Seurat::ScaleData(surt, features = rownames(surt))
surt <- Seurat::RunPCA(surt, features = rownames(surt))
surt <- Seurat::FindNeighbors(surt, reduction = "pca", dims = dims[[i]])
surt <- Seurat::FindClusters(surt, resolution = resolutions[i])
surt <- Seurat::as.SingleCellExperiment(surt)
temp colData(pdacs[[i]])$seurat_clusters <- colData(temp)$seurat_clusters
}
Show the clustering results in t-SNE spaces.
<- c("PDAC-A (cell type & disease)", "PDAC-A (function)",
titles "PDAC-A (pathway)")
for(i in seq_along(titles)){
<- colData(pdacs[[i]])$seurat_clusters
labels <- as.data.frame(reducedDim(pdacs[[i]], "TSNE"))
df <- ggplot2::ggplot() +
p ::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
ggplot2size = 1, alpha = 1) +
::labs(title = titles[i], x = "tSNE_1", y = "tSNE_2", color = "") +
ggplot2::theme_classic(base_size = 20) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) +
ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4)))
ggplot2if(i == 1){
<- c("0" = "limegreen", "1" = "deepskyblue1", "2" = "grey20",
mycolor "3" = "red", "4" = "orange")
<- p + ggplot2::scale_color_manual(values = mycolor)
p else if(i == 2){
}<- c("0" = "limegreen", "1" = "grey20", "2" = "red",
mycolor "3" = "orange", "4" = "magenta")
<- p + ggplot2::scale_color_manual(values = mycolor)
p else if(i == 3){
}<- p + ggplot2::scale_color_brewer(palette = "Set2")
p
}<- sprintf("figures/figure_10_%04d.png", 29 + i)
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 5.1, height = 4.3)
ggplot2 }
Show labels of experimental batches in the reduced sign space.
<- c("PDAC-A (cell type & disease)", "PDAC-A (function)",
titles "PDAC-A (pathway)")
for(i in seq_along(titles)){
<- as.data.frame(colData(pdacs[[i]])) ; batch$batch <- NA
batch grepl("ST", batch$orig.ident), ]$batch <- "ST"
batch[!grepl("ST", batch$orig.ident), ]$batch <- "inDrop"
batch[<- batch$batch
labels <- as.data.frame(reducedDim(pdacs[[i]], "TSNE"))
df <- ggplot2::ggplot() +
p ::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
ggplot2size = 1, alpha = 1) +
::labs(title = titles[i], x = "tSNE_1", y = "tSNE_2", color = "") +
ggplot2::theme_classic(base_size = 20) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) +
ggplot2::scale_color_hue() +
ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4)))
ggplot2<- sprintf("figures/figure_10_%04d.png", 34 + i)
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 5.6, height = 4.3)
ggplot2 }
Show labels of experimental batches of inDrop and ST datasets in the reduced sign space.
# Cell type and disease
<- as.character(colData(pdacs$CB)$orig.ident)
batch <- factor(batch, levels = unique(batch))
labels <- as.data.frame(reducedDim(pdacs$CB, "TSNE"))
df <- ggplot2::ggplot() +
p ::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
ggplot2size = 1, alpha = 1) +
::labs(title = "PDAC-A (cell type & disease)",
ggplot2x = "tSNE_1", y = "tSNE_2", color = "") +
::theme_classic(base_size = 20) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) +
ggplot2::scale_color_hue() +
ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4)))
ggplot2<- "figures/figure_10_0038_a.png"
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 7.1, height = 4.3)
ggplot2
# Biological process
<- as.character(colData(pdacs$GO)$orig.ident)
batch <- factor(batch, levels = unique(batch))
labels <- as.data.frame(reducedDim(pdacs$GO, "TSNE"))
df <- ggplot2::ggplot() +
p ::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
ggplot2size = 1, alpha = 1) +
::labs(title = "PDAC-A (function)",
ggplot2x = "tSNE_1", y = "tSNE_2", color = "") +
::theme_classic(base_size = 20) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) +
ggplot2::scale_color_hue() +
ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4)))
ggplot2<- "figures/figure_10_0038_b.png"
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 7.1, height = 4.3) ggplot2
Show the number of reads in the reduced sign space.
# Cell type and disease
<- log(as.numeric(colData(pdacs$CB)$nReads) + 1)
nreads <- as.data.frame(reducedDim(pdacs$CB, "TSNE"))
df <- ggplot2::ggplot() +
p ::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = nreads),
ggplot2size = 1, alpha = 1) +
::labs(title = "PDAC-A (cell type & disease)",
ggplot2x = "tSNE_1", y = "tSNE_2", color = "log(nReads + 1)") +
::theme_classic(base_size = 20) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) +
ggplot2::scale_color_continuous()
ggplot2<- "figures/figure_10_0039_a.png"
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 6.6, height = 4.3)
ggplot2
# Biological process
<- log(as.numeric(colData(pdacs$GO)$nReads) + 1)
nreads <- as.data.frame(reducedDim(pdacs$GO, "TSNE"))
df <- ggplot2::ggplot() +
p ::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = nreads),
ggplot2size = 1, alpha = 1) +
::labs(title = "PDAC-A (function)",
ggplot2x = "tSNE_1", y = "tSNE_2", color = "log(nReads + 1)") +
::theme_classic(base_size = 20) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) +
ggplot2::scale_color_continuous()
ggplot2<- "figures/figure_10_0039_b.png"
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 6.6, height = 4.3) ggplot2
Significant signs are analogous to differentially expressed genes but bear biological meanings. Note that naïve usages of statistical tests should be avoided because the row vectors of SSMs are centered.
Instead, ASURAT function compute_sepI_all()
computes
separation indices for each cluster against the others. Briefly, a
separation index “sepI”, ranging from -1 to 1, is a nonparametric
measure of significance of a given sign score for a given subpopulation.
The larger (resp. smaller) sepI is, the more reliable the sign is as a
positive (resp. negative) marker for the cluster.
for(i in seq_along(pdacs)){
set.seed(1)
<- colData(pdacs[[i]])$seurat_clusters
labels <- compute_sepI_all(sce = pdacs[[i]], labels = labels,
pdacs[[i]] nrand_samples = NULL)
}
Perform compute_sepI_all()
for investigating significant
signs for the clustering results of biological process.
<- colData(pdacs$GO)$seurat_clusters
labels
<- pdacs$CB
pdacs_LabelGO_SignCB metadata(pdacs_LabelGO_SignCB)$marker_signs <- NULL
set.seed(1)
<- compute_sepI_all(sce = pdacs_LabelGO_SignCB,
pdacs_LabelGO_SignCB labels = labels, nrand_samples = NULL)
<- pdacs$KG
pdacs_LabelGO_SignKG metadata(pdacs_LabelGO_SignKG)$marker_signs <- NULL
set.seed(1)
<- compute_sepI_all(sce = pdacs_LabelGO_SignKG,
pdacs_LabelGO_SignKG labels = labels, nrand_samples = NULL)
To date (December, 2021), one of the most useful methods of multiple
statistical tests in scRNA-seq data analysis is to use a Seurat function
FindAllMarkers()
.
If there is gene expression data in altExp(sce)
, one can
investigate differentially expressed genes by using Seurat functions in
the similar manner as described before.
set.seed(1)
<- Seurat::as.Seurat(pdacs$CB, counts = "counts", data = "counts")
surt <- as.matrix(assay(altExp(pdacs$CB), "counts"))
mat "GEM"]] <- Seurat::CreateAssayObject(counts = mat)
surt[[::DefaultAssay(surt) <- "GEM"
Seurat<- Seurat::SetIdent(surt, value = "seurat_clusters")
surt <- Seurat::FindAllMarkers(surt, only.pos = TRUE,
res min.pct = 0.25, logfc.threshold = 0.25)
metadata(pdacs$CB)$marker_genes$all <- res
Simultaneously analyze multiple sign-by-sample matrices, which helps us characterize individual samples (cells) from multiple biological aspects.
ASURAT function plot_multiheatmaps()
shows heatmaps
(ComplexHeatmap object) of sign scores and gene expression levels (if
there are), where rows and columns stand for sign (or gene) and sample
(cell), respectively.
First, remove unrelated signs by setting keywords, followed by selecting top significant signs and genes for the clustering results with respect to separation index and p-value, respectively.
# Significant signs
<- list()
marker_signs <- "foofoo|hogehoge"
keys for(i in seq_along(pdacs)){
<- metadata(pdacs[[i]])$marker_signs$all
marker_signs[[i]] <- marker_signs[[i]][!grepl(keys, marker_signs[[i]]$Description), ]
marker_signs[[i]] <- dplyr::group_by(marker_signs[[i]], Ident_1)
marker_signs[[i]] <- dplyr::slice_max(marker_signs[[i]], sepI, n = 2)
marker_signs[[i]] <- dplyr::slice_min(marker_signs[[i]], Rank, n = 1)
marker_signs[[i]]
}# Significant genes
<- metadata(pdacs$CB)$marker_genes$all
marker_genes_CB <- dplyr::group_by(marker_genes_CB, cluster)
marker_genes_CB <- dplyr::slice_min(marker_genes_CB, p_val_adj, n = 2)
marker_genes_CB <- dplyr::slice_max(marker_genes_CB, avg_log2FC, n = 2) marker_genes_CB
Then, prepare arguments.
# ssm_list
<- list() ; ssm_list <- list()
sces_sub for(i in seq_along(pdacs)){
<- pdacs[[i]][rownames(pdacs[[i]]) %in% marker_signs[[i]]$SignID, ]
sces_sub[[i]] <- assay(sces_sub[[i]], "counts")
ssm_list[[i]]
}names(ssm_list) <- c("SSM_cell_disease", "SSM_function", "SSM_pathway")
# gem_list
<- altExp(pdacs$CB, "logcounts")
expr_sub <- expr_sub[rownames(expr_sub) %in% marker_genes_CB$gene]
expr_sub <- list(x = t(scale(t(as.matrix(assay(expr_sub, "counts"))))))
gem_list names(gem_list) <- "Scaled\nLogExpr"
# ssmlabel_list
<- list() ; ssmlabel_list <- list()
labels for(i in seq_along(pdacs)){
<- colData(sces_sub[[i]])$seurat_clusters
tmp <- data.frame(label = tmp)
labels[[i]] <- length(unique(tmp))
n_groups if(i == 1){
<- c("0" = "limegreen", "1" = "deepskyblue1", "2" = "grey20",
mycolor "3" = "red", "4" = "orange")
$color <- mycolor[tmp]
labels[[i]]else if(i == 2){
}<- c("0" = "limegreen", "1" = "grey20", "2" = "red",
mycolor "3" = "orange", "4" = "magenta")
$color <- mycolor[tmp]
labels[[i]]else if(i == 3){
}$color <- scales::brewer_pal(palette = "Set2")(n_groups)[tmp]
labels[[i]]
}<- labels[[i]]
ssmlabel_list[[i]]
}names(ssmlabel_list) <- c("Label_cell_disease", "Label_function", "Label_pathway")
Finally, plot heatmaps for the selected signs and genes.
<- "figures/figure_10_0040.png"
filename # png(file = filename, height = 1200, width = 1300, res = 200)
png(file = filename, height = 350, width = 370, res = 60)
set.seed(1)
<- "PDAC-A"
title plot_multiheatmaps(ssm_list = ssm_list, gem_list = gem_list,
ssmlabel_list = ssmlabel_list, gemlabel_list = NULL,
nrand_samples = 500, show_row_names = TRUE, title = title)
dev.off()
Show violin plots for sign score and gene expression distributions across cell type-related clusters.
<- colData(pdacs$CB)$seurat_clusters
labels <- list(c("GE", "REG1A", "(Pancreatic cell)\n"),
vlist c("GE", "CPA5", "(Pancreatic cell)\n"),
c("CB", "MSigDBID:252-V",
"...PANCREAS_DUCTAL_CELL\n(SOX9, APOL1, ...)"),
c("CB", "DOID:3498-S",
"pancreatic ductal adenocarcinoma\n(S100P, MMP9, ...)"),
c("CB", "MSigDBID:69-S", "...NK_CELLS\n(GZMB, HCST, ...)"),
c("CB", "MSigDBID:41-S", "...MACROPHAGE\n(TYROBP, MS4A7, ...)"))
<- list(GE = "Expression level", CB = "Sign score")
ylabels <- c("0" = "limegreen", "1" = "deepskyblue1", "2" = "grey20", "3" = "red",
mycolor "4" = "orange")
for(i in seq_along(vlist)){
if(vlist[[i]][1] == "GE"){
<- which(rownames(altExp(pdacs$CB, "logcounts")) == vlist[[i]][2])
ind <- altExp(pdacs$CB, "logcounts")[ind, ]
subsce <- as.data.frame(t(as.matrix(assay(subsce, "counts"))))
df else{
}<- which(rownames(pdacs[[vlist[[i]][1]]]) == vlist[[i]][2])
ind <- pdacs[[vlist[[i]][1]]][ind, ]
subsce <- as.data.frame(t(as.matrix(assay(subsce, "counts"))))
df
}<- ggplot2::ggplot() +
p ::geom_violin(ggplot2::aes(x = as.factor(labels), y = df[, 1],
ggplot2fill = labels), trim = FALSE, size = 0.5) +
::geom_boxplot(ggplot2::aes(x = as.factor(labels), y = df[, 1]),
ggplot2width = 0.15, alpha = 0.6) +
::labs(title = paste0(vlist[[i]][2], "\n", vlist[[i]][3]),
ggplot2x = "Cluster (cell type & disease)",
y = ylabels[[vlist[[i]][1]]]) +
::theme_classic(base_size = 25) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 20),
ggplot2legend.position = "none") +
::scale_fill_manual(values = mycolor)
ggplot2<- sprintf("figures/figure_10_%04d.png", 49 + i)
filename ::ggsave(file = filename, plot = p, dpi = 50, width = 5, height = 4)
ggplot2 }