This vignette covers the main function and workflow of SCAVENGE. The standard processed input data including fine-mapped variants and single-cell epigenomic profiles. For fine-mapped variants of the trait of interest, we typically need information of genomic locations of variants and their corresponding posterior propability of causality. A peak-by-cell matrix of scATAC-seq profiles is needed. To walk through the workflow of SCAVENGE, we provided a blood cell trait of monocyte count and a 10X PBMC dataset as an example.
library(SCAVENGE)
library(chromVAR)
library(gchromVAR)
library(BuenColors)
library(SummarizedExperiment)
library(data.table)
library(dplyr)
library(BiocParallel)
library(BSgenome.Hsapiens.UCSC.hg19)
library(igraph)
set.seed(9527)
The PBMC data was processed using ArchR package. The peak-by-cell count matrix and corresponding meta data were extracted and stored in a RangedSummarizedExperiment object (for more details please follow our paper).
trait_import <- example_data(name="mono.PP001.bed")
SE_pbmc5k <- example_data(name="pbmc5k_SE.rda")
SE_pbmc5k <- addGCBias(object = SE_pbmc5k,
genome = BSgenome.Hsapiens.UCSC.hg19::BSgenome.Hsapiens.UCSC.hg19)
SE_pbmc5k_bg <- getBackgroundPeaks(object = SE_pbmc5k,
niterations = 200)
SE_pbmc5k_DEV <- computeWeightedDeviations(object = SE_pbmc5k,
weights = trait_import,
background_peaks = SE_pbmc5k_bg)
Reformat results
z_score_mat <- data.frame(SummarizedExperiment::colData(SE_pbmc5k),
z_score=t(SummarizedExperiment::assays(SE_pbmc5k_DEV)[["z"]]) |> c())
head(z_score_mat)
## names x y color
## input1#GTCACGGAGCTCGGCT-1 input1#GTCACGGAGCTCGGCT-1 11.71388 1.903179 Mono-2
## input1#CTGAATGAGCAGAATT-1 input1#CTGAATGAGCAGAATT-1 -13.86186 -4.616170 T-1
## input1#CCTGCTACAATGGCAG-1 input1#CCTGCTACAATGGCAG-1 10.90323 1.913244 Mono-2
## input1#TCAGGTAAGAGCAGCT-1 input1#TCAGGTAAGAGCAGCT-1 -13.64482 -4.757390 T-1
## input1#GAGTGAGTCGGTCTCT-1 input1#GAGTGAGTCGGTCTCT-1 10.77266 1.872978 Mono-2
## input1#AGGCCCAAGTCTGCTA-1 input1#AGGCCCAAGTCTGCTA-1 -13.88653 -4.610587 T-1
## color2 sample cell_cluster z_score
## input1#GTCACGGAGCTCGGCT-1 C5 input1 C5 0.3950389
## input1#CTGAATGAGCAGAATT-1 C1 input1 C1 0.0984394
## input1#CCTGCTACAATGGCAG-1 C5 input1 C5 0.3504030
## input1#TCAGGTAAGAGCAGCT-1 C1 input1 C1 -2.7724179
## input1#GAGTGAGTCGGTCTCT-1 C5 input1 C5 -0.4360599
## input1#AGGCCCAAGTCTGCTA-1 C1 input1 C1 -2.1425049
seed_idx <- seedindex(z_score = z_score_mat$z_score,
percent_cut = 0.05)
## Cells with enriched P < 0.05: 612
## Percent: 13.42%
## The top 5% of cells (N=228) were selected as seed cells
calculate scale factor
scale_factor <- cal_scalefactor(z_score = z_score_mat$z_score,
percent_cut = 0.01)
## Scale factor is calculating from most enriched 1% of cells.
Calculate tfidf-mat
peak_by_cell_mat <- SummarizedExperiment::assay(SE_pbmc5k)
tfidf_mat <- tfidf(bmat=peak_by_cell_mat,
mat_binary=TRUE,
TF=TRUE,
log_TF=TRUE)
## [info] binarize matrix
## [info] calculate tf
## [info] calculate idf
## [info] fast log tf-idf
Calculate lsi-mat
lsi_mat <- do_lsi(mat = tfidf_mat,
dims = 30)
## SVD analysis of TF-IDF matrix
Please be sure that there is no potential batch effects for
cell-to-cell graph construction. If the cells are from different samples
or different conditions etc., please consider using Harmony analysis
(HarmonyMatrix
from Harmony
package). Typically you could take the lsi_mat as the input with
parameter do_pca = FALSE
and provide meta data describing
extra data such as sample and batch for each cell. Finally, a
harmony-fixed LSI matrix can be used as input for the following
analysis.
Calculate m-knn graph
mutualknn30 <- getmutualknn(lsimat = lsi_mat,
num_k = 30)
np_score <- randomWalk_sparse(intM = mutualknn30,
queryCells = rownames(mutualknn30)[seed_idx],
gamma = 0.05)
Trait relevant score (TRS) with scaled and
normalized
A few cells are singletons are removed from further analysis, this will
lead very few cells be removed for the following analysis. You can
always recover those cells with a unified score of 0 and it will not
impact the following analysis.
omit_idx <- np_score==0
sum(omit_idx)
## [1] 23
mutualknn30 <- mutualknn30[!omit_idx, !omit_idx]
np_score <- np_score[!omit_idx]
TRS <- capOutlierQuantile(x = np_score,
q_ceiling = 0.95) |> max_min_scale()
TRS <- TRS * scale_factor
mono_mat <- data.frame(z_score_mat[!omit_idx, ],
seed_idx[!omit_idx],
np_score,
TRS)
head(mono_mat)
## names x y color
## input1#GTCACGGAGCTCGGCT-1 input1#GTCACGGAGCTCGGCT-1 11.71388 1.903179 Mono-2
## input1#CTGAATGAGCAGAATT-1 input1#CTGAATGAGCAGAATT-1 -13.86186 -4.616170 T-1
## input1#CCTGCTACAATGGCAG-1 input1#CCTGCTACAATGGCAG-1 10.90323 1.913244 Mono-2
## input1#TCAGGTAAGAGCAGCT-1 input1#TCAGGTAAGAGCAGCT-1 -13.64482 -4.757390 T-1
## input1#GAGTGAGTCGGTCTCT-1 input1#GAGTGAGTCGGTCTCT-1 10.77266 1.872978 Mono-2
## input1#AGGCCCAAGTCTGCTA-1 input1#AGGCCCAAGTCTGCTA-1 -13.88653 -4.610587 T-1
## color2 sample cell_cluster z_score
## input1#GTCACGGAGCTCGGCT-1 C5 input1 C5 0.3950389
## input1#CTGAATGAGCAGAATT-1 C1 input1 C1 0.0984394
## input1#CCTGCTACAATGGCAG-1 C5 input1 C5 0.3504030
## input1#TCAGGTAAGAGCAGCT-1 C1 input1 C1 -2.7724179
## input1#GAGTGAGTCGGTCTCT-1 C5 input1 C5 -0.4360599
## input1#AGGCCCAAGTCTGCTA-1 C1 input1 C1 -2.1425049
## seed_idx..omit_idx. np_score TRS
## input1#GTCACGGAGCTCGGCT-1 FALSE 3.804691e-05 0.213939514
## input1#CTGAATGAGCAGAATT-1 FALSE 2.209024e-07 0.001187911
## input1#CCTGCTACAATGGCAG-1 FALSE 6.088393e-05 0.342385858
## input1#TCAGGTAAGAGCAGCT-1 FALSE 2.220132e-07 0.001194159
## input1#GAGTGAGTCGGTCTCT-1 FALSE 4.785297e-05 0.269093513
## input1#AGGCCCAAGTCTGCTA-1 FALSE 2.572135e-07 0.001392142
Cell type annotation
p <- ggplot(data=mono_mat, aes(x, y, color=color)) +
geom_point(size=1, na.rm = TRUE) +
pretty_plot() +
theme(legend.title = element_blank()) +
labs(x="UMAP 1",y="UMAP 2")
p
Visualize cell-to-cell graph if you have low-dimensional coordinates such as UMAP1 and UMAP2
mutualknn30_graph <- graph_from_adjacency_matrix(adjmatrix = mutualknn30,
mode = "undirected",
diag = FALSE)
igraph::plot.igraph(x = mutualknn30_graph,
vertex.size=0.8,
vertex.label=NA,
vertex.color=adjustcolor("#c7ce3d", alpha.f = 1),
vertex.frame.color=NA,
edge.color=adjustcolor("#443dce", alpha.f = 1),
edge.width=0.3, edge.curved=.5,
layout=as.matrix(data.frame(mono_mat$x, mono_mat$y)))
p1 <- ggplot(data=mono_mat, aes(x, y, color=z_score)) +
geom_point(size=1, na.rm = TRUE, alpha = 0.6) +
scale_color_viridis_c() +
scale_alpha() +
pretty_plot() +
theme(legend.title = element_blank()) +
labs(x="UMAP 1", y="UMAP 2")
p1
Bar plot
pp1 <- ggplot(data=mono_mat, aes(x=color, y=z_score)) +
geom_boxplot(aes(fill=color, color=color), outlier.shape=NA) +
guides(fill=FALSE) +
pretty_plot(fontsize = 10) +
stat_summary(geom = "crossbar", width=0.65, fatten=0, color="black",
fun.data = function(x){ return(c(y=median(x), ymin=median(x), ymax=median(x))) }) + theme(legend.position = "none")
pp1
p2 <- ggplot(data=mono_mat, aes(x, y, color=TRS)) +
geom_point(size=1, na.rm = TRUE, alpha = 0.6) +
scale_color_viridis_c() +
scale_alpha() + pretty_plot() +
theme(legend.title = element_blank()) +
labs(x="UMAP 1", y="UMAP 2")
p2
Bar plot
pp2 <- ggplot(data=mono_mat, aes(x=color, y=TRS)) +
geom_boxplot(aes(fill=color, color=color), outlier.shape=NA) +
guides(fill=FALSE) +
pretty_plot(fontsize = 10) +
stat_summary(geom = "crossbar", width=0.65, fatten=0, color="black", fun.data = function(x){ return(c(y=median(x), ymin=median(x), ymax=median(x))) }) + theme(legend.position = "none")
pp2
About 2 mins
please set @mycores >= 1 and @permutation_times >= 1,000 in the real
setting
mono_permu <- get_sigcell_simple(knn_sparse_mat=mutualknn30,
seed_idx=mono_mat$seed_idx,
topseed_npscore=mono_mat$np_score,
permutation_times=100, # Increase to >=1000 in practice
true_cell_significance=0.05,
rda_output=FALSE,
# mycores=8,# Increase in practice
rw_gamma=0.05)
mono_mat2 <- data.frame(mono_mat, mono_permu)
Enriched cells
mono_mat2 |>
dplyr::group_by(color) |>
dplyr::summarise(enriched_cell=sum(true_cell_top_idx)) |>
ggplot(aes(x=color, y=enriched_cell, fill=color)) +
geom_bar(stat="identity") +
theme_classic()
Depleted cells
utils::sessionInfo()
## R version 4.3.1 (2023-06-16)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.3 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so; LAPACK version 3.10.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: UTC
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] igraph_1.5.1 BSgenome.Hsapiens.UCSC.hg19_1.4.3
## [3] BSgenome_1.69.0 rtracklayer_1.61.1
## [5] Biostrings_2.69.2 XVector_0.41.1
## [7] BiocParallel_1.35.4 dplyr_1.1.2
## [9] data.table_1.14.8 SummarizedExperiment_1.31.1
## [11] Biobase_2.61.0 GenomicRanges_1.53.1
## [13] GenomeInfoDb_1.37.3 IRanges_2.35.2
## [15] S4Vectors_0.39.1 BiocGenerics_0.47.0
## [17] MatrixGenerics_1.13.1 matrixStats_1.0.0
## [19] BuenColors_0.5.6 ggplot2_3.4.3
## [21] MASS_7.3-60 gchromVAR_0.3.2
## [23] chromVAR_1.23.0 SCAVENGE_1.0.2
## [25] BiocStyle_2.29.1
##
## loaded via a namespace (and not attached):
## [1] jsonlite_1.8.7 magrittr_2.0.3
## [3] farver_2.1.1 rmarkdown_2.24
## [5] fs_1.6.3 BiocIO_1.11.0
## [7] zlibbioc_1.47.0 ragg_1.2.5
## [9] vctrs_0.6.3 memoise_2.0.1
## [11] Rsamtools_2.17.0 RCurl_1.98-1.12
## [13] htmltools_0.5.6 S4Arrays_1.1.6
## [15] CNEr_1.37.0 SparseArray_1.1.12
## [17] sass_0.4.7 pracma_2.4.2
## [19] bslib_0.5.1 htmlwidgets_1.6.2
## [21] desc_1.4.2 plyr_1.8.8
## [23] plotly_4.10.2 cachem_1.0.8
## [25] GenomicAlignments_1.37.0 mime_0.12
## [27] lifecycle_1.0.3 pkgconfig_2.0.3
## [29] Matrix_1.6-1 R6_2.5.1
## [31] fastmap_1.1.1 GenomeInfoDbData_1.2.10
## [33] shiny_1.7.5 digest_0.6.33
## [35] colorspace_2.1-0 TFMPvalue_0.0.9
## [37] AnnotationDbi_1.63.2 rprojroot_2.0.3
## [39] irlba_2.3.5.1 textshaping_0.3.6
## [41] RSQLite_2.3.1 labeling_0.4.3
## [43] seqLogo_1.67.0 fansi_1.0.4
## [45] httr_1.4.7 abind_1.4-5
## [47] compiler_4.3.1 withr_2.5.0
## [49] bit64_4.0.5 DBI_1.1.3
## [51] highr_0.10 R.utils_2.12.2
## [53] poweRlaw_0.70.6 DelayedArray_0.27.10
## [55] rjson_0.2.21 gtools_3.9.4
## [57] caTools_1.18.2 tools_4.3.1
## [59] httpuv_1.6.11 R.oo_1.25.0
## [61] glue_1.6.2 restfulr_0.0.15
## [63] promises_1.2.1 grid_4.3.1
## [65] reshape2_1.4.4 TFBSTools_1.39.0
## [67] generics_0.1.3 gtable_0.3.4
## [69] tzdb_0.4.0 R.methodsS3_1.8.2
## [71] nabor_0.5.0 tidyr_1.3.0
## [73] hms_1.1.3 utf8_1.2.3
## [75] RANN_2.6.1 pillar_1.9.0
## [77] stringr_1.5.0 later_1.3.1
## [79] lattice_0.21-8 bit_4.0.5
## [81] annotate_1.79.0 tidyselect_1.2.0
## [83] DirichletMultinomial_1.43.0 GO.db_3.17.0
## [85] miniUI_0.1.1.1 knitr_1.43
## [87] bookdown_0.35 xfun_0.40
## [89] DT_0.29 stringi_1.7.12
## [91] lazyeval_0.2.2 yaml_2.3.7
## [93] evaluate_0.21 codetools_0.2-19
## [95] tibble_3.2.1 BiocManager_1.30.22
## [97] cli_3.6.1 xtable_1.8-4
## [99] systemfonts_1.0.4 munsell_0.5.0
## [101] jquerylib_0.1.4 Rcpp_1.0.11
## [103] png_0.1-8 XML_3.99-0.14
## [105] parallel_4.3.1 ellipsis_0.3.2
## [107] pkgdown_2.0.7 readr_2.1.4
## [109] blob_1.2.4 bitops_1.0-7
## [111] viridisLite_0.4.2 scales_1.2.1
## [113] purrr_1.0.2 crayon_1.5.2
## [115] rlang_1.1.1 KEGGREST_1.41.0