vignettes/ingest_data.Rmd
ingest_data.Rmd
ingest_data()
runs three main steps:
read_data()
).to_<format>
).save_data()
).SingleCellExperiment
data("example_seurat")
sce <- ingest_data(obj=example_seurat@assays$RNA@counts)
#> + Returning object directly...
#> Converting formats:
#> + 10 core(s) assigned as workers (2 reserved).
#> + Matrix ==> SingleCellExperiment
#> [1] "+ Checking SCE rownames."
#> + Saving SingleCellExperiment: /var/folders/zq/h7mtybc533b1qzkys_ttgpth0000gn/T//RtmpwUJgNi/scKirby_output.rds
Seurat
seurat <- ingest_data(obj=example_seurat@assays$RNA@counts,
output_type = "Seurat")
#> + Returning object directly...
#> Converting formats:
#> + 10 core(s) assigned as workers (2 reserved).
#> + Matrix ==> Seurat
#> + Saving Seurat: /var/folders/zq/h7mtybc533b1qzkys_ttgpth0000gn/T//RtmpwUJgNi/scKirby_output.rds
scKirby
can ingest a named list (i.e. list(exp=..., annot=...)
) with the following items:
exp
: Expression matrix with rows/genes x cols/cells. Can be a variety of matrix classes, including dense or sparse.
annot
: Cell annotation data.frame
with one cell per row. rownames(annot)
should be the same as colnames(exp)
.
This happens to be the format that the example data in EWCE
uses, but any user-supplied data will work.
SingleCellExperiment
data("example_EWCElist")
sce <- ingest_data(obj=example_EWCElist)
#> + Returning object directly...
#> Converting formats:
#> + 10 core(s) assigned as workers (2 reserved).
#> + EWCElist ==> SingleCellExperiment
#> [1] "+ Checking SCE rownames."
#> + Saving SingleCellExperiment: /var/folders/zq/h7mtybc533b1qzkys_ttgpth0000gn/T//RtmpwUJgNi/scKirby_output.rds
Seurat
seurat <- ingest_data(obj=example_EWCElist,
output_type = "Seurat")
#> + Returning object directly...
#> Converting formats:
#> + 10 core(s) assigned as workers (2 reserved).
#> + EWCElist ==> Seurat
#> Warning: Feature names cannot have underscores ('_'), replacing with dashes
#> ('-')
#> + Saving Seurat: /var/folders/zq/h7mtybc533b1qzkys_ttgpth0000gn/T//RtmpwUJgNi/scKirby_output.rds
SingleCellExperiment
In-memory
data("example_seurat")
sce <- ingest_data(obj=example_seurat)
#> + Returning object directly...
#> Converting formats:
#> + 10 core(s) assigned as workers (2 reserved).
#> + Seurat ==> SingleCellExperiment
#> [1] "+ Checking SCE rownames."
#> + Saving SingleCellExperiment: /var/folders/zq/h7mtybc533b1qzkys_ttgpth0000gn/T//RtmpwUJgNi/scKirby_output.rds
On-disk
library(SeuratDisk)
#> Registered S3 method overwritten by 'cli':
#> method from
#> print.boxx spatstat.geom
#> Registered S3 method overwritten by 'SeuratDisk':
#> method from
#> as.sparse.H5Group Seurat
data("example_seurat")
SaveH5Seurat(example_seurat, filename = "./pbmc_small.h5Seurat", overwrite = T)
#> Creating h5Seurat file for version 3.1.5.9900
#> Adding counts for RNA
#> Adding data for RNA
#> Adding scale.data for RNA
#> Adding variable features for RNA
#> Adding feature-level metadata for RNA
#> Adding cell embeddings for pca
#> Adding loadings for pca
#> Adding projected loadings for pca
#> Adding standard deviations for pca
#> Adding JackStraw information for pca
#> Adding cell embeddings for tsne
#> No loadings for tsne
#> No projected loadings for tsne
#> No standard deviations for tsne
#> No JackStraw data for tsne
seurat <- ingest_data(obj="./pbmc_small.h5Seurat")
#> + Reading from disk...
#> + h5Seurat format (.h5Seurat) detected. Importing as Seurat object...
#> Validating h5Seurat file
#> Initializing RNA with data
#> Adding counts for RNA
#> Adding scale.data for RNA
#> Adding feature-level metadata for RNA
#> Adding variable feature information for RNA
#> Adding reduction pca
#> Adding cell embeddings for pca
#> Adding feature loadings for pca
#> Adding projected loadings for pca
#> Adding miscellaneous information for pca
#> Loading JackStraw data for pca
#> Adding reduction tsne
#> Adding cell embeddings for tsne
#> Adding miscellaneous information for tsne
#> Adding graph RNA_snn
#> Adding command information
#> Adding cell-level metadata
#> Adding miscellaneous information
#> Adding tool-specific results
#> Adding data that was not associated with an assay
#> Warning: Adding a command log without an assay associated with it
#> Converting formats:
#> + 10 core(s) assigned as workers (2 reserved).
#> + Seurat ==> SingleCellExperiment
#> [1] "+ Checking SCE rownames."
#> + Saving SingleCellExperiment: /var/folders/zq/h7mtybc533b1qzkys_ttgpth0000gn/T//RtmpwUJgNi/scKirby_output.rds
SingleCellExperiment
data("example_sce")
sce <- HDF5Array::saveHDF5SummarizedExperiment(example_sce, dir = "./pbmc_small_h5", replace=T)
#> Start writing assay 1/2 to HDF5 file:
#> ./pbmc_small_h5/assays.h5
#> / Reading and realizing block 1/1 ... OK
#> \ Writing it ... OK
#> Finished writing assay 1/2 to HDF5 file:
#> ./pbmc_small_h5/assays.h5
#>
#> Start writing assay 2/2 to HDF5 file:
#> ./pbmc_small_h5/assays.h5
#> / Reading and realizing block 1/1 ... OK
#> \ Writing it ... OK
#> Finished writing assay 2/2 to HDF5 file:
#> ./pbmc_small_h5/assays.h5
#>
#> Serialize SingleCellExperiment object to RDS file:
#> ./pbmc_small_h5/se.rds
## Read in the sce object directly
sce <- ingest_data(obj=sce)
#> + Returning object directly...
#> Converting formats:
#> + 10 core(s) assigned as workers (2 reserved).
#> + Object already in SingleCellExperiment format. Returning as-is.
#> [1] "+ Checking SCE rownames."
#> + Saving SingleCellExperiment: /var/folders/zq/h7mtybc533b1qzkys_ttgpth0000gn/T//RtmpwUJgNi/scKirby_output.rds
## Read it from disk
sce_dir <- dirname(sce_filepath(sce))
sce <- ingest_data(obj=sce_dir)
#> + Reading from disk...
#> + HDF5Array format (.h5) detected. Importing as SingleCellExperiment object...
#> Converting formats:
#> + 10 core(s) assigned as workers (2 reserved).
#> + Object already in SingleCellExperiment format. Returning as-is.
#> [1] "+ Checking SCE rownames."
#> + Saving SingleCellExperiment: /var/folders/zq/h7mtybc533b1qzkys_ttgpth0000gn/T//RtmpwUJgNi/scKirby_output.rds
Seurat
sce <- HDF5Array::saveHDF5SummarizedExperiment(example_sce, dir = "./pbmc_small_h5", replace=T)
#> Start writing assay 1/2 to HDF5 file:
#> ./pbmc_small_h5/assays.h5
#> / Reading and realizing block 1/1 ... OK
#> \ Writing it ... OK
#> Finished writing assay 1/2 to HDF5 file:
#> ./pbmc_small_h5/assays.h5
#>
#> Start writing assay 2/2 to HDF5 file:
#> ./pbmc_small_h5/assays.h5
#> / Reading and realizing block 1/1 ... OK
#> \ Writing it ... OK
#> Finished writing assay 2/2 to HDF5 file:
#> ./pbmc_small_h5/assays.h5
#>
#> Serialize SingleCellExperiment object to RDS file:
#> ./pbmc_small_h5/se.rds
## Read in the sce object directly
seurat <- ingest_data(obj=sce,
output_type = "Seurat")
#> + Returning object directly...
#> Converting formats:
#> + 10 core(s) assigned as workers (2 reserved).
#> + SingleCellExperiment ==> Seurat
#> + Saving Seurat: /var/folders/zq/h7mtybc533b1qzkys_ttgpth0000gn/T//RtmpwUJgNi/scKirby_output.rds
For info on setting up anndata (e.g. with conda
), see
Note that some objects need to be loaded via functions instead of data(<name>)
(e.g. example_anndata()
and example_loom()
). This is because file types like loom
and anndata
must be stored on-disk.
SingleCellExperiment
### Set condaenv= to the name of a conda env you've made
reticulate::use_condaenv(condaenv = "echoR")
# Convert Seurat object to AnnData for example data
adata <- example_anndata()
#> [1] "+ Creating new anndata object: /var/folders/zq/h7mtybc533b1qzkys_ttgpth0000gn/T//RtmpwUJgNi/pbmc_small.h5ad"
## In memory
sce <- ingest_data(obj=adata)
#> + Returning object directly...
#> Converting formats:
#> + 10 core(s) assigned as workers (2 reserved).
#> + AnnData ==> SingleCellExperiment
#> [1] "+ Checking SCE rownames."
#> + Saving SingleCellExperiment: /var/folders/zq/h7mtybc533b1qzkys_ttgpth0000gn/T//RtmpwUJgNi/scKirby_output.rds
## On disk
adata$write_h5ad(filename = "./pbmc_small.h5ad")
#> None
sce <- ingest_data(obj = "./pbmc_small.h5ad")
#> + Reading from disk...
#> + AnnData format (.h5ad) detected. Importing as AnnData object...
#> Converting formats:
#> + 10 core(s) assigned as workers (2 reserved).
#> + AnnData ==> SingleCellExperiment
#> [1] "+ Checking SCE rownames."
#> + Saving SingleCellExperiment: /var/folders/zq/h7mtybc533b1qzkys_ttgpth0000gn/T//RtmpwUJgNi/scKirby_output.rds
Seurat
seurat <- ingest_data(obj=adata,
output_type = "Seurat")
#> + Returning object directly...
#> Converting formats:
#> + 10 core(s) assigned as workers (2 reserved).
#> + AnnData ==> Seurat
#> X -> counts
#> + Saving Seurat: /var/folders/zq/h7mtybc533b1qzkys_ttgpth0000gn/T//RtmpwUJgNi/scKirby_output.rds
From loomR package
SingleCellExperiment
loom <- example_loom()
#> Attaching SeuratObject
#> Saving data from RNA as /matrix
#> Adding slot counts for assay RNA
#> Adding layer counts
#> Adding col attribute CellID
#> Adding col attribute orig.ident
#> Adding col attribute nCount_RNA
#> Adding col attribute nFeature_RNA
#> Adding col attribute RNA_snn_res.0.8
#> Adding col attribute letter.idents
#> Adding col attribute groups
#> Adding col attribute RNA_snn_res.1
#> Adding row attribute Gene
## In memory
print(loom)
#> Class: loom
#> Filename: /private/var/folders/zq/h7mtybc533b1qzkys_ttgpth0000gn/T/RtmpwUJgNi/pbmc_small.loom
#> Access type: H5F_ACC_RDWR
#> Listing:
#> name obj_type dataset.dims dataset.type_class
#> attrs H5I_GROUP <NA> <NA>
#> col_attrs H5I_GROUP <NA> <NA>
#> col_graphs H5I_GROUP <NA> <NA>
#> layers H5I_GROUP <NA> <NA>
#> matrix H5I_DATASET 80 x 230 H5T_FLOAT
#> row_attrs H5I_GROUP <NA> <NA>
#> row_graphs H5I_GROUP <NA> <NA>
sce <- ingest_data(obj=loom)
#> + Returning object directly...
#> Converting formats:
#> + 10 core(s) assigned as workers (2 reserved).
#> + loom ==> SingleCellExperiment
#> Reading in /matrix
#> Storing /matrix as counts
#> Saving /matrix to assay 'RNA'
#> Loading graph RNA_snn
#> [1] "+ Checking SCE rownames."
#> + Saving SingleCellExperiment: /var/folders/zq/h7mtybc533b1qzkys_ttgpth0000gn/T//RtmpwUJgNi/scKirby_output.rds
## From disk
print(loom$filename)
#> [1] "/var/folders/zq/h7mtybc533b1qzkys_ttgpth0000gn/T//RtmpwUJgNi/pbmc_small.loom"
sce <- ingest_data(obj=loom$filename)
#> + Reading from disk...
#> + Loom format (.loom) detected. Importing as SingleCellLoomExperiment object...
#> Reading in /matrix
#> Storing /matrix as counts
#> Saving /matrix to assay 'RNA'
#> Loading graph RNA_snn
#> Converting formats:
#> + 10 core(s) assigned as workers (2 reserved).
#> + Seurat ==> SingleCellExperiment
#> [1] "+ Checking SCE rownames."
#> + Saving SingleCellExperiment: /var/folders/zq/h7mtybc533b1qzkys_ttgpth0000gn/T//RtmpwUJgNi/scKirby_output.rds
Seurat
loom <- example_loom()
#> Warning: Overwriting previous file /var/folders/zq/
#> h7mtybc533b1qzkys_ttgpth0000gn/T//RtmpwUJgNi/pbmc_small.loom
#> Saving data from RNA as /matrix
#> Adding slot counts for assay RNA
#> Adding layer counts
#> Adding col attribute CellID
#> Adding col attribute orig.ident
#> Adding col attribute nCount_RNA
#> Adding col attribute nFeature_RNA
#> Adding col attribute RNA_snn_res.0.8
#> Adding col attribute letter.idents
#> Adding col attribute groups
#> Adding col attribute RNA_snn_res.1
#> Adding row attribute Gene
## In memory
seurat <- ingest_data(obj=loom,
output_type = "Seurat")
#> + Returning object directly...
#> Converting formats:
#> + 10 core(s) assigned as workers (2 reserved).
#> + loom ==> Seurat
#> Reading in /matrix
#> Storing /matrix as counts
#> Saving /matrix to assay 'RNA'
#> Loading graph RNA_snn
#> + Saving Seurat: /var/folders/zq/h7mtybc533b1qzkys_ttgpth0000gn/T//RtmpwUJgNi/scKirby_output.rds
## From disk
print(loom$filename)
#> [1] "/var/folders/zq/h7mtybc533b1qzkys_ttgpth0000gn/T//RtmpwUJgNi/pbmc_small.loom"
seurat <- ingest_data(obj=loom$filename,
output_type = "Seurat")
#> + Reading from disk...
#> + Loom format (.loom) detected. Importing as SingleCellLoomExperiment object...
#> Reading in /matrix
#> Storing /matrix as counts
#> Saving /matrix to assay 'RNA'
#> Loading graph RNA_snn
#> Converting formats:
#> + 10 core(s) assigned as workers (2 reserved).
#> [1] "+ Object already in Seurat format. Returning as-is."
#> + Saving Seurat: /var/folders/zq/h7mtybc533b1qzkys_ttgpth0000gn/T//RtmpwUJgNi/scKirby_output.rds
Remove example files
try({ file.remove("./pbmc_small.h5Seurat", showWarnings=F) })
#> [1] TRUE FALSE
try({ unlink("./pbmc_small_h5/", recursive = T) })
try({ file.remove("./pbmc_small.h5ad") })
#> [1] TRUE
try({ file.remove("./pbmc_small.loom") })
#> [1] FALSE
utils::sessionInfo()
#> R version 4.1.0 (2021-05-18)
#> Platform: x86_64-apple-darwin17.0 (64-bit)
#> Running under: macOS Big Sur 10.16
#>
#> Matrix products: default
#> BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
#>
#> locale:
#> [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] SeuratObject_4.0.2 Seurat_4.0.3 SeuratDisk_0.0.0.9019
#> [4] scKirby_0.1.0
#>
#> loaded via a namespace (and not attached):
#> [1] systemfonts_1.0.2 plyr_1.8.6
#> [3] igraph_1.2.6 lazyeval_0.2.2
#> [5] splines_4.1.0 BiocParallel_1.26.0
#> [7] listenv_0.8.0 scattermore_0.7
#> [9] GenomeInfoDb_1.28.0 ggplot2_3.3.4
#> [11] digest_0.6.27 htmltools_0.5.1.1
#> [13] fansi_0.5.0 magrittr_2.0.1
#> [15] memoise_2.0.0 tensor_1.5
#> [17] cluster_2.1.2 ROCR_1.0-11
#> [19] globals_0.14.0 matrixStats_0.59.0
#> [21] pkgdown_1.6.1 spatstat.sparse_2.0-0
#> [23] colorspace_2.0-1 ggrepel_0.9.1
#> [25] textshaping_0.3.5 xfun_0.24
#> [27] dplyr_1.0.6 crayon_1.4.1
#> [29] RCurl_1.98-1.3 jsonlite_1.7.2
#> [31] Exact_2.1 spatstat.data_2.1-0
#> [33] survival_3.2-11 zoo_1.8-9
#> [35] glue_1.4.2 polyclip_1.10-0
#> [37] gtable_0.3.0 zlibbioc_1.38.0
#> [39] XVector_0.32.0 leiden_0.3.8
#> [41] DelayedArray_0.18.0 Rhdf5lib_1.14.1
#> [43] future.apply_1.7.0 SingleCellExperiment_1.14.1
#> [45] HDF5Array_1.20.0 BiocGenerics_0.38.0
#> [47] SparseM_1.81 abind_1.4-5
#> [49] scales_1.1.1 mvtnorm_1.1-2
#> [51] DBI_1.1.1 miniUI_0.1.1.1
#> [53] Rcpp_1.0.6 viridisLite_0.4.0
#> [55] xtable_1.8-4 reticulate_1.20
#> [57] spatstat.core_2.2-0 bit_4.0.4
#> [59] proxy_0.4-26 stats4_4.1.0
#> [61] htmlwidgets_1.5.3 httr_1.4.2
#> [63] anndata_0.7.5.2 RColorBrewer_1.1-2
#> [65] ellipsis_0.3.2 ica_1.0-2
#> [67] pkgconfig_2.0.3 uwot_0.1.10
#> [69] deldir_0.2-10 sass_0.4.0
#> [71] utf8_1.2.1 tidyselect_1.1.1
#> [73] rlang_0.4.11 reshape2_1.4.4
#> [75] later_1.2.0 munsell_0.5.0
#> [77] tools_4.1.0 cachem_1.0.5
#> [79] cli_2.5.0 generics_0.1.0
#> [81] ggridges_0.5.3 evaluate_0.14
#> [83] stringr_1.4.0 fastmap_1.1.0
#> [85] goftest_1.2-2 yaml_2.2.1
#> [87] ragg_1.1.3 bit64_4.0.5
#> [89] knitr_1.33 fs_1.5.0
#> [91] fitdistrplus_1.1-5 purrr_0.3.4
#> [93] RANN_2.6.1 rootSolve_1.8.2.1
#> [95] nlme_3.1-152 pbapply_1.4-3
#> [97] future_1.21.0 mime_0.10
#> [99] hdf5r_1.3.3 compiler_4.1.0
#> [101] rstudioapi_0.13 plotly_4.9.4
#> [103] png_0.1-7 e1071_1.7-7
#> [105] spatstat.utils_2.2-0 tibble_3.1.2
#> [107] bslib_0.2.5.1 DescTools_0.99.42
#> [109] stringi_1.6.2 desc_1.3.0
#> [111] lattice_0.20-44 Matrix_1.3-4
#> [113] vctrs_0.3.8 rhdf5filters_1.4.0
#> [115] pillar_1.6.1 lifecycle_1.0.0
#> [117] spatstat.geom_2.2-0 lmtest_0.9-38
#> [119] jquerylib_0.1.4 RcppAnnoy_0.0.18
#> [121] data.table_1.14.0 cowplot_1.1.1
#> [123] bitops_1.0-7 irlba_2.3.3
#> [125] lmom_2.8 httpuv_1.6.1
#> [127] patchwork_1.1.1 GenomicRanges_1.44.0
#> [129] R6_2.5.0 promises_1.2.0.1
#> [131] KernSmooth_2.23-20 gridExtra_2.3
#> [133] IRanges_2.26.0 parallelly_1.26.0
#> [135] gld_2.6.2 codetools_0.2-18
#> [137] boot_1.3-28 MASS_7.3-54
#> [139] assertthat_0.2.1 rhdf5_2.36.0
#> [141] SummarizedExperiment_1.22.0 rprojroot_2.0.2
#> [143] withr_2.4.2 sctransform_0.3.2
#> [145] S4Vectors_0.30.0 GenomeInfoDbData_1.2.6
#> [147] mgcv_1.8-36 expm_0.999-6
#> [149] parallel_4.1.0 rpart_4.1-15
#> [151] grid_4.1.0 tidyr_1.1.3
#> [153] class_7.3-19 rmarkdown_2.9
#> [155] MatrixGenerics_1.4.0 Rtsne_0.15
#> [157] Biobase_2.52.0 shiny_1.6.0