Intro

ingest_data() runs three main steps:

  1. Read: Automatically infers the file/object type and loads it (sub-function: read_data()).
  2. Convert: Converts it to the desired file/object type (sub-function: to_<format>).
  3. Save: Saves the converted file/object (sub-function: save_data()).

Examples

Ingest expression matrix

As SingleCellExperiment

data("example_seurat")

sce <- ingest_data(obj=example_seurat@assays$RNA@counts)
#> + Returning object directly...
#> Converting formats:
#> + 10 core(s) assigned as workers (2 reserved).
#> + Matrix ==> SingleCellExperiment
#> [1] "+ Checking SCE rownames."
#> + Saving SingleCellExperiment: /var/folders/zq/h7mtybc533b1qzkys_ttgpth0000gn/T//RtmpwUJgNi/scKirby_output.rds

As Seurat

seurat <- ingest_data(obj=example_seurat@assays$RNA@counts, 
                      output_type = "Seurat") 
#> + Returning object directly...
#> Converting formats:
#> + 10 core(s) assigned as workers (2 reserved).
#> + Matrix ==> Seurat
#> + Saving Seurat: /var/folders/zq/h7mtybc533b1qzkys_ttgpth0000gn/T//RtmpwUJgNi/scKirby_output.rds

Ingest list

scKirby can ingest a named list (i.e. list(exp=..., annot=...)) with the following items:

  • exp: Expression matrix with rows/genes x cols/cells. Can be a variety of matrix classes, including dense or sparse.

  • annot: Cell annotation data.frame with one cell per row. rownames(annot) should be the same as colnames(exp).

This happens to be the format that the example data in EWCE uses, but any user-supplied data will work.

As SingleCellExperiment

data("example_EWCElist")

sce <- ingest_data(obj=example_EWCElist)
#> + Returning object directly...
#> Converting formats:
#> + 10 core(s) assigned as workers (2 reserved).
#> + EWCElist ==> SingleCellExperiment
#> [1] "+ Checking SCE rownames."
#> + Saving SingleCellExperiment: /var/folders/zq/h7mtybc533b1qzkys_ttgpth0000gn/T//RtmpwUJgNi/scKirby_output.rds

As Seurat

seurat <- ingest_data(obj=example_EWCElist, 
                      output_type = "Seurat")
#> + Returning object directly...
#> Converting formats:
#> + 10 core(s) assigned as workers (2 reserved).
#> + EWCElist ==> Seurat
#> Warning: Feature names cannot have underscores ('_'), replacing with dashes
#> ('-')
#> + Saving Seurat: /var/folders/zq/h7mtybc533b1qzkys_ttgpth0000gn/T//RtmpwUJgNi/scKirby_output.rds

Ingest Seurat

As SingleCellExperiment

In-memory

data("example_seurat")

sce <- ingest_data(obj=example_seurat)
#> + Returning object directly...
#> Converting formats:
#> + 10 core(s) assigned as workers (2 reserved).
#> + Seurat ==> SingleCellExperiment
#> [1] "+ Checking SCE rownames."
#> + Saving SingleCellExperiment: /var/folders/zq/h7mtybc533b1qzkys_ttgpth0000gn/T//RtmpwUJgNi/scKirby_output.rds

Ingest H5Seurat

On-disk

library(SeuratDisk)
#> Registered S3 method overwritten by 'cli':
#>   method     from         
#>   print.boxx spatstat.geom
#> Registered S3 method overwritten by 'SeuratDisk':
#>   method            from  
#>   as.sparse.H5Group Seurat
data("example_seurat")

SaveH5Seurat(example_seurat, filename = "./pbmc_small.h5Seurat", overwrite = T)
#> Creating h5Seurat file for version 3.1.5.9900
#> Adding counts for RNA
#> Adding data for RNA
#> Adding scale.data for RNA
#> Adding variable features for RNA
#> Adding feature-level metadata for RNA
#> Adding cell embeddings for pca
#> Adding loadings for pca
#> Adding projected loadings for pca
#> Adding standard deviations for pca
#> Adding JackStraw information for pca
#> Adding cell embeddings for tsne
#> No loadings for tsne
#> No projected loadings for tsne
#> No standard deviations for tsne
#> No JackStraw data for tsne
seurat <- ingest_data(obj="./pbmc_small.h5Seurat")
#> + Reading from disk...
#> + h5Seurat format (.h5Seurat) detected. Importing as Seurat object...
#> Validating h5Seurat file
#> Initializing RNA with data
#> Adding counts for RNA
#> Adding scale.data for RNA
#> Adding feature-level metadata for RNA
#> Adding variable feature information for RNA
#> Adding reduction pca
#> Adding cell embeddings for pca
#> Adding feature loadings for pca
#> Adding projected loadings for pca
#> Adding miscellaneous information for pca
#> Loading JackStraw data for pca
#> Adding reduction tsne
#> Adding cell embeddings for tsne
#> Adding miscellaneous information for tsne
#> Adding graph RNA_snn
#> Adding command information
#> Adding cell-level metadata
#> Adding miscellaneous information
#> Adding tool-specific results
#> Adding data that was not associated with an assay
#> Warning: Adding a command log without an assay associated with it
#> Converting formats:
#> + 10 core(s) assigned as workers (2 reserved).
#> + Seurat ==> SingleCellExperiment
#> [1] "+ Checking SCE rownames."
#> + Saving SingleCellExperiment: /var/folders/zq/h7mtybc533b1qzkys_ttgpth0000gn/T//RtmpwUJgNi/scKirby_output.rds

Ingest HDF5 SingleCellExperiment

As SingleCellExperiment

data("example_sce")

sce <- HDF5Array::saveHDF5SummarizedExperiment(example_sce, dir = "./pbmc_small_h5", replace=T)
#> Start writing assay 1/2 to HDF5 file:
#>   ./pbmc_small_h5/assays.h5
#> / Reading and realizing block 1/1 ... OK
#> \ Writing it ... OK
#> Finished writing assay 1/2 to HDF5 file:
#>   ./pbmc_small_h5/assays.h5
#> 
#> Start writing assay 2/2 to HDF5 file:
#>   ./pbmc_small_h5/assays.h5
#> / Reading and realizing block 1/1 ... OK
#> \ Writing it ... OK
#> Finished writing assay 2/2 to HDF5 file:
#>   ./pbmc_small_h5/assays.h5
#> 
#> Serialize SingleCellExperiment object to RDS file:
#>   ./pbmc_small_h5/se.rds
## Read in the sce object directly
sce <- ingest_data(obj=sce)
#> + Returning object directly...
#> Converting formats:
#> + 10 core(s) assigned as workers (2 reserved).
#> + Object already in SingleCellExperiment format. Returning as-is.
#> [1] "+ Checking SCE rownames."
#> + Saving SingleCellExperiment: /var/folders/zq/h7mtybc533b1qzkys_ttgpth0000gn/T//RtmpwUJgNi/scKirby_output.rds

## Read it from disk
sce_dir <- dirname(sce_filepath(sce))
sce <- ingest_data(obj=sce_dir)
#> + Reading from disk...
#> + HDF5Array format (.h5) detected. Importing as SingleCellExperiment object...
#> Converting formats:
#> + 10 core(s) assigned as workers (2 reserved).
#> + Object already in SingleCellExperiment format. Returning as-is.
#> [1] "+ Checking SCE rownames."
#> + Saving SingleCellExperiment: /var/folders/zq/h7mtybc533b1qzkys_ttgpth0000gn/T//RtmpwUJgNi/scKirby_output.rds

As Seurat

sce <- HDF5Array::saveHDF5SummarizedExperiment(example_sce, dir = "./pbmc_small_h5", replace=T)
#> Start writing assay 1/2 to HDF5 file:
#>   ./pbmc_small_h5/assays.h5
#> / Reading and realizing block 1/1 ... OK
#> \ Writing it ... OK
#> Finished writing assay 1/2 to HDF5 file:
#>   ./pbmc_small_h5/assays.h5
#> 
#> Start writing assay 2/2 to HDF5 file:
#>   ./pbmc_small_h5/assays.h5
#> / Reading and realizing block 1/1 ... OK
#> \ Writing it ... OK
#> Finished writing assay 2/2 to HDF5 file:
#>   ./pbmc_small_h5/assays.h5
#> 
#> Serialize SingleCellExperiment object to RDS file:
#>   ./pbmc_small_h5/se.rds
## Read in the sce object directly
seurat <- ingest_data(obj=sce,
                      output_type = "Seurat") 
#> + Returning object directly...
#> Converting formats:
#> + 10 core(s) assigned as workers (2 reserved).
#> + SingleCellExperiment ==> Seurat
#> + Saving Seurat: /var/folders/zq/h7mtybc533b1qzkys_ttgpth0000gn/T//RtmpwUJgNi/scKirby_output.rds

Ingest AnnData

For info on setting up anndata (e.g. with conda), see

Note that some objects need to be loaded via functions instead of data(<name>) (e.g. example_anndata() and example_loom()). This is because file types like loom and anndata must be stored on-disk.

As SingleCellExperiment

### Set condaenv= to the name of a conda env you've made 
reticulate::use_condaenv(condaenv = "echoR")

# Convert Seurat object to AnnData for example data
adata <- example_anndata()
#> [1] "+ Creating new anndata object: /var/folders/zq/h7mtybc533b1qzkys_ttgpth0000gn/T//RtmpwUJgNi/pbmc_small.h5ad"
## In memory
sce <- ingest_data(obj=adata)
#> + Returning object directly...
#> Converting formats:
#> + 10 core(s) assigned as workers (2 reserved).
#> + AnnData ==> SingleCellExperiment
#> [1] "+ Checking SCE rownames."
#> + Saving SingleCellExperiment: /var/folders/zq/h7mtybc533b1qzkys_ttgpth0000gn/T//RtmpwUJgNi/scKirby_output.rds

## On disk
adata$write_h5ad(filename = "./pbmc_small.h5ad")
#> None
sce <- ingest_data(obj = "./pbmc_small.h5ad")
#> + Reading from disk...
#> + AnnData format (.h5ad) detected. Importing as AnnData object...
#> Converting formats:
#> + 10 core(s) assigned as workers (2 reserved).
#> + AnnData ==> SingleCellExperiment
#> [1] "+ Checking SCE rownames."
#> + Saving SingleCellExperiment: /var/folders/zq/h7mtybc533b1qzkys_ttgpth0000gn/T//RtmpwUJgNi/scKirby_output.rds

As Seurat

seurat <- ingest_data(obj=adata, 
                      output_type = "Seurat")
#> + Returning object directly...
#> Converting formats:
#> + 10 core(s) assigned as workers (2 reserved).
#> + AnnData ==> Seurat
#> X -> counts
#> + Saving Seurat: /var/folders/zq/h7mtybc533b1qzkys_ttgpth0000gn/T//RtmpwUJgNi/scKirby_output.rds

Ingest loom

From loomR package

As SingleCellExperiment

loom <- example_loom()
#> Attaching SeuratObject
#> Saving data from RNA as /matrix
#> Adding slot counts for assay RNA
#> Adding layer counts
#> Adding col attribute CellID
#> Adding col attribute orig.ident
#> Adding col attribute nCount_RNA
#> Adding col attribute nFeature_RNA
#> Adding col attribute RNA_snn_res.0.8
#> Adding col attribute letter.idents
#> Adding col attribute groups
#> Adding col attribute RNA_snn_res.1
#> Adding row attribute Gene
## In memory
print(loom)
#> Class: loom
#> Filename: /private/var/folders/zq/h7mtybc533b1qzkys_ttgpth0000gn/T/RtmpwUJgNi/pbmc_small.loom
#> Access type: H5F_ACC_RDWR
#> Listing:
#>        name    obj_type dataset.dims dataset.type_class
#>       attrs   H5I_GROUP         <NA>               <NA>
#>   col_attrs   H5I_GROUP         <NA>               <NA>
#>  col_graphs   H5I_GROUP         <NA>               <NA>
#>      layers   H5I_GROUP         <NA>               <NA>
#>      matrix H5I_DATASET     80 x 230          H5T_FLOAT
#>   row_attrs   H5I_GROUP         <NA>               <NA>
#>  row_graphs   H5I_GROUP         <NA>               <NA>
sce <- ingest_data(obj=loom) 
#> + Returning object directly...
#> Converting formats:
#> + 10 core(s) assigned as workers (2 reserved).
#> + loom ==> SingleCellExperiment
#> Reading in /matrix
#> Storing /matrix as counts
#> Saving /matrix to assay 'RNA'
#> Loading graph RNA_snn
#> [1] "+ Checking SCE rownames."
#> + Saving SingleCellExperiment: /var/folders/zq/h7mtybc533b1qzkys_ttgpth0000gn/T//RtmpwUJgNi/scKirby_output.rds

## From disk
print(loom$filename)
#> [1] "/var/folders/zq/h7mtybc533b1qzkys_ttgpth0000gn/T//RtmpwUJgNi/pbmc_small.loom"
sce <- ingest_data(obj=loom$filename)
#> + Reading from disk...
#> + Loom format (.loom) detected. Importing as SingleCellLoomExperiment object...
#> Reading in /matrix
#> Storing /matrix as counts
#> Saving /matrix to assay 'RNA'
#> Loading graph RNA_snn
#> Converting formats:
#> + 10 core(s) assigned as workers (2 reserved).
#> + Seurat ==> SingleCellExperiment
#> [1] "+ Checking SCE rownames."
#> + Saving SingleCellExperiment: /var/folders/zq/h7mtybc533b1qzkys_ttgpth0000gn/T//RtmpwUJgNi/scKirby_output.rds

As Seurat

loom <- example_loom()
#> Warning: Overwriting previous file /var/folders/zq/
#> h7mtybc533b1qzkys_ttgpth0000gn/T//RtmpwUJgNi/pbmc_small.loom
#> Saving data from RNA as /matrix
#> Adding slot counts for assay RNA
#> Adding layer counts
#> Adding col attribute CellID
#> Adding col attribute orig.ident
#> Adding col attribute nCount_RNA
#> Adding col attribute nFeature_RNA
#> Adding col attribute RNA_snn_res.0.8
#> Adding col attribute letter.idents
#> Adding col attribute groups
#> Adding col attribute RNA_snn_res.1
#> Adding row attribute Gene
## In memory 
seurat <- ingest_data(obj=loom, 
                      output_type = "Seurat") 
#> + Returning object directly...
#> Converting formats:
#> + 10 core(s) assigned as workers (2 reserved).
#> + loom ==> Seurat
#> Reading in /matrix
#> Storing /matrix as counts
#> Saving /matrix to assay 'RNA'
#> Loading graph RNA_snn
#> + Saving Seurat: /var/folders/zq/h7mtybc533b1qzkys_ttgpth0000gn/T//RtmpwUJgNi/scKirby_output.rds

## From disk
print(loom$filename)
#> [1] "/var/folders/zq/h7mtybc533b1qzkys_ttgpth0000gn/T//RtmpwUJgNi/pbmc_small.loom"
seurat <- ingest_data(obj=loom$filename,
                      output_type = "Seurat")
#> + Reading from disk...
#> + Loom format (.loom) detected. Importing as SingleCellLoomExperiment object...
#> Reading in /matrix
#> Storing /matrix as counts
#> Saving /matrix to assay 'RNA'
#> Loading graph RNA_snn
#> Converting formats:
#> + 10 core(s) assigned as workers (2 reserved).
#> [1] "+ Object already in Seurat format. Returning as-is."
#> + Saving Seurat: /var/folders/zq/h7mtybc533b1qzkys_ttgpth0000gn/T//RtmpwUJgNi/scKirby_output.rds

Cleanup

Remove example files

try({ file.remove("./pbmc_small.h5Seurat", showWarnings=F) })
#> [1]  TRUE FALSE
try({ unlink("./pbmc_small_h5/", recursive = T) })
try({ file.remove("./pbmc_small.h5ad") })
#> [1] TRUE
try({ file.remove("./pbmc_small.loom")  })
#> [1] FALSE

Session Info

utils::sessionInfo()
#> R version 4.1.0 (2021-05-18)
#> Platform: x86_64-apple-darwin17.0 (64-bit)
#> Running under: macOS Big Sur 10.16
#> 
#> Matrix products: default
#> BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
#> 
#> locale:
#> [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] SeuratObject_4.0.2    Seurat_4.0.3          SeuratDisk_0.0.0.9019
#> [4] scKirby_0.1.0        
#> 
#> loaded via a namespace (and not attached):
#>   [1] systemfonts_1.0.2           plyr_1.8.6                 
#>   [3] igraph_1.2.6                lazyeval_0.2.2             
#>   [5] splines_4.1.0               BiocParallel_1.26.0        
#>   [7] listenv_0.8.0               scattermore_0.7            
#>   [9] GenomeInfoDb_1.28.0         ggplot2_3.3.4              
#>  [11] digest_0.6.27               htmltools_0.5.1.1          
#>  [13] fansi_0.5.0                 magrittr_2.0.1             
#>  [15] memoise_2.0.0               tensor_1.5                 
#>  [17] cluster_2.1.2               ROCR_1.0-11                
#>  [19] globals_0.14.0              matrixStats_0.59.0         
#>  [21] pkgdown_1.6.1               spatstat.sparse_2.0-0      
#>  [23] colorspace_2.0-1            ggrepel_0.9.1              
#>  [25] textshaping_0.3.5           xfun_0.24                  
#>  [27] dplyr_1.0.6                 crayon_1.4.1               
#>  [29] RCurl_1.98-1.3              jsonlite_1.7.2             
#>  [31] Exact_2.1                   spatstat.data_2.1-0        
#>  [33] survival_3.2-11             zoo_1.8-9                  
#>  [35] glue_1.4.2                  polyclip_1.10-0            
#>  [37] gtable_0.3.0                zlibbioc_1.38.0            
#>  [39] XVector_0.32.0              leiden_0.3.8               
#>  [41] DelayedArray_0.18.0         Rhdf5lib_1.14.1            
#>  [43] future.apply_1.7.0          SingleCellExperiment_1.14.1
#>  [45] HDF5Array_1.20.0            BiocGenerics_0.38.0        
#>  [47] SparseM_1.81                abind_1.4-5                
#>  [49] scales_1.1.1                mvtnorm_1.1-2              
#>  [51] DBI_1.1.1                   miniUI_0.1.1.1             
#>  [53] Rcpp_1.0.6                  viridisLite_0.4.0          
#>  [55] xtable_1.8-4                reticulate_1.20            
#>  [57] spatstat.core_2.2-0         bit_4.0.4                  
#>  [59] proxy_0.4-26                stats4_4.1.0               
#>  [61] htmlwidgets_1.5.3           httr_1.4.2                 
#>  [63] anndata_0.7.5.2             RColorBrewer_1.1-2         
#>  [65] ellipsis_0.3.2              ica_1.0-2                  
#>  [67] pkgconfig_2.0.3             uwot_0.1.10                
#>  [69] deldir_0.2-10               sass_0.4.0                 
#>  [71] utf8_1.2.1                  tidyselect_1.1.1           
#>  [73] rlang_0.4.11                reshape2_1.4.4             
#>  [75] later_1.2.0                 munsell_0.5.0              
#>  [77] tools_4.1.0                 cachem_1.0.5               
#>  [79] cli_2.5.0                   generics_0.1.0             
#>  [81] ggridges_0.5.3              evaluate_0.14              
#>  [83] stringr_1.4.0               fastmap_1.1.0              
#>  [85] goftest_1.2-2               yaml_2.2.1                 
#>  [87] ragg_1.1.3                  bit64_4.0.5                
#>  [89] knitr_1.33                  fs_1.5.0                   
#>  [91] fitdistrplus_1.1-5          purrr_0.3.4                
#>  [93] RANN_2.6.1                  rootSolve_1.8.2.1          
#>  [95] nlme_3.1-152                pbapply_1.4-3              
#>  [97] future_1.21.0               mime_0.10                  
#>  [99] hdf5r_1.3.3                 compiler_4.1.0             
#> [101] rstudioapi_0.13             plotly_4.9.4               
#> [103] png_0.1-7                   e1071_1.7-7                
#> [105] spatstat.utils_2.2-0        tibble_3.1.2               
#> [107] bslib_0.2.5.1               DescTools_0.99.42          
#> [109] stringi_1.6.2               desc_1.3.0                 
#> [111] lattice_0.20-44             Matrix_1.3-4               
#> [113] vctrs_0.3.8                 rhdf5filters_1.4.0         
#> [115] pillar_1.6.1                lifecycle_1.0.0            
#> [117] spatstat.geom_2.2-0         lmtest_0.9-38              
#> [119] jquerylib_0.1.4             RcppAnnoy_0.0.18           
#> [121] data.table_1.14.0           cowplot_1.1.1              
#> [123] bitops_1.0-7                irlba_2.3.3                
#> [125] lmom_2.8                    httpuv_1.6.1               
#> [127] patchwork_1.1.1             GenomicRanges_1.44.0       
#> [129] R6_2.5.0                    promises_1.2.0.1           
#> [131] KernSmooth_2.23-20          gridExtra_2.3              
#> [133] IRanges_2.26.0              parallelly_1.26.0          
#> [135] gld_2.6.2                   codetools_0.2-18           
#> [137] boot_1.3-28                 MASS_7.3-54                
#> [139] assertthat_0.2.1            rhdf5_2.36.0               
#> [141] SummarizedExperiment_1.22.0 rprojroot_2.0.2            
#> [143] withr_2.4.2                 sctransform_0.3.2          
#> [145] S4Vectors_0.30.0            GenomeInfoDbData_1.2.6     
#> [147] mgcv_1.8-36                 expm_0.999-6               
#> [149] parallel_4.1.0              rpart_4.1-15               
#> [151] grid_4.1.0                  tidyr_1.1.3                
#> [153] class_7.3-19                rmarkdown_2.9              
#> [155] MatrixGenerics_1.4.0        Rtsne_0.15                 
#> [157] Biobase_2.52.0              shiny_1.6.0