library(poweranalysis)
library(SingleCellExperiment)
#> Loading required package: SummarizedExperiment
#> Loading required package: MatrixGenerics
#> Loading required package: matrixStats
#> 
#> Attaching package: 'MatrixGenerics'
#> The following objects are masked from 'package:matrixStats':
#> 
#>     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
#>     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
#>     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
#>     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
#>     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
#>     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
#>     colWeightedMeans, colWeightedMedians, colWeightedSds,
#>     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
#>     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
#>     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
#>     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
#>     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
#>     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
#>     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
#>     rowWeightedSds, rowWeightedVars
#> Loading required package: GenomicRanges
#> Loading required package: stats4
#> Loading required package: BiocGenerics
#> Loading required package: generics
#> 
#> Attaching package: 'generics'
#> The following objects are masked from 'package:base':
#> 
#>     as.difftime, as.factor, as.ordered, intersect, is.element, setdiff,
#>     setequal, union
#> 
#> Attaching package: 'BiocGenerics'
#> The following objects are masked from 'package:stats':
#> 
#>     IQR, mad, sd, var, xtabs
#> The following objects are masked from 'package:base':
#> 
#>     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
#>     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
#>     get, grep, grepl, is.unsorted, lapply, Map, mapply, match, mget,
#>     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
#>     rbind, Reduce, rownames, sapply, saveRDS, table, tapply, unique,
#>     unsplit, which.max, which.min
#> Loading required package: S4Vectors
#> 
#> Attaching package: 'S4Vectors'
#> The following object is masked from 'package:utils':
#> 
#>     findMatches
#> The following objects are masked from 'package:base':
#> 
#>     expand.grid, I, unname
#> Loading required package: IRanges
#> Loading required package: GenomeInfoDb
#> Loading required package: Biobase
#> Welcome to Bioconductor
#> 
#>     Vignettes contain introductory material; view with
#>     'browseVignettes()'. To cite Bioconductor, see
#>     'citation("Biobase")', and for packages 'citation("pkgname")'.
#> 
#> Attaching package: 'Biobase'
#> The following object is masked from 'package:MatrixGenerics':
#> 
#>     rowMedians
#> The following objects are masked from 'package:matrixStats':
#> 
#>     anyMissing, rowMedians

Assessing Effect Size Concordance via Correlation Analysis

The correlation_analysis function quantifies agreement in effect sizes (log₂ fold-changes) of differentially expressed genes (DEGs) across and within scRNA-seq datasets by computing Spearman’s rank correlations. A user-specified reference dataset is used to define significant DEGs, which are then compared across other studies, independently sampled subsets, and random permutations. Comparing these patterns to correlations from random permutations helps distinguish true biological signals from noise.

Below is an example demonstrating how to apply this analysis using a list of SingleCellExperiment objects:

# Load SCE objects (replace with actual file paths or data)
tsai_path <- system.file("extdata", "Tsai_Micro.qs", package="poweranalysis")
tsai_micro <- qs::qread(tsai_path)
zhou_path <- system.file("extdata", "Zhou_Micro.qs", package="poweranalysis")
zhou_micro <- qs::qread(zhou_path)

# List of SCE datasets
SCEs <- list(tsai_micro, zhou_micro)

# Dataset names (used in plots and output files)
dataset_names <- c("Tsai", "Zhou")

# Cell type mapping
celltype_corr <- list(Micro=c("Micro", "Micro"))

# Metadata column names per SCE
celltypeIDs <- c("cluster_celltype","cluster_celltype")
sampleIDs <- c("sample_id","sample_id")

# list of P-value thresholds
pvals <- c(0.05,0.01,0.001)

# Output path (must be a clean directory with no files or subdirectories)
output_corr <- file.path(getwd(), "output_corr")
if (!dir.exists(output_corr)) {
  dir.create(output_corr, recursive = TRUE)
}


# Run correlation analysis between single-cell datasets
correlation_analysis(
  main_dataset = "Tsai",    # Name of the dataset used to select significant DEGs from 
  SCEs = SCEs,
  dataset_names = dataset_names,
  celltype_corr = celltype_corr,
  celltypeIDs = celltypeIDs,
  sampleIDs = sampleIDs,
  pvals = pvals,
  N_randperms = 3,
  N_subsets = 3,
  output_path = output_corr
)

The correlation_analysis function generates two key outputs:

  • Symmetric heatmaps show pairwise Spearman’s rank correlations of log₂ fold-changes for DEGs identified under different P-value thresholds in the single-cell datasets.
    P-value = 0.05:

P-value = 0.01:

P-value = 0.001:

  • Box plots summarises correlations for effect sizes of DEGs across datasets, by cell type, as well as the overall mean. High between-study correlation indicates reproducibility of DEGs, while high within-study correlation may suggest technical variation such as batch effects.
    P-value = 0.05:

P-value = 0.01:

P-value = 0.001: