library(poweranalysis)
library(SingleCellExperiment)
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: generics
## 
## Attaching package: 'generics'
## The following objects are masked from 'package:base':
## 
##     as.difftime, as.factor, as.ordered, intersect, is.element, setdiff,
##     setequal, union
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, saveRDS, table, tapply, unique,
##     unsplit, which.max, which.min
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
## 
##     findMatches
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians

Validating Power Analysis using Bulk Data

This section introduces a validation strategy for scRNA-seq power analysis using high-confidence, sex-biased DEGs derived from bulk GTEx data. Specifically, it assesses how many “PTP DEGs” (genes sex-biased in ≥90% of GTEx tissues) are recovered in single-cell differential expression analysis results under various down-sampling levels. Because these DEGs are identified across a broad range of tissues, they provide a robust benchmark for evaluating sex-bias detection.

The bulk_power_analysis function estimates the power to detect these DEGs across one or more SCE objects, with user-supplied metadata indicating how to identify samples and cell types. Below is an example that demonstrates this analysis using two scRNA-seq datasets, based on downsampling individuals.

# Load SCE objects (replace with actual file paths or data)
tsai_path <- system.file("extdata", "Tsai_Micro.qs", package="poweranalysis")
tsai_micro <- qs::qread(tsai_path)
zhou_path <- system.file("extdata", "Zhou_Astro_subset.qs", package="poweranalysis")
zhou_astro <- qs::qread(zhou_path)

# List of SCE datasets
SCEs <- list(tsai_micro, zhou_astro)

# Dataset names (used in plots and output files)
dataset_names <- c("Tsai_Micro", "Zhou_Astro")

# Cell type mapping
celltype_corr <- list(Micro=c("Micro", NA),
                      Astro=c(NA,"Astro"))

# Metadata column names per SCE
celltypeIDs <- c("cluster_celltype","cluster_celltype")
sampleIDs <- c("sample_id","sample_id")


# Output path (must be a clean directory with no files or subdirectories)
output_sample <- file.path(getwd(), "bulk_sample")
if (!dir.exists(output_sample)) {
  dir.create(output_sample, recursive = TRUE)
}

# Load GTEx bulk DEGs
bulk_path <- system.file("extdata", "LFSR.tsv", package="poweranalysis")
bulkDE <- read.table(bulk_path, sep = "\t", header = TRUE)

# Run bulk power analysis for down-sampling individuals
bulk_power_analysis(
  SCEs = SCEs,
  dataset_names = dataset_names,
  celltype_corr = celltype_corr,
  celltypeIDs = celltypeIDs,
  sampleIDs = sampleIDs,
  bulkDE = bulkDE,
  sampled = "individuals",
  output_path = output_sample,
  Nperms = 3 # Nperms=3 for speed in example
)

To assess recovery by down-sampling cells instead of individuals, simply set sampled = "cells":

# Output path (must be a clean directory)
output_cell <- file.path(getwd(), "bulk_cell")
if (!dir.exists(output_cell)) {
  dir.create(output_cell, recursive = TRUE)
}

# Run bulk power analysis for down-sampling cells
bulk_power_analysis(
  SCEs = SCEs,
  dataset_names = dataset_names,
  celltype_corr = celltype_corr,
  celltypeIDs = celltypeIDs,
  sampleIDs = sampleIDs,
  bulkDE = bulkDE,
  sampled = "cells",  # down-sampling cells
  output_path = output_cell,
  Nperms = 3
)

The output displays the percentage of PTP DEGs from the bulk data that are detected across all cell types at different down-sampling levels of the scRNA-seq datasets.

Down-sampling individuals:

Down-sampling cells: