knitr::opts_chunk$set(echo = FALSE)
### Create Output Directory ###
# if save_output = TRUE
if(params$save_output){
outfile_dir <- file.path(params$output_dir,"EpiCompare_file")
if(!dir.exists(outfile_dir)){
dir.create(outfile_dir, showWarnings = FALSE, recursive = TRUE)
}
}
#### ------ Prepare genome builds ------ ####
# e.g. genome_build <- list(reference="hg19",peakfiles="hg38",blacklist="hg19")
# or... genome_build <- "hg19"
builds <- prepare_genome_builds(genome_build = params$genome_build)
## Standardise all data to hg19 build
output_build <- prepare_output_build(params$genome_build_output)
#### ------ Prepare peaklist(s) ------ ####
# and check that the list is named, if not default filenames are used
peaklist <- prepare_peaklist(peaklist = params$peakfile)
peaklist <- liftover_grlist(grlist = peaklist,
input_build = builds$peaklist,
output_build = output_build)
#### ------ Prepare reference(s) ------ ####
reference <- prepare_reference(reference = params$reference)
reference <- liftover_grlist(grlist = reference,
input_build = builds$reference,
output_build = output_build)
#### ------ Prepare blacklist ------ ####
blacklist <- liftover_grlist(grlist = params$blacklist,
input_build = builds$blacklist,
output_build = output_build)
### Standardise peaklist(s) ###
peaklist <- tidy_peakfile(peaklist = peaklist,
blacklist = blacklist)
peaklist_tidy <- peaklist
### Standardise reference(s) ###
# and include in peaklist
reference_tidy <- reference
if (!is.null(reference)){
reference_tidy <- tidy_peakfile(peaklist = reference,
blacklist = blacklist)
peaklist_tidy <- c(peaklist_tidy, reference_tidy)
}
### Obtain Genome Annotation ###
txdb <- check_genome_build(genome_build = output_build)
### Dynamic Figure Height ###
fig_height <- fig_length(default_size = 7,
number_of_items = length(peaklist_tidy),
max_items = 10)
EpiCompare compares epigenomic datasets for quality control and benchmarking purposes. The report consists of three sections:
## [1] "File1: ATAC_ENCFF558BLC"
## [1] "File2: Dnase_ENCFF274YGF"
## [1] "File3: ChIP_ENCFF038DDS"
## [1] "File4: Dnase_ENCFF185XRG_reference"
The function call used to create the report.
EpiCompare(peakfiles = list(ATAC_ENCFF558BLC, Dnase_ENCFF274YGF, ChIP_ENCFF038DDS, Dnase_ENCFF185XRG_reference),
genome_build = hg38,
genome_build_output = hg38,
blacklist = blacklist,
picard_files = list(),
reference = Dnase_ENCFF185XRG_reference,
upset_plot = TRUE,
stat_plot = TRUE,
chromHMM_plot = TRUE,
chromHMM_annotation = "K562",
chipseeker_plot = TRUE,
enrichment_plot = TRUE,
tss_plot = TRUE,
precision_recall_plot = TRUE,
n_threshold = 20,
corr_plot=TRUE,
interact = TRUE,
save_output = TRUE,
output_dir = "/Users/serachoi/Desktop",
workers = 1)
Column Description:
BRGenomics::tidyChromosomes()
.NB: EpiCompare uses filtered peakfiles (i.e. datasets after removing peaks in blacklisted regions and non-standard chromosomes)
PeakN Before Tidy | Blacklisted Peaks Removed (%) | Non-standard Peaks Removed (%) | PeakN After Tidy | |
---|---|---|---|---|
ATAC_ENCFF558BLC | 107082 | 0 | 0 | 107082 |
Dnase_ENCFF274YGF | 118721 | 0 | 0 | 118721 |
ChIP_ENCFF038DDS | 52301 | 0 | 0 | 52301 |
Metrics on fragments is shown only if Picard summary is provided. See manual for help.
Column Description:
Distribution of peak widths in samples
Percentage of overlapping peaks between samples. Hover over heatmap for percentage values.
NOTES: How to interpret heatmap: [Samples in x-axis
of heatmap] peaks in [Samples in y-axis of heatmap] peaks. In other
words:
- Recall: y-axis: Percent of peaks in y-axis samples that
overlap with peaks in x-axis samples. in the x-axis samples.
- Precision: x-axis: Percent of peaks in x-axis samples that
overlap with peaks in y-axis samples.
Depending on the format of the reference file, EpiCompare outputs different plots:
Reference peakfile: Dnase_ENCFF185XRG_reference
Keys:
The first plot shows the balance between precision and recall across multiple peak calling stringency thresholds.
The second plot shows F1 score (a score that combines precision and recall) across the different peak calling stringency thresholds.
2*(precision*recall) / (precision+recall)
The correlation plot shows the correlation between the quantiles when the genome is binned at a set size. These quantiles are based on the intensity of the peak, dependent on the peak caller used (q-value for MACS2):
ChromHMM annotates and characterises peaks into different chromatin states. ChromHMM annotations used in EpiCompare were obtained from here.
ChromHMM annotation
ChromHMM annotation of individual samples.
Percentage of Sample peaks found in Reference peaks (Reference peakfile: Dnase_ENCFF185XRG_reference)
query | subject | overlap | total | Percentage | type | peaklist1 | peaklist2 |
---|---|---|---|---|---|---|---|
ATAC_ENCFF558BLC | Dnase_ENCFF185XRG_reference | 78230 | 107082 | 73.1 | precision | ATAC_ENCFF558BLC | Dnase_ENCFF185XRG_reference |
Dnase_ENCFF274YGF | Dnase_ENCFF185XRG_reference | 99657 | 118721 | 83.9 | precision | Dnase_ENCFF274YGF | Dnase_ENCFF185XRG_reference |
ChIP_ENCFF038DDS | Dnase_ENCFF185XRG_reference | 36415 | 52301 | 69.6 | precision | ChIP_ENCFF038DDS | Dnase_ENCFF185XRG_reference |
Dnase_ENCFF185XRG_reference | Dnase_ENCFF185XRG_reference | 159277 | 159277 | 100.0 | precision | Dnase_ENCFF185XRG_reference | Dnase_ENCFF185XRG_reference |
Dnase_ENCFF185XRG_reference | ATAC_ENCFF558BLC | 99136 | 159277 | 62.2 | recall | ATAC_ENCFF558BLC | Dnase_ENCFF185XRG_reference |
Dnase_ENCFF185XRG_reference | Dnase_ENCFF274YGF | 99154 | 159277 | 62.3 | recall | Dnase_ENCFF274YGF | Dnase_ENCFF185XRG_reference |
Dnase_ENCFF185XRG_reference | ChIP_ENCFF038DDS | 56387 | 159277 | 35.4 | recall | ChIP_ENCFF038DDS | Dnase_ENCFF185XRG_reference |
Dnase_ENCFF185XRG_reference | Dnase_ENCFF185XRG_reference | 159277 | 159277 | 100.0 | recall | Dnase_ENCFF185XRG_reference | Dnase_ENCFF185XRG_reference |
ChromHMM annotation of sample peaks found in reference peaks.
Percentage of Reference peaks found in Sample peaks (Reference peakfile: Dnase_ENCFF185XRG_reference)
query | subject | overlap | total | Percentage | type | peaklist1 | peaklist2 |
---|---|---|---|---|---|---|---|
Dnase_ENCFF185XRG_reference | ATAC_ENCFF558BLC | 99136 | 159277 | 62.2 | precision | Dnase_ENCFF185XRG_reference | ATAC_ENCFF558BLC |
Dnase_ENCFF185XRG_reference | Dnase_ENCFF274YGF | 99154 | 159277 | 62.3 | precision | Dnase_ENCFF185XRG_reference | Dnase_ENCFF274YGF |
Dnase_ENCFF185XRG_reference | ChIP_ENCFF038DDS | 56387 | 159277 | 35.4 | precision | Dnase_ENCFF185XRG_reference | ChIP_ENCFF038DDS |
Dnase_ENCFF185XRG_reference | Dnase_ENCFF185XRG_reference | 159277 | 159277 | 100.0 | precision | Dnase_ENCFF185XRG_reference | Dnase_ENCFF185XRG_reference |
ATAC_ENCFF558BLC | Dnase_ENCFF185XRG_reference | 78230 | 107082 | 73.1 | recall | Dnase_ENCFF185XRG_reference | ATAC_ENCFF558BLC |
Dnase_ENCFF274YGF | Dnase_ENCFF185XRG_reference | 99657 | 118721 | 83.9 | recall | Dnase_ENCFF185XRG_reference | Dnase_ENCFF274YGF |
ChIP_ENCFF038DDS | Dnase_ENCFF185XRG_reference | 36415 | 52301 | 69.6 | recall | Dnase_ENCFF185XRG_reference | ChIP_ENCFF038DDS |
Dnase_ENCFF185XRG_reference | Dnase_ENCFF185XRG_reference | 159277 | 159277 | 100.0 | recall | Dnase_ENCFF185XRG_reference | Dnase_ENCFF185XRG_reference |
ChromHMM annotation of reference peaks found in sample peaks.
Percentage of sample peaks not found in reference peaks (Reference peakfile: Dnase_ENCFF185XRG_reference)
query | subject | overlap | total | Percentage | type | peaklist1 | peaklist2 |
---|---|---|---|---|---|---|---|
ATAC_ENCFF558BLC | Dnase_ENCFF185XRG_reference | 28852 | 107082 | 26.9 | precision | ATAC_ENCFF558BLC | Dnase_ENCFF185XRG_reference |
Dnase_ENCFF274YGF | Dnase_ENCFF185XRG_reference | 19064 | 118721 | 16.1 | precision | Dnase_ENCFF274YGF | Dnase_ENCFF185XRG_reference |
ChIP_ENCFF038DDS | Dnase_ENCFF185XRG_reference | 15886 | 52301 | 30.4 | precision | ChIP_ENCFF038DDS | Dnase_ENCFF185XRG_reference |
Dnase_ENCFF185XRG_reference | Dnase_ENCFF185XRG_reference | 0 | 159277 | 0.0 | precision | Dnase_ENCFF185XRG_reference | Dnase_ENCFF185XRG_reference |
Dnase_ENCFF185XRG_reference | ATAC_ENCFF558BLC | 60141 | 159277 | 37.8 | recall | ATAC_ENCFF558BLC | Dnase_ENCFF185XRG_reference |
Dnase_ENCFF185XRG_reference | Dnase_ENCFF274YGF | 60123 | 159277 | 37.7 | recall | Dnase_ENCFF274YGF | Dnase_ENCFF185XRG_reference |
Dnase_ENCFF185XRG_reference | ChIP_ENCFF038DDS | 102890 | 159277 | 64.6 | recall | ChIP_ENCFF038DDS | Dnase_ENCFF185XRG_reference |
Dnase_ENCFF185XRG_reference | Dnase_ENCFF185XRG_reference | 0 | 159277 | 0.0 | recall | Dnase_ENCFF185XRG_reference | Dnase_ENCFF185XRG_reference |
ChromHMM annotation of sample peaks not found in reference peaks.
Percentage of reference peaks not found in sample peaks (Reference peakfile: Dnase_ENCFF185XRG_reference)
query | subject | overlap | total | Percentage | type | peaklist1 | peaklist2 |
---|---|---|---|---|---|---|---|
Dnase_ENCFF185XRG_reference | ATAC_ENCFF558BLC | 60141 | 159277 | 37.8 | precision | Dnase_ENCFF185XRG_reference | ATAC_ENCFF558BLC |
Dnase_ENCFF185XRG_reference | Dnase_ENCFF274YGF | 60123 | 159277 | 37.7 | precision | Dnase_ENCFF185XRG_reference | Dnase_ENCFF274YGF |
Dnase_ENCFF185XRG_reference | ChIP_ENCFF038DDS | 102890 | 159277 | 64.6 | precision | Dnase_ENCFF185XRG_reference | ChIP_ENCFF038DDS |
Dnase_ENCFF185XRG_reference | Dnase_ENCFF185XRG_reference | 0 | 159277 | 0.0 | precision | Dnase_ENCFF185XRG_reference | Dnase_ENCFF185XRG_reference |
ATAC_ENCFF558BLC | Dnase_ENCFF185XRG_reference | 28852 | 107082 | 26.9 | recall | Dnase_ENCFF185XRG_reference | ATAC_ENCFF558BLC |
Dnase_ENCFF274YGF | Dnase_ENCFF185XRG_reference | 19064 | 118721 | 16.1 | recall | Dnase_ENCFF185XRG_reference | Dnase_ENCFF274YGF |
ChIP_ENCFF038DDS | Dnase_ENCFF185XRG_reference | 15886 | 52301 | 30.4 | recall | Dnase_ENCFF185XRG_reference | ChIP_ENCFF038DDS |
Dnase_ENCFF185XRG_reference | Dnase_ENCFF185XRG_reference | 0 | 159277 | 0.0 | recall | Dnase_ENCFF185XRG_reference | Dnase_ENCFF185XRG_reference |
ChromHMM annotation of reference peaks not found in sample peaks.
EpiCompare uses ChIPseeker::annotatePeak()
to annotate
peaks with the nearest gene and genomic regions where the peak is
located. The peaks are annotated with genes taken from human genome
annotations (hg19 or hg38) provided by Bioconductor.
EpiCompare performs KEGG pathway and GO enrichment analysis using
clusterProfiler
. ChIPseeker::annotatePeak()
is
first used to assign peaks to nearest genes. Biological themes amongst
the genes are identified using ontologies (KEGG and GO). The peaks are
annotated with genes taken from annotations of human genome (hg19 or
hg38) provided by Bioconductor.
GeneRatio is the number of observed genes divided by the number of expected genes from each GO category. GO terms can be useful when assessing different biological themes present in each epigenomic dataset.
This plots peaks that are mapping to transcriptional start sites (TSS). TSS regions are defined as the flanking sequence of the TSS sites. The frequency of peaks upstream (-3000bp) and downstream (+3000bp) of TSS is plotted. Faint color line around the main frequency line represents the 95% confidence interval estimated by bootstrap method.
## >> Running bootstrapping for tag matrix... 2023-02-08 21:43:21
## >> Running bootstrapping for tag matrix... 2023-02-08 21:50:12
## >> Running bootstrapping for tag matrix... 2023-02-08 21:55:03
## >> Running bootstrapping for tag matrix... 2023-02-08 22:02:31
## [[1]]
##
## [[1]]
##
## [[1]]
##
## [[1]]
## R version 4.2.0 (2022-04-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 11.2
##
## Matrix products: default
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] EpiCompare_1.3.2 rtracklayer_1.56.0 GenomicRanges_1.48.0 GenomeInfoDb_1.32.2
## [5] IRanges_2.30.0 S4Vectors_0.34.0 BiocGenerics_0.42.0 testthat_3.1.4
## [9] devtools_2.4.3 usethis_2.1.6
##
## loaded via a namespace (and not attached):
## [1] rappdirs_0.3.3 tidyr_1.2.0
## [3] ggplot2_3.3.6 bit64_4.0.5
## [5] knitr_1.39 DelayedArray_0.22.0
## [7] data.table_1.14.2 KEGGREST_1.36.0
## [9] RCurl_1.98-1.7 generics_0.1.2
## [11] GenomicFeatures_1.48.3 callr_3.7.0
## [13] RSQLite_2.2.14 shadowtext_0.1.2
## [15] bit_4.0.4 tzdb_0.3.0
## [17] enrichplot_1.16.1 BiocStyle_2.24.0
## [19] webshot_0.5.4 xml2_1.3.3
## [21] httpuv_1.6.5 SummarizedExperiment_1.26.1
## [23] assertthat_0.2.1 viridis_0.6.2
## [25] xfun_0.31 jquerylib_0.1.4
## [27] hms_1.1.1 TSP_1.2-2
## [29] evaluate_0.15 promises_1.2.0.1
## [31] fansi_1.0.3 restfulr_0.0.14
## [33] progress_1.2.2 dendextend_1.16.0
## [35] caTools_1.18.2 dbplyr_2.2.0
## [37] igraph_1.3.1 DBI_1.1.2
## [39] geneplotter_1.74.0 htmlwidgets_1.5.4
## [41] purrr_0.3.4 ellipsis_0.3.2
## [43] crosstalk_1.2.0 dplyr_1.0.9
## [45] annotate_1.74.0 gridBase_0.4-7
## [47] biomaRt_2.52.0 MatrixGenerics_1.8.0
## [49] vctrs_0.4.1 Biobase_2.56.0
## [51] remotes_2.4.2 cachem_1.0.6
## [53] withr_2.5.0 ggforce_0.3.3
## [55] BSgenome.Hsapiens.UCSC.hg38_1.4.4 BSgenome_1.64.0
## [57] genomation_1.28.0 vroom_1.5.7
## [59] GenomicAlignments_1.32.0 treeio_1.20.0
## [61] prettyunits_1.1.1 DOSE_3.22.0
## [63] ape_5.6-2 lazyeval_0.2.2
## [65] crayon_1.5.1 genefilter_1.78.0
## [67] pkgconfig_2.0.3 labeling_0.4.2
## [69] tweenr_1.0.2 seriation_1.4.1
## [71] nlme_3.1-157 pkgload_1.2.4
## [73] rlang_1.0.2 lifecycle_1.0.1
## [75] downloader_0.4 registry_0.5-1
## [77] filelock_1.0.2 BiocFileCache_2.4.0
## [79] seqPattern_1.28.0 AnnotationHub_3.4.0
## [81] rprojroot_2.0.3 polyclip_1.10-0
## [83] matrixStats_0.62.0 Matrix_1.4-1
## [85] aplot_0.1.6 boot_1.3-28
## [87] processx_3.5.3 png_0.1-7
## [89] viridisLite_0.4.0 rjson_0.2.21
## [91] ca_0.71.1 bitops_1.0-7
## [93] rcmdcheck_1.4.0 KernSmooth_2.23-20
## [95] Biostrings_2.64.0 blob_1.2.3
## [97] stringr_1.4.0 qvalue_2.28.0
## [99] readr_2.1.2 gridGraphics_0.5-1
## [101] scales_1.2.0 memoise_2.0.1
## [103] magrittr_2.0.3 plyr_1.8.7
## [105] gplots_3.1.3 zlibbioc_1.42.0
## [107] compiler_4.2.0 scatterpie_0.1.7
## [109] TxDb.Hsapiens.UCSC.hg38.knownGene_3.15.0 BiocIO_1.6.0
## [111] RColorBrewer_1.1-3 plotrix_3.8-2
## [113] DESeq2_1.36.0 Rsamtools_2.12.0
## [115] cli_3.3.0 XVector_0.36.0
## [117] patchwork_1.1.1 ps_1.7.0
## [119] MASS_7.3-57 tidyselect_1.1.2
## [121] stringi_1.7.6 highr_0.9
## [123] yaml_2.3.5 GOSemSim_2.22.0
## [125] locfit_1.5-9.5 ggrepel_0.9.1
## [127] grid_4.2.0 sass_0.4.1
## [129] fastmatch_1.1-3 tools_4.2.0
## [131] parallel_4.2.0 xopen_1.0.0
## [133] rstudioapi_0.13 foreach_1.5.2
## [135] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2 gridExtra_2.3
## [137] BRGenomics_1.8.0 plyranges_1.16.0
## [139] farver_2.1.0 ComplexUpset_1.3.3
## [141] ggraph_2.0.5 digest_0.6.29
## [143] BiocManager_1.30.18 shiny_1.7.1
## [145] Rcpp_1.0.8.3 BiocVersion_3.15.2
## [147] later_1.3.0 org.Hs.eg.db_3.15.0
## [149] httr_1.4.3 AnnotationDbi_1.58.0
## [151] colorspace_2.0-3 brio_1.1.3
## [153] XML_3.99-0.9 fs_1.5.2
## [155] splines_4.2.0 yulab.utils_0.0.4
## [157] tidytree_0.3.9 graphlayouts_0.8.0
## [159] ggplotify_0.1.0 plotly_4.10.0
## [161] sessioninfo_1.2.2 xtable_1.8-4
## [163] jsonlite_1.8.0 ggtree_3.4.0
## [165] heatmaply_1.4.2 tidygraph_1.2.1
## [167] UpSetR_1.4.0 ggfun_0.0.6
## [169] consensusSeekeR_1.24.0 R6_2.5.1
## [171] pillar_1.7.0 htmltools_0.5.2
## [173] mime_0.12 glue_1.6.2
## [175] fastmap_1.1.0 clusterProfiler_4.4.2
## [177] BiocParallel_1.30.3 interactiveDisplayBase_1.34.0
## [179] codetools_0.2-18 ChIPseeker_1.32.0
## [181] fgsea_1.22.0 pkgbuild_1.3.1
## [183] utf8_1.2.2 bslib_0.3.1
## [185] lattice_0.20-45 tibble_3.1.7
## [187] curl_4.3.2 gtools_3.9.2.1
## [189] GO.db_3.15.0 survival_3.3-1
## [191] roxygen2_7.2.0 rmarkdown_2.14
## [193] desc_1.4.1 munsell_0.5.0
## [195] DO.db_2.9 GenomeInfoDbData_1.2.8
## [197] iterators_1.0.14 impute_1.70.0
## [199] reshape2_1.4.4 gtable_0.3.0