library(data.table)
All HPO annotations comes from directly from the HPO site: https://hpo.jax.org/app/data/annotations
Detailed columns descriptions: https://hpo-annotation-qc.readthedocs.io/en/latest/annotationFormat.html#phenotype-hpoa-format
d <- HPOExplorer::load_phenotype_to_genes(3)
## Importing existing file: ... phenotype.hpoa
{
d[,DiseaseDB:=sapply(DatabaseID,function(x){strsplit(x,":")[[1]][1]})]
d <- HPOExplorer::add_death(d, agg_by = "DatabaseID")
d <- HPOExplorer::add_ndisease(d)
d <- HPOExplorer::add_ancestor(d)
d <- HPOExplorer::add_pheno_frequency(d)
d <- HPOExplorer::add_ont_lvl(d)
d <- HPOExplorer::add_onset(d)
d <- HPOExplorer::add_severity(d)
d <- HPOExplorer::add_tier(d)
}
## Annotating phenos with AgeOfDeath
## Annotating phenos with n_diseases
## Importing existing file: ... phenotype_to_genes.txt
## Importing existing file: ... genes_to_phenotype.txt
## Importing existing file: ... phenotype.hpoa
## Adding level-3 ancestor to each HPO ID.
## Annotating phenotype frequencies.
## Getting absolute ontology level for 10,550 HPO IDs.
## Annotating phenos with Onset.
## Annotating phenos with Modifiers
## Annotating phenos with Tiers.
all_vars <- c(
"Qualifier","Evidence","Sex","Aspect","Biocuration",
"DiseaseDB","ancestor_name",
"AgeOfDeath_names","AgeOfDeath_earliest","AgeOfDeath_latest",
"AgeOfDeath_score_min","AgeOfDeath_score_max","AgeOfDeath_score_mean",
"pheno_freq_name","pheno_freq_min","pheno_freq_max",
"Onset_name","Modifier_name","Severity_score",
"tier","tier_auto","tier_merge")
#### Only select variables with <N unique values ####
n_uniq <- lapply(stats::setNames(all_vars,
all_vars),
function(x){ length(unique(unlist(d[[x]]))) })
print(n_uniq)
## $Qualifier
## [1] 2
##
## $Evidence
## [1] 3
##
## $Sex
## [1] 5
##
## $Aspect
## [1] 4
##
## $Biocuration
## [1] 8267
##
## $DiseaseDB
## [1] 3
##
## $ancestor_name
## [1] 34
##
## $AgeOfDeath_names
## [1] 10
##
## $AgeOfDeath_earliest
## [1] 8
##
## $AgeOfDeath_latest
## [1] 7
##
## $AgeOfDeath_score_min
## [1] 8
##
## $AgeOfDeath_score_max
## [1] 7
##
## $AgeOfDeath_score_mean
## [1] 13
##
## $pheno_freq_name
## [1] 8
##
## $pheno_freq_min
## [1] 1143
##
## $pheno_freq_max
## [1] 1145
##
## $Onset_name
## [1] 12
##
## $Modifier_name
## [1] 40
##
## $Severity_score
## [1] 5
##
## $tier
## [1] 5
##
## $tier_auto
## [1] 5
##
## $tier_merge
## [1] 5
#### Filter variables ####
## Must contain less than 20 unique values
vars <- all_vars[n_uniq<20]
aod_vars <- grep("AgeOfDeath",vars, value = TRUE)
vars <- vars[!vars %in% aod_vars]
aod_vars <- aod_vars[!aod_vars %in% c("AgeOfDeath_names","AgeOfDeath_counts")]
plot_proportions <- function(d,
vars,
drop_na=FALSE){
lapply(stats::setNames(vars,
vars),
function(v){
dat <- d[,..v]
dat[get(names(dat)[1])=="",] <- NA
if(isTRUE(drop_na)){
dat <- dat[!is.na(get(names(dat)[1])),]
}
dat[[1]] <- factor(dat[[1]])
ggplot2::ggplot(dat,
ggplot2::aes_string(x="1", fill=names(dat)[1])) +
ggplot2::geom_bar(stat = "count",
position = ggplot2::position_stack()) +
ggplot2::labs(x=NULL, title=v) +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_blank())
})
}
plts <- plot_proportions(d = d,
vars = vars)
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation ideoms with `aes()`
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
pw <- patchwork::wrap_plots(plts)
print(pw)
Plot these separately simply to avoid overplotting.
aod_plts <- plot_proportions(d = d,
vars = aod_vars)
aod_pw <- patchwork::wrap_plots(aod_plts)
print(aod_pw)
Plot again, but without the NA values so we can better see the proportions for diseases that are annotated.
aod_plts <- plot_proportions(d = d,
vars = aod_vars,
drop_na = TRUE)
aod_pw <- patchwork::wrap_plots(aod_plts)
print(aod_pw)
Show the top 20 most frequent values for each of the metadata attributes.
counts <- lapply(stats::setNames(all_vars,
all_vars),
function(v){
dat <- d[,..v]
dat[get(names(dat)[1])=="",] <- NA
head(sort(table(unlist(dat[[1]]), useNA = "always"),
decreasing = TRUE),20)
})
print(counts)
## $Qualifier
##
## <NA> NOT
## 252320 1452
##
## $Evidence
##
## TAS PCS IEA <NA>
## 141758 71619 40395 0
##
## $Sex
##
## <NA> MALE FEMALE male female
## 253149 335 165 90 33
##
## $Aspect
##
## P I C M <NA>
## 239020 8833 5743 176 0
##
## $Biocuration
##
## ORPHA:orphadata[2022-12-20] HPO:iea[2009-02-17]
## 112897 26965
## HPO:skoehler[2017-07-13] HPO:skoehler[2012-10-17]
## 4264 4038
## HPO:skoehler[2018-10-08] HPO:probinson[2009-02-17]
## 3701 3277
## HPO:skoehler[2019-04-18] HPO:skoehler[2010-06-20]
## 1722 1675
## HPO:skoehler[2015-12-30] HPO:skoehler[2019-09-07]
## 1434 1404
## HPO:skoehler[2010-06-19] HPO:skoehler[2009-02-17]
## 1346 932
## HPO:skoehler[2010-06-18] HPO:skoehler[2019-02-15]
## 823 796
## HPO:skoehler[2012-11-18] HPO:iea[2012-04-24]
## 597 535
## HPO:probinson[2017-06-17] HPO:skoehler[2014-11-26]
## 533 507
## HPO:skoehler[2014-01-28] HPO:probinson[2022-05-08]
## 446 404
##
## $DiseaseDB
##
## OMIM ORPHA DECIPHER <NA>
## 140579 112897 296 0
##
## $ancestor_name
##
## Abnormality of the nervous system
## 51932
## Abnormality of the musculoskeletal system
## 33225
## Abnormality of head or neck
## 31255
## Abnormality of the eye
## 18445
## Abnormality of the integument
## 13162
## Abnormality of the cardiovascular system
## 11712
## Abnormality of the digestive system
## 11350
## Abnormality of the genitourinary system
## 10810
## Abnormality of metabolism/homeostasis
## 10003
## Abnormality of limbs
## 9948
## Mendelian inheritance
## 8247
## Growth abnormality
## 6827
## Abnormality of the ear
## 6390
## Abnormality of blood and blood-forming tissues
## 6059
## Abnormality of the respiratory system
## 5901
## Clinical course
## 5743
## Abnormality of the immune system
## 3687
## Abnormality of the endocrine system
## 3529
## Constitutional symptom
## 1543
## Abnormality of prenatal development or birth
## 1305
##
## $AgeOfDeath_names
##
## Stillbirth Prenatal death Death in infancy
## 794 787 702
## Death in adulthood Neonatal death Death in early adulthood
## 665 657 645
## Death in adolescence Miscarriage Death in childhood
## 600 587 519
## Death in middle age <NA>
## 20 0
##
## $AgeOfDeath_earliest
##
## <NA> Miscarriage Death in early adulthood
## 249575 1381 633
## Neonatal death Death in infancy Death in adolescence
## 605 576 552
## Death in childhood Death in middle age
## 430 20
##
## $AgeOfDeath_latest
##
## <NA> Miscarriage Death in middle age
## 249575 1275 665
## Death in infancy Death in adolescence Neonatal death
## 613 588 585
## Death in childhood
## 471
##
## $AgeOfDeath_score_min
##
## <NA> 1 6 2 3 5 4 7
## 249575 1381 633 605 576 552 430 20
##
## $AgeOfDeath_score_max
##
## <NA> 1 7 3 5 2 4
## 249575 1275 665 613 588 585 471
##
## $AgeOfDeath_score_mean
##
## <NA> 1 6.5 5
## 249575 1275 633 540
## 3 2 4 1.66666666666667
## 537 533 456 54
## 1.33333333333333 3.5 2.5 7
## 52 49 36 20
## 6
## 12
##
## $pheno_freq_name
##
## <NA> ratio Occasional (29-5%)
## 78058 58714 44519
## Frequent (79-30%) Very frequent (99-80%) Very rare (<4-1%)
## 38245 26230 7150
## Obligate (100%) percentage
## 667 189
##
## $pheno_freq_min
##
## <NA> 5 30 80
## 78058 44580 38428 26772
## 100 1 50 33.3333333333333
## 23382 7152 5340 3560
## 25 20 66.6666666666667 16.6666666666667
## 2540 1912 1716 1377
## 14.2857142857143 40 75 12.5
## 1157 1002 900 877
## 0 60 11.1111111111111 8.33333333333333
## 668 648 614 561
##
## $pheno_freq_max
##
## <NA> 29 79 99
## 78058 44519 38247 26230
## 100 4 50 33.3333333333333
## 23382 7172 5340 3560
## 25 20 66.6666666666667 16.6666666666667
## 2540 1912 1716 1377
## 14.2857142857143 40 75 12.5
## 1157 1002 900 877
## 0 60 11.1111111111111 8.33333333333333
## 668 648 614 561
##
## $Onset_name
##
## <NA> Congenital onset Infantile onset Neonatal onset
## 249948 1100 743 480
## Childhood onset Juvenile onset Fetal onset Adult onset
## 444 289 265 185
## Young adult onset Antenatal onset Middle age onset Late onset
## 105 92 63 58
##
## $Modifier_name
##
## <NA> Mild
## 252690 372
## Severe Triggered by cold
## 201 66
## Progressive Recurrent
## 63 58
## Episodic Unilateral
## 49 39
## Bilateral Profound
## 33 32
## Refractory Triggered by febrile illness
## 20 18
## Chronic Moderate
## 16 15
## Triggered by exertion Triggered by EBV infection
## 15 12
## Triggered by anesthetics Distal
## 10 9
## Generalized Proximal
## 8 7
##
## $Severity_score
##
## <NA> 4 2 1 3
## 253165 372 201 32 2
##
## $tier
##
## <NA> 2 1 3 4
## 234019 8706 6645 4294 108
##
## $tier_auto
##
## <NA> 3 1 2 4
## 210507 24828 11161 6968 308
##
## $tier_merge
##
## <NA> 3 2 1 4
## 197518 25591 15464 14891 308
Count :
annot <- HPOExplorer::load_phenotype_to_genes(3)
## Importing existing file: ... phenotype.hpoa
counts1 <- annot[,list(n_phenotypes=length(unique(HPO_ID))),by="DatabaseID"]
hist(counts1$n_phenotypes)
summary(counts1$n_phenotypes)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 7.00 15.00 19.97 27.00 188.00
counts2 <- annot[,list(n_diseases=length(unique(DatabaseID))),by="HPO_ID"]
hist(counts2$n_diseases, 50)
summary(counts2$n_diseases)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 2.00 4.00 23.91 14.00 4125.00
http://purl.obolibrary.org/obo/hp/hpoa/genes_to_phenotype.txt provides a link between genes and HPO terms. All phenotype terms associated with any disease that is associated with variants in a gene are assigned to that gene in this file. Other files are available on our Jenkins server that filter terms according to provenance of the annotation and frequency of the features in the disease.
annot <- HPOExplorer::load_phenotype_to_genes(1)
## Importing existing file: ... phenotype_to_genes.txt
data.table::setnames(annot,"LinkID","DatabaseID")
counts <- annot[,list(n_phenotypes=length(unique(HPO_ID)),
n_genes=length(unique(Gene))),
by="DatabaseID"]
hist(counts$n_phenotypes)
summary(counts$n_phenotypes)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 30.50 63.00 79.33 110.00 568.00
hist(counts$n_genes)
summary(counts$n_genes)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 1.000 1.415 1.000 82.000
counts <- annot[,list(n_diseases=length(unique(DatabaseID)),
n_genes=length(unique(Gene))),
by="HPO_ID"]
hist(counts$n_diseases)
summary(counts$n_diseases)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 2.00 6.00 64.27 25.00 5182.00
hist(counts$n_genes)
summary(counts$n_genes)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 2.00 7.00 67.74 34.00 3484.00
http://purl.obolibrary.org/obo/hp/hpoa/phenotype_to_genes.txt is analogous, but instead provides links from HPO terms to genes.
annot <- HPOExplorer::load_phenotype_to_genes(2)
## Importing existing file: ... genes_to_phenotype.txt
data.table::setnames(annot,"LinkID","DatabaseID")
counts <- annot[,list(n_phenotypes=length(unique(HPO_ID)),
n_genes=length(unique(Gene))),
by="DatabaseID"]
hist(counts$n_phenotypes)
summary(counts$n_phenotypes)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 8.00 17.00 20.78 28.00 172.00
hist(counts$n_genes)
summary(counts$n_genes)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 1.000 1.399 1.000 84.000
counts <- annot[,list(n_diseases=length(unique(DatabaseID)),
n_genes=length(unique(Gene))),
by="HPO_ID"]
hist(counts$n_diseases)
summary(counts$n_diseases)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 1.00 4.00 18.91 11.00 3305.00
hist(counts$n_genes)
summary(counts$n_genes)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 2.00 5.00 23.59 17.00 2924.00
sessioninfo::session_info()
## ─ Session info ───────────────────────────────────────────────────────────────
## setting value
## version R version 4.2.1 (2022-06-23)
## os macOS Big Sur ... 10.16
## system x86_64, darwin17.0
## ui X11
## language (EN)
## collate en_US.UTF-8
## ctype en_US.UTF-8
## tz Europe/London
## date 2023-03-22
## pandoc 3.1 @ /usr/local/bin/ (via rmarkdown)
##
## ─ Packages ───────────────────────────────────────────────────────────────────
## package * version date (UTC) lib source
## bslib 0.4.2 2022-12-16 [1] CRAN (R 4.2.1)
## cachem 1.0.7 2023-02-24 [1] CRAN (R 4.2.0)
## cli 3.6.0 2023-01-09 [1] CRAN (R 4.2.0)
## coda 0.19-4 2020-09-30 [1] CRAN (R 4.2.0)
## colorspace 2.1-0 2023-01-23 [1] CRAN (R 4.2.1)
## crayon 1.5.2 2022-09-29 [1] CRAN (R 4.2.0)
## data.table * 1.14.8 2023-02-17 [1] CRAN (R 4.2.0)
## digest 0.6.31 2022-12-11 [1] CRAN (R 4.2.0)
## dplyr 1.1.0 2023-01-29 [1] CRAN (R 4.2.1)
## evaluate 0.20 2023-01-17 [1] CRAN (R 4.2.1)
## fansi 1.0.4 2023-01-22 [1] CRAN (R 4.2.1)
## farver 2.1.1 2022-07-06 [1] CRAN (R 4.2.0)
## fastmap 1.1.1 2023-02-24 [1] CRAN (R 4.2.0)
## generics 0.1.3 2022-07-05 [1] CRAN (R 4.2.0)
## ggnetwork 0.5.12 2023-03-06 [1] CRAN (R 4.2.1)
## ggplot2 3.4.1 2023-02-10 [1] CRAN (R 4.2.0)
## glue 1.6.2 2022-02-24 [1] CRAN (R 4.2.0)
## gtable 0.3.1 2022-09-01 [1] CRAN (R 4.2.0)
## highr 0.10 2022-12-22 [1] CRAN (R 4.2.1)
## HPOExplorer 0.99.7 2023-03-22 [1] Bioconductor
## htmltools 0.5.4 2022-12-07 [1] CRAN (R 4.2.0)
## htmlwidgets 1.6.1 2023-01-07 [1] CRAN (R 4.2.0)
## httr 1.4.5 2023-02-24 [1] CRAN (R 4.2.0)
## jquerylib 0.1.4 2021-04-26 [1] CRAN (R 4.2.0)
## jsonlite 1.8.4 2022-12-06 [1] CRAN (R 4.2.0)
## knitr 1.42 2023-01-25 [1] CRAN (R 4.2.1)
## labeling 0.4.2 2020-10-20 [1] CRAN (R 4.2.0)
## lattice 0.20-45 2021-09-22 [1] CRAN (R 4.2.1)
## lazyeval 0.2.2 2019-03-15 [1] CRAN (R 4.2.0)
## lifecycle 1.0.3 2022-10-07 [1] CRAN (R 4.2.0)
## magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.2.0)
## minidown 0.4.0 2022-02-08 [1] CRAN (R 4.2.0)
## munsell 0.5.0 2018-06-12 [1] CRAN (R 4.2.0)
## network 1.18.1 2023-01-24 [1] CRAN (R 4.2.1)
## ontologyIndex 2.10 2022-08-24 [1] CRAN (R 4.2.0)
## patchwork 1.1.2 2022-08-19 [1] CRAN (R 4.2.0)
## pillar 1.8.1 2022-08-19 [1] CRAN (R 4.2.0)
## pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.2.0)
## plotly 4.10.1 2022-11-07 [1] CRAN (R 4.2.1)
## purrr 1.0.1 2023-01-10 [1] CRAN (R 4.2.0)
## R6 2.5.1 2021-08-19 [1] CRAN (R 4.2.0)
## rlang 1.1.0 2023-03-14 [1] CRAN (R 4.2.0)
## rmarkdown 2.20.1 2023-02-16 [1] Github (rstudio/rmarkdown@a75dc37)
## rstudioapi 0.14 2022-08-22 [1] CRAN (R 4.2.0)
## sass 0.4.5 2023-01-24 [1] CRAN (R 4.2.0)
## scales 1.2.1 2022-08-20 [1] CRAN (R 4.2.0)
## sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.2.0)
## statnet.common 4.8.0 2023-01-24 [1] CRAN (R 4.2.1)
## stringi 1.7.12 2023-01-11 [1] CRAN (R 4.2.0)
## stringr 1.5.0 2022-12-02 [1] CRAN (R 4.2.0)
## tibble 3.2.0 2023-03-08 [1] CRAN (R 4.2.0)
## tidyr 1.3.0 2023-01-24 [1] CRAN (R 4.2.1)
## tidyselect 1.2.0 2022-10-10 [1] CRAN (R 4.2.0)
## utf8 1.2.3 2023-01-31 [1] CRAN (R 4.2.1)
## vctrs 0.6.0 2023-03-16 [1] CRAN (R 4.2.1)
## viridisLite 0.4.1 2022-08-22 [1] CRAN (R 4.2.0)
## withr 2.5.0 2022-03-03 [1] CRAN (R 4.2.0)
## xfun 0.37 2023-01-31 [1] CRAN (R 4.2.1)
## yaml 2.3.7 2023-01-23 [1] CRAN (R 4.2.1)
##
## [1] /Library/Frameworks/R.framework/Versions/4.2/Resources/library
##
## ──────────────────────────────────────────────────────────────────────────────