Source
library(data.table)

All HPO annotations comes from directly from the HPO site: https://hpo.jax.org/app/data/annotations

Detailed columns descriptions: https://hpo-annotation-qc.readthedocs.io/en/latest/annotationFormat.html#phenotype-hpoa-format

Count metadata annotations

Parse metadata

Source
d <- HPOExplorer::load_phenotype_to_genes(3)
Message
## Importing existing file: ... phenotype.hpoa
Source
{
  d[,DiseaseDB:=sapply(DatabaseID,function(x){strsplit(x,":")[[1]][1]})]
  d <- HPOExplorer::add_death(d, agg_by = "DatabaseID")
  d <- HPOExplorer::add_ndisease(d)
  d <- HPOExplorer::add_ancestor(d)
  d <- HPOExplorer::add_pheno_frequency(d)
  d <- HPOExplorer::add_ont_lvl(d)
  d <- HPOExplorer::add_onset(d)
  d <- HPOExplorer::add_severity(d)
  d <- HPOExplorer::add_tier(d) 
}
Message
## Annotating phenos with AgeOfDeath
Message
## Annotating phenos with n_diseases
Message
## Importing existing file: ... phenotype_to_genes.txt
Message
## Importing existing file: ... genes_to_phenotype.txt
Message
## Importing existing file: ... phenotype.hpoa
Message
## Adding level-3 ancestor to each HPO ID.
Message
## Annotating phenotype frequencies.
Message
## Getting absolute ontology level for 10,550 HPO IDs.
Message
## Annotating phenos with Onset.
Message
## Annotating phenos with Modifiers
Message
## Annotating phenos with Tiers.

Unique values per attribute

Source
all_vars <- c(
  "Qualifier","Evidence","Sex","Aspect","Biocuration",
  "DiseaseDB","ancestor_name",
  "AgeOfDeath_names","AgeOfDeath_earliest","AgeOfDeath_latest",
  "AgeOfDeath_score_min","AgeOfDeath_score_max","AgeOfDeath_score_mean",
  "pheno_freq_name","pheno_freq_min","pheno_freq_max",
  "Onset_name","Modifier_name","Severity_score",
  "tier","tier_auto","tier_merge")  
#### Only select variables with <N unique values ####
n_uniq <- lapply(stats::setNames(all_vars,
                                 all_vars),
                 function(x){ length(unique(unlist(d[[x]]))) })
print(n_uniq)
Output
## $Qualifier
## [1] 2
## 
## $Evidence
## [1] 3
## 
## $Sex
## [1] 5
## 
## $Aspect
## [1] 4
## 
## $Biocuration
## [1] 8267
## 
## $DiseaseDB
## [1] 3
## 
## $ancestor_name
## [1] 34
## 
## $AgeOfDeath_names
## [1] 10
## 
## $AgeOfDeath_earliest
## [1] 8
## 
## $AgeOfDeath_latest
## [1] 7
## 
## $AgeOfDeath_score_min
## [1] 8
## 
## $AgeOfDeath_score_max
## [1] 7
## 
## $AgeOfDeath_score_mean
## [1] 13
## 
## $pheno_freq_name
## [1] 8
## 
## $pheno_freq_min
## [1] 1143
## 
## $pheno_freq_max
## [1] 1145
## 
## $Onset_name
## [1] 12
## 
## $Modifier_name
## [1] 40
## 
## $Severity_score
## [1] 5
## 
## $tier
## [1] 5
## 
## $tier_auto
## [1] 5
## 
## $tier_merge
## [1] 5
Source
#### Filter variables ####
## Must contain less than 20 unique values 
vars <- all_vars[n_uniq<20]

aod_vars <- grep("AgeOfDeath",vars, value = TRUE)
vars <- vars[!vars %in% aod_vars]
aod_vars <- aod_vars[!aod_vars %in% c("AgeOfDeath_names","AgeOfDeath_counts")]

Plot proportions

Source
plot_proportions <- function(d,
                             vars,
                             drop_na=FALSE){
  lapply(stats::setNames(vars,
                         vars),
               function(v){
  dat <- d[,..v]
  dat[get(names(dat)[1])=="",] <- NA
  if(isTRUE(drop_na)){
    dat <- dat[!is.na(get(names(dat)[1])),]  
  }
  dat[[1]] <- factor(dat[[1]])
  ggplot2::ggplot(dat,
                  ggplot2::aes_string(x="1", fill=names(dat)[1])) +
    ggplot2::geom_bar(stat = "count",
                      position = ggplot2::position_stack()) +
    ggplot2::labs(x=NULL, title=v) +
    ggplot2::theme_minimal() +
    ggplot2::theme(axis.text.x = ggplot2::element_blank())
  })
}

Most attributes

Source
plts <- plot_proportions(d = d, 
                         vars = vars)
Warning
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation ideoms with `aes()`
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Source
pw <- patchwork::wrap_plots(plts)
print(pw)

AgeOfDeath attributes

Plot these separately simply to avoid overplotting.

Source
aod_plts <- plot_proportions(d = d, 
                             vars = aod_vars)
aod_pw <- patchwork::wrap_plots(aod_plts)
print(aod_pw)

Plot again, but without the NA values so we can better see the proportions for diseases that are annotated.

Source
aod_plts <- plot_proportions(d = d, 
                             vars = aod_vars, 
                             drop_na = TRUE)
aod_pw <- patchwork::wrap_plots(aod_plts)
print(aod_pw)

Count frequencies

Show the top 20 most frequent values for each of the metadata attributes.

Source
counts <- lapply(stats::setNames(all_vars,
                                 all_vars),
                 function(v){ 
  dat <- d[,..v] 
  dat[get(names(dat)[1])=="",] <- NA
  head(sort(table(unlist(dat[[1]]), useNA = "always"), 
       decreasing = TRUE),20)
})
print(counts)
Output
## $Qualifier
## 
##   <NA>    NOT 
## 252320   1452 
## 
## $Evidence
## 
##    TAS    PCS    IEA   <NA> 
## 141758  71619  40395      0 
## 
## $Sex
## 
##   <NA>   MALE FEMALE   male female 
## 253149    335    165     90     33 
## 
## $Aspect
## 
##      P      I      C      M   <NA> 
## 239020   8833   5743    176      0 
## 
## $Biocuration
## 
## ORPHA:orphadata[2022-12-20]         HPO:iea[2009-02-17] 
##                      112897                       26965 
##    HPO:skoehler[2017-07-13]    HPO:skoehler[2012-10-17] 
##                        4264                        4038 
##    HPO:skoehler[2018-10-08]   HPO:probinson[2009-02-17] 
##                        3701                        3277 
##    HPO:skoehler[2019-04-18]    HPO:skoehler[2010-06-20] 
##                        1722                        1675 
##    HPO:skoehler[2015-12-30]    HPO:skoehler[2019-09-07] 
##                        1434                        1404 
##    HPO:skoehler[2010-06-19]    HPO:skoehler[2009-02-17] 
##                        1346                         932 
##    HPO:skoehler[2010-06-18]    HPO:skoehler[2019-02-15] 
##                         823                         796 
##    HPO:skoehler[2012-11-18]         HPO:iea[2012-04-24] 
##                         597                         535 
##   HPO:probinson[2017-06-17]    HPO:skoehler[2014-11-26] 
##                         533                         507 
##    HPO:skoehler[2014-01-28]   HPO:probinson[2022-05-08] 
##                         446                         404 
## 
## $DiseaseDB
## 
##     OMIM    ORPHA DECIPHER     <NA> 
##   140579   112897      296        0 
## 
## $ancestor_name
## 
##              Abnormality of the nervous system 
##                                          51932 
##      Abnormality of the musculoskeletal system 
##                                          33225 
##                    Abnormality of head or neck 
##                                          31255 
##                         Abnormality of the eye 
##                                          18445 
##                  Abnormality of the integument 
##                                          13162 
##       Abnormality of the cardiovascular system 
##                                          11712 
##            Abnormality of the digestive system 
##                                          11350 
##        Abnormality of the genitourinary system 
##                                          10810 
##          Abnormality of metabolism/homeostasis 
##                                          10003 
##                           Abnormality of limbs 
##                                           9948 
##                          Mendelian inheritance 
##                                           8247 
##                             Growth abnormality 
##                                           6827 
##                         Abnormality of the ear 
##                                           6390 
## Abnormality of blood and blood-forming tissues 
##                                           6059 
##          Abnormality of the respiratory system 
##                                           5901 
##                                Clinical course 
##                                           5743 
##               Abnormality of the immune system 
##                                           3687 
##            Abnormality of the endocrine system 
##                                           3529 
##                         Constitutional symptom 
##                                           1543 
##   Abnormality of prenatal development or birth 
##                                           1305 
## 
## $AgeOfDeath_names
## 
##               Stillbirth           Prenatal death         Death in infancy 
##                      794                      787                      702 
##       Death in adulthood           Neonatal death Death in early adulthood 
##                      665                      657                      645 
##     Death in adolescence              Miscarriage       Death in childhood 
##                      600                      587                      519 
##      Death in middle age                     <NA> 
##                       20                        0 
## 
## $AgeOfDeath_earliest
## 
##                     <NA>              Miscarriage Death in early adulthood 
##                   249575                     1381                      633 
##           Neonatal death         Death in infancy     Death in adolescence 
##                      605                      576                      552 
##       Death in childhood      Death in middle age 
##                      430                       20 
## 
## $AgeOfDeath_latest
## 
##                 <NA>          Miscarriage  Death in middle age 
##               249575                 1275                  665 
##     Death in infancy Death in adolescence       Neonatal death 
##                  613                  588                  585 
##   Death in childhood 
##                  471 
## 
## $AgeOfDeath_score_min
## 
##   <NA>      1      6      2      3      5      4      7 
## 249575   1381    633    605    576    552    430     20 
## 
## $AgeOfDeath_score_max
## 
##   <NA>      1      7      3      5      2      4 
## 249575   1275    665    613    588    585    471 
## 
## $AgeOfDeath_score_mean
## 
##             <NA>                1              6.5                5 
##           249575             1275              633              540 
##                3                2                4 1.66666666666667 
##              537              533              456               54 
## 1.33333333333333              3.5              2.5                7 
##               52               49               36               20 
##                6 
##               12 
## 
## $pheno_freq_name
## 
##                   <NA>                  ratio     Occasional (29-5%) 
##                  78058                  58714                  44519 
##      Frequent (79-30%) Very frequent (99-80%)      Very rare (<4-1%) 
##                  38245                  26230                   7150 
##        Obligate (100%)             percentage 
##                    667                    189 
## 
## $pheno_freq_min
## 
##             <NA>                5               30               80 
##            78058            44580            38428            26772 
##              100                1               50 33.3333333333333 
##            23382             7152             5340             3560 
##               25               20 66.6666666666667 16.6666666666667 
##             2540             1912             1716             1377 
## 14.2857142857143               40               75             12.5 
##             1157             1002              900              877 
##                0               60 11.1111111111111 8.33333333333333 
##              668              648              614              561 
## 
## $pheno_freq_max
## 
##             <NA>               29               79               99 
##            78058            44519            38247            26230 
##              100                4               50 33.3333333333333 
##            23382             7172             5340             3560 
##               25               20 66.6666666666667 16.6666666666667 
##             2540             1912             1716             1377 
## 14.2857142857143               40               75             12.5 
##             1157             1002              900              877 
##                0               60 11.1111111111111 8.33333333333333 
##              668              648              614              561 
## 
## $Onset_name
## 
##              <NA>  Congenital onset   Infantile onset    Neonatal onset 
##            249948              1100               743               480 
##   Childhood onset    Juvenile onset       Fetal onset       Adult onset 
##               444               289               265               185 
## Young adult onset   Antenatal onset  Middle age onset        Late onset 
##               105                92                63                58 
## 
## $Modifier_name
## 
##                         <NA>                         Mild 
##                       252690                          372 
##                       Severe            Triggered by cold 
##                          201                           66 
##                  Progressive                    Recurrent 
##                           63                           58 
##                     Episodic                   Unilateral 
##                           49                           39 
##                    Bilateral                     Profound 
##                           33                           32 
##                   Refractory Triggered by febrile illness 
##                           20                           18 
##                      Chronic                     Moderate 
##                           16                           15 
##        Triggered by exertion   Triggered by EBV infection 
##                           15                           12 
##     Triggered by anesthetics                       Distal 
##                           10                            9 
##                  Generalized                     Proximal 
##                            8                            7 
## 
## $Severity_score
## 
##   <NA>      4      2      1      3 
## 253165    372    201     32      2 
## 
## $tier
## 
##   <NA>      2      1      3      4 
## 234019   8706   6645   4294    108 
## 
## $tier_auto
## 
##   <NA>      3      1      2      4 
## 210507  24828  11161   6968    308 
## 
## $tier_merge
## 
##   <NA>      3      2      1      4 
## 197518  25591  15464  14891    308

Count X per Y

Count :

  • diseases per phenotype
  • phenotypes per disease
  • genes per phenotype
  • genes per disease

phenotype.hpoa file

Source
annot <- HPOExplorer::load_phenotype_to_genes(3)
Message
## Importing existing file: ... phenotype.hpoa

Number of phenotypes/disease

Source
counts1 <- annot[,list(n_phenotypes=length(unique(HPO_ID))),by="DatabaseID"]
hist(counts1$n_phenotypes)

Source
summary(counts1$n_phenotypes)
Output
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    7.00   15.00   19.97   27.00  188.00

Number of diseases/phenotype

Source
counts2 <- annot[,list(n_diseases=length(unique(DatabaseID))),by="HPO_ID"]
hist(counts2$n_diseases, 50)

Source
summary(counts2$n_diseases)
Output
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    2.00    4.00   23.91   14.00 4125.00

genes_to_phenotype.txt file

http://purl.obolibrary.org/obo/hp/hpoa/genes_to_phenotype.txt provides a link between genes and HPO terms. All phenotype terms associated with any disease that is associated with variants in a gene are assigned to that gene in this file. Other files are available on our Jenkins server that filter terms according to provenance of the annotation and frequency of the features in the disease.

Source
annot <- HPOExplorer::load_phenotype_to_genes(1)
Message
## Importing existing file: ... phenotype_to_genes.txt
Source
data.table::setnames(annot,"LinkID","DatabaseID")

Number of phenotypes/disease

Source
counts <- annot[,list(n_phenotypes=length(unique(HPO_ID)),
                      n_genes=length(unique(Gene))),
                 by="DatabaseID"]
hist(counts$n_phenotypes)

Source
summary(counts$n_phenotypes)
Output
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   30.50   63.00   79.33  110.00  568.00
Source
hist(counts$n_genes)

Source
summary(counts$n_genes)
Output
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   1.000   1.415   1.000  82.000

Number of diseases/phenotype

Source
counts <- annot[,list(n_diseases=length(unique(DatabaseID)),
                       n_genes=length(unique(Gene))),
                 by="HPO_ID"]
hist(counts$n_diseases)

Source
summary(counts$n_diseases)
Output
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    2.00    6.00   64.27   25.00 5182.00
Source
hist(counts$n_genes)

Source
summary(counts$n_genes)
Output
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    2.00    7.00   67.74   34.00 3484.00

phenotype_to_genes.txt file

http://purl.obolibrary.org/obo/hp/hpoa/phenotype_to_genes.txt is analogous, but instead provides links from HPO terms to genes.

Source
annot <- HPOExplorer::load_phenotype_to_genes(2)
Message
## Importing existing file: ... genes_to_phenotype.txt
Source
data.table::setnames(annot,"LinkID","DatabaseID")

Number of phenotypes/disease

Source
counts <- annot[,list(n_phenotypes=length(unique(HPO_ID)),
                      n_genes=length(unique(Gene))),
                 by="DatabaseID"]
hist(counts$n_phenotypes)

Source
summary(counts$n_phenotypes)
Output
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    8.00   17.00   20.78   28.00  172.00
Source
hist(counts$n_genes)

Source
summary(counts$n_genes)
Output
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   1.000   1.399   1.000  84.000

Number of diseases/phenotype

Source
counts <- annot[,list(n_diseases=length(unique(DatabaseID)),
                      n_genes=length(unique(Gene))),
                 by="HPO_ID"]
hist(counts$n_diseases)

Source
summary(counts$n_diseases)
Output
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    1.00    4.00   18.91   11.00 3305.00
Source
hist(counts$n_genes)

Source
summary(counts$n_genes)
Output
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    2.00    5.00   23.59   17.00 2924.00

Session info

Source
sessioninfo::session_info()
Output
## ─ Session info ───────────────────────────────────────────────────────────────
##  setting  value
##  version  R version 4.2.1 (2022-06-23)
##  os       macOS Big Sur ... 10.16
##  system   x86_64, darwin17.0
##  ui       X11
##  language (EN)
##  collate  en_US.UTF-8
##  ctype    en_US.UTF-8
##  tz       Europe/London
##  date     2023-03-22
##  pandoc   3.1 @ /usr/local/bin/ (via rmarkdown)
## 
## ─ Packages ───────────────────────────────────────────────────────────────────
##  package        * version date (UTC) lib source
##  bslib            0.4.2   2022-12-16 [1] CRAN (R 4.2.1)
##  cachem           1.0.7   2023-02-24 [1] CRAN (R 4.2.0)
##  cli              3.6.0   2023-01-09 [1] CRAN (R 4.2.0)
##  coda             0.19-4  2020-09-30 [1] CRAN (R 4.2.0)
##  colorspace       2.1-0   2023-01-23 [1] CRAN (R 4.2.1)
##  crayon           1.5.2   2022-09-29 [1] CRAN (R 4.2.0)
##  data.table     * 1.14.8  2023-02-17 [1] CRAN (R 4.2.0)
##  digest           0.6.31  2022-12-11 [1] CRAN (R 4.2.0)
##  dplyr            1.1.0   2023-01-29 [1] CRAN (R 4.2.1)
##  evaluate         0.20    2023-01-17 [1] CRAN (R 4.2.1)
##  fansi            1.0.4   2023-01-22 [1] CRAN (R 4.2.1)
##  farver           2.1.1   2022-07-06 [1] CRAN (R 4.2.0)
##  fastmap          1.1.1   2023-02-24 [1] CRAN (R 4.2.0)
##  generics         0.1.3   2022-07-05 [1] CRAN (R 4.2.0)
##  ggnetwork        0.5.12  2023-03-06 [1] CRAN (R 4.2.1)
##  ggplot2          3.4.1   2023-02-10 [1] CRAN (R 4.2.0)
##  glue             1.6.2   2022-02-24 [1] CRAN (R 4.2.0)
##  gtable           0.3.1   2022-09-01 [1] CRAN (R 4.2.0)
##  highr            0.10    2022-12-22 [1] CRAN (R 4.2.1)
##  HPOExplorer      0.99.7  2023-03-22 [1] Bioconductor
##  htmltools        0.5.4   2022-12-07 [1] CRAN (R 4.2.0)
##  htmlwidgets      1.6.1   2023-01-07 [1] CRAN (R 4.2.0)
##  httr             1.4.5   2023-02-24 [1] CRAN (R 4.2.0)
##  jquerylib        0.1.4   2021-04-26 [1] CRAN (R 4.2.0)
##  jsonlite         1.8.4   2022-12-06 [1] CRAN (R 4.2.0)
##  knitr            1.42    2023-01-25 [1] CRAN (R 4.2.1)
##  labeling         0.4.2   2020-10-20 [1] CRAN (R 4.2.0)
##  lattice          0.20-45 2021-09-22 [1] CRAN (R 4.2.1)
##  lazyeval         0.2.2   2019-03-15 [1] CRAN (R 4.2.0)
##  lifecycle        1.0.3   2022-10-07 [1] CRAN (R 4.2.0)
##  magrittr         2.0.3   2022-03-30 [1] CRAN (R 4.2.0)
##  minidown         0.4.0   2022-02-08 [1] CRAN (R 4.2.0)
##  munsell          0.5.0   2018-06-12 [1] CRAN (R 4.2.0)
##  network          1.18.1  2023-01-24 [1] CRAN (R 4.2.1)
##  ontologyIndex    2.10    2022-08-24 [1] CRAN (R 4.2.0)
##  patchwork        1.1.2   2022-08-19 [1] CRAN (R 4.2.0)
##  pillar           1.8.1   2022-08-19 [1] CRAN (R 4.2.0)
##  pkgconfig        2.0.3   2019-09-22 [1] CRAN (R 4.2.0)
##  plotly           4.10.1  2022-11-07 [1] CRAN (R 4.2.1)
##  purrr            1.0.1   2023-01-10 [1] CRAN (R 4.2.0)
##  R6               2.5.1   2021-08-19 [1] CRAN (R 4.2.0)
##  rlang            1.1.0   2023-03-14 [1] CRAN (R 4.2.0)
##  rmarkdown        2.20.1  2023-02-16 [1] Github (rstudio/rmarkdown@a75dc37)
##  rstudioapi       0.14    2022-08-22 [1] CRAN (R 4.2.0)
##  sass             0.4.5   2023-01-24 [1] CRAN (R 4.2.0)
##  scales           1.2.1   2022-08-20 [1] CRAN (R 4.2.0)
##  sessioninfo      1.2.2   2021-12-06 [1] CRAN (R 4.2.0)
##  statnet.common   4.8.0   2023-01-24 [1] CRAN (R 4.2.1)
##  stringi          1.7.12  2023-01-11 [1] CRAN (R 4.2.0)
##  stringr          1.5.0   2022-12-02 [1] CRAN (R 4.2.0)
##  tibble           3.2.0   2023-03-08 [1] CRAN (R 4.2.0)
##  tidyr            1.3.0   2023-01-24 [1] CRAN (R 4.2.1)
##  tidyselect       1.2.0   2022-10-10 [1] CRAN (R 4.2.0)
##  utf8             1.2.3   2023-01-31 [1] CRAN (R 4.2.1)
##  vctrs            0.6.0   2023-03-16 [1] CRAN (R 4.2.1)
##  viridisLite      0.4.1   2022-08-22 [1] CRAN (R 4.2.0)
##  withr            2.5.0   2022-03-03 [1] CRAN (R 4.2.0)
##  xfun             0.37    2023-01-31 [1] CRAN (R 4.2.1)
##  yaml             2.3.7   2023-01-23 [1] CRAN (R 4.2.1)
## 
##  [1] /Library/Frameworks/R.framework/Versions/4.2/Resources/library
## 
## ──────────────────────────────────────────────────────────────────────────────