Functions to add metadata to data.table objects.

add_ancestor(
  phenos,
  lvl = 2,
  hpo = get_hpo(),
  keep_descendants = NULL,
  remove_descendants = NULL,
  force_new = FALSE
)

add_death(
  phenos,
  keep_deaths = NULL,
  all.x = TRUE,
  allow.cartesian = FALSE,
  agg_by = NULL
)

add_disease(
  phenos,
  extra_cols = NULL,
  all.x = TRUE,
  allow.cartesian = FALSE,
  add_definitions = FALSE
)

add_disease_genes(phenos, all.x = TRUE, verbose = TRUE)

add_evidence(
  phenos,
  evidence_score_threshold = NULL,
  evidence_score_threshold_metric = "evidence_score_sum",
  all.x = TRUE,
  allow.cartesian = FALSE,
  agg_by = c("disease_id", "gene_symbol"),
  default_score = 1,
  ...
)

add_gene_frequency(
  phenotype_to_genes = load_phenotype_to_genes(),
  gene_frequency_threshold = NULL,
  all.x = TRUE,
  allow.cartesian = FALSE,
  verbose = TRUE
)

add_genes(
  phenos = NULL,
  phenotype_to_genes = load_phenotype_to_genes(),
  hpo = get_hpo(),
  by = c("hpo_id", "disease_id"),
  gene_col = "gene_symbol",
  all.x = FALSE,
  allow.cartesian = FALSE
)

add_gpt_annotations(
  phenos,
  annot = gpt_annot_codify(reset_weights_dict = TRUE)$annot_weighted,
  annot_cols = names(annot)[!names(annot) %in% c("hpo_id", "hpo_name")],
  gpt_filters = `names<-`(rep(list(NULL), length(annot_cols)), annot_cols),
  force_new = FALSE
)

add_hpo_definition(
  phenos,
  hpo = get_hpo(),
  line_length = FALSE,
  use_api = FALSE,
  verbose = TRUE
)

add_hpo_id(phenos, hpo = get_hpo(), ignore_case = TRUE)

add_hpo_name(phenos, hpo = get_hpo())

add_info_content(phenos, hpo = get_hpo())

add_mondo(phenos, input_col = "disease_id", map_to = "hpo", ...)

add_ndisease(
  phenos,
  pheno_ndiseases_threshold = NULL,
  all.x = TRUE,
  allow.cartesian = FALSE,
  verbose = TRUE
)

add_omop(
  phenos,
  input_col = "hpo_id",
  all.x = TRUE,
  allow.cartesian = FALSE,
  force_new = FALSE,
  verbose = TRUE
)

add_onset(
  phenos,
  keep_onsets = NULL,
  agg_by = NULL,
  all.x = TRUE,
  allow.cartesian = FALSE
)

add_ont_lvl(
  phenos,
  hpo = get_hpo(),
  absolute = TRUE,
  keep_ont_levels = NULL,
  ...
)

add_pheno_frequency(
  phenos,
  pheno_frequency_threshold = NULL,
  all.x = TRUE,
  allow.cartesian = FALSE
)

add_prevalence(
  phenos,
  input_col = "disease_id",
  drop_na = TRUE,
  method = "orphanet"
)

add_severity(
  phenos,
  hpo = get_hpo(),
  all.x = TRUE,
  allow.cartesian = FALSE,
  severity_threshold = NULL
)

add_tier(
  phenos,
  all.x = TRUE,
  include_disease_characteristics = TRUE,
  auto_assign = TRUE,
  hpo = get_hpo(),
  keep_tiers = NULL,
  verbose = TRUE
)

Arguments

phenos

A data.table containing HPO IDs and other metadata.

lvl

How many levels deep into the ontology to get ancestors from. For example:

  • 1: "All"

  • 2: "Phenotypic abnormality"

  • 3: "Abnormality of the nervous system"

  • 4: "Abnormality of nervous system physiology"

  • 5: "Neurodevelopmental abnormality" or "Behavioral abnormality"

hpo

Human Phenotype Ontology object, loaded from get_ontology.

keep_descendants

Terms whose descendants should be kept (including themselves). Set to NULL (default) to skip this filtering step.

remove_descendants

Terms whose descendants should be removed (including themselves). Set to NULL (default) to skip this filtering step.

force_new

Force a new query to the OARD API instead of using pre-downloaded data.

keep_deaths

The age of death associated with each HPO ID to keep. If >1 age of death is associated with the term, only the earliest age is considered. See add_death for details.

all.x

logical; if TRUE, rows from x which have no matching row in y are included. These rows will have 'NA's in the columns that are usually filled with values from y. The default is FALSE so that only rows with data from both x and y are included in the output.

allow.cartesian

See allow.cartesian in [.data.table.

agg_by

Column to aggregate age of onset metadata by.

extra_cols

Extra metadata columns from the"phenotype.hpoa" annotations file to include. See here for column descriptions.

add_definitions

Add disease definitions using add_mondo.

verbose

Print messages.

evidence_score_threshold

The minimum threshold of mean evidence scores of each gene-phenotype association to keep.

evidence_score_threshold_metric

The metric to use for filtering with evidence_score_threshold.

default_score

Default evidence score to apply to gene-disease associations that are present in the HPO annotations but don't have evidence scores in the GenCC annotations.

...

Arguments passed on to KGExplorer::get_gencc, KGExplorer::map_mondo, KGExplorer::get_ontology_levels

save_dir

Directory to save a file to.

dict

A named vector of evidence score mappings. See here for more information.

dat

data.table with genes.

output_col

Column name of output IDs.

to

Character vector of database(s) to map IDs to. When not "mondo", can supply multiple alternative databases to map to (e.g. c("OMIM","Orphanet","DECIPHER")).

map_types

Mapping types to include.

top_n

Top number of mappings to return per top_by grouping. Set to NULL to skip this step.

add_name

Logical, if TRUE, add mondo name column.

ont

An ontology of class ontology_DAG.

terms

A vector of ontology term IDs.

remove_terms

Character vector of term IDs to exclude.

reverse

If TRUE, ontology level numbers with be revered such that the level of the parent terms are larger than the child terms.

phenotype_to_genes

Output of load_phenotype_to_genes mapping phenotypes to gene annotations.

gene_frequency_threshold

Only keep genes with frequency above the set threshold. Frequency ranges from 0-100 where 100 is a gene that occurs 100% of the time in a given phenotype. Include NA if you wish to retain genes that do not have any frequency data. See add_gene_frequency for details.

by

A vector of shared column names in x and y to merge on. This defaults to the shared key columns between the two tables. If y has no key columns, this defaults to the key of x.

gene_col

Name of the gene column.

annot

GPT annotation data.

annot_cols

Columns to add.

gpt_filters

A named list of filters to apply to the GPT annotations.

line_length

The number of desired words per line \<int\>

use_api

Get definitions from the HPO API, as opposed to a static local dataset.

ignore_case

Ignore case when mapping terms.

input_col

Name of the column containing the disease or phenotype IDs.

map_to

Mapping outputs to include (from Mondo IDs to another database's IDs).

pheno_ndiseases_threshold

Filter phenotypes by the maximum number of diseases they are associated with.

keep_onsets

The age of onset associated with each HPO ID to keep. If >1 age of onset is associated with the term, only the earliest age is considered. See add_onset for details.

absolute

Make the levels absolute in the sense that they consider the entire ontology (TRUE). Otherwise, levels will be relative to only the terms that are in the provided subset of terms AND are directly adjacent (connected) to a given cluster of terms (FALSE).

keep_ont_levels

Only keep phenotypes at certain absolute ontology levels to keep. See add_ont_lvl for details.

pheno_frequency_threshold

Only keep phenotypes with frequency above the set threshold. Frequency ranges from 0-100 where 100 is a phenotype that occurs 100% of the time in all associated diseases. Include NA if you wish to retain phenotypes that do not have any frequency data. See add_pheno_frequency for details.

drop_na

Whether to drop rows with missing prevalence data.

method

One of "orphanet" or "oard".

severity_threshold

Only keep phenotypes with a mean severity score (averaged across multiple associated diseases) below the set threshold. The severity score ranges from 1-4 where 1 is the MOST severe. Include NA if you wish to retain phenotypes that do not have any severity score.

include_disease_characteristics

Include phenotypes that are also high-level include_disease_characteristics.

auto_assign

Automatically assing HPO IDs to Tiers by conducting regex searches for keywords that appear in the term name, or the names of its descendants or ancestors.

keep_tiers

Tiers from hpo_tiers to keep. Include NA if you wish to retain phenotypes that do not have any Tier assignment.

Value

Annotated data.

phenos data.table with extra columns:

  • "AgeOfDeath": AgeOfDeath HPO IDs of disease phenotypes associated with the target hpo_id phenotype.

  • "AgeOfDeath_names": AgeOfDeath HPO names of disease phenotypes associated with the target hpo_id phenotype.

  • "AgeOfDeath_counts": The number of times each term in "AgeOfDeath_names" appears across associated disease phenotypes.

  • "AgeOfDeath_score_mean": Mean age of death score.

  • "AgeOfDeath_score_min": Minimum age of death score.

  • "AgeOfDeath_top": The most common age of death term.

  • "AgeOfDeath_earliest": The earliest age of death.

  • "AgeOfDeath_latest": The latest age of death.

phenos data.table with extra columns:

  • "evidence_score_min": Minimum evidence score.

  • "evidence_score_max": Maximum evidence score.

  • "evidence_score_mean": Mean evidence score.

phenos data.table with extra column

A named vector of HPO term descriptions.

phenos data.table with extra column

phenos data.table with extra columns.

phenos data.table with extra columns

phenos data.table with extra columns:

  • "onset": onset HPO IDs of disease phenotypes associated with the target hpo_id phenotype.

  • "onset_names": onset HPO names of disease phenotypes associated with the target hpo_id phenotype.

  • "onset_counts": The number of times each term in "onset_names" appears across associated disease phenotypes.

  • "onset_score_mean": Mean onset score.

  • "onset_score_min": Minimum onset score.

  • "onset_top": The most common onset term.

  • "onset_earliest": The earliest age of onset.

  • "onset_latest": The latest age of onset.

phenos data.table with extra column

phenos data.table with extra column

phenos data.table with extra columns

phenos data.table with extra column

Functions

  • add_ancestor(): add_ Add ancestor

    Assign each HPO ID to the higher-order ancestral term that it is part of.

  • add_death(): add_ Add age of death

    Add age of death for each HPO ID. AgeOfDeath IDs and assigned "AgeOfDeath_score" values:

    • HP:0005268 "Miscarriage" (AgeOfDeath_score=1)

    • HP:0003826 "Stillbirth" (AgeOfDeath_score=1)

    • HP:0034241 "Prenatal death" (AgeOfDeath_score=1)

    • HP:0003811 "Neonatal death" (AgeOfDeath_score=2)

    • HP:0001522 "Death in infancy" (AgeOfDeath_score=3)

    • HP:0003819 "Death in childhood" (AgeOfDeath_score=4)

    • HP:0011421 "Death in adolescence" (AgeOfDeath_score=5)

    • HP:0100613 "Death in early adulthood" (AgeOfDeath_score=6)

    • HP:0033764 "Death in middle age" (AgeOfDeath_score=7)

    • HP:0033763 "Death in adulthood" (AgeOfDeath_score=7)

    • HP:0033765 "Death in late adulthood" (AgeOfDeath_score=8)

  • add_disease(): add_ Add diseases

    Annotate each HPO term with diseases that they are associated with.

  • add_disease_genes(): add_ Add disease genes

    Add genes that overlap between an HPO ID and an associated phenotype.

  • add_evidence(): add_ Add evidence

    Add the strength of evidence supporting each gene-disease association. Delphi survey evidence classification IDs and assigned "evidence_score" values:

    • GENCC:100001 "Definitive" (evidence_score=6)

    • GENCC:100002 "Strong" (evidence_score=5)

    • GENCC:100003 "Moderate" (evidence_score=4)

    • GENCC:100009 "Supportive" (evidence_score=3)

    • GENCC:100004 "Limited" (evidence_score=2)

    • GENCC:100005 "Disputed Evidence" (evidence_score=1)

    • GENCC:100006 "Refuted Evidence" (evidence_score=0)

    • GENCC:100008 "No Known Disease Relationship" (evidence_score=0)

  • add_gene_frequency(): add_ Add gene frequency

    Add gene-level frequency, i.e. how often mutations in a given gene are associated with a given phenotype. Numeric frequency columns are on a 0-100% scale.

  • add_genes(): add_ Add genes

    Add genes associated with each phenotype (in the context of a particular disease).

  • add_gpt_annotations(): add_ Add ancestor

    Add annotations generated with a Large Language Model.

  • add_hpo_definition(): add_ Get term definition

    This function accesses the HPO API to get a description/definition of an HPO term. If a line_length \> 0 is passed to the function, it will add newlines every nth word. This can be useful when displaying the description in plots with limited space.

  • add_hpo_id(): add_ Add HPO ID column to dataframe

    Adds the HPO term ID column "hpo_id".

  • add_hpo_name(): add_ Add HPO name column to dataframe

    Adds the HPO term name column "hpo_name".

  • add_info_content(): add_ Add information content

    Add a column containing the information content score for each HPO ID.

  • add_mondo(): add_ Add Mondo metadata

    Add Mondo metadata (MONDO ID mappings, names, and definitions) for diseases using files from their respective databases: e.g. OMIM, DECIPHER, Orphanet.

  • add_ndisease(): add_ Add N diseases

    Annotate each HPO term with the total number of disease they are associated with.

  • add_omop(): add_ Add OMOP

    Add metadata from MONDO, including:

    • mondo_id: MONDO term ID.

    • mondo_name: MONDO term name.

    • mondo_def: MONDO term definition.

  • add_onset(): add_ Add age of onset

    Add age of onset for each HPO ID. onset IDs and assigned "onset_score" values:

    • HP:0011461 "Fetal onset" (onset_score=1)

    • HP:0030674 "Antenatal onset" (onset_score=2)

    • HP:0003577 "Congenital onset" (onset_score=3)

    • HP:0003623 "Neonatal onset" (onset_score=4)

    • HP:0003593 "Infantile onset" (onset_score=5)

    • HP:0011463 "Childhood onset" (onset_score=6)

    • HP:0003621 "Juvenile onset" (onset_score=7)

    • HP:0011462 "Young adult onset" (onset_score=8)

    • HP:0003581 "Adult onset" (onset_score=9)

    • HP:0003596 "Middle age onset" (onset_score=10)

    • HP:0003584 "Late onset" (onset_score=11)

  • add_ont_lvl(): add_ Add ontology level

    Add the relative ontology level for each HPO ID.

  • add_pheno_frequency(): add_ Add phenotype frequency

    Add phenotype-level frequency, i.e. how often a phenotype occurs in a given disease.

  • add_prevalence(): add_ Add prevalence

    Add a column containing the prevalence score for each disease ("disease_id") or phenotype ("hpo_id").

  • add_severity(): add_ Add HPO modifiers

    Annotate each HPO with modifier terms, including (but not limited to) progression and severity ratings. In order of increasing severity:

    • HP:0012825 "Mild" (Severity_score=4)

    • HP:0012827 "Borderline" (Severity_score=3)

    • HP:0012828 "Severe" (Severity_score=2)

    • HP:0012829"Profound" (Severity_score=1)

  • add_tier(): add_ Add severity Tiers

    Add severity Tier for each HPO ID, in accordance with the rating system provided by Lazarin et al (2014). In order of increasing severity:

    • Tier 4 Reduced fertility

    • Tier 3 Sensory impairment: vision, Immunodeficiency/cancer, Sensory impairment: hearing, Sensory impairment: touch, other (including pain), Mental illness, Dysmorphic features

    • Tier 2 Shortened life span: premature adulthood, Impaired mobility, Internal physical malformation

    • Tier 1 Shortened life span: infancy, Shortened life span: childhood/adolescence, Intellectual disability

Examples

phenos <- example_phenos()
phenos2 <- add_ancestor(phenos = phenos, lvl=5)
#> Adding level-5 ancestor to each HPO ID.
#> Adding ancestor metadata.
#> Ancestor metadata already present. Use force_new=TRUE to overwrite.
#> 10 associations remain after filtering.
phenos <- example_phenos()
phenos2 <- add_death(phenos = phenos)
#> Annotating phenos with AgeOfDeath.
#> Annotating phenos with Disease
#> Reading cached RDS file: phenotype.hpoa
#> + Version: v2024-04-26
phenos <- example_phenos()
phenos2 <- add_disease(phenos = phenos)
#> Annotating phenos with Disease
#> Reading cached RDS file: phenotype.hpoa
#> + Version: v2024-04-26
if (FALSE) {
phenos <- load_phenotype_to_genes()
phenos2 <- add_severity(phenos = phenos)
}
phenos <- load_phenotype_to_genes()
#> Reading cached RDS file: phenotype_to_genes.txt
#> + Version: v2024-04-26
phenos2 <- add_evidence(phenos = phenos)
#> Annotating gene-disease associations with Evidence Score
#> Annotating phenos with Disease
#> Reading cached RDS file: phenotype.hpoa
#> + Version: v2024-04-26
#> Gathering data from GenCC.
#> Importing cached file.
#> Evidence scores for: 
#>  - 10509 diseases 
#>  - 5165 genes
#> + Version: 2024-05-22
phenotype_to_genes <- load_phenotype_to_genes()[seq(1000),]
#> Reading cached RDS file: phenotype_to_genes.txt
#> + Version: v2024-04-26
phenos2 <- add_gene_frequency(phenotype_to_genes = phenotype_to_genes)
#> Annotating gene frequencies.
#> Reading cached RDS file: genes_to_phenotype.txt
#> + Version: v2024-04-26
phenos <- example_phenos()
phenos2 <- add_genes(phenos = phenos)
#> Reading cached RDS file: phenotype_to_genes.txt
#> + Version: v2024-04-26
#> Annotating phenos with Disease
#> Reading cached RDS file: phenotype.hpoa
#> + Version: v2024-04-26
phenos <- example_phenos()
phenos2 <- add_gpt_annotations(phenos)
#> Loading required namespace: piggyback
#> Translating ontology terms to ids.
#> Reading cached RDS file: phenotype_to_genes.txt
#> + Version: v2024-04-26
#> 256 phenotypes do not have matching HPO IDs.
#> Reading in GPT annotations for 16,879 phenotypes.
phenos <- example_phenos()
phenos2 <- add_hpo_definition(phenos = phenos)
#> Adding term definitions.
phenotype_to_genes <- load_phenotype_to_genes()
#> Reading cached RDS file: phenotype_to_genes.txt
#> + Version: v2024-04-26
phenos <- unique(phenotype_to_genes[,c("hpo_id","hpo_name")])
phenos2 <- add_hpo_id(phenos=phenos)
phenos <- example_phenos()
phenos2 <- add_hpo_name(phenos=phenos)
#> Adding HPO names.
#> Translating ontology terms to names.
phenos <- example_phenos()
phenos2 <- add_info_content(phenos = phenos)
#> Adding information_content scores.
phenos <- load_phenotype_to_genes(3)[seq(1000)]
#> Reading cached RDS file: phenotype.hpoa
#> + Version: v2024-04-26
phenos2 <- add_mondo(phenos = phenos)
#> Loading required namespace: downloadR
#> Loading required namespace: echogithub
#> Mapping disease_id --> mondo_id
#> Loading cached ontology: /github/home/.cache/R/KGExplorer/mondo.rds
#> 0 / 60 (0%) mondo_id missing.
#> 0 / 60 (0%) mondo_name missing.
#> 20 / 60 (33.33%) mondo_def missing.
phenos <- example_phenos()
phenos2 <- add_ndisease(phenos = phenos)
#> Annotating phenos with n_diseases
#> Reading cached RDS file: phenotype_to_genes.txt
#> + Version: v2024-04-26
#> Reading cached RDS file: genes_to_phenotype.txt
#> + Version: v2024-04-26
#> Reading cached RDS file: phenotype.hpoa
#> + Version: v2024-04-26
phenos <- example_phenos()
phenos2 <- add_omop(phenos = phenos)
#> Annotating phenos with OMOP metadata.
#> 0 / 10 (0%) OMOP_ID missing.
#> 0 / 10 (0%) OMOP_NAME missing.
phenos <- example_phenos()
phenos2 <- add_onset(phenos = phenos)
#> Annotating phenos with onset.
#> Annotating phenos with Disease
#> Reading cached RDS file: phenotype.hpoa
#> + Version: v2024-04-26
phenos <- make_phenos_dataframe(ancestor = "Neurodevelopmental delay")
#> Reading cached RDS file: phenotype_to_genes.txt
#> + Version: v2024-04-26
#> Extracting data for 23 descendents.
#> Computing gene counts.
#> Adding term definitions.
#> Adding level-2 ancestor to each HPO ID.
#> Adding ancestor metadata.
#> Ancestor metadata already present. Use force_new=TRUE to overwrite.
#> 23 associations remain after filtering.
#> Getting absolute ontology level for 18,536 IDs.
#> Computing ontology level / gene count ratio.
phenos2 <- add_ont_lvl(phenos = phenos)
phenos <- example_phenos()
phenos2 <- add_pheno_frequency(phenos = phenos)
#> Annotating phenotype frequencies.
#> Annotating phenos with Disease
#> Reading cached RDS file: phenotype.hpoa
#> + Version: v2024-04-26
phenos <- example_phenos()
phenos2 <- add_prevalence(phenos = phenos)
#> Annotating phenos with Disease
#> Reading cached RDS file: phenotype.hpoa
#> + Version: v2024-04-26
#> Mapping disease_id --> mondo_id
#> Loading cached ontology: /github/home/.cache/R/KGExplorer/mondo.rds
#> 15 / 7,613 (0.2%) mondo_id missing.
#> 15 / 7,613 (0.2%) mondo_name missing.
#> 3,169 / 7,613 (41.63%) mondo_def missing.
#> Mapping disease_id --> mondo_id
#> Loading cached ontology: /github/home/.cache/R/KGExplorer/mondo.rds
#> 24 / 6,089 (0.39%) mondo_id missing.
#> 24 / 6,089 (0.39%) mondo_name missing.
#> 1,099 / 6,089 (18.05%) mondo_def missing.
#> Prevalence added for 0 / 7,613 disease_id IDs (0%)
#> Prevalence added for 0 / 10 hpo_id IDs (0%)
#> Prevalence added for 0 / 7,517 mondo_id IDs (0%)
phenos <- example_phenos()
phenos2 <- add_severity(phenos = phenos)
#> Annotating phenos with modifiers
#> Annotating phenos with Disease
#> Reading cached RDS file: phenotype.hpoa
#> + Version: v2024-04-26
phenos <- example_phenos()
phenos2 <- add_tier(phenos = phenos)
#> Annotating phenos with Tiers.