In [1]:
%config IPCompleter.use_jedi = False
%load_ext autoreload
%autoreload 2
%matplotlib inline

import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 300
In [2]:
import pandas as pd
import numpy as np
from scipy import sparse
import os
import plotly.express as px
import seaborn as sns

from functions import *

os.chdir("/Users/schilder/Desktop/AD_CVD_genetics/")
intro()
In [3]:
DEGAS = np.load("./raw_data/DEGAS/dev_allNonMHC_z_center_p0001_100PCs_20180129.npz", allow_pickle=True)
In [4]:
# list(DEGAS.keys())

for x in list(DEGAS.keys()):
    print(x)
    try:
        print(DEGAS[x].shape)
    except:
        print(len(DEGAS[x]))
contribution_phe
(2138, 100)
contribution_var
(235907, 100)
cos_phe
(2138, 100)
eigen_var
(235907, 100)
eigen_phe
(2138, 100)
eigen_v
(100,)
label_phe_stackedbar
(316,)
stackedbar_gene
(73, 100)
metadata
(4,)
cos_var
(235907, 100)
contribution_gene
(107345, 100)
label_gene
(107345,)
stackedbar_phe
(316, 100)
factor_phe
(2138, 100)
loading_phe
(2138, 100)
label_phe_code
(2138,)
label_phe
(2138,)
label_var
(235907,)
label_gene_stackedbar
(73,)
total_inertia
(1,)
factor_var
(235907, 100)
loading_var
(235907, 100)
In [5]:
# "loading_phe","contribution_phe","eigen_phe","cos_phe"
loading_phe = pd.DataFrame(data=DEGAS["contribution_phe"],  
columns=["PC"+str(x+1) for x in range(0,DEGAS["contribution_phe"].shape[1])],
index=DEGAS["label_phe"])  
loading_phe
Out[5]:
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 ... PC91 PC92 PC93 PC94 PC95 PC96 PC97 PC98 PC99 PC100
breast cysts 0.000005 1.186378e-06 0.000003 0.000005 2.035159e-06 7.562966e-07 1.338280e-06 6.545312e-07 8.990043e-07 1.580968e-07 ... 1.517194e-06 1.540360e-05 4.016826e-07 2.819625e-05 3.942034e-05 2.701953e-05 5.498834e-08 8.833819e-06 1.367510e-06 6.571907e-08
bone disorder 0.000005 1.163787e-06 0.000002 0.000003 2.809106e-09 2.767192e-07 1.476105e-06 2.802009e-07 3.095311e-07 9.608308e-07 ... 1.069943e-07 1.586188e-09 5.903842e-06 1.997779e-05 5.630284e-08 2.447236e-06 7.318879e-06 4.327306e-07 1.717049e-06 2.375211e-06
anxiety/panic attacks 0.000005 9.426096e-07 0.000002 0.000005 9.508454e-09 1.410584e-06 1.556532e-06 6.755476e-07 1.018647e-08 7.780577e-07 ... 1.303051e-05 5.794538e-06 1.205579e-06 7.580794e-07 1.905491e-06 2.565167e-05 4.083603e-05 7.386000e-05 2.250169e-07 2.570055e-05
connective tissue disorder 0.000005 2.727648e-07 0.000002 0.000003 3.070556e-09 6.242487e-07 1.630909e-06 2.726991e-07 5.444983e-07 3.488909e-07 ... 2.521400e-06 5.039999e-05 9.321211e-07 4.541067e-05 4.082073e-05 5.767555e-07 4.188419e-05 1.698208e-06 6.018270e-05 3.442434e-08
housemaid's knee (prepatellar bursitis) 0.000004 9.153760e-07 0.000001 0.000006 7.542438e-08 8.842300e-08 1.666502e-06 1.513856e-09 4.905633e-09 3.783048e-09 ... 7.256107e-08 2.349348e-06 7.088202e-06 3.440427e-06 7.699747e-06 1.164741e-05 1.782329e-05 2.257302e-06 9.444018e-07 1.492720e-05
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Particulate matter air pollution 2.5-10um; 2010 0.000004 6.667194e-07 0.000002 0.000004 4.145418e-08 1.766194e-07 3.250524e-06 6.043835e-07 2.248262e-06 1.646005e-07 ... 6.887772e-07 1.570322e-05 4.058734e-07 4.033574e-05 2.561271e-06 5.528692e-05 2.382160e-04 3.860690e-05 3.748469e-04 2.301656e-08
Arm fat-free mass (right) 0.050005 4.076553e-03 0.004779 0.000002 8.943901e-03 3.860991e-04 1.843923e-05 1.271482e-04 4.387143e-02 7.321188e-06 ... 6.243377e-04 3.545923e-03 8.615398e-04 1.386767e-03 7.488123e-03 6.471627e-03 6.452772e-05 1.936609e-03 6.113287e-03 4.549085e-06
Trunk fat mass 0.010582 4.416343e-02 0.020901 0.000009 2.977132e-07 1.101556e-04 2.240712e-04 3.235084e-05 3.410710e-04 1.569382e-06 ... 2.803052e-04 2.843188e-03 2.456978e-03 3.903862e-03 2.605272e-03 3.344793e-04 8.242964e-06 2.384700e-03 7.433331e-03 8.946398e-04
Trunk fat-free mass 0.059108 1.680808e-02 0.001528 0.000044 6.179815e-03 3.835175e-04 2.491919e-03 8.667685e-07 3.437652e-02 3.277293e-06 ... 9.714820e-05 2.832675e-04 1.339791e-03 1.527432e-04 8.842225e-04 7.154613e-06 1.025064e-04 8.573282e-05 6.857318e-04 3.304525e-05
cannabis use frequency 0.000007 6.796395e-06 0.000002 0.000006 1.136185e-06 1.316573e-06 2.449321e-08 1.852046e-09 5.964184e-07 1.330250e-07 ... 9.145058e-05 4.709951e-04 2.050217e-05 5.328953e-04 6.311615e-07 4.247831e-10 5.816937e-04 1.282400e-05 7.294808e-04 3.917194e-04

2138 rows × 100 columns

1.2  Import metdata

In [6]:
meta = pd.read_excel("raw_data/DEGAS/41467_2019_11953_MOESM5_ESM.xlsx")

meta = meta.merge(pd.DataFrame({"label_phe":DEGAS["label_phe"], "label_phe_code":DEGAS["label_phe_code"]}), on="label_phe_code")

## Assign AD/CVD status
meta["CVD"] = meta["label_phe"].str.lower().str.contains("cardiom|cardia|heart|vascular|atrial|atrium")
meta["AD"] = meta["label_phe"].str.lower().str.contains("alzheimer")
conditions = [
    (meta['AD'] == True),
    (meta['CVD'] == True)
]
values = ['AD','CVD'] 
meta['AD_CVD'] = np.select(conditions, values)
AD_terms = meta.loc[meta["AD"],:].label_phe.tolist()
CVD_terms = meta.loc[meta["CVD"],:].label_phe.tolist()
AD_CVD_terms = AD_terms + CVD_terms

# Sample size
meta["N_cases"] = np.log1p(meta["Number of cases"])

meta.loc[meta["AD_CVD"].isin(["AD","CVD"]),:]
Out[6]:
Category Phenotype name Number of cases All Coding PTVs Global Biobank Engine phenotype page (URL) label_phe_code label_phe CVD AD AD_CVD N_cases
21 Disease outcome atrial fibrillation 12665 Y Y Y https://biobankengine.stanford.edu/coding/HC281 HC281 atrial fibrillation True False CVD 9.446677
23 Disease outcome atrial flutter 12186 Y Y Y https://biobankengine.stanford.edu/coding/HC440 HC440 atrial flutter True False CVD 9.408125
25 Disease outcome heart attack / myocardial infarction 12138 Y Y Y https://biobankengine.stanford.edu/coding/HC326 HC326 heart attack/myocardial infarction True False CVD 9.404179
61 Disease outcome heart failure / pulmonary odema 4657 Y Y Y https://biobankengine.stanford.edu/coding/HC299 HC299 heart failure/pulmonary odema True False CVD 8.446341
77 Disease outcome peripheral vascular disease 3517 Y Y Y https://biobankengine.stanford.edu/coding/HC385 HC385 peripheral vascular disease True False CVD 8.165648
96 Disease outcome svt / supraventricular tachycardia 2653 Y Y Y https://biobankengine.stanford.edu/coding/HC131 HC131 svt/supraventricular tachycardia True False CVD 7.883823
108 Disease outcome heart valve problem / heart murmur 2326 Y Y N https://biobankengine.stanford.edu/coding/HC231 HC231 heart valve problem/heart murmur True False CVD 7.752335
117 Disease outcome heart arrhythmia 1884 Y Y Y https://biobankengine.stanford.edu/coding/HC193 HC193 heart arrhythmia True False CVD 7.541683
148 Disease outcome cardiomyopathy 1309 Y Y N https://biobankengine.stanford.edu/coding/HC414 HC414 cardiomyopathy True False CVD 7.177782
159 Disease outcome heart / cardiac problem 1148 Y Y N https://biobankengine.stanford.edu/coding/HC88 HC88 heart/cardiac problem True False CVD 7.046647
165 Disease outcome hypertrophic cardiomyopathy (hcm / hocm) 1096 Y Y N https://biobankengine.stanford.edu/coding/HC308 HC308 hypertrophic cardiomyopathy (hcm/hocm) True False CVD 7.000334
202 Disease outcome irregular heart beat 689 Y Y N https://biobankengine.stanford.edu/coding/HC113 HC113 irregular heart beat True False CVD 6.536692
225 Disease outcome pericardial effusion 531 Y Y N https://biobankengine.stanford.edu/coding/HC114 HC114 pericardial effusion True False CVD 6.276643
336 Disease outcome pericardial problem 179 Y Y N https://biobankengine.stanford.edu/coding/HC213 HC213 pericardial problem True False CVD 5.192957
416 Family History Alzheimer's disease / dementia 51794 Y Y Y https://biobankengine.stanford.edu/coding/FH1263 FH1263 Alzheimer's disease/dementia False True AD 10.855049
1819 Imaging Cardiac output 3374 Y Y Y https://biobankengine.stanford.edu/coding/INI2... INI22424 Cardiac output True False CVD 8.124151
1820 Imaging Cardiac index 3374 Y Y N https://biobankengine.stanford.edu/coding/INI2... INI22425 Cardiac index True False CVD 8.124151
1821 Imaging Average heart rate 3374 Y Y N https://biobankengine.stanford.edu/coding/INI2... INI22426 Average heart rate True False CVD 8.124151
2067 Questionnaire (quantitative) Age heart attack diagnosed 7942 Y Y N https://biobankengine.stanford.edu/coding/INI3894 INI3894 Age heart attack diagnosed True False CVD 8.980046

1.3  PCA

The top 2 PCs don't really generate identifiable clusters beyond Physical Measurement (mostly related to adiposity/BMI) and others.

In [7]:
pca_annot = loading_phe.merge(meta, left_index=True, right_on="label_phe")

fig  = px.scatter(pca_annot, x="PC1", y="PC2", #height=400,
                     log_x=True, log_y=True, 
                     opacity=.8, color="Category",#"AD_CVD",
              hover_data=["label_phe","AD_CVD","Number of cases"])
fig.update_traces(marker=dict(size=10))
fig.show()
10n100n10μ100μ0.0010.010.11p100p10n100μ0.01
CategoryDisease outcomeFamily HistoryCancerImagingQuestionnaire (quantitative)Physical MeasurementAssayMiscellaneous (quantitative)MedicationMiscellaneous (binary)Questionnaire (binary)PC1PC2

1.4  UMAP

UMAP (using the PCs as input) gives a much better visual representation of the data. But it's unclear how many of the 100 PCs to use. So we iterate running UMAP with an increasing number of PCs.

In [8]:
umap_df_list = [get_umap(loading_phe, meta, n_dims=x,  merging_var="label_phe") for x in [2,3,4,10,20,40,60,80,100]]
umap_df = pd.concat(umap_df_list) 
umap_df
Using 2 dimensions as input.
Rescaling from -1 to 1
Done.
Using 3 dimensions as input.
Rescaling from -1 to 1
Done.
Using 4 dimensions as input.
Rescaling from -1 to 1
Done.
Using 10 dimensions as input.
Rescaling from -1 to 1
Done.
Using 20 dimensions as input.
Rescaling from -1 to 1
Done.
Using 40 dimensions as input.
Rescaling from -1 to 1
Done.
Using 60 dimensions as input.
Rescaling from -1 to 1
Done.
Using 80 dimensions as input.
Rescaling from -1 to 1
Done.
Using 100 dimensions as input.
Rescaling from -1 to 1
Done.
Out[8]:
label_phe UMAP1 UMAP2 UMAP3 Category Phenotype name Number of cases All Coding PTVs Global Biobank Engine phenotype page (URL) label_phe_code CVD AD AD_CVD N_cases n_dims
0 breast cysts 0.363005 -0.623906 0.337725 Disease outcome breast cysts 1398 Y Y N https://biobankengine.stanford.edu/coding/HC139 HC139 False False 0 7.243513 2
1 bone disorder 0.218490 -0.641017 -0.640685 Disease outcome bone disorder 1459 Y Y N https://biobankengine.stanford.edu/coding/HC134 HC134 False False 0 7.286192 2
2 anxiety/panic attacks 0.550190 0.245177 -0.302014 Disease outcome anxiety / panic attacks 4986 Y Y Y https://biobankengine.stanford.edu/coding/HC135 HC135 False False 0 8.514590 2
3 connective tissue disorder -0.340144 -0.294241 0.954099 Disease outcome connective tissue disorder 2515 Y Y Y https://biobankengine.stanford.edu/coding/HC136 HC136 False False 0 7.830426 2
4 housemaid's knee (prepatellar bursitis) -0.977615 0.074180 0.135035 Disease outcome housemaid's knee (prepatellar bursitis) 194 Y Y N https://biobankengine.stanford.edu/coding/HC130 HC130 False False 0 5.273000 2
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2181 Nitrogen dioxide air pollution; 2007 0.734835 0.388312 -0.202410 Miscellaneous (quantitative) Nitrogen dioxide air pollution; 2007 332633 Y Y Y https://biobankengine.stanford.edu/coding/INI2... INI24018 False False 0 12.714798 100
2182 Particulate matter air pollution 2.5-10um; 2010 0.769730 0.393652 -0.148391 Miscellaneous (quantitative) Particulate matter air pollution 2.5-10um; 2010 308977 Y Y Y https://biobankengine.stanford.edu/coding/INI2... INI24008 False False 0 12.641025 100
2183 Arm fat-free mass (right) 0.782294 -0.041789 -0.260538 Physical Measurement Arm fat-free mass (right) 331418 Y Y Y https://biobankengine.stanford.edu/coding/INI2... INI23121 False False 0 12.711139 100
2184 Trunk fat-free mass 0.778969 -0.047271 -0.277467 Physical Measurement Trunk fat-free mass 331234 Y Y Y https://biobankengine.stanford.edu/coding/INI2... INI23129 False False 0 12.710583 100
2185 cannabis use frequency 0.786820 0.321295 -0.237221 Miscellaneous (quantitative) cannabis use frequency 110201 Y Y Y https://biobankengine.stanford.edu/coding/INI2... INI20453 False False 0 11.610070 100

19674 rows × 17 columns

1.4.1  2D UMAP

In [9]:
def animated_scatter(umap_df,
                     threeD=True,
                     x="UMAP1", y="UMAP2", z="UMAP3",
                    animation_frame="n_dims",
                    animation_group="label_phe",
                    size="N_cases", size_max=10, height=600,
                    opacity=.8, color="Category",
                    hover_data=["label_phe","AD_CVD","Number of cases"],
                    duration=2000,
                    renderer="notebook",
                    show_plot=True):
    import plotly.express as px

    if threeD:
        fig  = px.scatter_3d(umap_df, x=x, y=y, z=z, 
                             animation_frame=animation_frame, 
                             animation_group=animation_group,
                             size=size, size_max=size_max,height=height, 
                             title=f'{umap_df[animation_frame].unique()[0]} input dimensions',
                             opacity=opacity, color=color,
                             hover_data=hover_data)
    else:
        fig  = px.scatter(umap_df, x=x, y=y, 
                             animation_frame=animation_frame, 
                             animation_group=animation_group,
                             size=size, size_max=size_max,height=height, 
                             title=f'{umap_df[animation_frame].unique()[0]} input dimensions',
                             opacity=opacity, color=color,
                             hover_data=hover_data)
    for button in fig.layout.updatemenus[0].buttons:
        button['args'][1]['frame']['redraw'] = True
    for k in range(len(fig.frames)):
        n_dims = umap_df["n_dims"].unique()[k]
        fig.frames[k]['layout'].update(title_text=f'{n_dims} input dimensions')
    fig.update_traces(hoverlabel = dict(font=dict(color='white')))
    fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = duration
    # Hover modes: https://plotly.com/python/hover-text-and-formatting/
    # fig.update_traces(hovertemplate=None)
    # fig.update_layout(hovermode="x unified")
    if show_plot:
        fig.show(renderer="notebook")
    return fig

fig  = animated_scatter(umap_df, x="UMAP1", y="UMAP2", 
                        threeD=False,
                        animation_frame="n_dims", animation_group="label_phe",
                        size="N_cases", size_max=5,height=600, 
                        opacity=.8, color="Category", 
                        hover_data=["label_phe","AD_CVD","Number of cases"])
−1−0.500.51−1−0.500.51
CategoryDisease outcomeFamily HistoryCancerImagingQuestionnaire (quantitative)Physical MeasurementAssayMiscellaneous (quantitative)MedicationMiscellaneous (binary)Questionnaire (binary)n_dims=223410204060801002 input dimensionsUMAP1UMAP2

1.4.2  3D UMAP

In [10]:
fig  = animated_scatter(umap_df, x="UMAP1", y="UMAP2",  
                        animation_frame="n_dims", animation_group="label_phe",
                        size="N_cases", size_max=15,height=600, 
                        opacity=.8, color="Category", 
                        hover_data=["label_phe","AD_CVD","Number of cases"]) 
CategoryDisease outcomeFamily HistoryCancerImagingQuestionnaire (quantitative)Physical MeasurementAssayMiscellaneous (quantitative)MedicationMiscellaneous (binary)Questionnaire (binary)n_dims=223410204060801002 input dimensions

2  Identify overlapping AD-CVD component(s)

2.1  Get top PCs

  • Get the PCs with the top N highest or lowest mean loadings for each of the phenotype groups (AD and CVD).
  • Then, find the intersect between each of the highest/lowest PC pairs across groups. This high/low pairing ensures the phenotype groups aren't loaded in the opposite directions.
In [11]:
top_PCs = get_topPCs(loading_phe, meta, terms1=AD_terms, terms2=CVD_terms, top_n=10)
top_PCs
             PC1           PC2       PC3       PC4           PC5       PC6  \
AD_CVD                                                                       
AD      0.000021  3.436702e-05  0.000019  0.000007  3.193369e-06  0.000001   
CVD     0.000005  6.460530e-07  0.000003  0.000005  3.053245e-07  0.000005   

             PC7           PC8           PC9      PC10  ...      PC90  \
AD_CVD                                                  ...             
AD      0.000014  1.649632e-04  5.702882e-05  0.000235  ...  0.004542   
CVD     0.000003  5.464630e-07  7.562668e-07  0.000006  ...  0.000264   

            PC91      PC92      PC93      PC94      PC95      PC96      PC97  \
AD_CVD                                                                         
AD      0.000593  0.027282  0.001323  0.001207  0.000215  0.000263  0.000852   
CVD     0.000067  0.000351  0.000341  0.000032  0.000299  0.000198  0.000103   

            PC98      PC99  
AD_CVD                      
AD      0.015618  0.000360  
CVD     0.000047  0.000242  

[2 rows x 99 columns]
0 high_PCs
2 low_PCs
Out[11]:
{'PC28', 'PC5'}

2.2  Heatmaps

2.2.1  All phenotypes x all PCs

In [12]:
fig  = sns.clustermap(np.log10(loading_phe), figsize=(12,8)) 

2.2.2  Phenotypes x phenotypes

In [13]:
## ALL phenotypes x all phenotypes 
## Takes a very long time to run
# corr_mat = loading_phe.T.corr()
# fig  = sns.clustermap(corr_mat, figsize=(20,20)) 
In [14]:
 corr_mat = loading_phe.loc[AD_CVD_terms,:].T.corr() 
In [15]:
# corr_mat.to_csv("processed_data/DEGAS/all_traits.corr.csv.gz")
corr_mat.to_csv("processed_data/DEGAS/AD_CVD.corr.csv.gz")
In [16]:
# https://docs.bokeh.org/en/latest/docs/user_guide/jupyter.html#userguide-jupyter-notebook
from bokehheat import heat
from bokeh.palettes import Reds9, RdBu11, YlGn8, Colorblind8
from bokeh.io import output_notebook
from bokeh.plotting import figure, show
output_notebook()

s_filename="processed_data/DEGAS/AD_CVD.clustergram.html"
o_clustermap, ls_xaxis, ls_yaxis = heat.clustermap(
    df_matrix = corr_mat,
    ls_color_palette = Reds9,
    r_low = 0,
    r_high = 1,
    # s_z = "log2", 
    b_ydendo = True,
    b_xdendo = True, 
    s_filetitel="Phenotype x phenotype correlation",
    s_filename=s_filename
)
BokehJS 2.3.0 successfully loaded.
ds_xcolor: {}
ds_ycolor: {}
In [17]:
# show(o_clustermap)
In [18]:
from IPython.display import Markdown as md
md("[Clustergram link]({})".format("https://neurogenomics.github.io/AD_CVD_genetics/"+s_filename))
Out[18]:

2.2.3  All PCs vs. all PCs

In [19]:
corr_mat_PC = loading_phe.corr()
fig  = sns.clustermap(corr_mat_PC, figsize=(12,12)) 

2.2.4  AD/CVD phenotypes x top PCS

In [20]:
# https://plotly.com/python-api-reference/generated/plotly.express.imshow
px.imshow(loading_phe.loc[AD_terms+CVD_terms, top_PCs], aspect=.5) 
PC28PC5Age heart attack diagnosedAverage heart rateCardiac indexCardiac outputpericardial problempericardial effusionirregular heart beathypertrophic cardiomyopathy (hcm/hocm)heart/cardiac problemcardiomyopathyheart arrhythmiaheart valve problem/heart murmursvt/supraventricular tachycardiaperipheral vascular diseaseheart failure/pulmonary odemaheart attack/myocardial infarctionatrial flutteratrial fibrillationAlzheimer's disease/dementia
00.5μ1.5μ2.5μ3.5μ
In [21]:
sns.clustermap(loading_phe.loc[AD_terms+CVD_terms,top_PCs], figsize=(10,8))
Out[21]:
<seaborn.matrix.ClusterGrid at 0x7fcf8895a100>

2.3  Identify variants

In [22]:
variants = pd.DataFrame(DEGAS["contribution_var"], 
                        index=DEGAS["label_var"],
                        columns=["PC"+str(x+1) for x in range(0,DEGAS["contribution_var"].shape[1])])
print(variants.shape)
variants.head()
(235907, 100)
Out[22]:
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 ... PC91 PC92 PC93 PC94 PC95 PC96 PC97 PC98 PC99 PC100
1-768448 6.573156e-11 3.092439e-11 9.456376e-11 6.875357e-10 2.179480e-11 4.109375e-11 2.230303e-10 1.037994e-10 2.827198e-11 3.474831e-12 ... 6.254151e-08 4.105932e-08 1.997445e-08 3.240216e-08 8.507489e-08 6.155163e-08 2.384276e-08 1.900036e-09 3.657842e-09 9.608528e-09
1-779322 5.451681e-11 2.328294e-11 1.087348e-10 3.784154e-10 1.406952e-12 3.439214e-11 3.793652e-10 1.957522e-11 4.383288e-11 4.964640e-11 ... 6.706287e-08 9.872628e-09 5.321880e-08 1.994167e-09 5.297409e-08 5.011643e-08 1.735938e-08 3.024225e-08 6.453343e-09 4.666550e-08
1-801536 3.222878e-11 3.630352e-11 8.586395e-11 1.975290e-10 3.905643e-10 4.160173e-10 4.893696e-10 7.373002e-11 5.356124e-10 2.192709e-10 ... 6.181041e-08 1.492233e-06 2.462514e-08 1.095120e-06 1.101642e-05 1.185268e-06 1.193834e-05 4.711148e-07 3.123312e-07 3.400447e-06
1-834830 2.070836e-10 4.021792e-11 3.083290e-10 1.241294e-09 1.533956e-10 4.019168e-11 3.915927e-10 4.232564e-10 4.063007e-10 2.425424e-11 ... 2.416501e-08 2.556066e-08 1.562400e-08 2.026922e-08 3.366161e-08 5.405308e-09 2.207275e-10 9.119161e-09 2.048419e-08 4.274474e-08
1-838555 9.782194e-10 1.058574e-08 9.669674e-11 8.206221e-08 2.632052e-08 3.857839e-07 4.470772e-08 1.178918e-06 9.459870e-08 8.375471e-07 ... 6.936745e-06 1.281196e-05 9.884078e-07 5.406462e-06 6.958246e-07 7.719977e-07 2.613169e-06 1.916951e-06 5.602594e-08 2.401105e-06

5 rows × 100 columns

In [23]:
sorted_variants = get_top_loadings(loadings=variants, top_PCs=top_PCs, top_n=10, var_name="variant")
sorted_variants
Out[23]:
index variant value abs_value
390313 PC5 16-53813367 0.001670 0.001670
120551 PC5 4-73541684 0.001120 0.001120
410853 PC5 17-61908556 0.001104 0.001104
255441 PC5 9-98231008 0.001096 0.001096
456645 PC5 20-62328375 0.001095 0.001095
280571 PC5 10-100017453 0.001094 0.001094
125219 PC5 4-106828795 0.001063 0.001063
407973 PC5 17-43798308 0.001053 0.001053
312171 PC5 12-3351169 0.000931 0.000931
445535 PC5 20-6621685 0.000904 0.000904
399186 PC28 16-89986117 0.038287 0.038287
399146 PC28 16-89818732 0.035850 0.035850
399114 PC28 16-89755903 0.026439 0.026439
166972 PC28 6-421281 0.025326 0.025326
399056 PC28 16-89507330 0.022765 0.022765
449890 PC28 20-33171772 0.012982 0.012982
449976 PC28 20-33889677 0.009735 0.009735
302450 PC28 11-89005253 0.008553 0.008553
449936 PC28 20-33642396 0.007781 0.007781
166960 PC28 6-384970 0.006131 0.006131
In [24]:
save_table(df=variants.loc[:,top_PCs], df_path= "processed_data/DEGAS/AD_CVD.all_variants.csv.gz")
save_table(df=sorted_variants, df_path= "processed_data/DEGAS/AD_CVD.variants.csv.gz")

2.4  Identify genes

In [25]:
genes = pd.DataFrame(DEGAS["contribution_gene"], 
                     index=DEGAS["label_gene"], 
                     columns=["PC"+str(x+1) for x in range(0,DEGAS["contribution_gene"].shape[1])])
# Remove variants that are in this dataframe for some reason
genes = genes.loc[~genes.index.isin(variants.index),:]
print(genes.shape)
genes.head()
(30084, 100)
Out[25]:
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 ... PC91 PC92 PC93 PC94 PC95 PC96 PC97 PC98 PC99 PC100
A1CF 7.800001e-06 1.711743e-05 2.850361e-05 3.162356e-06 3.544850e-05 3.964877e-06 1.460415e-04 1.086146e-05 7.899508e-06 6.540217e-07 ... 1.670194e-06 2.811636e-06 1.239760e-05 3.260172e-06 5.115543e-06 8.824516e-06 1.538888e-06 4.649342e-07 6.945523e-06 4.671483e-06
A2M 1.104163e-10 5.306701e-11 2.143011e-10 3.051558e-10 7.210139e-11 2.106757e-10 1.886690e-10 1.743034e-10 1.577621e-10 6.543393e-11 ... 2.572362e-07 5.292908e-09 4.242549e-08 3.805009e-07 3.284277e-07 8.658985e-07 3.502422e-08 6.082331e-07 2.191440e-07 1.981446e-08
A2ML1 1.740716e-09 5.578239e-08 4.763233e-08 1.926809e-05 7.736715e-07 1.032013e-04 4.003432e-06 3.129451e-05 8.190236e-07 3.554642e-05 ... 1.755029e-05 2.267757e-05 1.123582e-05 2.305262e-05 1.095830e-05 1.443458e-05 2.900872e-05 1.459127e-05 8.960912e-06 2.107984e-05
A2ML1-AS1 7.995986e-11 5.989858e-08 3.514103e-08 1.521358e-05 5.702767e-07 8.047480e-05 9.763509e-07 4.072305e-06 3.783574e-07 3.905134e-07 ... 6.777451e-06 5.150641e-05 1.105268e-05 1.416616e-05 2.243055e-05 1.447252e-05 1.189793e-05 9.211398e-06 1.276655e-05 1.780519e-05
A2MP1 9.373051e-10 9.507045e-10 1.272066e-09 6.965735e-09 1.553929e-10 3.108601e-09 1.588853e-09 2.774902e-10 1.660875e-09 8.044117e-10 ... 1.045674e-06 2.447230e-06 1.123731e-06 4.625240e-06 2.342033e-06 5.069790e-07 5.545430e-06 5.737541e-06 1.016004e-06 5.918024e-06

5 rows × 100 columns

In [26]:
sorted_genes = get_top_loadings(loadings=genes, top_PCs=top_PCs, top_n=10, var_name="gene")
sorted_genes
Out[26]:
index gene value abs_value
7525 PC5 ADAMTSL3 0.003688 0.003688
32529 PC5 FTO 0.003094 0.003094
47887 PC5 PTCH1 0.001891 0.001891
57099 PC5 TSPAN9 0.001868 0.001868
42319 PC5 MSRA 0.001837 0.001837
35947 PC5 JAZF1 0.001615 0.001615
2909 PC5 AC022784.6 0.001602 0.001602
41089 PC5 MECOM 0.001592 0.001592
18531 PC5 DOCK3 0.001420 0.001420
58611 PC5 WWP2 0.001420 0.001420
31700 PC28 FANCA 0.040987 0.040987
24010 PC28 ENSG00000198211,ENSG00000258839 0.040021 0.040021
15466 PC28 CDK10 0.026441 0.026441
10824 PC28 ANKRD11 0.024918 0.024918
46334 PC28 PIGU 0.013988 0.013988
57416 PC28 TYR 0.010455 0.010455
57824 PC28 UQCC1 0.009785 0.009785
53872 PC28 SPATA33 0.009363 0.009363
56970 PC28 TRPC4AP 0.007796 0.007796
15500 PC28 CDK5RAP1 0.006129 0.006129
In [27]:
"IRX3" in sorted_genes.gene
Out[27]:
False

Genes of particular interest

  • FTO: FTO affects obesity through regulation of hypothalamic neurons (and thus eatin behaviour), though this is achieved through its regulation of IRX3, which is a less direct mechanism than once thought. In relaton to AD:

    "Recent studies revealed that carriers of common FTO gene polymorphisms show both a reduction in frontal lobe volume of the brain[39] and an impaired verbal fluency performance.[40] Fittingly, a population-based study from Sweden found that carriers of the FTO rs9939609 A allele have an increased risk for incident Alzheimer disease.[41]" -Wikipedia

  • FANCA: gene named after Fanconi anemia, a rare blood disorder.

In [28]:
save_table(df=genes.loc[:,top_PCs], df_path= "processed_data/DEGAS/AD_CVD.all_genes.csv.gz")
save_table(df=sorted_genes, df_path= "processed_data/DEGAS/AD_CVD.genes.csv.gz")

2.4.1  Determine outlier genes

  • Find the genes that signficantly more associated with these PCs than other PCs.
  • sklearn has a number of outlier/anomaly detection tools to do this. However these methods are all designed to make predictions from a matrix of the same size as the one you trained on, unless you fit_predict on the several PCs of interest directly. At which point, you're basically just fiddling with the hyperparameters to get the desired number of genes (very similar to the simple table sorting method).
  • gene-outlier-detection is explicitly designed for genes.
In [29]:
from sklearn.neighbors import LocalOutlierFactor

X = genes.loc[:,~genes.columns.isin(top_PCs)]
y = genes.loc[:,top_PCs]

# use fit_predict to compute the predicted labels of the training samples
# (when LOF is used for outlier detection, the estimator has no predict,
# decision_function and score_samples methods)
clf = LocalOutlierFactor(n_neighbors=20, contamination=0.001)
y_pred = clf.fit_predict(y) 
y.index[y_pred==-1]
Out[29]:
Index(['AC012485.2', 'ADAMTSL3', 'ANKRD11', 'CD82', 'CDK10', 'DPP4', 'EGFLAM',
       'ENSG00000198211,ENSG00000258839', 'ENSG00000230499,ENSG00000172965',
       'ENSG00000237813,ENSG00000105971', 'ENSG00000262304,ENSG00000196689',
       'FANCA', 'FTO', 'HMGXB3', 'IARS2', 'IFNG-AS1', 'KDELR3', 'LDHAL6B',
       'MSRA', 'PHAX', 'PIGU', 'PRSS55', 'PTCH1', 'RNU5E-4P', 'SPATA33',
       'TRPC4AP', 'TSPAN9', 'TYR', 'UMPS', 'UQCC1', 'ZNF597'],
      dtype='object')
In [30]:
### NOTE: have to add --ExecutePreprocessor.timeout=180 to avoid timeout.

# !jupyter nbconvert --to html_toc --ExecutePreprocessor.timeout=180 --execute code/DeGAs.ipynb
In [ ]: