DeGAs¶

Decomposition of Genetic Associations (DeGAs)

%config IPCompleter.use_jedi = False
%load_ext autoreload
%autoreload 2
%matplotlib inline

import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 300

import pandas as pd
import numpy as np
from scipy import sparse
import os
import plotly.express as px
import seaborn as sns

from functions import *

os.chdir("/Users/schilder/Desktop/AD_CVD_genetics/")
intro()

Data¶

Global Biobank Engine

DEGAS = np.load("./raw_data/DEGAS/dev_allNonMHC_z_center_p0001_100PCs_20180129.npz", allow_pickle=True)

# list(DEGAS.keys())

for x in list(DEGAS.keys()):
    print(x)
    try:
        print(DEGAS[x].shape)
    except:
        print(len(DEGAS[x]))

contribution_phe
(2138, 100)
contribution_var
(235907, 100)
cos_phe
(2138, 100)
eigen_var
(235907, 100)
eigen_phe
(2138, 100)
eigen_v
(100,)
label_phe_stackedbar
(316,)
stackedbar_gene
(73, 100)
metadata
(4,)
cos_var
(235907, 100)
contribution_gene
(107345, 100)
label_gene
(107345,)
stackedbar_phe
(316, 100)
factor_phe
(2138, 100)
loading_phe
(2138, 100)
label_phe_code
(2138,)
label_phe
(2138,)
label_var
(235907,)
label_gene_stackedbar
(73,)
total_inertia
(1,)
factor_var
(235907, 100)
loading_var
(235907, 100)

# "loading_phe","contribution_phe","eigen_phe","cos_phe"
loading_phe = pd.DataFrame(data=DEGAS["contribution_phe"],  
columns=["PC"+str(x+1) for x in range(0,DEGAS["contribution_phe"].shape[1])],
index=DEGAS["label_phe"])  
loading_phe

Import metdata¶

meta = pd.read_excel("raw_data/DEGAS/41467_2019_11953_MOESM5_ESM.xlsx")

meta = meta.merge(pd.DataFrame({"label_phe":DEGAS["label_phe"], "label_phe_code":DEGAS["label_phe_code"]}), on="label_phe_code")

## Assign AD/CVD status
meta["CVD"] = meta["label_phe"].str.lower().str.contains("cardiom|cardia|heart|vascular|atrial|atrium")
meta["AD"] = meta["label_phe"].str.lower().str.contains("alzheimer")
conditions = [
    (meta['AD'] == True),
    (meta['CVD'] == True)
]
values = ['AD','CVD'] 
meta['AD_CVD'] = np.select(conditions, values)
AD_terms = meta.loc[meta["AD"],:].label_phe.tolist()
CVD_terms = meta.loc[meta["CVD"],:].label_phe.tolist()
AD_CVD_terms = AD_terms + CVD_terms

# Sample size
meta["N_cases"] = np.log1p(meta["Number of cases"])

meta.loc[meta["AD_CVD"].isin(["AD","CVD"]),:]

PCA¶

The top 2 PCs don't really generate identifiable clusters beyond Physical Measurement (mostly related to adiposity/BMI) and others.

pca_annot = loading_phe.merge(meta, left_index=True, right_on="label_phe")

fig  = px.scatter(pca_annot, x="PC1", y="PC2", #height=400,
                     log_x=True, log_y=True, 
                     opacity=.8, color="Category",#"AD_CVD",
              hover_data=["label_phe","AD_CVD","Number of cases"])
fig.update_traces(marker=dict(size=10))
fig.show()

UMAP¶

UMAP (using the PCs as input) gives a much better visual representation of the data. But it's unclear how many of the 100 PCs to use. So we iterate running UMAP with an increasing number of PCs.

umap_df_list = [get_umap(loading_phe, meta, n_dims=x,  merging_var="label_phe") for x in [2,3,4,10,20,40,60,80,100]]
umap_df = pd.concat(umap_df_list) 
umap_df

Using 2 dimensions as input.
Rescaling from -1 to 1
Done.
Using 3 dimensions as input.
Rescaling from -1 to 1
Done.
Using 4 dimensions as input.
Rescaling from -1 to 1
Done.
Using 10 dimensions as input.
Rescaling from -1 to 1
Done.
Using 20 dimensions as input.
Rescaling from -1 to 1
Done.
Using 40 dimensions as input.
Rescaling from -1 to 1
Done.
Using 60 dimensions as input.
Rescaling from -1 to 1
Done.
Using 80 dimensions as input.
Rescaling from -1 to 1
Done.
Using 100 dimensions as input.
Rescaling from -1 to 1
Done.

2D UMAP¶

def animated_scatter(umap_df,
                     threeD=True,
                     x="UMAP1", y="UMAP2", z="UMAP3",
                    animation_frame="n_dims",
                    animation_group="label_phe",
                    size="N_cases", size_max=10, height=600,
                    opacity=.8, color="Category",
                    hover_data=["label_phe","AD_CVD","Number of cases"],
                    duration=2000,
                    renderer="notebook",
                    show_plot=True):
    import plotly.express as px

    if threeD:
        fig  = px.scatter_3d(umap_df, x=x, y=y, z=z, 
                             animation_frame=animation_frame, 
                             animation_group=animation_group,
                             size=size, size_max=size_max,height=height, 
                             title=f'{umap_df[animation_frame].unique()[0]} input dimensions',
                             opacity=opacity, color=color,
                             hover_data=hover_data)
    else:
        fig  = px.scatter(umap_df, x=x, y=y, 
                             animation_frame=animation_frame, 
                             animation_group=animation_group,
                             size=size, size_max=size_max,height=height, 
                             title=f'{umap_df[animation_frame].unique()[0]} input dimensions',
                             opacity=opacity, color=color,
                             hover_data=hover_data)
    for button in fig.layout.updatemenus[0].buttons:
        button['args'][1]['frame']['redraw'] = True
    for k in range(len(fig.frames)):
        n_dims = umap_df["n_dims"].unique()[k]
        fig.frames[k]['layout'].update(title_text=f'{n_dims} input dimensions')
    fig.update_traces(hoverlabel = dict(font=dict(color='white')))
    fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = duration
    # Hover modes: https://plotly.com/python/hover-text-and-formatting/
    # fig.update_traces(hovertemplate=None)
    # fig.update_layout(hovermode="x unified")
    if show_plot:
        fig.show(renderer="notebook")
    return fig

fig  = animated_scatter(umap_df, x="UMAP1", y="UMAP2", 
                        threeD=False,
                        animation_frame="n_dims", animation_group="label_phe",
                        size="N_cases", size_max=5,height=600, 
                        opacity=.8, color="Category", 
                        hover_data=["label_phe","AD_CVD","Number of cases"])

3D UMAP¶

fig  = animated_scatter(umap_df, x="UMAP1", y="UMAP2",  
                        animation_frame="n_dims", animation_group="label_phe",
                        size="N_cases", size_max=15,height=600, 
                        opacity=.8, color="Category", 
                        hover_data=["label_phe","AD_CVD","Number of cases"])

Identify overlapping AD-CVD component(s)¶

Get top PCs¶

Get the PCs with the top N highest or lowest mean loadings for each of the phenotype groups (AD and CVD).
Then, find the intersect between each of the highest/lowest PC pairs across groups. This high/low pairing ensures the phenotype groups aren't loaded in the opposite directions.

top_PCs = get_topPCs(loading_phe, meta, terms1=AD_terms, terms2=CVD_terms, top_n=10)
top_PCs

             PC1           PC2       PC3       PC4           PC5       PC6  \
AD_CVD                                                                       
AD      0.000021  3.436702e-05  0.000019  0.000007  3.193369e-06  0.000001   
CVD     0.000005  6.460530e-07  0.000003  0.000005  3.053245e-07  0.000005   

             PC7           PC8           PC9      PC10  ...      PC90  \
AD_CVD                                                  ...             
AD      0.000014  1.649632e-04  5.702882e-05  0.000235  ...  0.004542   
CVD     0.000003  5.464630e-07  7.562668e-07  0.000006  ...  0.000264   

            PC91      PC92      PC93      PC94      PC95      PC96      PC97  \
AD_CVD                                                                         
AD      0.000593  0.027282  0.001323  0.001207  0.000215  0.000263  0.000852   
CVD     0.000067  0.000351  0.000341  0.000032  0.000299  0.000198  0.000103   

            PC98      PC99  
AD_CVD                      
AD      0.015618  0.000360  
CVD     0.000047  0.000242  

[2 rows x 99 columns]
0 high_PCs
2 low_PCs

{'PC28', 'PC5'}

Heatmaps¶

All phenotypes x all PCs¶

fig  = sns.clustermap(np.log10(loading_phe), figsize=(12,8))

Phenotypes x phenotypes¶

## ALL phenotypes x all phenotypes 
## Takes a very long time to run
# corr_mat = loading_phe.T.corr()
# fig  = sns.clustermap(corr_mat, figsize=(20,20))

 corr_mat = loading_phe.loc[AD_CVD_terms,:].T.corr()

# corr_mat.to_csv("processed_data/DEGAS/all_traits.corr.csv.gz")
corr_mat.to_csv("processed_data/DEGAS/AD_CVD.corr.csv.gz")

# https://docs.bokeh.org/en/latest/docs/user_guide/jupyter.html#userguide-jupyter-notebook
from bokehheat import heat
from bokeh.palettes import Reds9, RdBu11, YlGn8, Colorblind8
from bokeh.io import output_notebook
from bokeh.plotting import figure, show
output_notebook()

s_filename="processed_data/DEGAS/AD_CVD.clustergram.html"
o_clustermap, ls_xaxis, ls_yaxis = heat.clustermap(
    df_matrix = corr_mat,
    ls_color_palette = Reds9,
    r_low = 0,
    r_high = 1,
    # s_z = "log2", 
    b_ydendo = True,
    b_xdendo = True, 
    s_filetitel="Phenotype x phenotype correlation",
    s_filename=s_filename
)

ds_xcolor: {}
ds_ycolor: {}

# show(o_clustermap)

from IPython.display import Markdown as md
md("[Clustergram link]({})".format("https://neurogenomics.github.io/AD_CVD_genetics/"+s_filename))

All PCs vs. all PCs¶

corr_mat_PC = loading_phe.corr()
fig  = sns.clustermap(corr_mat_PC, figsize=(12,12))

AD/CVD phenotypes x top PCS¶

# https://plotly.com/python-api-reference/generated/plotly.express.imshow
px.imshow(loading_phe.loc[AD_terms+CVD_terms, top_PCs], aspect=.5)

sns.clustermap(loading_phe.loc[AD_terms+CVD_terms,top_PCs], figsize=(10,8))

<seaborn.matrix.ClusterGrid at 0x7fcf8895a100>

Identify variants¶

variants = pd.DataFrame(DEGAS["contribution_var"], 
                        index=DEGAS["label_var"],
                        columns=["PC"+str(x+1) for x in range(0,DEGAS["contribution_var"].shape[1])])
print(variants.shape)
variants.head()

(235907, 100)

sorted_variants = get_top_loadings(loadings=variants, top_PCs=top_PCs, top_n=10, var_name="variant")
sorted_variants

save_table(df=variants.loc[:,top_PCs], df_path= "processed_data/DEGAS/AD_CVD.all_variants.csv.gz")
save_table(df=sorted_variants, df_path= "processed_data/DEGAS/AD_CVD.variants.csv.gz")

Saving file...

Saving file...

Identify genes¶

genes = pd.DataFrame(DEGAS["contribution_gene"], 
                     index=DEGAS["label_gene"], 
                     columns=["PC"+str(x+1) for x in range(0,DEGAS["contribution_gene"].shape[1])])
# Remove variants that are in this dataframe for some reason
genes = genes.loc[~genes.index.isin(variants.index),:]
print(genes.shape)
genes.head()

(30084, 100)

sorted_genes = get_top_loadings(loadings=genes, top_PCs=top_PCs, top_n=10, var_name="gene")
sorted_genes

"IRX3" in sorted_genes.gene

False

Genes of particular interest

FTO: FTO affects obesity through regulation of hypothalamic neurons (and thus eatin behaviour), though this is achieved through its regulation of IRX3, which is a less direct mechanism than once thought. In relaton to AD:

"Recent studies revealed that carriers of common FTO gene polymorphisms show both a reduction in frontal lobe volume of the brain[39] and an impaired verbal fluency performance.[40] Fittingly, a population-based study from Sweden found that carriers of the FTO rs9939609 A allele have an increased risk for incident Alzheimer disease.[41]" -Wikipedia
FANCA: gene named after Fanconi anemia, a rare blood disorder.

save_table(df=genes.loc[:,top_PCs], df_path= "processed_data/DEGAS/AD_CVD.all_genes.csv.gz")
save_table(df=sorted_genes, df_path= "processed_data/DEGAS/AD_CVD.genes.csv.gz")

Saving file...

Saving file...

Determine outlier genes¶

Find the genes that signficantly more associated with these PCs than other PCs.
sklearn has a number of outlier/anomaly detection tools to do this. However these methods are all designed to make predictions from a matrix of the same size as the one you trained on, unless you fit_predict on the several PCs of interest directly. At which point, you're basically just fiddling with the hyperparameters to get the desired number of genes (very similar to the simple table sorting method).
gene-outlier-detection is explicitly designed for genes.

from sklearn.neighbors import LocalOutlierFactor

X = genes.loc[:,~genes.columns.isin(top_PCs)]
y = genes.loc[:,top_PCs]

# use fit_predict to compute the predicted labels of the training samples
# (when LOF is used for outlier detection, the estimator has no predict,
# decision_function and score_samples methods)
clf = LocalOutlierFactor(n_neighbors=20, contamination=0.001)
y_pred = clf.fit_predict(y) 
y.index[y_pred==-1]

Index(['AC012485.2', 'ADAMTSL3', 'ANKRD11', 'CD82', 'CDK10', 'DPP4', 'EGFLAM',
       'ENSG00000198211,ENSG00000258839', 'ENSG00000230499,ENSG00000172965',
       'ENSG00000237813,ENSG00000105971', 'ENSG00000262304,ENSG00000196689',
       'FANCA', 'FTO', 'HMGXB3', 'IARS2', 'IFNG-AS1', 'KDELR3', 'LDHAL6B',
       'MSRA', 'PHAX', 'PIGU', 'PRSS55', 'PTCH1', 'RNU5E-4P', 'SPATA33',
       'TRPC4AP', 'TSPAN9', 'TYR', 'UMPS', 'UQCC1', 'ZNF597'],
      dtype='object')

### NOTE: have to add --ExecutePreprocessor.timeout=180 to avoid timeout.

# !jupyter nbconvert --to html_toc --ExecutePreprocessor.timeout=180 --execute code/DeGAs.ipynb

	PC1	PC2	PC3	PC4	PC5	PC6	PC7	PC8	PC9	PC10	...	PC91	PC92	PC93	PC94	PC95	PC96	PC97	PC98	PC99	PC100
breast cysts	0.000005	1.186378e-06	0.000003	0.000005	2.035159e-06	7.562966e-07	1.338280e-06	6.545312e-07	8.990043e-07	1.580968e-07	...	1.517194e-06	1.540360e-05	4.016826e-07	2.819625e-05	3.942034e-05	2.701953e-05	5.498834e-08	8.833819e-06	1.367510e-06	6.571907e-08
bone disorder	0.000005	1.163787e-06	0.000002	0.000003	2.809106e-09	2.767192e-07	1.476105e-06	2.802009e-07	3.095311e-07	9.608308e-07	...	1.069943e-07	1.586188e-09	5.903842e-06	1.997779e-05	5.630284e-08	2.447236e-06	7.318879e-06	4.327306e-07	1.717049e-06	2.375211e-06
anxiety/panic attacks	0.000005	9.426096e-07	0.000002	0.000005	9.508454e-09	1.410584e-06	1.556532e-06	6.755476e-07	1.018647e-08	7.780577e-07	...	1.303051e-05	5.794538e-06	1.205579e-06	7.580794e-07	1.905491e-06	2.565167e-05	4.083603e-05	7.386000e-05	2.250169e-07	2.570055e-05
connective tissue disorder	0.000005	2.727648e-07	0.000002	0.000003	3.070556e-09	6.242487e-07	1.630909e-06	2.726991e-07	5.444983e-07	3.488909e-07	...	2.521400e-06	5.039999e-05	9.321211e-07	4.541067e-05	4.082073e-05	5.767555e-07	4.188419e-05	1.698208e-06	6.018270e-05	3.442434e-08
housemaid's knee (prepatellar bursitis)	0.000004	9.153760e-07	0.000001	0.000006	7.542438e-08	8.842300e-08	1.666502e-06	1.513856e-09	4.905633e-09	3.783048e-09	...	7.256107e-08	2.349348e-06	7.088202e-06	3.440427e-06	7.699747e-06	1.164741e-05	1.782329e-05	2.257302e-06	9.444018e-07	1.492720e-05
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
Particulate matter air pollution 2.5-10um; 2010	0.000004	6.667194e-07	0.000002	0.000004	4.145418e-08	1.766194e-07	3.250524e-06	6.043835e-07	2.248262e-06	1.646005e-07	...	6.887772e-07	1.570322e-05	4.058734e-07	4.033574e-05	2.561271e-06	5.528692e-05	2.382160e-04	3.860690e-05	3.748469e-04	2.301656e-08
Arm fat-free mass (right)	0.050005	4.076553e-03	0.004779	0.000002	8.943901e-03	3.860991e-04	1.843923e-05	1.271482e-04	4.387143e-02	7.321188e-06	...	6.243377e-04	3.545923e-03	8.615398e-04	1.386767e-03	7.488123e-03	6.471627e-03	6.452772e-05	1.936609e-03	6.113287e-03	4.549085e-06
Trunk fat mass	0.010582	4.416343e-02	0.020901	0.000009	2.977132e-07	1.101556e-04	2.240712e-04	3.235084e-05	3.410710e-04	1.569382e-06	...	2.803052e-04	2.843188e-03	2.456978e-03	3.903862e-03	2.605272e-03	3.344793e-04	8.242964e-06	2.384700e-03	7.433331e-03	8.946398e-04
Trunk fat-free mass	0.059108	1.680808e-02	0.001528	0.000044	6.179815e-03	3.835175e-04	2.491919e-03	8.667685e-07	3.437652e-02	3.277293e-06	...	9.714820e-05	2.832675e-04	1.339791e-03	1.527432e-04	8.842225e-04	7.154613e-06	1.025064e-04	8.573282e-05	6.857318e-04	3.304525e-05
cannabis use frequency	0.000007	6.796395e-06	0.000002	0.000006	1.136185e-06	1.316573e-06	2.449321e-08	1.852046e-09	5.964184e-07	1.330250e-07	...	9.145058e-05	4.709951e-04	2.050217e-05	5.328953e-04	6.311615e-07	4.247831e-10	5.816937e-04	1.282400e-05	7.294808e-04	3.917194e-04

	Category	Phenotype name	Number of cases	All	Coding	PTVs	Global Biobank Engine phenotype page (URL)	label_phe_code	label_phe	CVD	AD	AD_CVD	N_cases
21	Disease outcome	atrial fibrillation	12665	Y	Y	Y	https://biobankengine.stanford.edu/coding/HC281	HC281	atrial fibrillation	True	False	CVD	9.446677
23	Disease outcome	atrial flutter	12186	Y	Y	Y	https://biobankengine.stanford.edu/coding/HC440	HC440	atrial flutter	True	False	CVD	9.408125
25	Disease outcome	heart attack / myocardial infarction	12138	Y	Y	Y	https://biobankengine.stanford.edu/coding/HC326	HC326	heart attack/myocardial infarction	True	False	CVD	9.404179
61	Disease outcome	heart failure / pulmonary odema	4657	Y	Y	Y	https://biobankengine.stanford.edu/coding/HC299	HC299	heart failure/pulmonary odema	True	False	CVD	8.446341
77	Disease outcome	peripheral vascular disease	3517	Y	Y	Y	https://biobankengine.stanford.edu/coding/HC385	HC385	peripheral vascular disease	True	False	CVD	8.165648
96	Disease outcome	svt / supraventricular tachycardia	2653	Y	Y	Y	https://biobankengine.stanford.edu/coding/HC131	HC131	svt/supraventricular tachycardia	True	False	CVD	7.883823
108	Disease outcome	heart valve problem / heart murmur	2326	Y	Y	N	https://biobankengine.stanford.edu/coding/HC231	HC231	heart valve problem/heart murmur	True	False	CVD	7.752335
117	Disease outcome	heart arrhythmia	1884	Y	Y	Y	https://biobankengine.stanford.edu/coding/HC193	HC193	heart arrhythmia	True	False	CVD	7.541683
148	Disease outcome	cardiomyopathy	1309	Y	Y	N	https://biobankengine.stanford.edu/coding/HC414	HC414	cardiomyopathy	True	False	CVD	7.177782
159	Disease outcome	heart / cardiac problem	1148	Y	Y	N	https://biobankengine.stanford.edu/coding/HC88	HC88	heart/cardiac problem	True	False	CVD	7.046647
165	Disease outcome	hypertrophic cardiomyopathy (hcm / hocm)	1096	Y	Y	N	https://biobankengine.stanford.edu/coding/HC308	HC308	hypertrophic cardiomyopathy (hcm/hocm)	True	False	CVD	7.000334
202	Disease outcome	irregular heart beat	689	Y	Y	N	https://biobankengine.stanford.edu/coding/HC113	HC113	irregular heart beat	True	False	CVD	6.536692
225	Disease outcome	pericardial effusion	531	Y	Y	N	https://biobankengine.stanford.edu/coding/HC114	HC114	pericardial effusion	True	False	CVD	6.276643
336	Disease outcome	pericardial problem	179	Y	Y	N	https://biobankengine.stanford.edu/coding/HC213	HC213	pericardial problem	True	False	CVD	5.192957
416	Family History	Alzheimer's disease / dementia	51794	Y	Y	Y	https://biobankengine.stanford.edu/coding/FH1263	FH1263	Alzheimer's disease/dementia	False	True	AD	10.855049
1819	Imaging	Cardiac output	3374	Y	Y	Y	https://biobankengine.stanford.edu/coding/INI2...	INI22424	Cardiac output	True	False	CVD	8.124151
1820	Imaging	Cardiac index	3374	Y	Y	N	https://biobankengine.stanford.edu/coding/INI2...	INI22425	Cardiac index	True	False	CVD	8.124151
1821	Imaging	Average heart rate	3374	Y	Y	N	https://biobankengine.stanford.edu/coding/INI2...	INI22426	Average heart rate	True	False	CVD	8.124151
2067	Questionnaire (quantitative)	Age heart attack diagnosed	7942	Y	Y	N	https://biobankengine.stanford.edu/coding/INI3894	INI3894	Age heart attack diagnosed	True	False	CVD	8.980046

	label_phe	UMAP1	UMAP2	UMAP3	Category	Phenotype name	Number of cases	All	Coding	PTVs	Global Biobank Engine phenotype page (URL)	label_phe_code	CVD	AD	AD_CVD	N_cases	n_dims
0	breast cysts	0.363005	-0.623906	0.337725	Disease outcome	breast cysts	1398	Y	Y	N	https://biobankengine.stanford.edu/coding/HC139	HC139	False	False	0	7.243513	2
1	bone disorder	0.218490	-0.641017	-0.640685	Disease outcome	bone disorder	1459	Y	Y	N	https://biobankengine.stanford.edu/coding/HC134	HC134	False	False	0	7.286192	2
2	anxiety/panic attacks	0.550190	0.245177	-0.302014	Disease outcome	anxiety / panic attacks	4986	Y	Y	Y	https://biobankengine.stanford.edu/coding/HC135	HC135	False	False	0	8.514590	2
3	connective tissue disorder	-0.340144	-0.294241	0.954099	Disease outcome	connective tissue disorder	2515	Y	Y	Y	https://biobankengine.stanford.edu/coding/HC136	HC136	False	False	0	7.830426	2
4	housemaid's knee (prepatellar bursitis)	-0.977615	0.074180	0.135035	Disease outcome	housemaid's knee (prepatellar bursitis)	194	Y	Y	N	https://biobankengine.stanford.edu/coding/HC130	HC130	False	False	0	5.273000	2
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2181	Nitrogen dioxide air pollution; 2007	0.734835	0.388312	-0.202410	Miscellaneous (quantitative)	Nitrogen dioxide air pollution; 2007	332633	Y	Y	Y	https://biobankengine.stanford.edu/coding/INI2...	INI24018	False	False	0	12.714798	100
2182	Particulate matter air pollution 2.5-10um; 2010	0.769730	0.393652	-0.148391	Miscellaneous (quantitative)	Particulate matter air pollution 2.5-10um; 2010	308977	Y	Y	Y	https://biobankengine.stanford.edu/coding/INI2...	INI24008	False	False	0	12.641025	100
2183	Arm fat-free mass (right)	0.782294	-0.041789	-0.260538	Physical Measurement	Arm fat-free mass (right)	331418	Y	Y	Y	https://biobankengine.stanford.edu/coding/INI2...	INI23121	False	False	0	12.711139	100
2184	Trunk fat-free mass	0.778969	-0.047271	-0.277467	Physical Measurement	Trunk fat-free mass	331234	Y	Y	Y	https://biobankengine.stanford.edu/coding/INI2...	INI23129	False	False	0	12.710583	100
2185	cannabis use frequency	0.786820	0.321295	-0.237221	Miscellaneous (quantitative)	cannabis use frequency	110201	Y	Y	Y	https://biobankengine.stanford.edu/coding/INI2...	INI20453	False	False	0	11.610070	100

	PC1	PC2	PC3	PC4	PC5	PC6	PC7	PC8	PC9	PC10	...	PC91	PC92	PC93	PC94	PC95	PC96	PC97	PC98	PC99	PC100
1-768448	6.573156e-11	3.092439e-11	9.456376e-11	6.875357e-10	2.179480e-11	4.109375e-11	2.230303e-10	1.037994e-10	2.827198e-11	3.474831e-12	...	6.254151e-08	4.105932e-08	1.997445e-08	3.240216e-08	8.507489e-08	6.155163e-08	2.384276e-08	1.900036e-09	3.657842e-09	9.608528e-09
1-779322	5.451681e-11	2.328294e-11	1.087348e-10	3.784154e-10	1.406952e-12	3.439214e-11	3.793652e-10	1.957522e-11	4.383288e-11	4.964640e-11	...	6.706287e-08	9.872628e-09	5.321880e-08	1.994167e-09	5.297409e-08	5.011643e-08	1.735938e-08	3.024225e-08	6.453343e-09	4.666550e-08
1-801536	3.222878e-11	3.630352e-11	8.586395e-11	1.975290e-10	3.905643e-10	4.160173e-10	4.893696e-10	7.373002e-11	5.356124e-10	2.192709e-10	...	6.181041e-08	1.492233e-06	2.462514e-08	1.095120e-06	1.101642e-05	1.185268e-06	1.193834e-05	4.711148e-07	3.123312e-07	3.400447e-06
1-834830	2.070836e-10	4.021792e-11	3.083290e-10	1.241294e-09	1.533956e-10	4.019168e-11	3.915927e-10	4.232564e-10	4.063007e-10	2.425424e-11	...	2.416501e-08	2.556066e-08	1.562400e-08	2.026922e-08	3.366161e-08	5.405308e-09	2.207275e-10	9.119161e-09	2.048419e-08	4.274474e-08
1-838555	9.782194e-10	1.058574e-08	9.669674e-11	8.206221e-08	2.632052e-08	3.857839e-07	4.470772e-08	1.178918e-06	9.459870e-08	8.375471e-07	...	6.936745e-06	1.281196e-05	9.884078e-07	5.406462e-06	6.958246e-07	7.719977e-07	2.613169e-06	1.916951e-06	5.602594e-08	2.401105e-06

	index	variant	value	abs_value
390313	PC5	16-53813367	0.001670	0.001670
120551	PC5	4-73541684	0.001120	0.001120
410853	PC5	17-61908556	0.001104	0.001104
255441	PC5	9-98231008	0.001096	0.001096
456645	PC5	20-62328375	0.001095	0.001095
280571	PC5	10-100017453	0.001094	0.001094
125219	PC5	4-106828795	0.001063	0.001063
407973	PC5	17-43798308	0.001053	0.001053
312171	PC5	12-3351169	0.000931	0.000931
445535	PC5	20-6621685	0.000904	0.000904
399186	PC28	16-89986117	0.038287	0.038287
399146	PC28	16-89818732	0.035850	0.035850
399114	PC28	16-89755903	0.026439	0.026439
166972	PC28	6-421281	0.025326	0.025326
399056	PC28	16-89507330	0.022765	0.022765
449890	PC28	20-33171772	0.012982	0.012982
449976	PC28	20-33889677	0.009735	0.009735
302450	PC28	11-89005253	0.008553	0.008553
449936	PC28	20-33642396	0.007781	0.007781
166960	PC28	6-384970	0.006131	0.006131

	PC1	PC2	PC3	PC4	PC5	PC6	PC7	PC8	PC9	PC10	...	PC91	PC92	PC93	PC94	PC95	PC96	PC97	PC98	PC99	PC100
A1CF	7.800001e-06	1.711743e-05	2.850361e-05	3.162356e-06	3.544850e-05	3.964877e-06	1.460415e-04	1.086146e-05	7.899508e-06	6.540217e-07	...	1.670194e-06	2.811636e-06	1.239760e-05	3.260172e-06	5.115543e-06	8.824516e-06	1.538888e-06	4.649342e-07	6.945523e-06	4.671483e-06
A2M	1.104163e-10	5.306701e-11	2.143011e-10	3.051558e-10	7.210139e-11	2.106757e-10	1.886690e-10	1.743034e-10	1.577621e-10	6.543393e-11	...	2.572362e-07	5.292908e-09	4.242549e-08	3.805009e-07	3.284277e-07	8.658985e-07	3.502422e-08	6.082331e-07	2.191440e-07	1.981446e-08
A2ML1	1.740716e-09	5.578239e-08	4.763233e-08	1.926809e-05	7.736715e-07	1.032013e-04	4.003432e-06	3.129451e-05	8.190236e-07	3.554642e-05	...	1.755029e-05	2.267757e-05	1.123582e-05	2.305262e-05	1.095830e-05	1.443458e-05	2.900872e-05	1.459127e-05	8.960912e-06	2.107984e-05
A2ML1-AS1	7.995986e-11	5.989858e-08	3.514103e-08	1.521358e-05	5.702767e-07	8.047480e-05	9.763509e-07	4.072305e-06	3.783574e-07	3.905134e-07	...	6.777451e-06	5.150641e-05	1.105268e-05	1.416616e-05	2.243055e-05	1.447252e-05	1.189793e-05	9.211398e-06	1.276655e-05	1.780519e-05
A2MP1	9.373051e-10	9.507045e-10	1.272066e-09	6.965735e-09	1.553929e-10	3.108601e-09	1.588853e-09	2.774902e-10	1.660875e-09	8.044117e-10	...	1.045674e-06	2.447230e-06	1.123731e-06	4.625240e-06	2.342033e-06	5.069790e-07	5.545430e-06	5.737541e-06	1.016004e-06	5.918024e-06

	index	gene	value	abs_value
7525	PC5	ADAMTSL3	0.003688	0.003688
32529	PC5	FTO	0.003094	0.003094
47887	PC5	PTCH1	0.001891	0.001891
57099	PC5	TSPAN9	0.001868	0.001868
42319	PC5	MSRA	0.001837	0.001837
35947	PC5	JAZF1	0.001615	0.001615
2909	PC5	AC022784.6	0.001602	0.001602
41089	PC5	MECOM	0.001592	0.001592
18531	PC5	DOCK3	0.001420	0.001420
58611	PC5	WWP2	0.001420	0.001420
31700	PC28	FANCA	0.040987	0.040987
24010	PC28	ENSG00000198211,ENSG00000258839	0.040021	0.040021
15466	PC28	CDK10	0.026441	0.026441
10824	PC28	ANKRD11	0.024918	0.024918
46334	PC28	PIGU	0.013988	0.013988
57416	PC28	TYR	0.010455	0.010455
57824	PC28	UQCC1	0.009785	0.009785
53872	PC28	SPATA33	0.009363	0.009363
56970	PC28	TRPC4AP	0.007796	0.007796
15500	PC28	CDK5RAP1	0.006129	0.006129