library(here)
## here() starts at /Users/alanmurphy/Documents/ICL/R_packages/MAGMA_Files_Public
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

Background

Analysis of metadata from MungeSumstats quality control and standardisation of 714 GWAS summary statistics from IEU GWAS.

Import metadata

meta <- data.table::fread(here::here("metadata.csv"), drop = "V1")

knitr::kable(head(meta))
id trait author subcategory ncontrol mr unit consortium group_name population year ontology priority build category note ncase nsnp pmid sex sample_size sd N munged_path id_standard rows_start snps_start sig_snps_start chroms_start rows_end snps_end sig_snps_end chroms_end build_inferred snps_flipped snps_dropped_INFO snps_dropped_nonRef snps_dropped_nonA1A2 snps_dropped_duplicates snps_dropped_chrom log_path build_matches genes_raw_url genes_out_url
ebi-a-GCST002245 Alzheimer’s disease (late onset) Lambert JC NA 37154 1 logOR NA public European 2013 NA 0 HG19/GRCh37 NA NA 17008 7022150 24162737 NA 55134 NA 55134 /home/rstudio/raid1/bschilder/projects/MAGMA_Files_Public/data/GWAS_sumstats/ebi-a-GCST002245/ebi-a-GCST002245.tsv.gz EBI-A-GCST002245 7006673 7005018 1053 24 6772393 6770843 855 22 GRCH37 70 NA 47581 109 4903 75 /home/rstudio/raid1/bschilder/projects/MAGMA_Files_Public/data/GWAS_sumstats/ebi-a-GCST002245/logs/MungeSumstats_log_msg.txt TRUE https://github.com/neurogenomics/MAGMA_Files_Public/raw/master/data/MAGMA_Files/ebi-a-GCST002245.tsv.gz.35UP.10DOWN/ebi-a-GCST002245.tsv.gz.35UP.10DOWN.genes.raw https://github.com/neurogenomics/MAGMA_Files_Public/raw/master/data/MAGMA_Files/ebi-a-GCST002245.tsv.gz.35UP.10DOWN/ebi-a-GCST002245.tsv.gz.35UP.10DOWN.genes.out
ebi-a-GCST004692 Amyotrophic lateral sclerosis van Rheenen W NA 23475 1 logOR NA public European 2016 NA 0 HG19/GRCh37 NA NA 12577 8643689 27455348 NA 36052 NA 36052 /home/rstudio/raid1/bschilder/projects/MAGMA_Files_Public/data/GWAS_sumstats/ebi-a-GCST004692/ebi-a-GCST004692.tsv.gz EBI-A-GCST004692 8623133 8623063 125 23 7677212 7677178 112 22 GRCH37 211 NA 728310 349 140 2 /home/rstudio/raid1/bschilder/projects/MAGMA_Files_Public/data/GWAS_sumstats/ebi-a-GCST004692/logs/MungeSumstats_log_msg.txt TRUE https://github.com/neurogenomics/MAGMA_Files_Public/raw/master/data/MAGMA_Files/ebi-a-GCST004692.tsv.gz.35UP.10DOWN/ebi-a-GCST004692.tsv.gz.35UP.10DOWN.genes.raw https://github.com/neurogenomics/MAGMA_Files_Public/raw/master/data/MAGMA_Files/ebi-a-GCST004692.tsv.gz.35UP.10DOWN/ebi-a-GCST004692.tsv.gz.35UP.10DOWN.genes.out
ebi-a-GCST004901 Amyotrophic lateral sclerosis (sporadic) Benyamin B NA 2850 1 logOR NA public East Asian 2017 NA 0 HG19/GRCh37 NA NA 1234 4933403 28931804 NA 4084 NA 4084 /home/rstudio/raid1/bschilder/projects/MAGMA_Files_Public/data/GWAS_sumstats/ebi-a-GCST004901/ebi-a-GCST004901.tsv.gz EBI-A-GCST004901 4921720 4921717 116 22 4773761 4773760 113 22 GRCH37 57 NA 13624 113 6 NA /home/rstudio/raid1/bschilder/projects/MAGMA_Files_Public/data/GWAS_sumstats/ebi-a-GCST004901/logs/MungeSumstats_log_msg.txt TRUE https://github.com/neurogenomics/MAGMA_Files_Public/raw/master/data/MAGMA_Files/ebi-a-GCST004901.tsv.gz.35UP.10DOWN/ebi-a-GCST004901.tsv.gz.35UP.10DOWN.genes.raw https://github.com/neurogenomics/MAGMA_Files_Public/raw/master/data/MAGMA_Files/ebi-a-GCST004901.tsv.gz.35UP.10DOWN/ebi-a-GCST004901.tsv.gz.35UP.10DOWN.genes.out
ebi-a-GCST005647 Amyotrophic lateral sclerosis Nicolas A NA 59804 1 logOR NA public European 2018 NA 0 HG19/GRCh37 NA NA 20806 39630630 29566793 NA 80610 NA 80610 /home/rstudio/raid1/bschilder/projects/MAGMA_Files_Public/data/GWAS_sumstats/ebi-a-GCST005647/ebi-a-GCST005647.tsv.gz EBI-A-GCST005647 9526646 9526199 178 22 214053 214051 0 22 GRCH37 202 8422400 881999 372 1590 NA /home/rstudio/raid1/bschilder/projects/MAGMA_Files_Public/data/GWAS_sumstats/ebi-a-GCST005647/logs/MungeSumstats_log_msg.txt TRUE https://github.com/neurogenomics/MAGMA_Files_Public/raw/master/data/MAGMA_Files/ebi-a-GCST005647.tsv.gz.35UP.10DOWN/ebi-a-GCST005647.tsv.gz.35UP.10DOWN.genes.raw https://github.com/neurogenomics/MAGMA_Files_Public/raw/master/data/MAGMA_Files/ebi-a-GCST005647.tsv.gz.35UP.10DOWN/ebi-a-GCST005647.tsv.gz.35UP.10DOWN.genes.out
ebi-a-GCST005920 Paternal history of Alzheimer’s disease Marioni RE NA 245941 1 logOR NA public European 2018 NA 0 HG19/GRCh37 NA NA 14338 7776415 29777097 NA 260279 NA 260279 /home/rstudio/raid1/bschilder/projects/MAGMA_Files_Public/data/GWAS_sumstats/ebi-a-GCST005920/ebi-a-GCST005920.tsv.gz EBI-A-GCST005920 7759506 7759499 199 23 7539989 7539984 192 22 GRCH37 74 NA 37652 166 37 1 /home/rstudio/raid1/bschilder/projects/MAGMA_Files_Public/data/GWAS_sumstats/ebi-a-GCST005920/logs/MungeSumstats_log_msg.txt TRUE https://github.com/neurogenomics/MAGMA_Files_Public/raw/master/data/MAGMA_Files/ebi-a-GCST005920.tsv.gz.35UP.10DOWN/ebi-a-GCST005920.tsv.gz.35UP.10DOWN.genes.raw https://github.com/neurogenomics/MAGMA_Files_Public/raw/master/data/MAGMA_Files/ebi-a-GCST005920.tsv.gz.35UP.10DOWN/ebi-a-GCST005920.tsv.gz.35UP.10DOWN.genes.out
ebi-a-GCST005921 Family history of Alzheimer’s disease Marioni RE NA NA 1 NA NA public European 2018 NA 0 HG19/GRCh37 NA NA NA 7746640 29777097 NA 314278 NA 314278 /home/rstudio/raid1/bschilder/projects/MAGMA_Files_Public/data/GWAS_sumstats/ebi-a-GCST005921/ebi-a-GCST005921.tsv.gz EBI-A-GCST005921 7729799 7729792 632 23 7511067 7511062 610 22 GRCH37 74 NA 37542 166 37 1 /home/rstudio/raid1/bschilder/projects/MAGMA_Files_Public/data/GWAS_sumstats/ebi-a-GCST005921/logs/MungeSumstats_log_msg.txt TRUE https://github.com/neurogenomics/MAGMA_Files_Public/raw/master/data/MAGMA_Files/ebi-a-GCST005921.tsv.gz.35UP.10DOWN/ebi-a-GCST005921.tsv.gz.35UP.10DOWN.genes.raw https://github.com/neurogenomics/MAGMA_Files_Public/raw/master/data/MAGMA_Files/ebi-a-GCST005921.tsv.gz.35UP.10DOWN/ebi-a-GCST005921.tsv.gz.35UP.10DOWN.genes.out

Plot

Categorical metadata

First, let’s get a sense of the characteristics of the different GWAS summary statistics:

variables <- c("category", "subcategory", "consortium", "population", "unit")
# palettes <- rownames(subset(RColorBrewer::brewer.pal.info, category=="div"))
palettes <- c(pals::stevens.pinkblue, 
              pals::arc.bluepink,
              pals::brewer.bugn,
              pals::brewer.seqseq2,
              pals::brewer.bupu)

gg_pies <- lapply(seq_len(length(variables)), function(i){
  v <- variables[i]
  pal <- palettes[i]
  n_colors <- dplyr::n_distinct(meta[[v]])
  message(v)
  #### Prepare data ####
  pie_data <- meta %>% 
  dplyr::mutate_at(.vars = dplyr::all_of(v), 
                   .funs = function(v){ifelse(v=="NA", NA, v)}) %>%
  dplyr::group_by_at(dplyr::all_of(v)) %>%
  dplyr::count() %>%
  data.table::data.table() %>% 
  dplyr::mutate_at(.vars = v,
                   .funs = stringr::str_trunc, 25)
  #### Plot ####                  
  ggplot(pie_data, 
         aes_string(x=1, y="n", fill=v)) + 
  geom_bar(width = 1, stat = "identity") +
  coord_polar(theta = "y") + 
  labs(title=v, x=NULL, y=NULL) + 
  theme_bw()+ 
  theme(aspect.ratio = 1, 
        legend.position = "bottom",
        legend.title = element_blank(), 
        axis.text.x = element_blank(),
        axis.text.y = element_blank()) +
  guides(fill = guide_legend(ncol = 2)) + 
  scale_fill_manual(values = palettes[[i]](n = n_colors))
}) %>% `names<-`(variables)
## category
## subcategory
## consortium
## population
## unit
patchwork::wrap_plots(gg_pies, nrow = 1)

Over time

variables <- c("nsnp","N","population")
time_data <- meta %>% 
  dplyr::select(id, year, all_of(variables)) %>%
  dplyr::mutate(year=factor(year,
                            levels = sort(unique(meta$year)),
                            ordered = TRUE), 
                population = stringr::str_trunc(population, 25))

gg_time <- lapply(seq_len(length(variables)), function(i){
  v <- variables[i]
  message(v)
  ggplot(time_data, 
         aes_string(x="year", y=v, color=v)) +
    geom_boxplot() + 
    geom_point(alpha=.5) +
    geom_jitter(height = 0) + 
    theme_bw() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    labs(title=v)
}) %>% `names<-`(variables)
## nsnp
## N
## population
patchwork::wrap_plots(gg_time, ncol = 1)
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).
## Warning: Removed 6 rows containing missing values (geom_point).

## Warning: Removed 6 rows containing missing values (geom_point).

Genome builds

All genome builds on IEU GWAS should be hg19/GRCh37, is this actually the case?:

ggplot(meta,aes(x=build_inferred,fill=build_inferred))+
  geom_bar()+
  geom_text(aes( label = paste0(format(100*..count../nrow(meta), digits=2, 
                                  drop0trailing=TRUE),"% - ",..count..),
                 y= ..count.. ), stat= "count", vjust = -.5)+
  scale_fill_manual(values=c("purple","orange"))+
  theme_bw()+
  labs(y= "Number of Studies", 
       x = "Inferred Genomic Build") +
  theme(strip.background=element_blank(),
        legend.position = "none")

So 25 studies of what we evaluated actually are GRCh38!

More detail:

library(ggalluvial) 
alluv_data <- meta %>% 
  dplyr::select(build, build_inferred, build_matches, category) %>%
  dplyr::mutate(n=1, 
                build = gsub("HG19/GRCh37","GRCH37",build))

# ggalluvial::is_alluvia_form(alluv_data)
ggplot(alluv_data, 
       aes(y = n, 
           axis1 = category,
           axis2 = build_inferred)) +
  ggalluvial::geom_alluvium(aes(fill=build_matches)) +
  ggalluvial::geom_stratum(width = 1/12, fill = "black", color = "grey") +
  geom_label(stat = "stratum", aes(label = after_stat(stratum))) +
  scale_x_discrete(limits = c("build", "build_inferred"), expand = c(.05, .05)) +
  scale_fill_brewer(type = "qual", palette = "Set1") +
  theme_bw() +
  labs(title="Genome builds: OpenGWAS metadata vs. Inferred")

SNP filtering

variables <- sort(grep("snps",colnames(meta), value = TRUE),
                  decreasing = TRUE)
filt_data <- meta[,..variables]

Quality Control of SNPs

Overview charts:

gg_hist <- lapply(seq_len(length(variables)), function(i){
  v <- variables[i]
  message(v)
  ggplot(filt_data, 
         aes_string(x = v)) +
  geom_histogram(bins = 75) + 
  theme_bw() 
}) %>% `names<-`(variables)
## snps_start
## snps_flipped
## snps_end
## snps_dropped_nonRef
## snps_dropped_nonA1A2
## snps_dropped_INFO
## snps_dropped_duplicates
## snps_dropped_chrom
## sig_snps_start
## sig_snps_end
patchwork::wrap_plots(gg_hist, ncol = 2)
## Warning: Removed 264 rows containing non-finite values (stat_bin).
## Warning: Removed 17 rows containing non-finite values (stat_bin).
## Warning: Removed 41 rows containing non-finite values (stat_bin).
## Warning: Removed 482 rows containing non-finite values (stat_bin).
## Warning: Removed 646 rows containing non-finite values (stat_bin).

How many SNPs fail quality control checks?:

meta[,snps_dropped_total:=nsnp-snps_end]
#add constand column for plot
meta[,sumstats:="sumstats"]
ggplot(meta,aes(x=sumstats,y=snps_dropped_total/nsnp,fill=sumstats))+
  geom_violin()+
  geom_boxplot(width=0.05)+
  theme_bw()+
  labs(y= "Proportion of SNPs failed quality control", x = "") +
  theme(strip.background=element_blank(),
        legend.position = "none")
## Warning: Removed 6 rows containing non-finite values (stat_ydensity).
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).

And why are these SNPs removed?:

snps_dropped <- melt(meta[,c("id","nsnp","snps_dropped_nonA1A2",
                             "snps_dropped_nonRef","snps_dropped_chrom",
                             "snps_dropped_INFO","snps_dropped_duplicates")],
                     id.vars=c("id","nsnp"))
#remove snps_dropped_
snps_dropped[,variable:=substring(variable, 14)]

ggplot(snps_dropped,aes(x=variable,y=value/nsnp,fill=variable))+
  geom_violin()+
  geom_boxplot(width=0.02)+
  theme_bw()+
  labs(y= "Proportion of SNPs failed quality control", 
       x = "Exclusion criteria") +
  theme(strip.background=element_blank(),
        legend.position = "none")
## Warning: Removed 1210 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1210 rows containing non-finite values (stat_boxplot).

INFO filtering is the biggest determinant, if we remove and replot?

ggplot(snps_dropped[variable!="INFO"],
        aes(x=variable,y=value/nsnp,fill=variable))+
  geom_violin()+
  geom_boxplot(width=0.02)+
  theme_bw()+
  labs(y= "Proportion of SNPs failed quality control", 
       x = "Exclusion criteria") +
  theme(strip.background=element_blank(),
        legend.position = "none")
## Warning: Removed 1163 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1163 rows containing non-finite values (stat_boxplot).

Further detail:

filt_data2 <- filt_data %>% data.table::melt.data.table()
## Warning in data.table::melt.data.table(.): id.vars and measure.vars are
## internally guessed when both are 'NULL'. All non-numeric/integer/logical type
## columns are considered id.vars, which in this case are columns []. Consider
## providing at least one of 'id' or 'measure' vars in future.
gg_filt <- ggplot(filt_data2, aes(x=value, y=variable)) +
  geom_boxplot() +
  geom_point(alpha=.5) + 
  theme_bw() +
  theme(axis.text.x = element_text(angle=45, hjust=1))

gg_filt
## Warning: Removed 1450 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1450 rows containing missing values (geom_point).

Effect direction of SNPs

The effect direction or SNPs is also evaluated against a reference genome and in cases where it is incorrect, it is corrected. Let’s check how many are corrected:

ggplot(meta,aes(x=snps_flipped/nsnp,fill=sumstats))+
  geom_histogram(bins = 75)+
  theme_bw()+
  labs(x= "Proportion of SNPs flipped", x = "Number of studies") +
  theme(strip.background=element_blank(),
        legend.position = "none")
## Warning: Removed 270 rows containing non-finite values (stat_bin).

There are about 25/30 GWAS sumstats which contain SNPs with an incorrect direction (accounting for about 18% of all SNPs in those studies).

Session Info

utils::sessionInfo()
## R version 4.1.1 (2021-08-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_IE.UTF-8/en_IE.UTF-8/en_IE.UTF-8/C/en_IE.UTF-8/en_IE.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ggalluvial_0.12.3 data.table_1.14.2 dplyr_1.0.7       ggplot2_3.3.5    
## [5] here_1.0.1       
## 
## loaded via a namespace (and not attached):
##  [1] RColorBrewer_1.1-2 highr_0.9          bslib_0.3.1        compiler_4.1.1    
##  [5] pillar_1.6.4       jquerylib_0.1.4    pals_1.7           tools_4.1.1       
##  [9] digest_0.6.29      jsonlite_1.7.2     evaluate_0.14      lifecycle_1.0.1   
## [13] tibble_3.1.6       gtable_0.3.0       pkgconfig_2.0.3    rlang_0.4.12      
## [17] DBI_1.1.1          mapproj_1.2.7      patchwork_1.1.1    yaml_2.2.1        
## [21] xfun_0.27          fastmap_1.1.0      withr_2.4.3        stringr_1.4.0     
## [25] knitr_1.36         maps_3.4.0         generics_0.1.1     sass_0.4.0        
## [29] vctrs_0.3.8        rprojroot_2.0.2    grid_4.1.1         tidyselect_1.1.1  
## [33] glue_1.5.1         R6_2.5.1           fansi_0.5.0        rmarkdown_2.11    
## [37] tidyr_1.1.4        farver_2.1.0       purrr_0.3.4        magrittr_2.0.1    
## [41] scales_1.1.1       htmltools_0.5.2    ellipsis_0.3.2     dichromat_2.0-0   
## [45] assertthat_0.2.1   colorspace_2.0-2   labeling_0.4.2     utf8_1.2.2        
## [49] stringi_1.7.6      munsell_0.5.0      crayon_1.4.2