library(here)
## here() starts at /Users/alanmurphy/Documents/ICL/R_packages/MAGMA_Files_Public
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
Analysis of metadata from MungeSumstats quality control and standardisation of 714 GWAS summary statistics from IEU GWAS.
meta <- data.table::fread(here::here("metadata.csv"), drop = "V1")
knitr::kable(head(meta))
id | trait | author | subcategory | ncontrol | mr | unit | consortium | group_name | population | year | ontology | priority | build | category | note | ncase | nsnp | pmid | sex | sample_size | sd | N | munged_path | id_standard | rows_start | snps_start | sig_snps_start | chroms_start | rows_end | snps_end | sig_snps_end | chroms_end | build_inferred | snps_flipped | snps_dropped_INFO | snps_dropped_nonRef | snps_dropped_nonA1A2 | snps_dropped_duplicates | snps_dropped_chrom | log_path | build_matches | genes_raw_url | genes_out_url |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
ebi-a-GCST002245 | Alzheimer’s disease (late onset) | Lambert JC | NA | 37154 | 1 | logOR | NA | public | European | 2013 | NA | 0 | HG19/GRCh37 | NA | NA | 17008 | 7022150 | 24162737 | NA | 55134 | NA | 55134 | /home/rstudio/raid1/bschilder/projects/MAGMA_Files_Public/data/GWAS_sumstats/ebi-a-GCST002245/ebi-a-GCST002245.tsv.gz | EBI-A-GCST002245 | 7006673 | 7005018 | 1053 | 24 | 6772393 | 6770843 | 855 | 22 | GRCH37 | 70 | NA | 47581 | 109 | 4903 | 75 | /home/rstudio/raid1/bschilder/projects/MAGMA_Files_Public/data/GWAS_sumstats/ebi-a-GCST002245/logs/MungeSumstats_log_msg.txt | TRUE | https://github.com/neurogenomics/MAGMA_Files_Public/raw/master/data/MAGMA_Files/ebi-a-GCST002245.tsv.gz.35UP.10DOWN/ebi-a-GCST002245.tsv.gz.35UP.10DOWN.genes.raw | https://github.com/neurogenomics/MAGMA_Files_Public/raw/master/data/MAGMA_Files/ebi-a-GCST002245.tsv.gz.35UP.10DOWN/ebi-a-GCST002245.tsv.gz.35UP.10DOWN.genes.out |
ebi-a-GCST004692 | Amyotrophic lateral sclerosis | van Rheenen W | NA | 23475 | 1 | logOR | NA | public | European | 2016 | NA | 0 | HG19/GRCh37 | NA | NA | 12577 | 8643689 | 27455348 | NA | 36052 | NA | 36052 | /home/rstudio/raid1/bschilder/projects/MAGMA_Files_Public/data/GWAS_sumstats/ebi-a-GCST004692/ebi-a-GCST004692.tsv.gz | EBI-A-GCST004692 | 8623133 | 8623063 | 125 | 23 | 7677212 | 7677178 | 112 | 22 | GRCH37 | 211 | NA | 728310 | 349 | 140 | 2 | /home/rstudio/raid1/bschilder/projects/MAGMA_Files_Public/data/GWAS_sumstats/ebi-a-GCST004692/logs/MungeSumstats_log_msg.txt | TRUE | https://github.com/neurogenomics/MAGMA_Files_Public/raw/master/data/MAGMA_Files/ebi-a-GCST004692.tsv.gz.35UP.10DOWN/ebi-a-GCST004692.tsv.gz.35UP.10DOWN.genes.raw | https://github.com/neurogenomics/MAGMA_Files_Public/raw/master/data/MAGMA_Files/ebi-a-GCST004692.tsv.gz.35UP.10DOWN/ebi-a-GCST004692.tsv.gz.35UP.10DOWN.genes.out |
ebi-a-GCST004901 | Amyotrophic lateral sclerosis (sporadic) | Benyamin B | NA | 2850 | 1 | logOR | NA | public | East Asian | 2017 | NA | 0 | HG19/GRCh37 | NA | NA | 1234 | 4933403 | 28931804 | NA | 4084 | NA | 4084 | /home/rstudio/raid1/bschilder/projects/MAGMA_Files_Public/data/GWAS_sumstats/ebi-a-GCST004901/ebi-a-GCST004901.tsv.gz | EBI-A-GCST004901 | 4921720 | 4921717 | 116 | 22 | 4773761 | 4773760 | 113 | 22 | GRCH37 | 57 | NA | 13624 | 113 | 6 | NA | /home/rstudio/raid1/bschilder/projects/MAGMA_Files_Public/data/GWAS_sumstats/ebi-a-GCST004901/logs/MungeSumstats_log_msg.txt | TRUE | https://github.com/neurogenomics/MAGMA_Files_Public/raw/master/data/MAGMA_Files/ebi-a-GCST004901.tsv.gz.35UP.10DOWN/ebi-a-GCST004901.tsv.gz.35UP.10DOWN.genes.raw | https://github.com/neurogenomics/MAGMA_Files_Public/raw/master/data/MAGMA_Files/ebi-a-GCST004901.tsv.gz.35UP.10DOWN/ebi-a-GCST004901.tsv.gz.35UP.10DOWN.genes.out |
ebi-a-GCST005647 | Amyotrophic lateral sclerosis | Nicolas A | NA | 59804 | 1 | logOR | NA | public | European | 2018 | NA | 0 | HG19/GRCh37 | NA | NA | 20806 | 39630630 | 29566793 | NA | 80610 | NA | 80610 | /home/rstudio/raid1/bschilder/projects/MAGMA_Files_Public/data/GWAS_sumstats/ebi-a-GCST005647/ebi-a-GCST005647.tsv.gz | EBI-A-GCST005647 | 9526646 | 9526199 | 178 | 22 | 214053 | 214051 | 0 | 22 | GRCH37 | 202 | 8422400 | 881999 | 372 | 1590 | NA | /home/rstudio/raid1/bschilder/projects/MAGMA_Files_Public/data/GWAS_sumstats/ebi-a-GCST005647/logs/MungeSumstats_log_msg.txt | TRUE | https://github.com/neurogenomics/MAGMA_Files_Public/raw/master/data/MAGMA_Files/ebi-a-GCST005647.tsv.gz.35UP.10DOWN/ebi-a-GCST005647.tsv.gz.35UP.10DOWN.genes.raw | https://github.com/neurogenomics/MAGMA_Files_Public/raw/master/data/MAGMA_Files/ebi-a-GCST005647.tsv.gz.35UP.10DOWN/ebi-a-GCST005647.tsv.gz.35UP.10DOWN.genes.out |
ebi-a-GCST005920 | Paternal history of Alzheimer’s disease | Marioni RE | NA | 245941 | 1 | logOR | NA | public | European | 2018 | NA | 0 | HG19/GRCh37 | NA | NA | 14338 | 7776415 | 29777097 | NA | 260279 | NA | 260279 | /home/rstudio/raid1/bschilder/projects/MAGMA_Files_Public/data/GWAS_sumstats/ebi-a-GCST005920/ebi-a-GCST005920.tsv.gz | EBI-A-GCST005920 | 7759506 | 7759499 | 199 | 23 | 7539989 | 7539984 | 192 | 22 | GRCH37 | 74 | NA | 37652 | 166 | 37 | 1 | /home/rstudio/raid1/bschilder/projects/MAGMA_Files_Public/data/GWAS_sumstats/ebi-a-GCST005920/logs/MungeSumstats_log_msg.txt | TRUE | https://github.com/neurogenomics/MAGMA_Files_Public/raw/master/data/MAGMA_Files/ebi-a-GCST005920.tsv.gz.35UP.10DOWN/ebi-a-GCST005920.tsv.gz.35UP.10DOWN.genes.raw | https://github.com/neurogenomics/MAGMA_Files_Public/raw/master/data/MAGMA_Files/ebi-a-GCST005920.tsv.gz.35UP.10DOWN/ebi-a-GCST005920.tsv.gz.35UP.10DOWN.genes.out |
ebi-a-GCST005921 | Family history of Alzheimer’s disease | Marioni RE | NA | NA | 1 | NA | NA | public | European | 2018 | NA | 0 | HG19/GRCh37 | NA | NA | NA | 7746640 | 29777097 | NA | 314278 | NA | 314278 | /home/rstudio/raid1/bschilder/projects/MAGMA_Files_Public/data/GWAS_sumstats/ebi-a-GCST005921/ebi-a-GCST005921.tsv.gz | EBI-A-GCST005921 | 7729799 | 7729792 | 632 | 23 | 7511067 | 7511062 | 610 | 22 | GRCH37 | 74 | NA | 37542 | 166 | 37 | 1 | /home/rstudio/raid1/bschilder/projects/MAGMA_Files_Public/data/GWAS_sumstats/ebi-a-GCST005921/logs/MungeSumstats_log_msg.txt | TRUE | https://github.com/neurogenomics/MAGMA_Files_Public/raw/master/data/MAGMA_Files/ebi-a-GCST005921.tsv.gz.35UP.10DOWN/ebi-a-GCST005921.tsv.gz.35UP.10DOWN.genes.raw | https://github.com/neurogenomics/MAGMA_Files_Public/raw/master/data/MAGMA_Files/ebi-a-GCST005921.tsv.gz.35UP.10DOWN/ebi-a-GCST005921.tsv.gz.35UP.10DOWN.genes.out |
First, let’s get a sense of the characteristics of the different GWAS summary statistics:
variables <- c("category", "subcategory", "consortium", "population", "unit")
# palettes <- rownames(subset(RColorBrewer::brewer.pal.info, category=="div"))
palettes <- c(pals::stevens.pinkblue,
pals::arc.bluepink,
pals::brewer.bugn,
pals::brewer.seqseq2,
pals::brewer.bupu)
gg_pies <- lapply(seq_len(length(variables)), function(i){
v <- variables[i]
pal <- palettes[i]
n_colors <- dplyr::n_distinct(meta[[v]])
message(v)
#### Prepare data ####
pie_data <- meta %>%
dplyr::mutate_at(.vars = dplyr::all_of(v),
.funs = function(v){ifelse(v=="NA", NA, v)}) %>%
dplyr::group_by_at(dplyr::all_of(v)) %>%
dplyr::count() %>%
data.table::data.table() %>%
dplyr::mutate_at(.vars = v,
.funs = stringr::str_trunc, 25)
#### Plot ####
ggplot(pie_data,
aes_string(x=1, y="n", fill=v)) +
geom_bar(width = 1, stat = "identity") +
coord_polar(theta = "y") +
labs(title=v, x=NULL, y=NULL) +
theme_bw()+
theme(aspect.ratio = 1,
legend.position = "bottom",
legend.title = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_blank()) +
guides(fill = guide_legend(ncol = 2)) +
scale_fill_manual(values = palettes[[i]](n = n_colors))
}) %>% `names<-`(variables)
## category
## subcategory
## consortium
## population
## unit
patchwork::wrap_plots(gg_pies, nrow = 1)
variables <- c("nsnp","N","population")
time_data <- meta %>%
dplyr::select(id, year, all_of(variables)) %>%
dplyr::mutate(year=factor(year,
levels = sort(unique(meta$year)),
ordered = TRUE),
population = stringr::str_trunc(population, 25))
gg_time <- lapply(seq_len(length(variables)), function(i){
v <- variables[i]
message(v)
ggplot(time_data,
aes_string(x="year", y=v, color=v)) +
geom_boxplot() +
geom_point(alpha=.5) +
geom_jitter(height = 0) +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title=v)
}) %>% `names<-`(variables)
## nsnp
## N
## population
patchwork::wrap_plots(gg_time, ncol = 1)
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).
## Warning: Removed 6 rows containing missing values (geom_point).
## Warning: Removed 6 rows containing missing values (geom_point).
All genome builds on IEU GWAS should be hg19/GRCh37, is this actually the case?:
ggplot(meta,aes(x=build_inferred,fill=build_inferred))+
geom_bar()+
geom_text(aes( label = paste0(format(100*..count../nrow(meta), digits=2,
drop0trailing=TRUE),"% - ",..count..),
y= ..count.. ), stat= "count", vjust = -.5)+
scale_fill_manual(values=c("purple","orange"))+
theme_bw()+
labs(y= "Number of Studies",
x = "Inferred Genomic Build") +
theme(strip.background=element_blank(),
legend.position = "none")
So 25 studies of what we evaluated actually are GRCh38!
More detail:
library(ggalluvial)
alluv_data <- meta %>%
dplyr::select(build, build_inferred, build_matches, category) %>%
dplyr::mutate(n=1,
build = gsub("HG19/GRCh37","GRCH37",build))
# ggalluvial::is_alluvia_form(alluv_data)
ggplot(alluv_data,
aes(y = n,
axis1 = category,
axis2 = build_inferred)) +
ggalluvial::geom_alluvium(aes(fill=build_matches)) +
ggalluvial::geom_stratum(width = 1/12, fill = "black", color = "grey") +
geom_label(stat = "stratum", aes(label = after_stat(stratum))) +
scale_x_discrete(limits = c("build", "build_inferred"), expand = c(.05, .05)) +
scale_fill_brewer(type = "qual", palette = "Set1") +
theme_bw() +
labs(title="Genome builds: OpenGWAS metadata vs. Inferred")
variables <- sort(grep("snps",colnames(meta), value = TRUE),
decreasing = TRUE)
filt_data <- meta[,..variables]
Overview charts:
gg_hist <- lapply(seq_len(length(variables)), function(i){
v <- variables[i]
message(v)
ggplot(filt_data,
aes_string(x = v)) +
geom_histogram(bins = 75) +
theme_bw()
}) %>% `names<-`(variables)
## snps_start
## snps_flipped
## snps_end
## snps_dropped_nonRef
## snps_dropped_nonA1A2
## snps_dropped_INFO
## snps_dropped_duplicates
## snps_dropped_chrom
## sig_snps_start
## sig_snps_end
patchwork::wrap_plots(gg_hist, ncol = 2)
## Warning: Removed 264 rows containing non-finite values (stat_bin).
## Warning: Removed 17 rows containing non-finite values (stat_bin).
## Warning: Removed 41 rows containing non-finite values (stat_bin).
## Warning: Removed 482 rows containing non-finite values (stat_bin).
## Warning: Removed 646 rows containing non-finite values (stat_bin).
How many SNPs fail quality control checks?:
meta[,snps_dropped_total:=nsnp-snps_end]
#add constand column for plot
meta[,sumstats:="sumstats"]
ggplot(meta,aes(x=sumstats,y=snps_dropped_total/nsnp,fill=sumstats))+
geom_violin()+
geom_boxplot(width=0.05)+
theme_bw()+
labs(y= "Proportion of SNPs failed quality control", x = "") +
theme(strip.background=element_blank(),
legend.position = "none")
## Warning: Removed 6 rows containing non-finite values (stat_ydensity).
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).
And why are these SNPs removed?:
snps_dropped <- melt(meta[,c("id","nsnp","snps_dropped_nonA1A2",
"snps_dropped_nonRef","snps_dropped_chrom",
"snps_dropped_INFO","snps_dropped_duplicates")],
id.vars=c("id","nsnp"))
#remove snps_dropped_
snps_dropped[,variable:=substring(variable, 14)]
ggplot(snps_dropped,aes(x=variable,y=value/nsnp,fill=variable))+
geom_violin()+
geom_boxplot(width=0.02)+
theme_bw()+
labs(y= "Proportion of SNPs failed quality control",
x = "Exclusion criteria") +
theme(strip.background=element_blank(),
legend.position = "none")
## Warning: Removed 1210 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1210 rows containing non-finite values (stat_boxplot).
INFO filtering is the biggest determinant, if we remove and replot?
ggplot(snps_dropped[variable!="INFO"],
aes(x=variable,y=value/nsnp,fill=variable))+
geom_violin()+
geom_boxplot(width=0.02)+
theme_bw()+
labs(y= "Proportion of SNPs failed quality control",
x = "Exclusion criteria") +
theme(strip.background=element_blank(),
legend.position = "none")
## Warning: Removed 1163 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1163 rows containing non-finite values (stat_boxplot).
Further detail:
filt_data2 <- filt_data %>% data.table::melt.data.table()
## Warning in data.table::melt.data.table(.): id.vars and measure.vars are
## internally guessed when both are 'NULL'. All non-numeric/integer/logical type
## columns are considered id.vars, which in this case are columns []. Consider
## providing at least one of 'id' or 'measure' vars in future.
gg_filt <- ggplot(filt_data2, aes(x=value, y=variable)) +
geom_boxplot() +
geom_point(alpha=.5) +
theme_bw() +
theme(axis.text.x = element_text(angle=45, hjust=1))
gg_filt
## Warning: Removed 1450 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1450 rows containing missing values (geom_point).
The effect direction or SNPs is also evaluated against a reference genome and in cases where it is incorrect, it is corrected. Let’s check how many are corrected:
ggplot(meta,aes(x=snps_flipped/nsnp,fill=sumstats))+
geom_histogram(bins = 75)+
theme_bw()+
labs(x= "Proportion of SNPs flipped", x = "Number of studies") +
theme(strip.background=element_blank(),
legend.position = "none")
## Warning: Removed 270 rows containing non-finite values (stat_bin).
There are about 25/30 GWAS sumstats which contain SNPs with an incorrect direction (accounting for about 18% of all SNPs in those studies).
utils::sessionInfo()
## R version 4.1.1 (2021-08-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_IE.UTF-8/en_IE.UTF-8/en_IE.UTF-8/C/en_IE.UTF-8/en_IE.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggalluvial_0.12.3 data.table_1.14.2 dplyr_1.0.7 ggplot2_3.3.5
## [5] here_1.0.1
##
## loaded via a namespace (and not attached):
## [1] RColorBrewer_1.1-2 highr_0.9 bslib_0.3.1 compiler_4.1.1
## [5] pillar_1.6.4 jquerylib_0.1.4 pals_1.7 tools_4.1.1
## [9] digest_0.6.29 jsonlite_1.7.2 evaluate_0.14 lifecycle_1.0.1
## [13] tibble_3.1.6 gtable_0.3.0 pkgconfig_2.0.3 rlang_0.4.12
## [17] DBI_1.1.1 mapproj_1.2.7 patchwork_1.1.1 yaml_2.2.1
## [21] xfun_0.27 fastmap_1.1.0 withr_2.4.3 stringr_1.4.0
## [25] knitr_1.36 maps_3.4.0 generics_0.1.1 sass_0.4.0
## [29] vctrs_0.3.8 rprojroot_2.0.2 grid_4.1.1 tidyselect_1.1.1
## [33] glue_1.5.1 R6_2.5.1 fansi_0.5.0 rmarkdown_2.11
## [37] tidyr_1.1.4 farver_2.1.0 purrr_0.3.4 magrittr_2.0.1
## [41] scales_1.1.1 htmltools_0.5.2 ellipsis_0.3.2 dichromat_2.0-0
## [45] assertthat_0.2.1 colorspace_2.0-2 labeling_0.4.2 utf8_1.2.2
## [49] stringi_1.7.6 munsell_0.5.0 crayon_1.4.2