01_process_gwas_summary.Rmd

---
title: "Format GWAS sdata and extract instruments"
output: pdf_document
author: "Marina Vabistsevits"
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(readr)
library(vroom)
library(dplyr)
library(TwoSampleMR)
```

```{r}
# set path for pre-calculated data, outside the code repo
# `local` / `remote` (reading data from RDSF)
currently_working_env = "local"
source("set_paths.R")
set_paths(currently_working_env)

# metadata file that is iteratively read in and updated
data_lookup_file <- paste0("metadata/data_lookup.csv")
data_lookup<-read_csv(data_lookup_file)

# functions
source("functions.R")
```


```{r}
# supl functions

read_and_format <-function(file_gwas, data_version="ieu_gwas_pipeline"){
  
  # different versions of data formats to read in
  if (data_version == "ieu_gwas_pipeline"){
    # data produced by IEU GWAS pipeline
    out <-vroom(file_gwas,
          col_select = c("SNP","BETA","SE","ALLELE1","ALLELE0","A1FREQ","P_BOLT_LMM_INF")) %>% 
               format_data(., type="outcome",
                              snp_col = "SNP",
                              beta_col = "BETA",
                              se_col = "SE",
                              effect_allele_col = "ALLELE1",
                              other_allele_col = "ALLELE0",
                              eaf_col = "A1FREQ",
                              pval_col = "P_BOLT_LMM_INF")

  } else if (data_version == "ieu_gwas_pipeline_v2"){
     # data produced by IEU GWAS pipeline: pval col P_BOLT_LMM
    out <-vroom(file_gwas,
          col_select = c("SNP","BETA","SE","ALLELE1","ALLELE0","A1FREQ","P_BOLT_LMM")) %>% 
               format_data(., type="outcome",
                              snp_col = "SNP",
                              beta_col = "BETA",
                              se_col = "SE",
                              effect_allele_col = "ALLELE1",
                              other_allele_col = "ALLELE0",
                              eaf_col = "A1FREQ",
                              pval_col = "P_BOLT_LMM")
    
  } else if (data_version == "shared_subset"){
    # data shared as text files; pre-processed in script 00v1/2_..
    out <-vroom(file_gwas) %>% 
               format_data(., type="outcome",
                              snp_col = "SNP",
                              beta_col = "beta",
                              se_col = "se",
                              effect_allele_col = "effect_allele",
                              other_allele_col = "other_allele",
                              eaf_col = "effect_allele_freq",
                              pval_col = "pval")
  }else if (data_version == "bcac_md"){
    # processing of BCAC MD data from Chen paper
    
    # Z-score conversion to beta/se method from here:  https://ctg.cncr.nl/documents/p1651/readme.txt
    out <-vroom(file_gwas, col_select = c(	'rsid',	'Allele1',	'Allele2'	,'freq' ,	'Zscore',"Weight",	'P-value') ) 
    # calc beta and se
    #Beta = Zscore / sqrt( 2 * MAF * ( 1 - MAF) * ( N + Zscore^2 ) )
    #SE = 1 / sqrt( 2 * MAF * ( 1 - MAF ) * ( N + Zscore^2 ) )
    out <- out %>% mutate(beta = Zscore / sqrt( 2 * freq * ( 1 - freq) * ( Weight + Zscore^2 ) )) %>% 
                   mutate(se = 1 / sqrt( 2 * freq * ( 1 - freq ) * ( Weight + Zscore^2 ) ))

     out <- format_data(out, type="outcome",
                              snp_col = "rsid",
                              beta_col = "beta",
                              se_col = "se",
                              effect_allele_col = "Allele1",
                              other_allele_col = "Allele2",
                              eaf_col = "freq",
                              pval_col = "P-value")
     
  }else if (data_version == "ukb_neale"){
     # UKB data produced by Neale lab
    print("reading variants")
    variants <- vroom(paste0(data_path_gwas_raw, "variants_rsid_only.tsv")) # this is a col subset of variants.tsv.bgz from Neale lab
    print("reading gwas")
    gwas <-vroom(file_gwas, col_select = c("variant","minor_allele","minor_AF","beta","se","pval"))
    
    print("joining")
    if (nrow(gwas) == nrow(variants)){
    merged <- left_join(gwas, variants, by = c("variant"="variant", "minor_allele"="alt")) %>% 
              select(-variant) %>% select(SNP=rsid, everything())
    }
    
    print("formatting")
    out <-format_data(merged, type="outcome",
                              snp_col = "SNP",
                              beta_col = "beta",
                              se_col = "se",
                              effect_allele_col = "minor_allele",
                              other_allele_col = "ref",
                              eaf_col = "minor_AF",
                              pval_col = "pval")
    
  } 
  return(out)
  
}

extract_tophits <- function(outcome_gwas){
    outcome_gwas %>%
      filter(pval.outcome < 5e-8) %>% 
      convert_outcome_to_exposure() %>% 
      clump_data(., clump_r2 = 0.001)
}

extract_tophits_wo_clump <- function(outcome_gwas){
    outcome_gwas %>%
      filter(pval.outcome < 5e-8) %>% 
      convert_outcome_to_exposure() 
}
```

```{r}
# specify the source of full summary stats GWAS file
data_source <- "shared_subset" 
trait_cat = 'md_unadj'
traits <- data_lookup %>% filter(source == data_source) %>% filter(trait_category == trait_cat) %>%  pull(trait)
```

# Process every file in a standard way: 
# - Format GWAs data into outcome format and save as `GWAS_tidy_outcome.txt.gz`
# - Extract instruments and save as `tophits.tsv`

```{r message=F}

tidy_gwas <- "_GWAS_tidy_outcome.txt.gz"
tidy_tophits <- "_tophits.tsv"


for (current_trait in traits) {

  gwas_filename<- data_lookup %>% filter(trait == current_trait , source == data_source, trait_category == trait_cat) %>% pull(original_file)
  file_gwas <- paste0(data_path_gwas_raw, gwas_filename)

  print(paste0("Processing: ", current_trait, ", ", gwas_filename))
  gwas_outcome_format<-read_and_format(file_gwas, data_version = data_source)
  gwas_outcome_format$outcome <- current_trait

  print("   -> finished formatting")
  
  data_name <- paste0(data_lookup %>% filter(trait == current_trait, source == data_source, trait_category == trait_cat) %>% pull(trait_file_name))
  print(paste0("Saving tidy GWAS outcome file to: ", data_path_gwas, data_name, tidy_gwas))
  vroom_write(gwas_outcome_format, paste0(data_path_gwas, data_name, tidy_gwas ))
  
  print("Extracting tophits")
  #gwas_outcome_format<-vroom(paste0(data_path_gwas, data_name, tidy_gwas )) # uncomment if need to run from here
  tophits <- extract_tophits(gwas_outcome_format)
  if (!exists("tophits")) { stop("Extarcting instruments failed!")}
  print(paste0("Found ", dim(tophits)[1], " SNPs at < 5e-8"))
  write_tsv(tophits, paste0(data_path_tophits, data_name, tidy_tophits))
  print(paste0("Saved tophits file: ", paste0(data_path_tophits, data_name, tidy_tophits)))
  
  rm(gwas_outcome_format)
  rm(tophits)
  
}                       
```