08_gene_pathway_exploration.Rmd

---
title: "07_pathway_exploring"
author: "Marina Vabistsevits"
date: "`r Sys.Date()`"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(readr)
library(vroom)

library(tidyr)
library(tibble)
library(dplyr)
library(ggplot2)

#library(gprofiler2)
#library(httr)
#library(jsonlite)
#library(GenomicRanges)
#library(biomaRt)

library(ReactomeContentService4R)
library(enrichR)
library(eulerr)
```

```{r message=F}
# set path for pre-calculated data, outside the code repo
# `local` / `remote` (reading data from RDSF)
currently_working_env = "local"
source("set_paths.R")
set_paths(currently_working_env)

# metafile
#data_lookup<-read_csv(paste0("metadata/data_lookup.csv")) 
#bcac_lookup<-read_csv(paste0("metadata/data_lookup_BCAC.csv")) 

# load functions
source("functions.R")
```

# Select and load data 

```{r}
# dense
exp_code <- "NDA_unadj" 
cluster_df <-read_tsv(paste0(results_path,"MRClust_results/",exp_code ,"_snps_clusters.tsv"))

```

# Map SNPs to genes

Run a FUMA job with the list of SNP (input file from 06_instruments_review.Rmd file) and save the SNPs result

## FUMA

```{r}
# read FUMA snp-gene mapping 

# we FUMA-d all gws SNPs, but left_join with cluster_df will keep only what is needed if we are working with a subset
if (!grepl("all", exp_code)){exp_code_fuma <- paste0(exp_code, "_all_eQTL")} else{ exp_code_fuma <- exp_code}

# get pos mapping
snps <- read_tsv(paste0(results_path, "FUMA/FUMA_", exp_code_fuma, "/snps.txt")) 
map1 <- cluster_df %>% left_join(snps %>% dplyr::select(uniqID, rsID, nearestGene,r2,  eqtlMapFilt)) %>% distinct() 

# get eQTL mapping
eqtls <- read_tsv(paste0(results_path, "FUMA/FUMA_", exp_code_fuma, "/eqtl.txt")) 
map2 <- map1 %>% left_join(eqtls %>% dplyr::select(uniqID, tissue,FDR, gene_eQTL = symbol)%>% distinct() , by  ='uniqID')

write_tsv(map2, paste0(results_path, "SNP_to_gene_to_pathway/", exp_code, "_snps_clusters_genes_wide.tsv")) # 

```

```{r}
# transformi g int long format-- remember to select MD pheno appropriate cluster direction annotation!

map2<- read_tsv(paste0(results_path, "SNP_to_gene_to_pathway/", exp_code, "_snps_clusters_genes_wide.tsv"))

cluster_df_map <- map2 %>%
  pivot_longer(cols = c("nearestGene","gene_eQTL"), names_to = "source", values_to = "gene") %>% 
  mutate(source = ifelse(source == "nearestGene", "positional mapping", "eQTL")) %>% 
  mutate(qtl_group = ifelse(source!="eQTL", NA, tissue )) %>% 
  distinct() %>% 
  filter(!is.na(gene)) %>%
  dplyr::select(cluster, rsID, gene, source, qtl_group, qtl_FDR=FDR) %>%
  mutate(qtl_FDR = ifelse(is.na(qtl_group), NA, qtl_FDR)) %>% 
  arrange(cluster, rsID) %>% 
   # DA & PD
  #mutate(cluster_effect_direction = case_when(cluster %in% c("cluster_1", "cluster_Null") ~ "negative",
  #                               rsID == "rs73169097" ~ "negative",
  #                               TRUE ~ "positive"))
  # NDA
  mutate(cluster_effect_direction = case_when(rsID == "rs75772194" ~ "positive",
                                 TRUE ~ "negative")) 

# save tidy version
write_tsv(cluster_df_map, 
          paste0(results_path, "SNP_to_gene_to_pathway/", exp_code, "_snps_clusters_genes_long.tsv")) # supl file for 
```


# Pathways - enrichR
```{r}


#cluster_df_genes2_subOLD <-  read_tsv(paste0(results_path, "SNP_to_gene_to_pathway/", exp_code, "_snps_clusters_gene_annotation.tsv")) 
#setdiff(cluster_df_map$gene, cluster_df_genes2_subOLD$gene)
#setdiff( cluster_df_genes2_subOLD$gene, cluster_df_map$gene)

# load manually updated mapping file
cluster_df_map <-  read_tsv(paste0(results_path, "SNP_to_gene_to_pathway/", exp_code, "_snps_clusters_genes_long.tsv")) 

```

```{r}
# add genes to forest plot for visualisation
library(ggplot2)

#forest_data
load(paste0(results_path,"MRClust_results/",exp_code ,"_forest_plot.Rdata"))
forest_by_clusters<-forest_data$forest_by_clusters
res_single<-forest_data$res_single

res_single2<- res_single %>% 
               left_join(cluster_df_map %>%
                    dplyr::select(rsID,gene), by=c("SNP"='rsID') ) %>% distinct() %>% 
             # group by source
                group_by_at(vars(-gene)) %>%
                summarize(gene = toString(gene)) %>%
                ungroup() 

p_genes <- forest_by_clusters$p +
  geom_text(data=res_single2, aes(y=SNP, x=b, label=gene),size =2.5 , hjust=0.5, vjust=-0.5)+xlim(-3,3)+labs(subtitle = "")

### NB: if you get error "Error in discrete_range() : could not find function "discrete_range",
### need to install older verison of ggplot2: 
### devtools::install_version("ggplot2", "3.3.5")

# single plot save
#ggsave(paste0(results_path, "SNP_to_gene_to_pathway/forest_plot_gene_names_", exp_code, ".png"),
#       plot=p_genes, scale=1.4, 
#       width=10, height=10,
#       #width=10, height=5,
#       units=c("cm"), dpi=300, limitsize=F)
     
# building 3-plot figure
da_plot <- p_genes
pd_plot<- p_genes
nda_plot <- p_genes

library(cowplot)
three_plots <-
  plot_grid(da_plot, pd_plot,nda_plot,
           labels = c("a",
                      "b",
                      "c"),
           label_size = 12, rel_heights = c(0.75, 0.50, 0.39),
          axis = "t", nrow=3) 

ggsave(plot=three_plots, 
      height=14, width=7, scale=0.8, dpi=300, 
       filename=paste0(results_path,"MRClust_results/forest_plot_w_genes_combinedV2.png"))

```


```{r}
library(enrichR)
alldbs<-listEnrichrDbs()
pathway_dbs <- c("Reactome_2016", "KEGG_2021_Human", 
                 "GO_Biological_Process_2021", "GO_Cellular_Component_2021", "GO_Molecular_Function_2021",
                  "WikiPathway_2021_Human"
                 )
```


```{r}
# run enrichr to collect all pathways
gene_list <- cluster_df_map %>% pull(gene) %>% unique()
enriched <- enrichr(gene_list, pathway_dbs)

# flatten list into a table; handle empty tables
for (db_name in names(enriched)){
  if( dim(enriched[[db_name]])[1] > 0){
    enriched[[db_name]]$db <- db_name
  } else  {
    # if it's empty, delete it
    enriched[[db_name]] <- NULL
  }
}
enriched_df<-bind_rows(enriched) %>% dplyr::select( Term, Genes, db, Overlap) 

enriched_df_unw<- enriched_df %>% tidyr::separate_rows(Genes, sep = ";", convert = TRUE)

genes_not_mapped <- setdiff(gene_list, unique(enriched_df_unw$Genes)) 
genes_not_mapped # check if any genes can be further mapped -- update manual annotation file

```

```{r}
# query latest reactome db (because enrichr has only upto 2016)
reactome_res<- lapply(gene_list, queryReactome)
reactome_res_df<- bind_rows(reactome_res) %>% mutate(db = "Reactome_2022")

# join pathway results
enriched_df_unw <- bind_rows(enriched_df_unw , reactome_res_df)
```


```{r}
# merge pathways with clusters
enriched_df_unw <- left_join(enriched_df_unw, cluster_df_map, by  = c("Genes" = "gene"))

```


## compare clusters

```{r}
enriched_df_unw_clust <-enriched_df_unw %>%  dplyr::select(-Genes, -cluster) %>% distinct()

library(eulerr)
# negative and junk
sX <- list(positive = enriched_df_unw %>% filter(cluster_effect_direction == "positive") %>%
             pull(Term) %>% unique()%>% sort(),
           #null = enriched_df_unw_clust %>% filter(cluster_effect_direction == "null") %>% pull(Term) %>% sort(), # only add if needed
           negative = enriched_df_unw %>% filter(cluster_effect_direction == "negative") %>% 
             pull(Term) %>% unique() %>% sort())
venn <- plot(euler(sX, shape = "ellipse"), quantities = TRUE)

ggsave(paste0(results_path, "SNP_to_gene_to_pathway/pathway_venn_", exp_code, ".png"),
       plot=venn, scale=2, 
       width=5, height=5,
       #width=10, height=5,
       units=c("cm"), dpi=300, limitsize=F)


onlyneg<- setdiff(sX$negative, sX$positive)
onlypos<- setdiff(sX$positive, sX$negative)
bothclust<- intersect(sX$positive, sX$negative)

#enriched_df_unw %>% filter(Term %in% onlyneg) %>% View()


path_count<- enriched_df_unw %>% count(Term, sort=T) # 
path_count2<- enriched_df_unw %>% dplyr::select(Term, Genes) %>% distinct() %>% count(Term, sort=T) %>% dplyr::rename(n_path = n)

enriched_df_unw <- enriched_df_unw %>%
  mutate(clust_status =
         case_when(Term %in% onlyneg ~ "only in negative",
                   Term %in% onlypos ~ "only in positive",
                   TRUE ~ "in both ")) %>% 
  left_join(path_count, by="Term") %>% 
  left_join(path_count2, by="Term") 

write_tsv(enriched_df_unw,  paste0(results_path, "SNP_to_gene_to_pathway/", exp_code, "_pathways_by_clusters.tsv")) # raw

enriched_df_unw_tidy <- enriched_df_unw %>%
  mutate(db = factor(db, levels = c("Reactome_2022", "Reactome_2016", "KEGG_2021_Human" ,   "WikiPathway_2021_Human",
                                    "GO_Biological_Process_2021" ,"GO_Cellular_Component_2021", "GO_Molecular_Function_2021"))) %>% 
  arrange(db) %>% 
  dplyr::select(rsID, gene = Genes, cluster, cluster_effect_direction,
                pathway=Term, db, pathway_only_in_genes_with_this_effect = clust_status) %>%
  arrange(desc(cluster_effect_direction), cluster, desc(gene))

write_tsv(enriched_df_unw_tidy,  paste0(results_path, "SNP_to_gene_to_pathway/", exp_code, "_pathways_by_clusters_tidy.tsv")) 


# for manipulating out genes for slides:
#enriched_df_unw %>% dplyr::select(cluster, Genes) %>% distinct() %>% View()
#enriched_df_unw %>% dplyr::select(cluster, Genes) %>% distinct() %>% filter(cluster == "cluster_1") %>% pull(Genes) 

```

# Review 3 phenotypes
```{r}
da_pathways <- read_tsv(paste0(results_path, "SNP_to_gene_to_pathway/", exp_code, "_pathways_by_clusters_tidy.tsv"))
nda_pathways <- read_tsv(paste0(results_path, "SNP_to_gene_to_pathway/", exp_code, "_pathways_by_clusters_tidy.tsv"))
pd_pathways <- read_tsv(paste0(results_path, "SNP_to_gene_to_pathway/", exp_code, "_pathways_by_clusters_tidy.tsv"))

```


# OUTTAKES - not used in the analysis


```{r}
# read FUMA snp-gene mapping
if (!grepl("all", exp_code)){exp_code_fuma <- paste0(exp_code, "_all")} else{ exp_code_fuma <- exp_code}
snpstxt <- read_tsv(paste0(results_path, "FUMA/FUMA_", exp_code_fuma, "/snps.txt")) %>% filter (r2>0.8)# we FUMA-d all gws SNPs, but leftjoin will keep only what is needed if we are working with a subset

cluster_df <-cluster_df %>% left_join(snpstxt %>% dplyr::select(rsID, geneFUMA=nearestGene, uniqID, r2), by = "rsID") 

cluster_df_genes<- cluster_df %>% dplyr::select(cluster, geneFUMA) %>% distinct()
cluster_df_genes %>% count(geneFUMA,sort=T)
```


## gProfiler - not used

```{r}
# map to genes with gProfiler:
library(gprofiler2)
grof_res <- gprofiler2::gsnpense(query =unique(cluster_df$rsID)) %>% 
            dplyr::select(rsID=rs_id, gprofiler_gene = gene_names) %>% distinct()

# just check
cluster_df_tmp<- left_join(cluster_df, grof_res, by ="rsID")
cluster_df_genes2<- cluster_df_tmp %>% 
                    dplyr::select(cluster, geneFUMA, gprofiler_gene) %>% distinct()
```


## eQTL catalogue - not used
```{r}
### Querying eQTL catalogue for variants

eqtl1 <- eqtl_for_snps(unique(cluster_df$rsID))

eqtl1_sub <- eqtl1 %>% dplyr::select(rsid, hgnc_symbol) %>% distinct()

print(paste("mapped", length(unique(eqtl1_sub$rsid)), "SNPs out of", length(unique(cluster_df$rsID))))

eqtl1_sub %>% count(rsid, sort=T) %>% View()
eqtl1 %>% count(qtl_group, sort=T) %>% View()

tissues_keep <-c( "Adipose_Subcutaneous" ,"Adipose_Visceral_Omentum" ,
                 "Breast_Mammary_Tissue" ) # GTEx V8 only

#eqtl1 %>% dplyr::select(qtl_group, study_id) %>% filter(qtl_group %in% tissues_keep) %>% distinct() %>%  View()


eqtl1_sub_selected_tissues <- eqtl1 %>% 
                              filter(qtl_group %in% tissues_keep) %>% 
                              dplyr::select(rsid, eQTL_gene = hgnc_symbol,qtl_group, median_tpm, pvalue) %>% 
                              distinct() %>% 
                              filter(median_tpm > 0) %>% 
                              mutate(eQTL_gene = ifelse(eQTL_gene=="", NA, eQTL_gene)) %>% 
                              filter(!is.na(eQTL_gene))


print(paste("mapped", length(unique(eqtl1_sub$rsid)), "SNPs out of", length(unique(cluster_df$rsID)), "; in selected tissues:", length(unique(eqtl1_sub_selected_tissues$rsid)) ))
```


## joining and saving mappings -not used 

````{r}
cluster_df_multi<- left_join(cluster_df, grof_res, by ="rsID") %>%  # fuma +gprof
                    left_join(., eqtl1_sub_selected_tissues, by =c("rsID"="rsid")) # . + eQTL

cluster_df_genes3<- cluster_df_multi %>% 
  dplyr::select(cluster, rsID, geneFUMA, gprofiler_gene, eQTL_gene, everything()) %>%
  distinct() 
# save version with tissues 
write_tsv(cluster_df_genes3, 
          paste0(results_path, "SNP_to_gene_to_pathway/", exp_code, "_snps_clusters_gene_annotation_w_tissues.tsv"))

cluster_df_genes3_sub<- cluster_df_genes3 %>% 
                        dplyr::select("cluster" ,"rsID", "geneFUMA","gprofiler_gene" ,"eQTL_gene", "qtl_group" ) %>% distinct() %>% 
                        group_by_at(vars(-qtl_group)) %>%
                        summarize(qtl_group = toString(qtl_group)) %>%
                        ungroup() 

cluster_df_genes3_subtidy <- cluster_df_genes3_sub %>%
  pivot_longer(cols = c("geneFUMA","gprofiler_gene" ,"eQTL_gene"), names_to = "source", values_to = "gene") %>% 
  mutate(source = ifelse(source != "eQTL_gene", "positional mapping", "eQTL")) %>% 
  mutate(qtl_group = ifelse(source!="eQTL", NA, qtl_group )) %>% 
  distinct() %>% 
  filter(!is.na(gene)) %>%
  dplyr::select(cluster, rsID, gene, source, qtl_group) %>%
  arrange(cluster, rsID) %>% 
   # DA & PD
  #mutate(cluster_effect_direction = case_when(cluster %in% c("cluster_1", "cluster_Null") ~ "negative",
  #                               rsID == "rs73169097" ~ "negative",
  #                               TRUE ~ "positive"))
  # NDA
  mutate(cluster_effect_direction = case_when(rsID == "rs75772194" ~ "positive",
                                 TRUE ~ "negative")) 


# save tidy version
write_tsv(cluster_df_genes3_subtidy, 
          paste0(results_path, "SNP_to_gene_to_pathway/", exp_code, "_snps_clusters_gene_annotation.tsv")) # supl file for 
```