-
Notifications
You must be signed in to change notification settings - Fork 0
/
2.1.1_prepare_pregnancy_for_ldak.py
executable file
·66 lines (50 loc) · 2.1 KB
/
2.1.1_prepare_pregnancy_for_ldak.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#!/usr/bin/env python
# coding: utf-8
# In[151]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
VARIANTS = "/home/achangalidi/ukb_finn_extra_data/variants.tsv"
DATA_PATH_N = "/media/MIRROR/ukb_finngen/meta_analysis/file_mapping.csv"
DATA_PATH_N_PREG = "/media/MIRROR/ukb_finngen/meta_analysis/file_mapping_preg.csv"
TO_RENAME = {'Predictor':'Predictor', 'Allele1':'A1','Allele2':'A2','n':'n', 'Zscore':'Z'}
PREG_DIR = "/media/MIRROR/ukb_finngen/meta_analysis/PREG_DATA/"
CUR_DIR = "/media/MIRROR/ukb_finngen/meta_analysis/"
SCRIPT = '''/home/achangalidi/tools/ldak5.2.linux --sum-cors gencor \\
--summary {} \\
--summary2 {} \\
--tagfile /home/achangalidi/tools/ldak.thin.hapmap.gbr.tagging \\
--allow-ambiguous YES \\
--check-sums NO \\
> {}'''
# In[163]:
variants = pd.read_csv(VARIANTS, sep='\t')
path_n = pd.read_csv(DATA_PATH_N, header=None)
path_n_preg = pd.read_csv(DATA_PATH_N_PREG, header=None)
variants.head()
## for every
def return_ldak_df(FILE):
pair = FILE.split('/')[-1].replace('1.TBL.gz', '').replace('1.TBL', '')
if len(path_n[path_n.iloc[:,0]==pair][3]) > 0:
N = int(path_n[path_n.iloc[:,0]==pair][3])
else:
N = int(path_n_preg[path_n_preg.iloc[:,0]==pair][3])
data = pd.read_csv(FILE, sep='\t')
data_m = pd.merge(data, variants, left_on='MarkerName', right_on='rsid')
data_m['Predictor'] = data_m.chr.astype(str)+':'+data_m.pos.astype(str)
data_m['n'] = data_m['n_called']+N
f_data = data_m.rename(columns = TO_RENAME)[list(TO_RENAME.values())]
f_data = f_data[(f_data.A1.apply(lambda x: len(str(x))<2)) & (f_data.A2.apply(lambda x: len(str(x))<2))]
f_data.A1 = f_data.A1.apply(lambda x: x.upper())
f_data.A2 = f_data.A2.apply(lambda x: x.upper())
return f_data
FILES_PREG = [f for f in os.listdir(path=PREG_DIR) if f[-4:]=='.TBL']
preg_names = []
for f in FILES_PREG:
preg_data = return_ldak_df(os.path.join(PREG_DIR, f))
name = f.replace('.TBL', '.tsv')
out_file = os.path.join(PREG_DIR, name)
print(out_file )
preg_data.to_csv(out_file , index=False, sep='\t')
preg_names.append(out_file )