-
Notifications
You must be signed in to change notification settings - Fork 0
/
convert_excel_to_txt.py
106 lines (91 loc) · 3.41 KB
/
convert_excel_to_txt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/usr/bin/python3
# -*- coding:utf-8 -*-
import pathlib
import pandas
class FileConverter:
def __init__(self):
""" """
self.excel_files = []
self.acc_dataframe = None
self.default_columns = [
"field",
"filter",
"public_value",
"code",
"swedish",
"english",
"synonyms",
"ices_biology",
"ices_physical_and_chemical",
"bodc_nerc",
"darwincore",
"comments",
"source",
]
self.new_column_names = {
"filter": "filter",
"value": "public_value",
"Code": "code",
"Beskrivning/Svensk översättning": "swedish",
"Description/English translate": "english",
"Synonym": "synonyms",
"ICES biology": "ices_biology",
"ICES_biology": "ices_biology",
"ICES physchem": "ices_physchem",
"ICES_physical_and_chemical": "ices_physical_and_chemical",
}
def scan_source(self, source_dir="data_in"):
""" """
source_path = pathlib.Path(source_dir)
self.excel_files = list(source_path.glob("translate_*.xlsx"))
def convert_and_save(self, target_dir="data_out"):
""" """
self.acc_dataframe = pandas.DataFrame(columns=self.default_columns)
for excel_file in sorted(self.excel_files):
source_path = pathlib.Path(excel_file)
print("FILE: ", source_path.name)
df = pandas.read_excel(source_path)
df = df.rename(columns=self.new_column_names)
df["source"] = source_path.stem
public_value_list = df["public_value"]
code_list = df["code"]
swedish_list = df["swedish"]
english_list = df["english"]
public_values_new = []
for index, public_value in enumerate(public_value_list):
if str(public_value) in ["<use_code>", "", "nan"]:
public_values_new.append(str(code_list[index]))
elif str(public_value) == "<use_swedish>":
public_values_new.append(str(swedish_list[index]))
elif str(public_value) == "<use_english>":
public_values_new.append(str(english_list[index]))
elif str(public_value) == "<use_blank>":
public_values_new.append("")
else:
public_values_new.append(str(public_value_list[index]))
df["public_value"] = public_values_new
target_dir_path = pathlib.Path(target_dir)
target_file_path = pathlib.Path(target_dir_path, source_path.stem + ".txt")
df.to_csv(
target_file_path,
sep="\t",
index=False,
encoding="cp1252",
errors="replace",
)
# Add to concatenated list.
self.acc_dataframe = pandas.concat([self.acc_dataframe, df])
# Save concatenated list.
target_file_path = pathlib.Path(target_dir, "translate_codes_NEW.txt")
self.acc_dataframe.to_csv(
target_file_path,
sep="\t",
index=False,
encoding="cp1252",
errors="replace",
)
if __name__ == "__main__":
""" """
conv = FileConverter()
conv.scan_source()
conv.convert_and_save()