-
Notifications
You must be signed in to change notification settings - Fork 0
/
table-extractor.py
121 lines (90 loc) · 3.97 KB
/
table-extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import os
from tabula import read_pdf
import pandas as pd
import numpy as np
import re
import auxiliar_functions as af
import csv_zipper as cz
# 0. Setting Pandas printing options
pd.options.display.max_colwidth = 100
pd.options.display.max_rows = 140
# 1. Reading desired tables from .pdf and saving them as Pandas Dataframes
pdf_path = os.path.join(os.path.abspath("src"), "Componente Organizacional.pdf")
pages = [79, 80, 81, 82, 83, 84, 85]
df = read_pdf(pdf_path, pages = pages)
# 2. Organizing Quadro 30
## Creating MultiIndex
header = [df[0].columns.values[0], df[0].columns.values[0]]
subheader = ["Código", "Descrição da Categoria"]
Q30_header = pd.MultiIndex.from_tuples(list(zip(header, subheader)))
## Dividing column with one string into column with list of strings
df[0] = af.divide_string_column_into_list_column(df[0], 5)
## Dividing column with list of two strings into two columns
df[0] = pd.DataFrame(df[0]["Tabela de Tipo do Demandante"].to_list(), columns = subheader)
## Creating a new empty Dataframe to fill it with df[0] Dataframe
Q30 = pd.DataFrame(np.empty((5, 2), dtype = str), columns = Q30_header)
for i in range(5):
for j in range(2):
Q30.iloc[i][j] = df[0].iloc[i + 1][j]
print(Q30, "\n")
# 3. Saving Quadro 30 as a .csv file
Q30_filename = "quadro30"
Q30.to_csv(path_or_buf = Q30_filename + ".csv", encoding='utf-8-sig')
# 4. Organizing Quadro 31
## Creating MultiIndex
header = [df[1].columns.values[1], df[1].columns.values[1]]
Q31_header = pd.MultiIndex.from_tuples(list(zip(header, subheader)))
## Creating auxiliar matrix for df[2]
df[2] = af.drag_header_to_first_line(df[2], 27, 2, Q31_header)
## Creating auxiliar matrix for df[3]
df[3] = af.drag_header_to_first_line(df[3], 27, 2, Q31_header)
## Creating auxiliar matrix for df[4]
df[4] = af.drag_header_to_first_line(df[4], 26, 2, Q31_header)
## Creating auxiliar matrix for df[5]
df[5] = af.drag_header_to_first_line(df[5], 25, 2, Q31_header)
## Creating auxiliar matrix for df[6]
df[6] = af.drag_header_to_first_line(df[6], 23, 2, Q31_header)
## Creating a new empty Dataframe and filling it with df[1:6] Dataframes
Q31 = pd.DataFrame(np.empty((131, 2), dtype = str), columns = Q31_header)
for i in range(131):
for j in range(2):
if (i <= 2):
Q31.iloc[i][j] = df[1].iloc[i + 1][j]
elif (3 <= i) and (i <= 29):
Q31.iloc[i][j] = df[2].iloc[i - 3][j]
elif (30 <= i) and (i <= 56):
Q31.iloc[i][j] = df[3].iloc[i - 30][j]
elif (57 <= i) and (i <= 82):
Q31.iloc[i][j] = df[4].iloc[i - 57][j]
elif (83 <= i) and (i <= 107):
Q31.iloc[i][j] = df[5].iloc[i - 83][j]
elif (108 <= i) and (i <= 130):
Q31.iloc[i][j] = df[6].iloc[i - 108][j]
## Replacing '\r' chars
for i in range(131):
if ("\r" in Q31.iloc[i][1]):
Q31.iloc[i][1] = Q31.iloc[i][1].replace("\r", " ")
print(Q31, "\n")
# 5. Saving Quadro 31 as a .csv file
Q31_filename = "quadro31"
Q31.to_csv(path_or_buf = Q31_filename + ".csv", encoding='utf-8-sig')
# 6. Organizing Quadro 32
## Creating MultiIndex
header = [df[7].columns.values[0], df[7].columns.values[0]]
subheader = ["Código", "Descrição da Categoria"]
Q32_header = pd.MultiIndex.from_tuples(list(zip(header, subheader)))
## Dividing column with one string into column with list of strings
df[7] = af.divide_string_column_into_list_column(df[7], 3)
## Dividing column with list of two strings into two columns
df[7] = pd.DataFrame(df[7]["Tabela de Tipo de Solicitação"].to_list(), columns = subheader)
## Creating a new empty Dataframe to fill it with df[0] Dataframe
Q32 = pd.DataFrame(np.empty((3, 2), dtype = str), columns = Q32_header)
for i in range(3):
for j in range(2):
Q32.iloc[i][j] = df[7].iloc[i + 1][j]
print(Q32, "\n")
# 7. Saving Quadro 32 as a .csv file
Q32_filename = "quadro32"
Q32.to_csv(path_or_buf = Q32_filename + ".csv", encoding='utf-8-sig')
# 8. Zipping all .csv files
cz.zip_this_folder(path = os.path.abspath(os.getcwd()))