-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
167 lines (118 loc) · 4.88 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import numpy as np
from typing import Tuple
import modin.pandas as pd
import json
def get_xgboost_x_y(
indices: list,
data: np.array,
target_sequence_length,
input_seq_len: int
) -> Tuple[np.array, np.array]:
"""
Args:
indices: List of index positions at which data should be sliced
data: A univariate time series
target_sequence_length: The forecasting horizon, m
input_seq_len: The length of the model input, n
Output:
all_x: np.array of shape (number of instances, input seq len)
all_y: np.array of shape (number of instances, target seq len)
"""
# print("Preparing data..")
all_x =[]
all_y =[]
# Loop over list of training indices
for i, idx in enumerate(indices):
# Slice data into instance of length input length + target length
data_instance = data[idx[0]:idx[1]]
x = data_instance[0:input_seq_len]
assert len(x) == input_seq_len
y = data_instance[input_seq_len:input_seq_len+target_sequence_length]
# Create all_y and all_x objects in first loop iteration
if i == 0:
all_y = y.reshape(1, -1)
all_x = x.reshape(1, -1)
else:
all_y = np.concatenate((all_y, y.reshape(1, -1)), axis=0)
all_x = np.concatenate((all_x, x.reshape(1, -1)), axis=0)
# print("Finished preparing data!")
return all_x, all_y
def get_indices_entire_sequence(
data: pd.DataFrame,
window_size: int,
step_size: int
) -> list:
"""
Produce all the start and end index positions that is needed to produce
the sub-sequences.
Returns a list of tuples. Each tuple is (start_idx, end_idx) of a sub-
sequence. These tuples should be used to slice the dataset into sub-
sequences. These sub-sequences should then be passed into a function
that slices them into input and target sequences.
Args:
data (pd.DataFrame): Partitioned data set, e.g. training data
window_size (int): The desired length of each sub-sequence. Should be
(input_sequence_length + target_sequence_length)
E.g. if you want the model to consider the past 100
time steps in order to predict the future 50
time steps, window_size = 100+50 = 150
step_size (int): Size of each step as the data sequence is traversed
by the moving window.
If 1, the first sub-sequence will be [0:window_size],
and the next will be [1:window_size].
Return:
indices: a list of tuples
"""
stop_position = len(data)-1 # 1- because of 0 indexing
# Start the first sub-sequence at index position 0
subseq_first_idx = 0
subseq_last_idx = window_size
indices = []
while subseq_last_idx <= stop_position:
indices.append((subseq_first_idx, subseq_last_idx))
subseq_first_idx += step_size
subseq_last_idx += step_size
return indices
def prepare_data_for_xgb(y_train_data_,y_test_data_,in_length,step_size,target_sequence_length):
training_indices = get_indices_entire_sequence(
data=y_train_data_,
window_size=in_length+target_sequence_length,
step_size=step_size
)
# Obtain (X,Y) pairs of training data
x_train, y_train = get_xgboost_x_y(
indices=training_indices,
data=y_train_data_.to_numpy(),
target_sequence_length=target_sequence_length,
input_seq_len=in_length
)
test_indices = get_indices_entire_sequence(
data=y_test_data_,
window_size=in_length+target_sequence_length,
step_size=step_size
)
# Obtain (X,Y) pairs of test data
x_test, y_test = get_xgboost_x_y(
indices=test_indices,
data=y_test_data_.to_numpy(),
target_sequence_length=target_sequence_length,
input_seq_len=in_length
)
return x_train, y_train,x_test, y_test
class NpEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, np.integer):
return int(obj)
if isinstance(obj, np.floating):
return float(obj)
if isinstance(obj, np.ndarray):
return obj.tolist()
return super(NpEncoder, self).default(obj)
def save_to_json(filename, data, mode='w'):
with open(f"./best_params/{filename}.json", mode) as f:
json.dump(data, f,cls=NpEncoder)
def read_from_json(filename,mode="r"):
with open(f'./best_params/{filename}.json', mode) as f:
# Load the JSON data into a Python dictionary
data = json.load(f)
return data