-
Notifications
You must be signed in to change notification settings - Fork 17
/
run.py
197 lines (160 loc) · 9.08 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import streamlit as st
import sys
sys.path.insert(0, 'lib/')
#sys.tracebacklimit = 0 # Hide traceback on errors
from decompose_series import decompose_series
from file_selector import file_selector
from find_acf_pacf import find_acf_pacf
from generate_code import generate_code
from grid_search_arima import grid_search_arima
from mean_abs_pct_error import mean_abs_pct_error
from plot_forecast import plot_forecasts
from predict_set import predict_set
from sidebar_menus import sidebar_menus
from test_stationary import test_stationary
from train_ts_model import train_ts_model
from transform_time_series import transform_time_series
pd.set_option('display.float_format', lambda x: '%.3f' % x) # Granting that pandas won't use scientific notation for floating fields
description = '''
**Arauto** is an open-source project that will help you to forecast the future from historical data.
It uses statiscal models to give you accurated predictions for time series data, which is helpful for
financial data, network traffic, sales, and much more.
'''
# Description
st.image('img/banner.png')
st.write('*An equivalent exchange: you give me data, I give you answers*')
st.write(description)
### SIDEBAR
st.sidebar.title('Your data')
filename, df = file_selector()
st.markdown('## **First lines of your data**')
st.dataframe(df.head(10)) # First lines of DataFrame
ds_column, y, data_frequency, test_set_size, exog_variables = sidebar_menus('feature_target', df=df)
# Name of the exogenous variables
exog_variables_names = exog_variables
# If there's not exogenous variables, it returns None
exog_variables = df[exog_variables] if len(exog_variables) > 0 else None
# Show plots
plot_menu_title = st.sidebar.markdown('### Charts')
plot_menu_text = st.sidebar.text('Select which charts you want to see')
show_absolute_plot = sidebar_menus('absolute')
show_seasonal_decompose = sidebar_menus('seasonal')
show_adfuller_test = sidebar_menus('adfuller')
show_train_prediction = sidebar_menus('train_predictions')
show_test_prediction = sidebar_menus('test_predictions')
force_transformation = sidebar_menus('force_transformations') # You can force a transformation technique
difference_size = None
seasonal_difference_size = None
if ('Custom Difference') in force_transformation:
# If the user selects a custom transformation, enable the difference options
difference_size = st.sidebar.slider('Difference size: ', 0, 30, 1)
seasonal_difference_size = st.sidebar.slider('Seasonal Difference size: ', 0, 30, 1)
plot_adfuller_result = False
if show_adfuller_test:
plot_adfuller_result = True
# Transform DataFrame to a Series
df = transform_time_series(df, ds_column, data_frequency, y)
# Show the historical plot?
if show_absolute_plot:
st.markdown('# Historical data ')
df[y].plot(color='green')
plt.title('Absolute historical data')
st.pyplot()
# Show decomposition plot
if show_seasonal_decompose:
st.markdown('# Seasonal decomposition')
decompose_series(df)
# Checking for stationarity in the series
st.title('Checking stationarity')
# If a function is not forced by the user, use the default pipeline
if force_transformation == None:
ts, d, D, seasonality, acf_pacf_data, transformation_function, test_stationarity_code = test_stationary(df[y], plot_adfuller_result, data_frequency)
else:
ts, d, D, seasonality, acf_pacf_data, transformation_function, test_stationarity_code = test_stationary(df[y], plot_adfuller_result, data_frequency,
force_transformation_technique = force_transformation,
custom_transformation_size = (difference_size, seasonal_difference_size))
st.title('ACF and PACF estimation')
p, q, P, Q = find_acf_pacf(acf_pacf_data, seasonality)
st.markdown('**Suggested parameters for your model**: {}x{}{}'.format((p, d, q), (P, D, Q), (seasonality)))
st.title('Time to train!')
st.write('Select the terms on the side bar and click "Do your Magic!" button')
try:
p, d, q, P, D, Q, s, train_model, periods_to_forecast, execute_grid_search = sidebar_menus('terms', test_set_size, seasonality, (p, d, q, P, D, Q, seasonality), df=ts)
except ValueError:
error_message = '''
A problem has occurred while we tried to find the best initial parameters for p, d, and q.
Please, check if your FREQUENCY field is correct for your dataset. For example, if your dataset
was collected in a daily basis, check if you selected DAILY in the FREQUENCY field.
'''
raise ValueError(error_message)
# Showing a warning when Grid Search operation is too expensive
if execute_grid_search:
if data_frequency in ['Hourly', 'Daily'] or p >= 5 or q >= 5:
warning_grid_search = '''
Apply Grid Search on this dataset with these settings might be computationally expensive.
Be sure you have enough memory for this operation, otherwise, it will fail
'''
st.sidebar.warning(warning_grid_search)
# If train button has be clicked
if train_model:
exog_train = None
exog_test = None
# Aligning endog and exog variables index, if exog_variables is not null
if type(exog_variables) == type(pd.DataFrame()):
exog_variables.index = ts.index
exog_train = exog_variables.iloc[:-test_set_size]
exog_test = exog_variables.iloc[-test_set_size:]
train_set = transformation_function(ts.iloc[:-test_set_size])
test_set = transformation_function(ts.iloc[-test_set_size:])
try:
model = train_ts_model(train_set, p, d, q, P, D, Q, s, exog_variables=exog_train, quiet=False)
except ValueError as ve:
if ve.args[0] == 'maxlag should be < nobs':
raise ValueError('Seems that you don\'t have enough data. Try to use smaller terms for AR and MA (p, q, P, Q)')
else:
raise ve
st.markdown('## **Train set prediction**')
st.write('The model was trained with this data. It\'s trying to predict the same data')
if transformation_function == np.log1p:
predict_set(train_set.iloc[-24:], y, seasonality, np.expm1, model, show_train_prediction=show_train_prediction, show_test_prediction=show_test_prediction)
else:
predict_set(train_set.iloc[-24:], y, seasonality, transformation_function, model, show_train_prediction=show_train_prediction, show_test_prediction=show_test_prediction)
st.markdown('## **Test set forecast**')
st.write('Unseen data. The model was not trained with this data and it\'s trying to forecast')
if transformation_function == np.log1p:
predict_set(test_set, y, seasonality, np.expm1, model, exog_variables=exog_test,forecast=True, show_train_prediction=show_train_prediction, show_test_prediction=show_test_prediction)
else:
predict_set(test_set, y, seasonality, transformation_function, model, exog_variables=exog_test, forecast=True, show_train_prediction=show_train_prediction, show_test_prediction=show_test_prediction)
# Executing Grid Search
if execute_grid_search:
st.markdown('# Executing Grid Search')
st.markdown('''
We\'re going to find the best parameters for your model. This might take some minutes.
Now it's a good time to grab some coffee.
''')
p, d, q, P, D, Q, s = grid_search_arima(train_set, exog_train, range(p+2), range(q+2), range(P+2), range(Q+2), d=d, D=D, s=s)
# Forecasting data
st.markdown('# Out-of-sample Forecast')
# Creating final model
with st.spinner('Training model with entire dataset. Please wait.'):
final_model = train_ts_model(transformation_function(ts), p, d, q, P, D, Q, s, exog_variables=exog_variables, quiet=True)
st.success('Done!')
if type(exog_variables) == type(pd.DataFrame()):
st.write('You are using exogenous variables. We can\'t forecast the future since we don\'t have the exogenous variables for future periods. Adapt the code below to use them.' )
else:
if transformation_function == np.log1p:
forecasts = np.expm1(final_model.forecast(periods_to_forecast))
confidence_interval = np.expm1(final_model.get_forecast(periods_to_forecast).conf_int())
else:
forecasts = final_model.forecast(periods_to_forecast)
confidence_interval = final_model.get_forecast(periods_to_forecast).conf_int()
confidence_interval.columns = ['ci_lower', 'ci_upper']
plot_forecasts(forecasts, confidence_interval, data_frequency)
st.write('# Here\'s your code')
st.markdown(generate_code(filename, ds_column, y, test_stationarity_code, test_set_size,
seasonality, p, d, q, P, D, Q, s, exog_variables_names, transformation_function,
periods_to_forecast, data_frequency))