-
Notifications
You must be signed in to change notification settings - Fork 432
/
main.py
95 lines (74 loc) · 2.68 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import json
import mlflow
import tempfile
import os
import wandb
import hydra
from omegaconf import DictConfig
_steps = [
"download",
"basic_cleaning",
"data_check",
"data_split",
"train_random_forest",
# NOTE: We do not include this in the steps so it is not run by mistake.
# You first need to promote a model export to "prod" before you can run this,
# then you need to run this step explicitly
# "test_regression_model"
]
# This automatically reads in the configuration
@hydra.main(config_name='config')
def go(config: DictConfig):
# Setup the wandb experiment. All runs will be grouped under this name
os.environ["WANDB_PROJECT"] = config["main"]["project_name"]
os.environ["WANDB_RUN_GROUP"] = config["main"]["experiment_name"]
# Steps to execute
steps_par = config['main']['steps']
active_steps = steps_par.split(",") if steps_par != "all" else _steps
# Move to a temporary directory
with tempfile.TemporaryDirectory() as tmp_dir:
if "download" in active_steps:
# Download file and load in W&B
_ = mlflow.run(
f"{config['main']['components_repository']}/get_data",
"main",
parameters={
"sample": config["etl"]["sample"],
"artifact_name": "sample.csv",
"artifact_type": "raw_data",
"artifact_description": "Raw file as downloaded"
},
)
if "basic_cleaning" in active_steps:
##################
# Implement here #
##################
pass
if "data_check" in active_steps:
##################
# Implement here #
##################
pass
if "data_split" in active_steps:
##################
# Implement here #
##################
pass
if "train_random_forest" in active_steps:
# NOTE: we need to serialize the random forest configuration into JSON
rf_config = os.path.abspath("rf_config.json")
with open(rf_config, "w+") as fp:
json.dump(dict(config["modeling"]["random_forest"].items()), fp) # DO NOT TOUCH
# NOTE: use the rf_config we just created as the rf_config parameter for the train_random_forest
# step
##################
# Implement here #
##################
pass
if "test_regression_model" in active_steps:
##################
# Implement here #
##################
pass
if __name__ == "__main__":
go()