-
Notifications
You must be signed in to change notification settings - Fork 2
/
engineeringclassification.py
126 lines (115 loc) · 4.43 KB
/
engineeringclassification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
"""Conduct basic classification independent of voting."""
import pandas as pd
from sklearn.ensemble import BaggingClassifier as BC
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier as KNC
from sklearn.neighbors.nearest_centroid import NearestCentroid as NC
from sklearn.gaussian_process import GaussianProcessClassifier as GPC
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.tree import ExtraTreeClassifier as ExTC1
from sklearn.ensemble import ExtraTreesClassifier as ExTC2
from sklearn.neural_network import MLPClassifier as NNC
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split as tts
import pickle as pk
from os import listdir
import matplotlib.pyplot as plt
from copy import copy
import numpy as np
def train_classifiers():
"""
Train classifiers
DLTZ, WFG and ZDT data used. Models dumped as pickle files to ./results/bestmodels.
"""
classifiers = {
"BC": BC,
"SVC": SVC,
"KNC": KNC,
"NC": NC,
"GPC": GPC,
"DTC": DTC,
"NNC": NNC,
"ExTC1": ExTC1,
"ExTC2": ExTC2,
}
training_costs = pd.DataFrame(columns=classifiers.keys())
prediction_costs = pd.DataFrame(columns=classifiers.keys())
rootfolder = "./results/featuresR2/"
outputfolder = "./results/bestmodels/"
data = pd.read_csv(rootfolder + "bigdatafortraining.csv")
inputs = data[data.keys()[1:14]]
outputs = data[data.keys()[14:]]
inputs_train, inputs_test, outputs_train, outputs_test = tts(inputs, outputs)
target_train = outputs_train.idxmax(axis=1)
target_test = outputs_test.idxmax(axis=1)
numruns = 50
temp_t = pd.DataFrame(
np.zeros((numruns, len(classifiers.keys()))), columns=classifiers.keys()
)
temp_p = pd.DataFrame(
np.zeros((numruns, len(classifiers.keys()))), columns=classifiers.keys()
)
for key, value in classifiers.items():
model = {"model": [], "cost_train": 1, "cost_test": 1, "name": key}
print(key, "\n")
for i in range(numruns):
clf = value()
clf.fit(inputs_train, target_train)
predictions = clf.predict(inputs_train)
cost_mat = outputs_train
cost_mat = -cost_mat.sub(cost_mat.max(axis=1), axis=0)
num_samples, num_classes = cost_mat.shape
cost = 0
lenp = len(predictions)
for index in range(num_samples):
cost += cost_mat.iloc[index][predictions[index]]
cost_train = cost / lenp
predictions = clf.predict(inputs_test)
cost_mat = outputs_test
cost_mat = -cost_mat.sub(cost_mat.max(axis=1), axis=0)
num_samples, num_classes = cost_mat.shape
cost = 0
lenp = len(predictions)
for index in range(num_samples):
cost += cost_mat.iloc[index][predictions[index]]
cost_test = cost / lenp
temp_t[key][i] = cost_train
temp_p[key][i] = cost_test
if cost_test < model["cost_test"]:
model["model"] = clf
model["cost_test"] = cost_test
model["cost_train"] = cost_train
pk.dump(model, open(outputfolder + key + "best.p", "wb"))
temp_p.to_csv("./results/classification/testcost_on_bench.csv", index=False)
temp_t.to_csv("./results/classification/traincost_on_bench.csv", index=False)
def test_classifiers():
plt.ion()
rootfolder = "./results/featuresR2/"
data = pd.read_csv(rootfolder + "engineering.csv")
root = "./results/bestmodels/"
files = listdir(root)
models = {}
for file in files:
models[file[0:-6]] = pk.load(open(root + file, "rb"))
data_features = data[data.keys()[1:14]]
R2all = data[data.keys()[14:]]
for key, value in models.items():
model = value["model"]
algo_predicted = model.predict(data_features)
R2predicted = [
R2all.iloc[index][algo_predicted[index]]
for index in range(len(algo_predicted))
]
newR2 = copy(R2all)
newR2["Predicted"] = R2predicted
newR2["files"] = data["files"]
newR2.to_csv(
"./results/predictionsonengineering/" + key + "prediction.csv", index=False
)
# newR2.plot.line()
# plt.show()
# plt.title(key)
# plt.ioff()
# plt.show()
if __name__ == "__main__":
train_classifiers()