-
Notifications
You must be signed in to change notification settings - Fork 0
/
E2_clf_real.py
82 lines (64 loc) · 2.41 KB
/
E2_clf_real.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
"""
E2 - classification for real-world streams
"""
import numpy as np
from sklearn import clone
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from tqdm import tqdm
from sklearn.metrics import balanced_accuracy_score
np.random.seed(1233)
measures = ["clustering",
"complexity",
"concept",
"general",
"info-theory",
"itemset",
"landmarking",
"model-based",
"statistical"
]
real_streams_full = [
'real_streams/covtypeNorm-1-2vsAll-pruned.arff',
'real_streams/electricity.npy',
'real_streams/poker-lsn-1-2vsAll-pruned.arff',
'real_streams/INSECTS-abrupt_imbalanced_norm.arff',
'real_streams/INSECTS-gradual_imbalanced_norm.arff',
'real_streams/INSECTS-incremental_imbalanced_norm.arff'
]
base_clfs = [
GaussianNB(),
KNeighborsClassifier(),
SVC(random_state=11313),
DecisionTreeClassifier(random_state=11313),
MLPClassifier(random_state=11313)
]
origial_datasets=len(real_streams_full)
n_splits=2
n_repeats=5
clf_res = np.zeros((len(measures), origial_datasets, n_splits*n_repeats, len(base_clfs)))
pbar = tqdm(total=len(measures)*origial_datasets*n_splits*n_repeats*len(base_clfs))
for f_id in range(len(real_streams_full)):
for m_id, m in enumerate(measures):
res = np.load('results/real_%s_%s.npy' % (f_id, m))
p = np.random.permutation(res.shape[0])
res = res[p]
# print(res_rep.shape) # chunks, measures + label
X = res[:,:-1]
y = res[:,-1]
X[np.isnan(X)]=1
X[np.isinf(X)]=1
rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=3242)
for fold, (train, test) in enumerate(rskf.split(X, y)):
for base_id, base_c in enumerate(base_clfs):
clf = clone(base_c)
pred = clf.fit(X[train], y[train]).predict(X[test])
acc = balanced_accuracy_score(y[test], pred)
clf_res[m_id, f_id, fold, base_id] = acc
pbar.update(1)
print(m, np.mean(clf_res[m_id, f_id], axis=0))
np.save('results/real_clf.npy', clf_res)