drift annotation

w4k2 · Oct 25, 2023 · 37d1646 · 37d1646
1 parent 9006197
commit 37d1646
Show file tree

Hide file tree

Showing 16 changed files with 90 additions and 10 deletions.
diff --git a/vis_bar.py → E4P_vis_bar.py b/vis_bar.py → E4P_vis_bar.py
diff --git a/data/real_streams_gt/INSECTS-abrupt_imbalanced_norm.png b/data/real_streams_gt/INSECTS-abrupt_imbalanced_norm.png
diff --git a/data/real_streams_gt/INSECTS-gradual_imbalanced_norm.png b/data/real_streams_gt/INSECTS-gradual_imbalanced_norm.png
diff --git a/data/real_streams_gt/INSECTS-incremental_imbalanced_norm.png b/data/real_streams_gt/INSECTS-incremental_imbalanced_norm.png
diff --git a/data/real_streams_gt/clf_INSECTS-abrupt_imbalanced_norm.npy b/data/real_streams_gt/clf_INSECTS-abrupt_imbalanced_norm.npy
diff --git a/data/real_streams_gt/clf_INSECTS-gradual_imbalanced_norm.npy b/data/real_streams_gt/clf_INSECTS-gradual_imbalanced_norm.npy
diff --git a/data/real_streams_gt/clf_INSECTS-incremental_imbalanced_norm.npy b/data/real_streams_gt/clf_INSECTS-incremental_imbalanced_norm.npy
diff --git a/data/real_streams_gt/clf_covtypeNorm-1-2vsAll-pruned.npy b/data/real_streams_gt/clf_covtypeNorm-1-2vsAll-pruned.npy
diff --git a/data/real_streams_gt/clf_electricity.npy b/data/real_streams_gt/clf_electricity.npy
diff --git a/data/real_streams_gt/clf_poker-lsn-1-2vsAll-pruned.npy b/data/real_streams_gt/clf_poker-lsn-1-2vsAll-pruned.npy
diff --git a/data/real_streams_gt/covtypeNorm-1-2vsAll-pruned.png b/data/real_streams_gt/covtypeNorm-1-2vsAll-pruned.png
diff --git a/data/real_streams_gt/electricity.png b/data/real_streams_gt/electricity.png
diff --git a/data/real_streams_gt/poker-lsn-1-2vsAll-pruned.png b/data/real_streams_gt/poker-lsn-1-2vsAll-pruned.png
diff --git a/drift_marking.md b/drift_marking.md
@@ -0,0 +1,29 @@
+# Drift annotation procedure
+
+Real concept drifts are associated with a change (usually a decrease) in the quality of the classification achieved by the classifier. If the classifier was trained using data from a concept other than the current one, its recognition quality should decrease, as the classifier is not *familiar* with the current data deistribution. If an increase in quality is observed, it can be suspected that the data distribution is close to the previous concept and there are fewer samples in areas of overlap between class samples - which is also related to the change in concept.
+
+A human expert marked the locations of drifts based on the classification quality of three classifiers: Gaussian Naive Bayes (GNB) and Multilayer Perceptron (MLP) and Extreme Learning Machine (ELM) in the Test-Then-Train experimental protocol. For every chunk of data, the classifiers were first used in the inference and quality evaluation procedure, then trained using a new portion of data. Such a protocol should allow for the most accurate determination of real concept drifts at the beginning of stream processing, in particular a clear identification of the change between the first and second concept. Training the classifier with subsequent portions of data, especially in the case of MLP, which is *forgetting* the previous data distributions, should enable the identification of further concept changes.
+
+Partial fitting MLP by default performs only one iteration of weight optimization, so at the beginning of stream processing the recognition quality using MLP is lower and later, if the concept is stable, it increases.
+
+It should be emphasized that the processed streams were previously divided into chunks and pruned of those batches containing only single class samples. This makes the identified drift moments specific to the transformed streams used in the experiments and should not be used as unambiguous drift moments in the original streams for the purposes of other studies.
+
+Below we present the classification results using scetterplot (top row) and plot (bottom row) for the processed streams. The quality obtained by GNB is marked in blue, the MLP is marked in gold, and in red - ELM. The x-axis shows the identified moments of drift, determined based on changes in classification quality.
+
+### Electricity
+![electricity](data/real_streams_gt/electricity.png)
+
+### Covtype
+![covtype](data/real_streams_gt/covtypeNorm-1-2vsAll-pruned.png)
+
+### Poker
+![poker](data/real_streams_gt/poker-lsn-1-2vsAll-pruned.png)
+
+### Insect abrupt
+![insect-abrupt](data/real_streams_gt/INSECTS-abrupt_imbalanced_norm.png)
+
+### Insect gradual
+![insect-grad](data/real_streams_gt/INSECTS-gradual_imbalanced_norm.png)
+
+### Insect incremental
+![insect-abrupt](data/real_streams_gt/INSECTS-incremental_imbalanced_norm.png)
diff --git a/real_gt.py b/real_gt.py
@@ -1,13 +1,13 @@
 """
 Script for detecting and marking moments of drift for real-world datastreams.
 """
-
 import numpy as np
 import strlearn as sl
 import matplotlib.pyplot as plt
 from sklearn.naive_bayes import GaussianNB
 from sklearn.neural_network import MLPClassifier
 from tqdm import tqdm
+from utils import ELMI
 
 real_streams = [
     'data/real_streams/covtypeNorm-1-2vsAll-pruned.arff',
@@ -84,19 +84,16 @@
     if f_id==5:
         drfs=[9,35,60,180,220]
 
-    clf = [GaussianNB(), MLPClassifier()]
+    clf = [GaussianNB(), MLPClassifier(), ELMI()]
 
     evaluator = sl.evaluators.TestThenTrain()
     evaluator.process(stream, clf)
 
-    if f_id == 2:
-        fig, ax = plt.subplots(2,1,figsize=(13,7))
-    else:
-        fig, ax = plt.subplots(2,1,figsize=(7,7))
+    fig, ax = plt.subplots(2,1,figsize=(14,7))
 
     for i in range(len(clf)):
-        ax[0].scatter(np.arange(len(evaluator.scores[i,:,1])),evaluator.scores[i,:,1], alpha=0.9, label=['GNB', 'MLP'][i], c=['blue','tomato'][i],s=3)
-        ax[1].plot(evaluator.scores[i,:,1], alpha=0.9, label=['GNB', 'MLP'][i], c=['blue','tomato'][i],lw=1)
+        ax[0].scatter(np.arange(len(evaluator.scores[i,:,1])),evaluator.scores[i,:,1], alpha=0.9, label=['GNB', 'MLP', 'ELM'][i], c=['blue', 'gold', 'tomato'][i],s=3)
+        ax[1].plot(evaluator.scores[i,:,1], alpha=0.7, label=['GNB', 'MLP', 'ELM'][i], c=['blue', 'gold', 'tomato'][i],lw = 1 if f_id==2 else 2)
     for aa in ax:
         aa.spines['top'].set_visible(False)
         aa.spines['right'].set_visible(False)
@@ -112,4 +109,4 @@
 
     np.save('data/real_streams_gt/%s.npy' % fname, drfs)
     np.save('data/real_streams_gt/clf_%s.npy' % fname, evaluator.scores)
-
+
diff --git a/utils.py b/utils.py
@@ -61,4 +61,58 @@ def find_real_drift(chunks, drifts):
     'INSECTS-abrupt_imbalanced_norm': [125],
     'INSECTS-gradual_imbalanced_norm': [  9,  60,  90, 125, 190],
     'INSECTS-incremental_imbalanced_norm': [  9,  35,  60, 180, 220]
-}
+}
+
+from sklearn.base import BaseEstimator, ClassifierMixin
+
+class ELMI(BaseEstimator, ClassifierMixin):
+    def __init__(self, hidden_layer_size=1024, 
+                 probing_rate=.1,
+                 update_rate=.1):
+        self.hidden_layer_size = hidden_layer_size
+        self.probing_rate = probing_rate
+        self.update_rate = update_rate
+
+    def partial_fit(self, X, y, classes=None):
+
+        if classes is None:
+            classes = np.unique(y)
+        if not hasattr(self, 'enc'):
+            self.enc = np.arange(len(classes))
+        _y = (np.array([yi==self.enc for yi in y]).astype(int))
+
+        # Check if first
+        if not hasattr(self, 'beta_'):
+            # Get problem info
+            self.n_classes = _y.shape[1]
+            self.n_features = X.shape[1]
+
+            # Initialize W
+            self.coefs_ = np.random.uniform(-1, 1, size=(self.n_features,
+                                                         self.hidden_layer_size))
+            # Initialize bias
+            self.intercepts_ = np.random.normal(size=(self.hidden_layer_size,))
+
+            # Initialize empty beta
+            self.beta_ = np.zeros((self.hidden_layer_size, self.n_classes))
+
+        pmask = np.random.uniform(size=_y.shape[0]) < self.probing_rate
+
+        H = self.activation(X[pmask].dot(self.coefs_) + self.intercepts_)  # Propagate
+        H_pinv = np.linalg.pinv(H)                                  # Inverse by Moore–Penrose
+
+        # Calculate partial beta and update beta
+        partial_beta = H_pinv.dot(_y[pmask]) 
+        self.beta_ = self.beta_ * (1-self.update_rate) + partial_beta * self.update_rate
+
+        return self
+
+    def predict_proba(self, X):
+        H = self.activation(X.dot(self.coefs_) + self.intercepts_)
+        return H.dot(self.beta_)
+
+    def predict(self, X):
+        return np.argmax(self.predict_proba(X), axis=1)
+
+    def activation(self, x):
+        return 1. / (1. + np.exp(-x))