normalize: speedup preprocessing with lru_cache

biolab · Aug 17, 2021 · 5daa2e8 · 5daa2e8
1 parent 96c295c
commit 5daa2e8
Show file tree

Hide file tree

Showing 2 changed files with 30 additions and 7 deletions.
diff --git a/orangecontrib/text/preprocess/normalize.py b/orangecontrib/text/preprocess/normalize.py
@@ -1,3 +1,4 @@
+from functools import lru_cache
 from typing import List, Callable
 import os
 import json
@@ -24,6 +25,10 @@ class BaseNormalizer(TokenizedPreprocessor):
     """
     normalizer = NotImplemented
 
+    def __init__(self):
+        # cache already normalized string to speedup normalization
+        self._normalization_cache = {}
+
     def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
         if callback is None:
             callback = dummy_callback
@@ -33,7 +38,16 @@ def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
 
     def _preprocess(self, string: str) -> str:
         """ Normalizes token to canonical form. """
-        return self.normalizer(string)
+        if string in self._normalization_cache:
+            return self._normalization_cache[string]
+        self._normalization_cache[string] = norm_string = self.normalizer(string)
+        return norm_string
+
+    def __getstate__(self):
+        d = self.__dict__.copy()
+        # since cache can be quite big, empty cache before pickling
+        d["_normalization_cache"] = {}
+        return d
 
 
 class WordNetLemmatizer(BaseNormalizer):
@@ -55,10 +69,8 @@ class SnowballStemmer(BaseNormalizer):
     supported_languages = [l.capitalize() for l in stem.SnowballStemmer.languages]
 
     def __init__(self, language='English'):
-        self.normalizer = stem.SnowballStemmer(language.lower())
-
-    def _preprocess(self, token):
-        return self.normalizer.stem(token)
+        super().__init__()
+        self.normalizer = stem.SnowballStemmer(language.lower()).stem
 
 
 def language_to_name(language):
@@ -119,6 +131,7 @@ class UDPipeLemmatizer(BaseNormalizer):
     name = 'UDPipe Lemmatizer'
 
     def __init__(self, language='English', use_tokenizer=False):
+        super().__init__()
         self.__language = language
         self.__use_tokenizer = use_tokenizer
         self.models = UDPipeModels()
@@ -178,8 +191,7 @@ def __getstate__(self):
         Note: __setstate__ is not required since we do not make any harm if
               model is not restored. It will be loaded on __call__
         """
-        # copy to avoid editing original dict
-        state = self.__dict__.copy()
+        state = super().__getstate__()
         # Remove the unpicklable Model and output format.
         state['_UDPipeLemmatizer__model'] = None
         state['_UDPipeLemmatizer__output_format'] = None

diff --git a/orangecontrib/text/tests/test_preprocess.py b/orangecontrib/text/tests/test_preprocess.py
@@ -291,6 +291,17 @@ def test_udpipe_deepcopy(self):
         self.assertEqual(list(copied(self.corpus).tokens[0]),
                          ['gora', 'na', 'gora', 'hiša', 'goreti'])
 
+    def test_cache(self):
+        normalizer = preprocess.UDPipeLemmatizer('Slovenian')
+        self.corpus.metas[0, 0] = 'sem'
+        normalizer(self.corpus)
+        self.assertEqual(normalizer._normalization_cache['sem'], 'biti')
+        self.assertEqual(40, len(normalizer._normalization_cache))
+
+        # cache should not be pickled
+        loaded_normalizer = pickle.loads(pickle.dumps(normalizer))
+        self.assertEqual(0, len(loaded_normalizer._normalization_cache))
+
 
 class UDPipeModelsTests(unittest.TestCase):
     def test_label_transform(self):