Skip to content

Commit

Permalink
normalize: speedup preprocessing with lru_cache
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Aug 17, 2021
1 parent 96c295c commit 5daa2e8
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 7 deletions.
26 changes: 19 additions & 7 deletions orangecontrib/text/preprocess/normalize.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from functools import lru_cache
from typing import List, Callable
import os
import json
Expand All @@ -24,6 +25,10 @@ class BaseNormalizer(TokenizedPreprocessor):
"""
normalizer = NotImplemented

def __init__(self):
# cache already normalized string to speedup normalization
self._normalization_cache = {}

def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
if callback is None:
callback = dummy_callback
Expand All @@ -33,7 +38,16 @@ def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:

def _preprocess(self, string: str) -> str:
""" Normalizes token to canonical form. """
return self.normalizer(string)
if string in self._normalization_cache:
return self._normalization_cache[string]
self._normalization_cache[string] = norm_string = self.normalizer(string)
return norm_string

def __getstate__(self):
d = self.__dict__.copy()
# since cache can be quite big, empty cache before pickling
d["_normalization_cache"] = {}
return d


class WordNetLemmatizer(BaseNormalizer):
Expand All @@ -55,10 +69,8 @@ class SnowballStemmer(BaseNormalizer):
supported_languages = [l.capitalize() for l in stem.SnowballStemmer.languages]

def __init__(self, language='English'):
self.normalizer = stem.SnowballStemmer(language.lower())

def _preprocess(self, token):
return self.normalizer.stem(token)
super().__init__()
self.normalizer = stem.SnowballStemmer(language.lower()).stem


def language_to_name(language):
Expand Down Expand Up @@ -119,6 +131,7 @@ class UDPipeLemmatizer(BaseNormalizer):
name = 'UDPipe Lemmatizer'

def __init__(self, language='English', use_tokenizer=False):
super().__init__()
self.__language = language
self.__use_tokenizer = use_tokenizer
self.models = UDPipeModels()
Expand Down Expand Up @@ -178,8 +191,7 @@ def __getstate__(self):
Note: __setstate__ is not required since we do not make any harm if
model is not restored. It will be loaded on __call__
"""
# copy to avoid editing original dict
state = self.__dict__.copy()
state = super().__getstate__()
# Remove the unpicklable Model and output format.
state['_UDPipeLemmatizer__model'] = None
state['_UDPipeLemmatizer__output_format'] = None
Expand Down
11 changes: 11 additions & 0 deletions orangecontrib/text/tests/test_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,17 @@ def test_udpipe_deepcopy(self):
self.assertEqual(list(copied(self.corpus).tokens[0]),
['gora', 'na', 'gora', 'hiša', 'goreti'])

def test_cache(self):
normalizer = preprocess.UDPipeLemmatizer('Slovenian')
self.corpus.metas[0, 0] = 'sem'
normalizer(self.corpus)
self.assertEqual(normalizer._normalization_cache['sem'], 'biti')
self.assertEqual(40, len(normalizer._normalization_cache))

# cache should not be pickled
loaded_normalizer = pickle.loads(pickle.dumps(normalizer))
self.assertEqual(0, len(loaded_normalizer._normalization_cache))


class UDPipeModelsTests(unittest.TestCase):
def test_label_transform(self):
Expand Down

0 comments on commit 5daa2e8

Please sign in to comment.