Skip to content

Commit

Permalink
normalize: speedup preprocessing with caching
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Aug 23, 2021
1 parent 994ff6a commit cedb7d6
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 15 deletions.
35 changes: 22 additions & 13 deletions orangecontrib/text/preprocess/normalize.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from typing import List, Callable
import os
import json
import ufal.udpipe as udpipe
from lemmagen3 import Lemmatizer
import serverfiles
Expand All @@ -25,6 +24,10 @@ class BaseNormalizer(TokenizedPreprocessor):
"""
normalizer = NotImplemented

def __init__(self):
# cache already normalized string to speedup normalization
self._normalization_cache = {}

def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
if callback is None:
callback = dummy_callback
Expand All @@ -34,7 +37,16 @@ def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:

def _preprocess(self, string: str) -> str:
""" Normalizes token to canonical form. """
return self.normalizer(string)
if string in self._normalization_cache:
return self._normalization_cache[string]
self._normalization_cache[string] = norm_string = self.normalizer(string)
return norm_string

def __getstate__(self):
d = self.__dict__.copy()
# since cache can be quite big, empty cache before pickling
d["_normalization_cache"] = {}
return d


class WordNetLemmatizer(BaseNormalizer):
Expand All @@ -57,10 +69,8 @@ class SnowballStemmer(BaseNormalizer):
stem.SnowballStemmer.languages]

def __init__(self, language='English'):
self.normalizer = stem.SnowballStemmer(language.lower())

def _preprocess(self, token):
return self.normalizer.stem(token)
super().__init__()
self.normalizer = stem.SnowballStemmer(language.lower()).stem


def language_to_name(language):
Expand Down Expand Up @@ -121,6 +131,7 @@ class UDPipeLemmatizer(BaseNormalizer):
name = 'UDPipe Lemmatizer'

def __init__(self, language='English', use_tokenizer=False):
super().__init__()
self.__language = language
self.__use_tokenizer = use_tokenizer
self.models = UDPipeModels()
Expand Down Expand Up @@ -156,8 +167,7 @@ def __normalize_token(self, token: str) -> str:
sentence = udpipe.Sentence()
sentence.addWord(token)
self.__model.tag(sentence, self.__model.DEFAULT)
output = self.__output_format.writeSentence(sentence)
return json.loads(output)['nodes'][0]['properties']['lemma']
return sentence.words[1].lemma

def __normalize_document(self, document: str) -> List[str]:
tokens = []
Expand All @@ -167,10 +177,9 @@ def __normalize_document(self, document: str) -> List[str]:
sentence = udpipe.Sentence()
while tokenizer.nextSentence(sentence, error):
self.__model.tag(sentence, self.__model.DEFAULT)
output = self.__output_format.writeSentence(sentence)
# 1: is used because words[0] is the root required by the dependency trees
tokens.extend([w.lemma for w in sentence.words[1:]])
sentence = udpipe.Sentence()
tokens.extend([t['properties']['lemma']
for t in json.loads(output)['nodes']])
return tokens

def __getstate__(self):
Expand All @@ -180,8 +189,7 @@ def __getstate__(self):
Note: __setstate__ is not required since we do not make any harm if
model is not restored. It will be loaded on __call__
"""
# copy to avoid editing original dict
state = self.__dict__.copy()
state = super().__getstate__()
# Remove the unpicklable Model and output format.
state['_UDPipeLemmatizer__model'] = None
state['_UDPipeLemmatizer__output_format'] = None
Expand Down Expand Up @@ -213,6 +221,7 @@ class LemmagenLemmatizer(BaseNormalizer):
}

def __init__(self, language='English'):
super().__init__()
self.lemmatizer = Lemmatizer(self.lemmagen_languages[language])

def normalizer(self, token):
Expand Down
11 changes: 11 additions & 0 deletions orangecontrib/text/tests/test_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,17 @@ def test_lemmagen(self):
Lemmatizer("sl").lemmatize(token)
)

def test_cache(self):
normalizer = preprocess.UDPipeLemmatizer('Slovenian')
self.corpus.metas[0, 0] = 'sem'
normalizer(self.corpus)
self.assertEqual(normalizer._normalization_cache['sem'], 'biti')
self.assertEqual(40, len(normalizer._normalization_cache))

# cache should not be pickled
loaded_normalizer = pickle.loads(pickle.dumps(normalizer))
self.assertEqual(0, len(loaded_normalizer._normalization_cache))


class UDPipeModelsTests(unittest.TestCase):
def test_label_transform(self):
Expand Down
4 changes: 2 additions & 2 deletions orangecontrib/text/widgets/tests/test_owpreprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,13 +402,13 @@ def test_createinstance(self):
params = {"method": NormalizationModule.Snowball}
pp = self.editor.createinstance(params)
self.assertIsInstance(pp, SnowballStemmer)
self.assertEqual(str(pp.normalizer.stemmer), "<EnglishStemmer>")
self.assertIn("<EnglishStemmer>", str(pp.normalizer))

params = {"method": NormalizationModule.Snowball,
"snowball_language": "Dutch"}
pp = self.editor.createinstance(params)
self.assertIsInstance(pp, SnowballStemmer)
self.assertEqual(str(pp.normalizer.stemmer), "<DutchStemmer>")
self.assertIn("<DutchStemmer>", str(pp.normalizer))

params = {"method": NormalizationModule.UDPipe,
"udpipe_language": "Finnish",
Expand Down

0 comments on commit cedb7d6

Please sign in to comment.