Skip to content

Commit

Permalink
normalize: speedup preprocessing with caching
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Aug 19, 2021
1 parent 96c295c commit e5e35d1
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 15 deletions.
34 changes: 21 additions & 13 deletions orangecontrib/text/preprocess/normalize.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from typing import List, Callable
import os
import json
import ufal.udpipe as udpipe
import serverfiles
from nltk import stem
Expand All @@ -24,6 +23,10 @@ class BaseNormalizer(TokenizedPreprocessor):
"""
normalizer = NotImplemented

def __init__(self):
# cache already normalized string to speedup normalization
self._normalization_cache = {}

def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
if callback is None:
callback = dummy_callback
Expand All @@ -33,7 +36,16 @@ def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:

def _preprocess(self, string: str) -> str:
""" Normalizes token to canonical form. """
return self.normalizer(string)
if string in self._normalization_cache:
return self._normalization_cache[string]
self._normalization_cache[string] = norm_string = self.normalizer(string)
return norm_string

def __getstate__(self):
d = self.__dict__.copy()
# since cache can be quite big, empty cache before pickling
d["_normalization_cache"] = {}
return d


class WordNetLemmatizer(BaseNormalizer):
Expand All @@ -55,10 +67,8 @@ class SnowballStemmer(BaseNormalizer):
supported_languages = [l.capitalize() for l in stem.SnowballStemmer.languages]

def __init__(self, language='English'):
self.normalizer = stem.SnowballStemmer(language.lower())

def _preprocess(self, token):
return self.normalizer.stem(token)
super().__init__()
self.normalizer = stem.SnowballStemmer(language.lower()).stem


def language_to_name(language):
Expand Down Expand Up @@ -119,6 +129,7 @@ class UDPipeLemmatizer(BaseNormalizer):
name = 'UDPipe Lemmatizer'

def __init__(self, language='English', use_tokenizer=False):
super().__init__()
self.__language = language
self.__use_tokenizer = use_tokenizer
self.models = UDPipeModels()
Expand Down Expand Up @@ -154,8 +165,7 @@ def __normalize_token(self, token: str) -> str:
sentence = udpipe.Sentence()
sentence.addWord(token)
self.__model.tag(sentence, self.__model.DEFAULT)
output = self.__output_format.writeSentence(sentence)
return json.loads(output)['nodes'][0]['properties']['lemma']
return sentence.words[1].lemma

def __normalize_document(self, document: str) -> List[str]:
tokens = []
Expand All @@ -165,10 +175,9 @@ def __normalize_document(self, document: str) -> List[str]:
sentence = udpipe.Sentence()
while tokenizer.nextSentence(sentence, error):
self.__model.tag(sentence, self.__model.DEFAULT)
output = self.__output_format.writeSentence(sentence)
# 1: is used because words[0] is the root required by the dependency trees
tokens.extend([w.lemma for w in sentence.words[1:]])
sentence = udpipe.Sentence()
tokens.extend([t['properties']['lemma']
for t in json.loads(output)['nodes']])
return tokens

def __getstate__(self):
Expand All @@ -178,8 +187,7 @@ def __getstate__(self):
Note: __setstate__ is not required since we do not make any harm if
model is not restored. It will be loaded on __call__
"""
# copy to avoid editing original dict
state = self.__dict__.copy()
state = super().__getstate__()
# Remove the unpicklable Model and output format.
state['_UDPipeLemmatizer__model'] = None
state['_UDPipeLemmatizer__output_format'] = None
Expand Down
11 changes: 11 additions & 0 deletions orangecontrib/text/tests/test_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,17 @@ def test_udpipe_deepcopy(self):
self.assertEqual(list(copied(self.corpus).tokens[0]),
['gora', 'na', 'gora', 'hiša', 'goreti'])

def test_cache(self):
normalizer = preprocess.UDPipeLemmatizer('Slovenian')
self.corpus.metas[0, 0] = 'sem'
normalizer(self.corpus)
self.assertEqual(normalizer._normalization_cache['sem'], 'biti')
self.assertEqual(40, len(normalizer._normalization_cache))

# cache should not be pickled
loaded_normalizer = pickle.loads(pickle.dumps(normalizer))
self.assertEqual(0, len(loaded_normalizer._normalization_cache))


class UDPipeModelsTests(unittest.TestCase):
def test_label_transform(self):
Expand Down
4 changes: 2 additions & 2 deletions orangecontrib/text/widgets/tests/test_owpreprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,13 +395,13 @@ def test_createinstance(self):
params = {"method": NormalizationModule.Snowball}
pp = self.editor.createinstance(params)
self.assertIsInstance(pp, SnowballStemmer)
self.assertEqual(str(pp.normalizer.stemmer), "<EnglishStemmer>")
self.assertIn("<EnglishStemmer>", str(pp.normalizer))

params = {"method": NormalizationModule.Snowball,
"snowball_language": "Dutch"}
pp = self.editor.createinstance(params)
self.assertIsInstance(pp, SnowballStemmer)
self.assertEqual(str(pp.normalizer.stemmer), "<DutchStemmer>")
self.assertIn("<DutchStemmer>", str(pp.normalizer))

params = {"method": NormalizationModule.UDPipe,
"udpipe_language": "Finnish",
Expand Down

0 comments on commit e5e35d1

Please sign in to comment.