Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

normalize: speedup preprocessing with caching #709

Merged
merged 1 commit into from
Aug 23, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 22 additions & 13 deletions orangecontrib/text/preprocess/normalize.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from typing import List, Callable
import os
import json
import ufal.udpipe as udpipe
from lemmagen3 import Lemmatizer
import serverfiles
Expand All @@ -25,6 +24,10 @@ class BaseNormalizer(TokenizedPreprocessor):
"""
normalizer = NotImplemented

def __init__(self):
# cache already normalized string to speedup normalization
self._normalization_cache = {}

def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
if callback is None:
callback = dummy_callback
Expand All @@ -34,7 +37,16 @@ def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:

def _preprocess(self, string: str) -> str:
""" Normalizes token to canonical form. """
return self.normalizer(string)
if string in self._normalization_cache:
return self._normalization_cache[string]
self._normalization_cache[string] = norm_string = self.normalizer(string)
return norm_string

def __getstate__(self):
d = self.__dict__.copy()
# since cache can be quite big, empty cache before pickling
d["_normalization_cache"] = {}
return d


class WordNetLemmatizer(BaseNormalizer):
Expand All @@ -57,10 +69,8 @@ class SnowballStemmer(BaseNormalizer):
stem.SnowballStemmer.languages]

def __init__(self, language='English'):
self.normalizer = stem.SnowballStemmer(language.lower())

def _preprocess(self, token):
return self.normalizer.stem(token)
super().__init__()
self.normalizer = stem.SnowballStemmer(language.lower()).stem


def language_to_name(language):
Expand Down Expand Up @@ -121,6 +131,7 @@ class UDPipeLemmatizer(BaseNormalizer):
name = 'UDPipe Lemmatizer'

def __init__(self, language='English', use_tokenizer=False):
super().__init__()
self.__language = language
self.__use_tokenizer = use_tokenizer
self.models = UDPipeModels()
Expand Down Expand Up @@ -156,8 +167,7 @@ def __normalize_token(self, token: str) -> str:
sentence = udpipe.Sentence()
sentence.addWord(token)
self.__model.tag(sentence, self.__model.DEFAULT)
output = self.__output_format.writeSentence(sentence)
return json.loads(output)['nodes'][0]['properties']['lemma']
return sentence.words[1].lemma

def __normalize_document(self, document: str) -> List[str]:
tokens = []
Expand All @@ -167,10 +177,9 @@ def __normalize_document(self, document: str) -> List[str]:
sentence = udpipe.Sentence()
while tokenizer.nextSentence(sentence, error):
self.__model.tag(sentence, self.__model.DEFAULT)
output = self.__output_format.writeSentence(sentence)
# 1: is used because words[0] is the root required by the dependency trees
tokens.extend([w.lemma for w in sentence.words[1:]])
sentence = udpipe.Sentence()
tokens.extend([t['properties']['lemma']
for t in json.loads(output)['nodes']])
return tokens

def __getstate__(self):
Expand All @@ -180,8 +189,7 @@ def __getstate__(self):
Note: __setstate__ is not required since we do not make any harm if
model is not restored. It will be loaded on __call__
"""
# copy to avoid editing original dict
state = self.__dict__.copy()
state = super().__getstate__()
# Remove the unpicklable Model and output format.
state['_UDPipeLemmatizer__model'] = None
state['_UDPipeLemmatizer__output_format'] = None
Expand Down Expand Up @@ -213,6 +221,7 @@ class LemmagenLemmatizer(BaseNormalizer):
}

def __init__(self, language='English'):
super().__init__()
self.lemmatizer = Lemmatizer(self.lemmagen_languages[language])

def normalizer(self, token):
Expand Down
11 changes: 11 additions & 0 deletions orangecontrib/text/tests/test_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,17 @@ def test_lemmagen(self):
Lemmatizer("sl").lemmatize(token)
)

def test_cache(self):
normalizer = preprocess.UDPipeLemmatizer('Slovenian')
self.corpus.metas[0, 0] = 'sem'
normalizer(self.corpus)
self.assertEqual(normalizer._normalization_cache['sem'], 'biti')
self.assertEqual(40, len(normalizer._normalization_cache))

# cache should not be pickled
loaded_normalizer = pickle.loads(pickle.dumps(normalizer))
self.assertEqual(0, len(loaded_normalizer._normalization_cache))


class UDPipeModelsTests(unittest.TestCase):
def test_label_transform(self):
Expand Down
4 changes: 2 additions & 2 deletions orangecontrib/text/widgets/tests/test_owpreprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,13 +402,13 @@ def test_createinstance(self):
params = {"method": NormalizationModule.Snowball}
pp = self.editor.createinstance(params)
self.assertIsInstance(pp, SnowballStemmer)
self.assertEqual(str(pp.normalizer.stemmer), "<EnglishStemmer>")
self.assertIn("<EnglishStemmer>", str(pp.normalizer))

params = {"method": NormalizationModule.Snowball,
"snowball_language": "Dutch"}
pp = self.editor.createinstance(params)
self.assertIsInstance(pp, SnowballStemmer)
self.assertEqual(str(pp.normalizer.stemmer), "<DutchStemmer>")
self.assertIn("<DutchStemmer>", str(pp.normalizer))

params = {"method": NormalizationModule.UDPipe,
"udpipe_language": "Finnish",
Expand Down