Skip to content

Commit

Permalink
Preprocess - Use ISO language codes for Lemmagen
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Dec 1, 2023
1 parent 887ae3c commit f2fb09f
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 45 deletions.
36 changes: 22 additions & 14 deletions orangecontrib/text/widgets/owpreprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from Orange.widgets.widget import Input, Output, Msg, Message

from orangecontrib.text import Corpus
from orangecontrib.text.language import ISO2LANG
from orangecontrib.text.language import ISO2LANG, LANG2ISO
from orangecontrib.text.misc import nltk_data_dir
from orangecontrib.text.preprocess import *
from orangecontrib.text.preprocess.normalize import UDPipeStopIteration
Expand Down Expand Up @@ -475,13 +475,15 @@ class NormalizationModule(SingleMethodModule):
UDPipe: UDPipeLemmatizer,
Lemmagen: LemmagenLemmatizer}
DEFAULT_METHOD = Porter
DEFAULT_LANGUAGE = "English"
DEFAULT_SNOWBALL_LANG = "English" # todo: remove when snowball use iso
DEFAULT_UDPIPE_LANG = "English" # todo: remove when udpipe use iso
DEFAULT_LANGUAGE = "en"
DEFAULT_USE_TOKE = False

def __init__(self, parent=None, **kwargs):
super().__init__(parent, **kwargs)
self.__snowball_lang = self.DEFAULT_LANGUAGE
self.__udpipe_lang = self.DEFAULT_LANGUAGE
self.__snowball_lang = self.DEFAULT_SNOWBALL_LANG
self.__udpipe_lang = self.DEFAULT_UDPIPE_LANG
self.__lemmagen_lang = self.DEFAULT_LANGUAGE
self.__use_tokenizer = self.DEFAULT_USE_TOKE

Expand All @@ -490,15 +492,17 @@ def __init__(self, parent=None, **kwargs):
self.__snowball_lang, self.__set_snowball_lang
)
self.__combo_udl = UDPipeComboBox(
self, self.__udpipe_lang, self.DEFAULT_LANGUAGE,
self.__set_udpipe_lang
self, self.__udpipe_lang, self.DEFAULT_UDPIPE_LANG, self.__set_udpipe_lang
)
self.__check_use = QCheckBox("UDPipe tokenizer",
checked=self.DEFAULT_USE_TOKE)
self.__check_use.clicked.connect(self.__set_use_tokenizer)
self.__combo_lemm = ComboBox(
self, LemmagenLemmatizer.lemmagen_languages,
self.__lemmagen_lang, self.__set_lemmagen_lang
self.__combo_lemm = LanguageComboBox(
self,
LemmagenLemmatizer.supported_languages,
self.__lemmagen_lang,
False,
self.__set_lemmagen_lang,
)

label = QLabel("Language:")
Expand Down Expand Up @@ -530,9 +534,9 @@ def __enable_udpipe(self):

def setParameters(self, params: Dict):
super().setParameters(params)
snowball_lang = params.get("snowball_language", self.DEFAULT_LANGUAGE)
snowball_lang = params.get("snowball_language", self.DEFAULT_SNOWBALL_LANG)
self.__set_snowball_lang(snowball_lang)
udpipe_lang = params.get("udpipe_language", self.DEFAULT_LANGUAGE)
udpipe_lang = params.get("udpipe_language", self.DEFAULT_UDPIPE_LANG)
self.__set_udpipe_lang(udpipe_lang)
use_tokenizer = params.get("udpipe_tokenizer", self.DEFAULT_USE_TOKE)
self.__set_use_tokenizer(use_tokenizer)
Expand Down Expand Up @@ -562,7 +566,7 @@ def __set_udpipe_lang(self, language: str):
def __set_lemmagen_lang(self, language: str):
if self.__lemmagen_lang != language:
self.__lemmagen_lang = language
self.__combo_lemm.setCurrentText(language)
self.__combo_lemm.set_current_language(language)
self.changed.emit()
if self.method == self.Lemmagen:
self.edited.emit()
Expand All @@ -587,12 +591,14 @@ def parameters(self) -> Dict:
def createinstance(params: Dict) -> BaseNormalizer:
method = params.get("method", NormalizationModule.DEFAULT_METHOD)
args = {}
def_snowball = NormalizationModule.DEFAULT_SNOWBALL_LANG
def_udpipe = NormalizationModule.DEFAULT_UDPIPE_LANG
def_lang = NormalizationModule.DEFAULT_LANGUAGE
if method == NormalizationModule.Snowball:
args = {"language": params.get("snowball_language", def_lang)}
args = {"language": params.get("snowball_language", def_snowball)}
elif method == NormalizationModule.UDPipe:
def_use = NormalizationModule.DEFAULT_USE_TOKE
args = {"language": params.get("udpipe_language", def_lang),
args = {"language": params.get("udpipe_language", def_udpipe),
"use_tokenizer": params.get("udpipe_tokenizer", def_use)}
elif method == NormalizationModule.Lemmagen:
args = {"language": params.get("lemmagen_language", def_lang)}
Expand Down Expand Up @@ -1384,6 +1390,8 @@ def str_into_paths(label):
pp["language"] = None
else:
pp["language"] = StopwordsFilter.lang_to_iso(pp["language"])
if pp_name == "preprocess.normalize" and "lemmagen_language" in pp:
pp["lemmagen_language"] = LANG2ISO[pp["lemmagen_language"]]


if __name__ == "__main__":
Expand Down
74 changes: 43 additions & 31 deletions orangecontrib/text/widgets/tests/test_owpreprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,30 +271,16 @@ def test_migrate_settings(self):
}
self.create_widget(OWPreprocess, stored_settings=settings)

def test_migrate_language_settings(self):
def test_migrate_filter_language_settings(self):
"""Test migration to iso langauge codes"""
settings = {
"__version__": 3,
"storedsettings": {
"preprocessors": [
(
"preprocess.normalize",
{
"snowball_language": "French",
"udpipe_language": "German",
"lemmagen_language": "Slovenian",
},
),
("preprocess.filter", {"language": "Finnish"}),
]
"preprocessors": [("preprocess.filter", {"language": "Finnish"})]
},
}
widget = self.create_widget(OWPreprocess, stored_settings=settings)
normalize_settings = widget.storedsettings["preprocessors"][0][1]
filter_settings = widget.storedsettings["preprocessors"][1][1]
self.assertEqual("Slovenian", normalize_settings["lemmagen_language"])
self.assertEqual("French", normalize_settings["snowball_language"])
self.assertEqual("German", normalize_settings["udpipe_language"])
filter_settings = widget.storedsettings["preprocessors"][0][1]
self.assertEqual("fi", filter_settings["language"])

# NLTK uses Slovene instead of Slovenian, this is also the reason
Expand All @@ -320,6 +306,32 @@ def test_migrate_language_settings(self):
filter_settings = widget.storedsettings["preprocessors"][0][1]
self.assertIsNone(filter_settings["language"])

def test_migrate_lemmagen_language_settings(self):
"""Test migration to iso langauge codes"""
settings = {
"__version__": 3,
"storedsettings": {
"preprocessors": [
("preprocess.normalize", {"lemmagen_language": "Slovenian"}),
]
},
}
widget = self.create_widget(OWPreprocess, stored_settings=settings)
normalize_settings = widget.storedsettings["preprocessors"][0][1]
self.assertEqual("sl", normalize_settings["lemmagen_language"])

settings = {
"__version__": 3,
"storedsettings": {
"preprocessors": [
("preprocess.normalize", {"lemmagen_language": "English"}),
]
},
}
widget = self.create_widget(OWPreprocess, stored_settings=settings)
normalize_settings = widget.storedsettings["preprocessors"][0][1]
self.assertEqual("en", normalize_settings["lemmagen_language"])


class TestTransformationModule(WidgetTest):
def setUp(self):
Expand Down Expand Up @@ -459,19 +471,23 @@ def test_init(self):
self.assertFalse(self.check_use.isChecked())

def test_parameters(self):
params = {"method": NormalizationModule.Porter,
"snowball_language": "English",
"udpipe_language": "English",
"lemmagen_language": "English",
"udpipe_tokenizer": False}
params = {
"method": NormalizationModule.Porter,
"snowball_language": "English",
"udpipe_language": "English",
"lemmagen_language": "en",
"udpipe_tokenizer": False,
}
self.assertDictEqual(self.editor.parameters(), params)

def test_set_parameters(self):
params = {"method": NormalizationModule.UDPipe,
"snowball_language": "Dutch",
"udpipe_language": "Slovenian",
"lemmagen_language": "Bulgarian",
"udpipe_tokenizer": True}
params = {
"method": NormalizationModule.UDPipe,
"snowball_language": "Dutch",
"udpipe_language": "Slovenian",
"lemmagen_language": "bg",
"udpipe_tokenizer": True,
}
self.editor.setParameters(params)
self.assertDictEqual(self.editor.parameters(), params)
self.assertEqual(self.combo_sbl.currentText(), "Dutch")
Expand Down Expand Up @@ -738,10 +754,6 @@ def test_createinstance(self):
pp = self.editor.createinstance({"method": POSTaggingModule.MaxEnt})
self.assertIsInstance(pp, MaxEntTagger)

# TODO - implement StanfordPOSTagger
# pp = self.editor.createinstance({"method": POSTaggingModule.Stanford})
# self.assertIsInstance(pp, StanfordPOSTagger)

def test_repr(self):
self.assertEqual(str(self.editor), "Averaged Perceptron Tagger")

Expand Down

0 comments on commit f2fb09f

Please sign in to comment.