From f4676cceb79aa1b49da425d31a451dd17135a707 Mon Sep 17 00:00:00 2001 From: PrimozGodec Date: Fri, 14 Apr 2023 09:24:17 +0200 Subject: [PATCH 1/4] Filter - language form corpus in StopwordsFilter --- orangecontrib/text/annotate_documents.py | 2 +- orangecontrib/text/preprocess/filter.py | 73 +++++++++++++++---- orangecontrib/text/tests/test_preprocess.py | 34 +++++++-- orangecontrib/text/widgets/owannotator.py | 2 +- .../text/widgets/tests/test_owannotator.py | 2 +- 5 files changed, 90 insertions(+), 23 deletions(-) diff --git a/orangecontrib/text/annotate_documents.py b/orangecontrib/text/annotate_documents.py index c975e2023..9f166101e 100644 --- a/orangecontrib/text/annotate_documents.py +++ b/orangecontrib/text/annotate_documents.py @@ -289,7 +289,7 @@ def _hypergeom_clusters( corpus_ = Corpus.from_file("book-excerpts") for pp in (LowercaseTransformer(), RegexpTokenizer(r"\w+"), - StopwordsFilter("English"), FrequencyFilter(0.1)): + StopwordsFilter("en"), FrequencyFilter(0.1)): corpus_ = pp(corpus_) transformed_corpus = BowVectorizer().transform(corpus_) diff --git a/orangecontrib/text/preprocess/filter.py b/orangecontrib/text/preprocess/filter.py index 50f748c31..ae8ccfc77 100644 --- a/orangecontrib/text/preprocess/filter.py +++ b/orangecontrib/text/preprocess/filter.py @@ -1,5 +1,5 @@ from itertools import compress -from typing import List, Callable +from typing import List, Callable, Optional, Set import os import re @@ -11,6 +11,7 @@ from Orange.util import wrap_callback, dummy_callback from orangecontrib.text import Corpus +from orangecontrib.text.language import ISO2LANG, LANG2ISO from orangecontrib.text.misc import wait_nltk_data from orangecontrib.text.preprocess import TokenizedPreprocessor @@ -71,27 +72,69 @@ class StopwordsFilter(BaseTokenFilter, FileWordListMixin): """ Remove tokens present in NLTK's language specific lists or a file. """ name = 'Stopwords' - @wait_nltk_data - def __init__(self, language='English', path: str = None): + # nltk uses different language nams for some languages + LANG2NLTK = {"Slovenian": "Slovene"} + NLTK2LANG = {v: k for k, v in LANG2NLTK.items()} + + def __init__( + self, + language: Optional[str] = "en", + path: Optional[str] = None, + ): + """ + Parameters + ---------- + language + The language code in ISO format for NLTK stopwords selection. + If None, only words from file are used (NLTK stopwords are not used). + path + The path to the file with its stopwords will be used if present. + The file must contain a newline-separated list of words. + """ super().__init__() FileWordListMixin.__init__(self, path) - self.__stopwords = set(x.strip() for x in - stopwords.words(language.lower())) \ - if language else [] + self.__stopwords = set() + if language: + # transform iso code to NLTK's language name + language = ISO2LANG[language] + language = self.LANG2NLTK.get(language, language).lower() + self.__stopwords = set(x.strip() for x in stopwords.words(language)) + + @staticmethod + def lang_to_iso(language: str) -> str: + """ + Returns the ISO language code for the NLTK language. NLTK have a different name + for Slovenian. This function takes it into account while transforming to ISO. + + Parameters + ---------- + language + NLTK language name + + Returns + ------- + ISO language code for input language + """ + return LANG2ISO[StopwordsFilter.NLTK2LANG.get(language, language)] @staticmethod @wait_nltk_data - def supported_languages(): - # get NLTK list of stopwords - stopwords_listdir = [] + def supported_languages() -> Set[str]: + """ + List all languages supported by NLTK + + Returns + ------- + Set of all languages supported by NLTK + """ try: - stopwords_listdir = [file for file in - os.listdir(stopwords._get_root()) - if file.islower()] + return { + StopwordsFilter.lang_to_iso(file.title()) + for file in os.listdir(stopwords._get_root()) + if file.islower() + } except LookupError: # when no NLTK data is available - pass - - return sorted(file.capitalize() for file in stopwords_listdir) + return set() def _check(self, token): return token not in self.__stopwords and token not in self._lexicon diff --git a/orangecontrib/text/tests/test_preprocess.py b/orangecontrib/text/tests/test_preprocess.py index f94ba5f81..26dc54821 100644 --- a/orangecontrib/text/tests/test_preprocess.py +++ b/orangecontrib/text/tests/test_preprocess.py @@ -15,9 +15,17 @@ from orangecontrib.text import preprocess, tag from orangecontrib.text.corpus import Corpus -from orangecontrib.text.preprocess import BASE_TOKENIZER, PreprocessorList -from orangecontrib.text.preprocess.normalize import file_to_language, \ - file_to_name, language_to_name, UDPipeModels +from orangecontrib.text.preprocess import ( + BASE_TOKENIZER, + PreprocessorList, + StopwordsFilter, +) +from orangecontrib.text.preprocess.normalize import ( + file_to_language, + file_to_name, + language_to_name, + UDPipeModels, +) SF_LIST = "orangecontrib.text.preprocess.normalize.serverfiles.ServerFiles.listfiles" @@ -430,7 +438,7 @@ def _check(self, token): self.assertEqual(filtered, ['a']) def test_stopwords(self): - f = preprocess.StopwordsFilter('english') + f = preprocess.StopwordsFilter("en") self.assertFalse(f._check('a')) self.assertTrue(f._check('filter')) with self.corpus.unlocked(): @@ -440,7 +448,7 @@ def test_stopwords(self): self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2) def test_stopwords_slovene(self): - f = preprocess.StopwordsFilter('slovene') + f = preprocess.StopwordsFilter("sl") self.assertFalse(f._check('in')) self.assertTrue(f._check('abeceda')) with self.corpus.unlocked(): @@ -449,6 +457,22 @@ def test_stopwords_slovene(self): self.assertListEqual(["kača", "hiši"], corpus.tokens[0]) self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2) + def test_supported_languages(self): + langs = preprocess.StopwordsFilter.supported_languages() + self.assertIsInstance(langs, set) + # just testing few of most important languages since I want for test to be + # resistant for any potentially newly introduced languages by NLTK + self.assertIn("en", langs) + self.assertIn("sl", langs) + self.assertIn("fr", langs) + self.assertIn("sv", langs) + self.assertIn("fi", langs) + self.assertIn("de", langs) + + def test_lang_to_iso(self): + self.assertEqual("en", StopwordsFilter.lang_to_iso("English")) + self.assertEqual("sl", StopwordsFilter.lang_to_iso("Slovene")) + def test_lexicon(self): f = tempfile.NamedTemporaryFile(delete=False) f.write(b'filter\n') diff --git a/orangecontrib/text/widgets/owannotator.py b/orangecontrib/text/widgets/owannotator.py index 8bffc5294..e3cd5f25f 100644 --- a/orangecontrib/text/widgets/owannotator.py +++ b/orangecontrib/text/widgets/owannotator.py @@ -618,7 +618,7 @@ def onDeleteWidget(self): corpus_ = Corpus.from_file("book-excerpts") for pp in (LowercaseTransformer(), RegexpTokenizer(r"\w+"), - StopwordsFilter("English"), FrequencyFilter(0.1)): + StopwordsFilter("en"), FrequencyFilter(0.1)): corpus_ = pp(corpus_) transformed_corpus = BowVectorizer().transform(corpus_) diff --git a/orangecontrib/text/widgets/tests/test_owannotator.py b/orangecontrib/text/widgets/tests/test_owannotator.py index 5ee0b52d9..19e4951d2 100644 --- a/orangecontrib/text/widgets/tests/test_owannotator.py +++ b/orangecontrib/text/widgets/tests/test_owannotator.py @@ -21,7 +21,7 @@ def preprocess(corpus: Corpus) -> Corpus: for pp in (LowercaseTransformer(), RegexpTokenizer(r"\w+"), - StopwordsFilter("English"), FrequencyFilter(0.25, 0.5)): + StopwordsFilter("en"), FrequencyFilter(0.25, 0.5)): corpus = pp(corpus) corpus = BowVectorizer().transform(corpus) return add_embedding(corpus, 4) From 56cfc5aafee1490c5f478b3b44275e597c4e6257 Mon Sep 17 00:00:00 2001 From: PrimozGodec Date: Fri, 17 Nov 2023 11:23:08 +0100 Subject: [PATCH 2/4] Language - Add Hinglish --- orangecontrib/text/language.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/orangecontrib/text/language.py b/orangecontrib/text/language.py index 57256f9c7..d65c03d29 100644 --- a/orangecontrib/text/language.py +++ b/orangecontrib/text/language.py @@ -46,6 +46,9 @@ "he": "Hebrew", "hi": "Hindi", "hi-Latn": "Hindi (latin)", + # https://en.wikipedia.org/wiki/Hinglish - since it doesn't really have ISO + # code we made one up to be able to used it for stopwords (supported in NLTK) + "hi_eng": "Hinglish", "hr": "Croatian", "ht": "Haitian", "hu": "Hungarian", From f1567f699bda17aa0b8604871f398a7f462577ec Mon Sep 17 00:00:00 2001 From: PrimozGodec Date: Fri, 17 Nov 2023 14:09:26 +0100 Subject: [PATCH 3/4] Preprocess Widget - Use ISO language format for stop words settings --- orangecontrib/text/widgets/owpreprocess.py | 87 +++++++++-- .../text/widgets/tests/test_owpreprocess.py | 135 ++++++++++++++++-- 2 files changed, 201 insertions(+), 21 deletions(-) diff --git a/orangecontrib/text/widgets/owpreprocess.py b/orangecontrib/text/widgets/owpreprocess.py index d42df3ed7..35f1b245c 100644 --- a/orangecontrib/text/widgets/owpreprocess.py +++ b/orangecontrib/text/widgets/owpreprocess.py @@ -1,10 +1,10 @@ -from typing import Dict, Optional, List, Callable, Tuple, Type, Union +from typing import Dict, Optional, List, Callable, Tuple, Type, Union, Iterable from types import SimpleNamespace import os import random import pkg_resources -from AnyQt.QtCore import Qt, pyqtSignal +from AnyQt.QtCore import Qt, pyqtSignal, QModelIndex from AnyQt.QtWidgets import QComboBox, QButtonGroup, QLabel, QCheckBox, \ QRadioButton, QGridLayout, QLineEdit, QSpinBox, QFormLayout, QHBoxLayout, \ QDoubleSpinBox, QFileDialog, QAbstractSpinBox @@ -24,6 +24,7 @@ from Orange.widgets.widget import Input, Output, Msg, Message from orangecontrib.text import Corpus +from orangecontrib.text.language import ISO2LANG from orangecontrib.text.misc import nltk_data_dir from orangecontrib.text.preprocess import * from orangecontrib.text.preprocess.normalize import UDPipeStopIteration @@ -63,6 +64,57 @@ def __init__(self, master: BaseEditor, items: List[str], value: str, self.currentTextChanged.connect(callback) +class LanguageComboBox(QComboBox): + """A combo box for selecting language.""" + def __init__( + self, + parent: Optional[BaseEditor], + items: Iterable[str], + value: Optional[str], + include_none: bool, + callback: Callable, + ): + """ + Parameters + ---------- + parent + Combo box's parent widget + items + Combo box's languages (items) as ISO codes. + include_none + Boolean indicating whether to include none option in the start of the list + value + Boxs initial value (as an ISO code). + """ + super().__init__(parent) + self.setMinimumWidth(80) + self.__add_items(items, include_none) + self.set_current_language(value) + self.currentIndexChanged.connect(self.__index_changed) + self.callback = callback + + def __add_items(self, items: Iterable[str], include_non: bool): + if include_non: + self.addItem(_DEFAULT_NONE, None) + for itm in sorted(items, key=ISO2LANG.get): + self.addItem(ISO2LANG[itm], itm) + + def __index_changed(self, index: QModelIndex): + self.callback(self.itemData(index)) + + def set_current_language(self, iso_language: Optional[str]): + """ + Set current element of dropdown from ISO language code. + + Parameters + ---------- + iso_language + The ISO language code of element to be selected. + """ + index = self.findData(iso_language) + self.setCurrentIndex(index) + + class UDPipeComboBox(QComboBox): def __init__(self, master: BaseEditor, value: str, default: str, callback: Callable): @@ -570,7 +622,7 @@ class FilteringModule(MultipleMethodModule): MostFreq: MostFrequentTokensFilter, PosTag: PosTagFilter} DEFAULT_METHODS = [Stopwords] - DEFAULT_LANG = "English" + DEFAULT_LANG = "en" DEFAULT_NONE = None DEFAULT_INCL_NUM = False DEFAULT_PATTERN = r"\.|,|:|;|!|\?|\(|\)|\||\+|\'|\"|‘|’|“|”|\'|" \ @@ -597,9 +649,12 @@ def __init__(self, parent=None, **kwargs): self.__pos_tag = self.DEFAULT_POS_TAGS self.__invalidated = False - self.__combo = ComboBox( - self, [_DEFAULT_NONE] + StopwordsFilter.supported_languages(), - self.__sw_lang, self.__set_language + self.__combo = LanguageComboBox( + self, + StopwordsFilter.supported_languages(), + self.__sw_lang, + True, + self.__set_language, ) self.__sw_loader = FileLoader() self.__sw_loader.set_file_list() @@ -755,10 +810,10 @@ def setParameters(self, params: Dict): self.__set_tags(params.get("pos_tags", self.DEFAULT_POS_TAGS)) self.__invalidated = False - def __set_language(self, language: str): + def __set_language(self, language: Optional[str]): if self.__sw_lang != language: self.__sw_lang = language - self.__combo.setCurrentText(language) + self.__combo.set_current_language(language) self.changed.emit() if self.Stopwords in self.methods: self.edited.emit() @@ -899,8 +954,8 @@ def __repr__(self): texts = [] for method in self.methods: if method == self.Stopwords: - append = f"Language: {self.__sw_lang}, " \ - f"File: {_to_abspath(self.__sw_file)}" + language = ISO2LANG[self.__sw_lang] + append = f"Language: {language}, File: {_to_abspath(self.__sw_file)}" elif method == self.Lexicon: append = f"File: {_to_abspath(self.__lx_file)}" elif method == self.Numbers: @@ -1026,7 +1081,7 @@ class OWPreprocess(Orange.widgets.data.owpreprocess.OWPreprocess, priority = 200 keywords = "preprocess text, text" - settings_version = 3 + settings_version = 4 class Inputs: corpus = Input("Corpus", Corpus) @@ -1320,6 +1375,16 @@ def str_into_paths(label): del pp_settings["start"] del pp_settings["end"] + # before version 4 languages were saved as full-word language strings + if version < 4: + preprocessors = settings["storedsettings"]["preprocessors"] + for pp_name, pp in preprocessors: + if pp_name == "preprocess.filter" and "language" in pp: + if pp["language"] == _DEFAULT_NONE: + pp["language"] = None + else: + pp["language"] = StopwordsFilter.lang_to_iso(pp["language"]) + if __name__ == "__main__": from Orange.widgets.utils.widgetpreview import WidgetPreview diff --git a/orangecontrib/text/widgets/tests/test_owpreprocess.py b/orangecontrib/text/widgets/tests/test_owpreprocess.py index d1e47f2a3..d5cfc633b 100644 --- a/orangecontrib/text/widgets/tests/test_owpreprocess.py +++ b/orangecontrib/text/widgets/tests/test_owpreprocess.py @@ -5,6 +5,7 @@ from Orange.data import Domain, StringVariable from orangewidget.utils.filedialogs import RecentPath from Orange.widgets.tests.base import WidgetTest +from Orange.widgets.tests.utils import simulate from orangecontrib.text.corpus import Corpus from orangecontrib.text.preprocess import RegexpTokenizer, WhitespaceTokenizer, \ @@ -12,9 +13,17 @@ UDPipeLemmatizer, StopwordsFilter, MostFrequentTokensFilter, NGrams from orangecontrib.text.tag import AveragedPerceptronTagger, MaxEntTagger from orangecontrib.text.tests.test_preprocess import SF_LIST, SERVER_FILES -from orangecontrib.text.widgets.owpreprocess import OWPreprocess, \ - TransformationModule, TokenizerModule, NormalizationModule, \ - FilteringModule, NgramsModule, POSTaggingModule +from orangecontrib.text.widgets.owpreprocess import ( + OWPreprocess, + TransformationModule, + TokenizerModule, + NormalizationModule, + FilteringModule, + NgramsModule, + POSTaggingModule, + LanguageComboBox, + _DEFAULT_NONE, +) @patch(SF_LIST, new=Mock(return_value=SERVER_FILES)) @@ -211,7 +220,7 @@ def test_migrate_settings_filter(self): "use_df": False, "use_keep_n": False}} widget = self.create_widget(OWPreprocess, stored_settings=settings) params = [("preprocess.filter", - {"methods": [0, 2, 4], "language": "Finnish", + {"methods": [0, 2, 4], "language": "fi", "sw_path": None, "sw_list": [], "lx_path": None, "lx_list": [], "pattern": "foo", "rel_start": 0.3, @@ -262,6 +271,55 @@ def test_migrate_settings(self): } self.create_widget(OWPreprocess, stored_settings=settings) + def test_migrate_language_settings(self): + """Test migration to iso langauge codes""" + settings = { + "__version__": 3, + "storedsettings": { + "preprocessors": [ + ( + "preprocess.normalize", + { + "snowball_language": "French", + "udpipe_language": "German", + "lemmagen_language": "Slovenian", + }, + ), + ("preprocess.filter", {"language": "Finnish"}), + ] + }, + } + widget = self.create_widget(OWPreprocess, stored_settings=settings) + normalize_settings = widget.storedsettings["preprocessors"][0][1] + filter_settings = widget.storedsettings["preprocessors"][1][1] + self.assertEqual("Slovenian", normalize_settings["lemmagen_language"]) + self.assertEqual("French", normalize_settings["snowball_language"]) + self.assertEqual("German", normalize_settings["udpipe_language"]) + self.assertEqual("fi", filter_settings["language"]) + + # NLTK uses Slovene instead of Slovenian, this is also the reason + # that preprocess widget stored language as Slovene before + # check if it is mapped correctly + settings = { + "__version__": 3, + "storedsettings": { + "preprocessors": [("preprocess.filter", {"language": "Slovene"})] + }, + } + widget = self.create_widget(OWPreprocess, stored_settings=settings) + filter_settings = widget.storedsettings["preprocessors"][0][1] + self.assertEqual("sl", filter_settings["language"]) + + settings = { + "__version__": 3, + "storedsettings": { + "preprocessors": [("preprocess.filter", {"language": _DEFAULT_NONE})] + }, + } + widget = self.create_widget(OWPreprocess, stored_settings=settings) + filter_settings = widget.storedsettings["preprocessors"][0][1] + self.assertIsNone(filter_settings["language"]) + class TestTransformationModule(WidgetTest): def setUp(self): @@ -522,7 +580,7 @@ def test_init(self): def test_parameters(self): params = {"methods": [FilteringModule.Stopwords], - "language": "English", "sw_path": None, "lx_path": None, + "language": "en", "sw_path": None, "lx_path": None, "sw_list": [], "lx_list": [], "incl_num": False, "pattern": FilteringModule.DEFAULT_PATTERN, @@ -537,7 +595,7 @@ def test_set_parameters(self): sw_path = RecentPath.create("Foo", []) lx_path = RecentPath.create("Bar", []) params = {"methods": [FilteringModule.Lexicon, FilteringModule.Regexp], - "language": "Finnish", + "language": "fi", "sw_path": sw_path, "lx_path": lx_path, "sw_list": [sw_path], "lx_list": [lx_path], "incl_num": False, @@ -581,10 +639,13 @@ def test_createinstance(self): self.assertIsInstance(pp[1], MostFrequentTokensFilter) def test_repr(self): - self.assertEqual(str(self.editor), - "Stopwords (Language: English, File: None)") - params = {"methods": [FilteringModule.Lexicon, - FilteringModule.Regexp]} + self.assertEqual(str(self.editor), "Stopwords (Language: English, File: None)") + params = self.editor.parameters() + params["language"] = None + self.editor.setParameters(params) + self.assertEqual(str(self.editor), "Stopwords (Language: None, File: None)") + + params = {"methods": [FilteringModule.Lexicon, FilteringModule.Regexp]} self.editor.setParameters(params) self.assertEqual( str(self.editor), @@ -685,5 +746,59 @@ def test_repr(self): self.assertEqual(str(self.editor), "Averaged Perceptron Tagger") +class TestLanguageComboBox(WidgetTest): + def test_basic_setup(self): + mock = Mock() + cb = LanguageComboBox(None, ["sl", "en", "sv", "fi"], "fi", False, mock) + self.assertEqual(4, cb.count()) + self.assertEqual( + ["English", "Finnish", "Slovenian", "Swedish"], + [cb.itemText(i) for i in range(cb.count())], + ) + self.assertEqual("Finnish", cb.currentText()) + + def test_include_none(self): + mock = Mock() + cb = LanguageComboBox(None, ["sl", "en", "sv", "fi"], "fi", True, mock) + self.assertEqual(5, cb.count()) + self.assertEqual( + [_DEFAULT_NONE, "English", "Finnish", "Slovenian", "Swedish"], + [cb.itemText(i) for i in range(cb.count())], + ) + self.assertEqual("Finnish", cb.currentText()) + + # test with current item None + cb = LanguageComboBox(None, ["sl", "en", "sv", "fi"], None, True, mock) + self.assertEqual(5, cb.count()) + self.assertEqual( + [_DEFAULT_NONE, "English", "Finnish", "Slovenian", "Swedish"], + [cb.itemText(i) for i in range(cb.count())], + ) + self.assertEqual(_DEFAULT_NONE, cb.currentText()) + + def test_set_current_language(self): + mock = Mock() + cb = LanguageComboBox(None, ["sl", "en", "sv", "fi"], "fi", True, mock) + self.assertEqual("Finnish", cb.currentText()) + cb.set_current_language("sl") + self.assertEqual("Slovenian", cb.currentText()) + cb.set_current_language(None) + self.assertEqual(_DEFAULT_NONE, cb.currentText()) + + def test_change_item(self): + mock = Mock() + cb = LanguageComboBox(None, ["sl", "en", "sv", "fi"], "fi", True, mock) + self.assertEqual( + [_DEFAULT_NONE, "English", "Finnish", "Slovenian", "Swedish"], + [cb.itemText(i) for i in range(cb.count())], + ) + mock.assert_not_called() + simulate.combobox_activate_item(cb, "Slovenian") + mock.assert_called_once_with("sl") + mock.reset_mock() + simulate.combobox_activate_item(cb, _DEFAULT_NONE) + mock.assert_called_once_with(None) + + if __name__ == "__main__": unittest.main() From 3b5004fd42896270b5b8a0fcfa01b72eca12f6b3 Mon Sep 17 00:00:00 2001 From: PrimozGodec Date: Fri, 17 Nov 2023 14:26:32 +0100 Subject: [PATCH 4/4] Keywords - Temporary solution for supported languages --- orangecontrib/text/keywords/__init__.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/orangecontrib/text/keywords/__init__.py b/orangecontrib/text/keywords/__init__.py index 16a9527dd..783ecad13 100644 --- a/orangecontrib/text/keywords/__init__.py +++ b/orangecontrib/text/keywords/__init__.py @@ -15,12 +15,16 @@ from orangecontrib.text import Corpus from orangecontrib.text.keywords.mbert import mbert_keywords from orangecontrib.text.keywords.rake import Rake +from orangecontrib.text.language import ISO2LANG from orangecontrib.text.preprocess import StopwordsFilter # all available languages for RAKE from orangecontrib.text.vectorization import BowVectorizer -RAKE_LANGUAGES = StopwordsFilter.supported_languages() + +# todo: refactor when refactoring language for keywords module +# this is a temporary solution since supported_languages now returns lang ISO codes +RAKE_LANGUAGES = [ISO2LANG[la] for la in StopwordsFilter.supported_languages()] # all available languages for YAKE! YAKE_LANGUAGE_MAPPING = { "Arabic": "ar",