From f4676cceb79aa1b49da425d31a451dd17135a707 Mon Sep 17 00:00:00 2001
From: PrimozGodec
Date: Fri, 14 Apr 2023 09:24:17 +0200
Subject: [PATCH 1/4] Filter - language form corpus in StopwordsFilter
---
orangecontrib/text/annotate_documents.py | 2 +-
orangecontrib/text/preprocess/filter.py | 73 +++++++++++++++----
orangecontrib/text/tests/test_preprocess.py | 34 +++++++--
orangecontrib/text/widgets/owannotator.py | 2 +-
.../text/widgets/tests/test_owannotator.py | 2 +-
5 files changed, 90 insertions(+), 23 deletions(-)
diff --git a/orangecontrib/text/annotate_documents.py b/orangecontrib/text/annotate_documents.py
index c975e2023..9f166101e 100644
--- a/orangecontrib/text/annotate_documents.py
+++ b/orangecontrib/text/annotate_documents.py
@@ -289,7 +289,7 @@ def _hypergeom_clusters(
corpus_ = Corpus.from_file("book-excerpts")
for pp in (LowercaseTransformer(), RegexpTokenizer(r"\w+"),
- StopwordsFilter("English"), FrequencyFilter(0.1)):
+ StopwordsFilter("en"), FrequencyFilter(0.1)):
corpus_ = pp(corpus_)
transformed_corpus = BowVectorizer().transform(corpus_)
diff --git a/orangecontrib/text/preprocess/filter.py b/orangecontrib/text/preprocess/filter.py
index 50f748c31..ae8ccfc77 100644
--- a/orangecontrib/text/preprocess/filter.py
+++ b/orangecontrib/text/preprocess/filter.py
@@ -1,5 +1,5 @@
from itertools import compress
-from typing import List, Callable
+from typing import List, Callable, Optional, Set
import os
import re
@@ -11,6 +11,7 @@
from Orange.util import wrap_callback, dummy_callback
from orangecontrib.text import Corpus
+from orangecontrib.text.language import ISO2LANG, LANG2ISO
from orangecontrib.text.misc import wait_nltk_data
from orangecontrib.text.preprocess import TokenizedPreprocessor
@@ -71,27 +72,69 @@ class StopwordsFilter(BaseTokenFilter, FileWordListMixin):
""" Remove tokens present in NLTK's language specific lists or a file. """
name = 'Stopwords'
- @wait_nltk_data
- def __init__(self, language='English', path: str = None):
+ # nltk uses different language nams for some languages
+ LANG2NLTK = {"Slovenian": "Slovene"}
+ NLTK2LANG = {v: k for k, v in LANG2NLTK.items()}
+
+ def __init__(
+ self,
+ language: Optional[str] = "en",
+ path: Optional[str] = None,
+ ):
+ """
+ Parameters
+ ----------
+ language
+ The language code in ISO format for NLTK stopwords selection.
+ If None, only words from file are used (NLTK stopwords are not used).
+ path
+ The path to the file with its stopwords will be used if present.
+ The file must contain a newline-separated list of words.
+ """
super().__init__()
FileWordListMixin.__init__(self, path)
- self.__stopwords = set(x.strip() for x in
- stopwords.words(language.lower())) \
- if language else []
+ self.__stopwords = set()
+ if language:
+ # transform iso code to NLTK's language name
+ language = ISO2LANG[language]
+ language = self.LANG2NLTK.get(language, language).lower()
+ self.__stopwords = set(x.strip() for x in stopwords.words(language))
+
+ @staticmethod
+ def lang_to_iso(language: str) -> str:
+ """
+ Returns the ISO language code for the NLTK language. NLTK have a different name
+ for Slovenian. This function takes it into account while transforming to ISO.
+
+ Parameters
+ ----------
+ language
+ NLTK language name
+
+ Returns
+ -------
+ ISO language code for input language
+ """
+ return LANG2ISO[StopwordsFilter.NLTK2LANG.get(language, language)]
@staticmethod
@wait_nltk_data
- def supported_languages():
- # get NLTK list of stopwords
- stopwords_listdir = []
+ def supported_languages() -> Set[str]:
+ """
+ List all languages supported by NLTK
+
+ Returns
+ -------
+ Set of all languages supported by NLTK
+ """
try:
- stopwords_listdir = [file for file in
- os.listdir(stopwords._get_root())
- if file.islower()]
+ return {
+ StopwordsFilter.lang_to_iso(file.title())
+ for file in os.listdir(stopwords._get_root())
+ if file.islower()
+ }
except LookupError: # when no NLTK data is available
- pass
-
- return sorted(file.capitalize() for file in stopwords_listdir)
+ return set()
def _check(self, token):
return token not in self.__stopwords and token not in self._lexicon
diff --git a/orangecontrib/text/tests/test_preprocess.py b/orangecontrib/text/tests/test_preprocess.py
index f94ba5f81..26dc54821 100644
--- a/orangecontrib/text/tests/test_preprocess.py
+++ b/orangecontrib/text/tests/test_preprocess.py
@@ -15,9 +15,17 @@
from orangecontrib.text import preprocess, tag
from orangecontrib.text.corpus import Corpus
-from orangecontrib.text.preprocess import BASE_TOKENIZER, PreprocessorList
-from orangecontrib.text.preprocess.normalize import file_to_language, \
- file_to_name, language_to_name, UDPipeModels
+from orangecontrib.text.preprocess import (
+ BASE_TOKENIZER,
+ PreprocessorList,
+ StopwordsFilter,
+)
+from orangecontrib.text.preprocess.normalize import (
+ file_to_language,
+ file_to_name,
+ language_to_name,
+ UDPipeModels,
+)
SF_LIST = "orangecontrib.text.preprocess.normalize.serverfiles.ServerFiles.listfiles"
@@ -430,7 +438,7 @@ def _check(self, token):
self.assertEqual(filtered, ['a'])
def test_stopwords(self):
- f = preprocess.StopwordsFilter('english')
+ f = preprocess.StopwordsFilter("en")
self.assertFalse(f._check('a'))
self.assertTrue(f._check('filter'))
with self.corpus.unlocked():
@@ -440,7 +448,7 @@ def test_stopwords(self):
self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2)
def test_stopwords_slovene(self):
- f = preprocess.StopwordsFilter('slovene')
+ f = preprocess.StopwordsFilter("sl")
self.assertFalse(f._check('in'))
self.assertTrue(f._check('abeceda'))
with self.corpus.unlocked():
@@ -449,6 +457,22 @@ def test_stopwords_slovene(self):
self.assertListEqual(["kača", "hiši"], corpus.tokens[0])
self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2)
+ def test_supported_languages(self):
+ langs = preprocess.StopwordsFilter.supported_languages()
+ self.assertIsInstance(langs, set)
+ # just testing few of most important languages since I want for test to be
+ # resistant for any potentially newly introduced languages by NLTK
+ self.assertIn("en", langs)
+ self.assertIn("sl", langs)
+ self.assertIn("fr", langs)
+ self.assertIn("sv", langs)
+ self.assertIn("fi", langs)
+ self.assertIn("de", langs)
+
+ def test_lang_to_iso(self):
+ self.assertEqual("en", StopwordsFilter.lang_to_iso("English"))
+ self.assertEqual("sl", StopwordsFilter.lang_to_iso("Slovene"))
+
def test_lexicon(self):
f = tempfile.NamedTemporaryFile(delete=False)
f.write(b'filter\n')
diff --git a/orangecontrib/text/widgets/owannotator.py b/orangecontrib/text/widgets/owannotator.py
index 8bffc5294..e3cd5f25f 100644
--- a/orangecontrib/text/widgets/owannotator.py
+++ b/orangecontrib/text/widgets/owannotator.py
@@ -618,7 +618,7 @@ def onDeleteWidget(self):
corpus_ = Corpus.from_file("book-excerpts")
for pp in (LowercaseTransformer(), RegexpTokenizer(r"\w+"),
- StopwordsFilter("English"), FrequencyFilter(0.1)):
+ StopwordsFilter("en"), FrequencyFilter(0.1)):
corpus_ = pp(corpus_)
transformed_corpus = BowVectorizer().transform(corpus_)
diff --git a/orangecontrib/text/widgets/tests/test_owannotator.py b/orangecontrib/text/widgets/tests/test_owannotator.py
index 5ee0b52d9..19e4951d2 100644
--- a/orangecontrib/text/widgets/tests/test_owannotator.py
+++ b/orangecontrib/text/widgets/tests/test_owannotator.py
@@ -21,7 +21,7 @@
def preprocess(corpus: Corpus) -> Corpus:
for pp in (LowercaseTransformer(), RegexpTokenizer(r"\w+"),
- StopwordsFilter("English"), FrequencyFilter(0.25, 0.5)):
+ StopwordsFilter("en"), FrequencyFilter(0.25, 0.5)):
corpus = pp(corpus)
corpus = BowVectorizer().transform(corpus)
return add_embedding(corpus, 4)
From 56cfc5aafee1490c5f478b3b44275e597c4e6257 Mon Sep 17 00:00:00 2001
From: PrimozGodec
Date: Fri, 17 Nov 2023 11:23:08 +0100
Subject: [PATCH 2/4] Language - Add Hinglish
---
orangecontrib/text/language.py | 3 +++
1 file changed, 3 insertions(+)
diff --git a/orangecontrib/text/language.py b/orangecontrib/text/language.py
index 57256f9c7..d65c03d29 100644
--- a/orangecontrib/text/language.py
+++ b/orangecontrib/text/language.py
@@ -46,6 +46,9 @@
"he": "Hebrew",
"hi": "Hindi",
"hi-Latn": "Hindi (latin)",
+ # https://en.wikipedia.org/wiki/Hinglish - since it doesn't really have ISO
+ # code we made one up to be able to used it for stopwords (supported in NLTK)
+ "hi_eng": "Hinglish",
"hr": "Croatian",
"ht": "Haitian",
"hu": "Hungarian",
From f1567f699bda17aa0b8604871f398a7f462577ec Mon Sep 17 00:00:00 2001
From: PrimozGodec
Date: Fri, 17 Nov 2023 14:09:26 +0100
Subject: [PATCH 3/4] Preprocess Widget - Use ISO language format for stop
words settings
---
orangecontrib/text/widgets/owpreprocess.py | 87 +++++++++--
.../text/widgets/tests/test_owpreprocess.py | 135 ++++++++++++++++--
2 files changed, 201 insertions(+), 21 deletions(-)
diff --git a/orangecontrib/text/widgets/owpreprocess.py b/orangecontrib/text/widgets/owpreprocess.py
index d42df3ed7..35f1b245c 100644
--- a/orangecontrib/text/widgets/owpreprocess.py
+++ b/orangecontrib/text/widgets/owpreprocess.py
@@ -1,10 +1,10 @@
-from typing import Dict, Optional, List, Callable, Tuple, Type, Union
+from typing import Dict, Optional, List, Callable, Tuple, Type, Union, Iterable
from types import SimpleNamespace
import os
import random
import pkg_resources
-from AnyQt.QtCore import Qt, pyqtSignal
+from AnyQt.QtCore import Qt, pyqtSignal, QModelIndex
from AnyQt.QtWidgets import QComboBox, QButtonGroup, QLabel, QCheckBox, \
QRadioButton, QGridLayout, QLineEdit, QSpinBox, QFormLayout, QHBoxLayout, \
QDoubleSpinBox, QFileDialog, QAbstractSpinBox
@@ -24,6 +24,7 @@
from Orange.widgets.widget import Input, Output, Msg, Message
from orangecontrib.text import Corpus
+from orangecontrib.text.language import ISO2LANG
from orangecontrib.text.misc import nltk_data_dir
from orangecontrib.text.preprocess import *
from orangecontrib.text.preprocess.normalize import UDPipeStopIteration
@@ -63,6 +64,57 @@ def __init__(self, master: BaseEditor, items: List[str], value: str,
self.currentTextChanged.connect(callback)
+class LanguageComboBox(QComboBox):
+ """A combo box for selecting language."""
+ def __init__(
+ self,
+ parent: Optional[BaseEditor],
+ items: Iterable[str],
+ value: Optional[str],
+ include_none: bool,
+ callback: Callable,
+ ):
+ """
+ Parameters
+ ----------
+ parent
+ Combo box's parent widget
+ items
+ Combo box's languages (items) as ISO codes.
+ include_none
+ Boolean indicating whether to include none option in the start of the list
+ value
+ Boxs initial value (as an ISO code).
+ """
+ super().__init__(parent)
+ self.setMinimumWidth(80)
+ self.__add_items(items, include_none)
+ self.set_current_language(value)
+ self.currentIndexChanged.connect(self.__index_changed)
+ self.callback = callback
+
+ def __add_items(self, items: Iterable[str], include_non: bool):
+ if include_non:
+ self.addItem(_DEFAULT_NONE, None)
+ for itm in sorted(items, key=ISO2LANG.get):
+ self.addItem(ISO2LANG[itm], itm)
+
+ def __index_changed(self, index: QModelIndex):
+ self.callback(self.itemData(index))
+
+ def set_current_language(self, iso_language: Optional[str]):
+ """
+ Set current element of dropdown from ISO language code.
+
+ Parameters
+ ----------
+ iso_language
+ The ISO language code of element to be selected.
+ """
+ index = self.findData(iso_language)
+ self.setCurrentIndex(index)
+
+
class UDPipeComboBox(QComboBox):
def __init__(self, master: BaseEditor, value: str, default: str,
callback: Callable):
@@ -570,7 +622,7 @@ class FilteringModule(MultipleMethodModule):
MostFreq: MostFrequentTokensFilter,
PosTag: PosTagFilter}
DEFAULT_METHODS = [Stopwords]
- DEFAULT_LANG = "English"
+ DEFAULT_LANG = "en"
DEFAULT_NONE = None
DEFAULT_INCL_NUM = False
DEFAULT_PATTERN = r"\.|,|:|;|!|\?|\(|\)|\||\+|\'|\"|‘|’|“|”|\'|" \
@@ -597,9 +649,12 @@ def __init__(self, parent=None, **kwargs):
self.__pos_tag = self.DEFAULT_POS_TAGS
self.__invalidated = False
- self.__combo = ComboBox(
- self, [_DEFAULT_NONE] + StopwordsFilter.supported_languages(),
- self.__sw_lang, self.__set_language
+ self.__combo = LanguageComboBox(
+ self,
+ StopwordsFilter.supported_languages(),
+ self.__sw_lang,
+ True,
+ self.__set_language,
)
self.__sw_loader = FileLoader()
self.__sw_loader.set_file_list()
@@ -755,10 +810,10 @@ def setParameters(self, params: Dict):
self.__set_tags(params.get("pos_tags", self.DEFAULT_POS_TAGS))
self.__invalidated = False
- def __set_language(self, language: str):
+ def __set_language(self, language: Optional[str]):
if self.__sw_lang != language:
self.__sw_lang = language
- self.__combo.setCurrentText(language)
+ self.__combo.set_current_language(language)
self.changed.emit()
if self.Stopwords in self.methods:
self.edited.emit()
@@ -899,8 +954,8 @@ def __repr__(self):
texts = []
for method in self.methods:
if method == self.Stopwords:
- append = f"Language: {self.__sw_lang}, " \
- f"File: {_to_abspath(self.__sw_file)}"
+ language = ISO2LANG[self.__sw_lang]
+ append = f"Language: {language}, File: {_to_abspath(self.__sw_file)}"
elif method == self.Lexicon:
append = f"File: {_to_abspath(self.__lx_file)}"
elif method == self.Numbers:
@@ -1026,7 +1081,7 @@ class OWPreprocess(Orange.widgets.data.owpreprocess.OWPreprocess,
priority = 200
keywords = "preprocess text, text"
- settings_version = 3
+ settings_version = 4
class Inputs:
corpus = Input("Corpus", Corpus)
@@ -1320,6 +1375,16 @@ def str_into_paths(label):
del pp_settings["start"]
del pp_settings["end"]
+ # before version 4 languages were saved as full-word language strings
+ if version < 4:
+ preprocessors = settings["storedsettings"]["preprocessors"]
+ for pp_name, pp in preprocessors:
+ if pp_name == "preprocess.filter" and "language" in pp:
+ if pp["language"] == _DEFAULT_NONE:
+ pp["language"] = None
+ else:
+ pp["language"] = StopwordsFilter.lang_to_iso(pp["language"])
+
if __name__ == "__main__":
from Orange.widgets.utils.widgetpreview import WidgetPreview
diff --git a/orangecontrib/text/widgets/tests/test_owpreprocess.py b/orangecontrib/text/widgets/tests/test_owpreprocess.py
index d1e47f2a3..d5cfc633b 100644
--- a/orangecontrib/text/widgets/tests/test_owpreprocess.py
+++ b/orangecontrib/text/widgets/tests/test_owpreprocess.py
@@ -5,6 +5,7 @@
from Orange.data import Domain, StringVariable
from orangewidget.utils.filedialogs import RecentPath
from Orange.widgets.tests.base import WidgetTest
+from Orange.widgets.tests.utils import simulate
from orangecontrib.text.corpus import Corpus
from orangecontrib.text.preprocess import RegexpTokenizer, WhitespaceTokenizer, \
@@ -12,9 +13,17 @@
UDPipeLemmatizer, StopwordsFilter, MostFrequentTokensFilter, NGrams
from orangecontrib.text.tag import AveragedPerceptronTagger, MaxEntTagger
from orangecontrib.text.tests.test_preprocess import SF_LIST, SERVER_FILES
-from orangecontrib.text.widgets.owpreprocess import OWPreprocess, \
- TransformationModule, TokenizerModule, NormalizationModule, \
- FilteringModule, NgramsModule, POSTaggingModule
+from orangecontrib.text.widgets.owpreprocess import (
+ OWPreprocess,
+ TransformationModule,
+ TokenizerModule,
+ NormalizationModule,
+ FilteringModule,
+ NgramsModule,
+ POSTaggingModule,
+ LanguageComboBox,
+ _DEFAULT_NONE,
+)
@patch(SF_LIST, new=Mock(return_value=SERVER_FILES))
@@ -211,7 +220,7 @@ def test_migrate_settings_filter(self):
"use_df": False, "use_keep_n": False}}
widget = self.create_widget(OWPreprocess, stored_settings=settings)
params = [("preprocess.filter",
- {"methods": [0, 2, 4], "language": "Finnish",
+ {"methods": [0, 2, 4], "language": "fi",
"sw_path": None, "sw_list": [],
"lx_path": None, "lx_list": [],
"pattern": "foo", "rel_start": 0.3,
@@ -262,6 +271,55 @@ def test_migrate_settings(self):
}
self.create_widget(OWPreprocess, stored_settings=settings)
+ def test_migrate_language_settings(self):
+ """Test migration to iso langauge codes"""
+ settings = {
+ "__version__": 3,
+ "storedsettings": {
+ "preprocessors": [
+ (
+ "preprocess.normalize",
+ {
+ "snowball_language": "French",
+ "udpipe_language": "German",
+ "lemmagen_language": "Slovenian",
+ },
+ ),
+ ("preprocess.filter", {"language": "Finnish"}),
+ ]
+ },
+ }
+ widget = self.create_widget(OWPreprocess, stored_settings=settings)
+ normalize_settings = widget.storedsettings["preprocessors"][0][1]
+ filter_settings = widget.storedsettings["preprocessors"][1][1]
+ self.assertEqual("Slovenian", normalize_settings["lemmagen_language"])
+ self.assertEqual("French", normalize_settings["snowball_language"])
+ self.assertEqual("German", normalize_settings["udpipe_language"])
+ self.assertEqual("fi", filter_settings["language"])
+
+ # NLTK uses Slovene instead of Slovenian, this is also the reason
+ # that preprocess widget stored language as Slovene before
+ # check if it is mapped correctly
+ settings = {
+ "__version__": 3,
+ "storedsettings": {
+ "preprocessors": [("preprocess.filter", {"language": "Slovene"})]
+ },
+ }
+ widget = self.create_widget(OWPreprocess, stored_settings=settings)
+ filter_settings = widget.storedsettings["preprocessors"][0][1]
+ self.assertEqual("sl", filter_settings["language"])
+
+ settings = {
+ "__version__": 3,
+ "storedsettings": {
+ "preprocessors": [("preprocess.filter", {"language": _DEFAULT_NONE})]
+ },
+ }
+ widget = self.create_widget(OWPreprocess, stored_settings=settings)
+ filter_settings = widget.storedsettings["preprocessors"][0][1]
+ self.assertIsNone(filter_settings["language"])
+
class TestTransformationModule(WidgetTest):
def setUp(self):
@@ -522,7 +580,7 @@ def test_init(self):
def test_parameters(self):
params = {"methods": [FilteringModule.Stopwords],
- "language": "English", "sw_path": None, "lx_path": None,
+ "language": "en", "sw_path": None, "lx_path": None,
"sw_list": [], "lx_list": [],
"incl_num": False,
"pattern": FilteringModule.DEFAULT_PATTERN,
@@ -537,7 +595,7 @@ def test_set_parameters(self):
sw_path = RecentPath.create("Foo", [])
lx_path = RecentPath.create("Bar", [])
params = {"methods": [FilteringModule.Lexicon, FilteringModule.Regexp],
- "language": "Finnish",
+ "language": "fi",
"sw_path": sw_path, "lx_path": lx_path,
"sw_list": [sw_path], "lx_list": [lx_path],
"incl_num": False,
@@ -581,10 +639,13 @@ def test_createinstance(self):
self.assertIsInstance(pp[1], MostFrequentTokensFilter)
def test_repr(self):
- self.assertEqual(str(self.editor),
- "Stopwords (Language: English, File: None)")
- params = {"methods": [FilteringModule.Lexicon,
- FilteringModule.Regexp]}
+ self.assertEqual(str(self.editor), "Stopwords (Language: English, File: None)")
+ params = self.editor.parameters()
+ params["language"] = None
+ self.editor.setParameters(params)
+ self.assertEqual(str(self.editor), "Stopwords (Language: None, File: None)")
+
+ params = {"methods": [FilteringModule.Lexicon, FilteringModule.Regexp]}
self.editor.setParameters(params)
self.assertEqual(
str(self.editor),
@@ -685,5 +746,59 @@ def test_repr(self):
self.assertEqual(str(self.editor), "Averaged Perceptron Tagger")
+class TestLanguageComboBox(WidgetTest):
+ def test_basic_setup(self):
+ mock = Mock()
+ cb = LanguageComboBox(None, ["sl", "en", "sv", "fi"], "fi", False, mock)
+ self.assertEqual(4, cb.count())
+ self.assertEqual(
+ ["English", "Finnish", "Slovenian", "Swedish"],
+ [cb.itemText(i) for i in range(cb.count())],
+ )
+ self.assertEqual("Finnish", cb.currentText())
+
+ def test_include_none(self):
+ mock = Mock()
+ cb = LanguageComboBox(None, ["sl", "en", "sv", "fi"], "fi", True, mock)
+ self.assertEqual(5, cb.count())
+ self.assertEqual(
+ [_DEFAULT_NONE, "English", "Finnish", "Slovenian", "Swedish"],
+ [cb.itemText(i) for i in range(cb.count())],
+ )
+ self.assertEqual("Finnish", cb.currentText())
+
+ # test with current item None
+ cb = LanguageComboBox(None, ["sl", "en", "sv", "fi"], None, True, mock)
+ self.assertEqual(5, cb.count())
+ self.assertEqual(
+ [_DEFAULT_NONE, "English", "Finnish", "Slovenian", "Swedish"],
+ [cb.itemText(i) for i in range(cb.count())],
+ )
+ self.assertEqual(_DEFAULT_NONE, cb.currentText())
+
+ def test_set_current_language(self):
+ mock = Mock()
+ cb = LanguageComboBox(None, ["sl", "en", "sv", "fi"], "fi", True, mock)
+ self.assertEqual("Finnish", cb.currentText())
+ cb.set_current_language("sl")
+ self.assertEqual("Slovenian", cb.currentText())
+ cb.set_current_language(None)
+ self.assertEqual(_DEFAULT_NONE, cb.currentText())
+
+ def test_change_item(self):
+ mock = Mock()
+ cb = LanguageComboBox(None, ["sl", "en", "sv", "fi"], "fi", True, mock)
+ self.assertEqual(
+ [_DEFAULT_NONE, "English", "Finnish", "Slovenian", "Swedish"],
+ [cb.itemText(i) for i in range(cb.count())],
+ )
+ mock.assert_not_called()
+ simulate.combobox_activate_item(cb, "Slovenian")
+ mock.assert_called_once_with("sl")
+ mock.reset_mock()
+ simulate.combobox_activate_item(cb, _DEFAULT_NONE)
+ mock.assert_called_once_with(None)
+
+
if __name__ == "__main__":
unittest.main()
From 3b5004fd42896270b5b8a0fcfa01b72eca12f6b3 Mon Sep 17 00:00:00 2001
From: PrimozGodec
Date: Fri, 17 Nov 2023 14:26:32 +0100
Subject: [PATCH 4/4] Keywords - Temporary solution for supported languages
---
orangecontrib/text/keywords/__init__.py | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/orangecontrib/text/keywords/__init__.py b/orangecontrib/text/keywords/__init__.py
index 16a9527dd..783ecad13 100644
--- a/orangecontrib/text/keywords/__init__.py
+++ b/orangecontrib/text/keywords/__init__.py
@@ -15,12 +15,16 @@
from orangecontrib.text import Corpus
from orangecontrib.text.keywords.mbert import mbert_keywords
from orangecontrib.text.keywords.rake import Rake
+from orangecontrib.text.language import ISO2LANG
from orangecontrib.text.preprocess import StopwordsFilter
# all available languages for RAKE
from orangecontrib.text.vectorization import BowVectorizer
-RAKE_LANGUAGES = StopwordsFilter.supported_languages()
+
+# todo: refactor when refactoring language for keywords module
+# this is a temporary solution since supported_languages now returns lang ISO codes
+RAKE_LANGUAGES = [ISO2LANG[la] for la in StopwordsFilter.supported_languages()]
# all available languages for YAKE!
YAKE_LANGUAGE_MAPPING = {
"Arabic": "ar",