Skip to content

Commit

Permalink
Merge pull request #1024 from PrimozGodec/language-filter
Browse files Browse the repository at this point in the history
[ENH] Filter - Use ISO language in StopwordsFilter
  • Loading branch information
VesnaT authored Nov 24, 2023
2 parents fd9b246 + 3b5004f commit b4367d5
Show file tree
Hide file tree
Showing 9 changed files with 299 additions and 45 deletions.
2 changes: 1 addition & 1 deletion orangecontrib/text/annotate_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ def _hypergeom_clusters(

corpus_ = Corpus.from_file("book-excerpts")
for pp in (LowercaseTransformer(), RegexpTokenizer(r"\w+"),
StopwordsFilter("English"), FrequencyFilter(0.1)):
StopwordsFilter("en"), FrequencyFilter(0.1)):
corpus_ = pp(corpus_)

transformed_corpus = BowVectorizer().transform(corpus_)
Expand Down
6 changes: 5 additions & 1 deletion orangecontrib/text/keywords/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,16 @@
from orangecontrib.text import Corpus
from orangecontrib.text.keywords.mbert import mbert_keywords
from orangecontrib.text.keywords.rake import Rake
from orangecontrib.text.language import ISO2LANG
from orangecontrib.text.preprocess import StopwordsFilter

# all available languages for RAKE
from orangecontrib.text.vectorization import BowVectorizer

RAKE_LANGUAGES = StopwordsFilter.supported_languages()

# todo: refactor when refactoring language for keywords module
# this is a temporary solution since supported_languages now returns lang ISO codes
RAKE_LANGUAGES = [ISO2LANG[la] for la in StopwordsFilter.supported_languages()]
# all available languages for YAKE!
YAKE_LANGUAGE_MAPPING = {
"Arabic": "ar",
Expand Down
3 changes: 3 additions & 0 deletions orangecontrib/text/language.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@
"he": "Hebrew",
"hi": "Hindi",
"hi-Latn": "Hindi (latin)",
# https://en.wikipedia.org/wiki/Hinglish - since it doesn't really have ISO
# code we made one up to be able to used it for stopwords (supported in NLTK)
"hi_eng": "Hinglish",
"hr": "Croatian",
"ht": "Haitian",
"hu": "Hungarian",
Expand Down
73 changes: 58 additions & 15 deletions orangecontrib/text/preprocess/filter.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from itertools import compress
from typing import List, Callable
from typing import List, Callable, Optional, Set
import os
import re

Expand All @@ -11,6 +11,7 @@
from Orange.util import wrap_callback, dummy_callback

from orangecontrib.text import Corpus
from orangecontrib.text.language import ISO2LANG, LANG2ISO
from orangecontrib.text.misc import wait_nltk_data
from orangecontrib.text.preprocess import TokenizedPreprocessor

Expand Down Expand Up @@ -71,27 +72,69 @@ class StopwordsFilter(BaseTokenFilter, FileWordListMixin):
""" Remove tokens present in NLTK's language specific lists or a file. """
name = 'Stopwords'

@wait_nltk_data
def __init__(self, language='English', path: str = None):
# nltk uses different language nams for some languages
LANG2NLTK = {"Slovenian": "Slovene"}
NLTK2LANG = {v: k for k, v in LANG2NLTK.items()}

def __init__(
self,
language: Optional[str] = "en",
path: Optional[str] = None,
):
"""
Parameters
----------
language
The language code in ISO format for NLTK stopwords selection.
If None, only words from file are used (NLTK stopwords are not used).
path
The path to the file with its stopwords will be used if present.
The file must contain a newline-separated list of words.
"""
super().__init__()
FileWordListMixin.__init__(self, path)
self.__stopwords = set(x.strip() for x in
stopwords.words(language.lower())) \
if language else []
self.__stopwords = set()
if language:
# transform iso code to NLTK's language name
language = ISO2LANG[language]
language = self.LANG2NLTK.get(language, language).lower()
self.__stopwords = set(x.strip() for x in stopwords.words(language))

@staticmethod
def lang_to_iso(language: str) -> str:
"""
Returns the ISO language code for the NLTK language. NLTK have a different name
for Slovenian. This function takes it into account while transforming to ISO.
Parameters
----------
language
NLTK language name
Returns
-------
ISO language code for input language
"""
return LANG2ISO[StopwordsFilter.NLTK2LANG.get(language, language)]

@staticmethod
@wait_nltk_data
def supported_languages():
# get NLTK list of stopwords
stopwords_listdir = []
def supported_languages() -> Set[str]:
"""
List all languages supported by NLTK
Returns
-------
Set of all languages supported by NLTK
"""
try:
stopwords_listdir = [file for file in
os.listdir(stopwords._get_root())
if file.islower()]
return {
StopwordsFilter.lang_to_iso(file.title())
for file in os.listdir(stopwords._get_root())
if file.islower()
}
except LookupError: # when no NLTK data is available
pass

return sorted(file.capitalize() for file in stopwords_listdir)
return set()

def _check(self, token):
return token not in self.__stopwords and token not in self._lexicon
Expand Down
34 changes: 29 additions & 5 deletions orangecontrib/text/tests/test_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,17 @@

from orangecontrib.text import preprocess, tag
from orangecontrib.text.corpus import Corpus
from orangecontrib.text.preprocess import BASE_TOKENIZER, PreprocessorList
from orangecontrib.text.preprocess.normalize import file_to_language, \
file_to_name, language_to_name, UDPipeModels
from orangecontrib.text.preprocess import (
BASE_TOKENIZER,
PreprocessorList,
StopwordsFilter,
)
from orangecontrib.text.preprocess.normalize import (
file_to_language,
file_to_name,
language_to_name,
UDPipeModels,
)


SF_LIST = "orangecontrib.text.preprocess.normalize.serverfiles.ServerFiles.listfiles"
Expand Down Expand Up @@ -430,7 +438,7 @@ def _check(self, token):
self.assertEqual(filtered, ['a'])

def test_stopwords(self):
f = preprocess.StopwordsFilter('english')
f = preprocess.StopwordsFilter("en")
self.assertFalse(f._check('a'))
self.assertTrue(f._check('filter'))
with self.corpus.unlocked():
Expand All @@ -440,7 +448,7 @@ def test_stopwords(self):
self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2)

def test_stopwords_slovene(self):
f = preprocess.StopwordsFilter('slovene')
f = preprocess.StopwordsFilter("sl")
self.assertFalse(f._check('in'))
self.assertTrue(f._check('abeceda'))
with self.corpus.unlocked():
Expand All @@ -449,6 +457,22 @@ def test_stopwords_slovene(self):
self.assertListEqual(["kača", "hiši"], corpus.tokens[0])
self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2)

def test_supported_languages(self):
langs = preprocess.StopwordsFilter.supported_languages()
self.assertIsInstance(langs, set)
# just testing few of most important languages since I want for test to be
# resistant for any potentially newly introduced languages by NLTK
self.assertIn("en", langs)
self.assertIn("sl", langs)
self.assertIn("fr", langs)
self.assertIn("sv", langs)
self.assertIn("fi", langs)
self.assertIn("de", langs)

def test_lang_to_iso(self):
self.assertEqual("en", StopwordsFilter.lang_to_iso("English"))
self.assertEqual("sl", StopwordsFilter.lang_to_iso("Slovene"))

def test_lexicon(self):
f = tempfile.NamedTemporaryFile(delete=False)
f.write(b'filter\n')
Expand Down
2 changes: 1 addition & 1 deletion orangecontrib/text/widgets/owannotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -618,7 +618,7 @@ def onDeleteWidget(self):

corpus_ = Corpus.from_file("book-excerpts")
for pp in (LowercaseTransformer(), RegexpTokenizer(r"\w+"),
StopwordsFilter("English"), FrequencyFilter(0.1)):
StopwordsFilter("en"), FrequencyFilter(0.1)):
corpus_ = pp(corpus_)

transformed_corpus = BowVectorizer().transform(corpus_)
Expand Down
87 changes: 76 additions & 11 deletions orangecontrib/text/widgets/owpreprocess.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from typing import Dict, Optional, List, Callable, Tuple, Type, Union
from typing import Dict, Optional, List, Callable, Tuple, Type, Union, Iterable
from types import SimpleNamespace
import os
import random
import pkg_resources

from AnyQt.QtCore import Qt, pyqtSignal
from AnyQt.QtCore import Qt, pyqtSignal, QModelIndex
from AnyQt.QtWidgets import QComboBox, QButtonGroup, QLabel, QCheckBox, \
QRadioButton, QGridLayout, QLineEdit, QSpinBox, QFormLayout, QHBoxLayout, \
QDoubleSpinBox, QFileDialog, QAbstractSpinBox
Expand All @@ -24,6 +24,7 @@
from Orange.widgets.widget import Input, Output, Msg, Message

from orangecontrib.text import Corpus
from orangecontrib.text.language import ISO2LANG
from orangecontrib.text.misc import nltk_data_dir
from orangecontrib.text.preprocess import *
from orangecontrib.text.preprocess.normalize import UDPipeStopIteration
Expand Down Expand Up @@ -63,6 +64,57 @@ def __init__(self, master: BaseEditor, items: List[str], value: str,
self.currentTextChanged.connect(callback)


class LanguageComboBox(QComboBox):
"""A combo box for selecting language."""
def __init__(
self,
parent: Optional[BaseEditor],
items: Iterable[str],
value: Optional[str],
include_none: bool,
callback: Callable,
):
"""
Parameters
----------
parent
Combo box's parent widget
items
Combo box's languages (items) as ISO codes.
include_none
Boolean indicating whether to include none option in the start of the list
value
Boxs initial value (as an ISO code).
"""
super().__init__(parent)
self.setMinimumWidth(80)
self.__add_items(items, include_none)
self.set_current_language(value)
self.currentIndexChanged.connect(self.__index_changed)
self.callback = callback

def __add_items(self, items: Iterable[str], include_non: bool):
if include_non:
self.addItem(_DEFAULT_NONE, None)
for itm in sorted(items, key=ISO2LANG.get):
self.addItem(ISO2LANG[itm], itm)

def __index_changed(self, index: QModelIndex):
self.callback(self.itemData(index))

def set_current_language(self, iso_language: Optional[str]):
"""
Set current element of dropdown from ISO language code.
Parameters
----------
iso_language
The ISO language code of element to be selected.
"""
index = self.findData(iso_language)
self.setCurrentIndex(index)


class UDPipeComboBox(QComboBox):
def __init__(self, master: BaseEditor, value: str, default: str,
callback: Callable):
Expand Down Expand Up @@ -570,7 +622,7 @@ class FilteringModule(MultipleMethodModule):
MostFreq: MostFrequentTokensFilter,
PosTag: PosTagFilter}
DEFAULT_METHODS = [Stopwords]
DEFAULT_LANG = "English"
DEFAULT_LANG = "en"
DEFAULT_NONE = None
DEFAULT_INCL_NUM = False
DEFAULT_PATTERN = r"\.|,|:|;|!|\?|\(|\)|\||\+|\'|\"|‘|’|“|”|\'|" \
Expand All @@ -597,9 +649,12 @@ def __init__(self, parent=None, **kwargs):
self.__pos_tag = self.DEFAULT_POS_TAGS
self.__invalidated = False

self.__combo = ComboBox(
self, [_DEFAULT_NONE] + StopwordsFilter.supported_languages(),
self.__sw_lang, self.__set_language
self.__combo = LanguageComboBox(
self,
StopwordsFilter.supported_languages(),
self.__sw_lang,
True,
self.__set_language,
)
self.__sw_loader = FileLoader()
self.__sw_loader.set_file_list()
Expand Down Expand Up @@ -755,10 +810,10 @@ def setParameters(self, params: Dict):
self.__set_tags(params.get("pos_tags", self.DEFAULT_POS_TAGS))
self.__invalidated = False

def __set_language(self, language: str):
def __set_language(self, language: Optional[str]):
if self.__sw_lang != language:
self.__sw_lang = language
self.__combo.setCurrentText(language)
self.__combo.set_current_language(language)
self.changed.emit()
if self.Stopwords in self.methods:
self.edited.emit()
Expand Down Expand Up @@ -899,8 +954,8 @@ def __repr__(self):
texts = []
for method in self.methods:
if method == self.Stopwords:
append = f"Language: {self.__sw_lang}, " \
f"File: {_to_abspath(self.__sw_file)}"
language = ISO2LANG[self.__sw_lang]
append = f"Language: {language}, File: {_to_abspath(self.__sw_file)}"
elif method == self.Lexicon:
append = f"File: {_to_abspath(self.__lx_file)}"
elif method == self.Numbers:
Expand Down Expand Up @@ -1026,7 +1081,7 @@ class OWPreprocess(Orange.widgets.data.owpreprocess.OWPreprocess,
priority = 200
keywords = "preprocess text, text"

settings_version = 3
settings_version = 4

class Inputs:
corpus = Input("Corpus", Corpus)
Expand Down Expand Up @@ -1320,6 +1375,16 @@ def str_into_paths(label):
del pp_settings["start"]
del pp_settings["end"]

# before version 4 languages were saved as full-word language strings
if version < 4:
preprocessors = settings["storedsettings"]["preprocessors"]
for pp_name, pp in preprocessors:
if pp_name == "preprocess.filter" and "language" in pp:
if pp["language"] == _DEFAULT_NONE:
pp["language"] = None
else:
pp["language"] = StopwordsFilter.lang_to_iso(pp["language"])


if __name__ == "__main__":
from Orange.widgets.utils.widgetpreview import WidgetPreview
Expand Down
2 changes: 1 addition & 1 deletion orangecontrib/text/widgets/tests/test_owannotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

def preprocess(corpus: Corpus) -> Corpus:
for pp in (LowercaseTransformer(), RegexpTokenizer(r"\w+"),
StopwordsFilter("English"), FrequencyFilter(0.25, 0.5)):
StopwordsFilter("en"), FrequencyFilter(0.25, 0.5)):
corpus = pp(corpus)
corpus = BowVectorizer().transform(corpus)
return add_embedding(corpus, 4)
Expand Down
Loading

0 comments on commit b4367d5

Please sign in to comment.