diff --git a/orangecontrib/text/widgets/owscoredocuments.py b/orangecontrib/text/widgets/owscoredocuments.py index cac8e932f..68a8495fd 100644 --- a/orangecontrib/text/widgets/owscoredocuments.py +++ b/orangecontrib/text/widgets/owscoredocuments.py @@ -10,7 +10,6 @@ Domain, StringVariable, ContinuousVariable, - DiscreteVariable, ) from Orange.util import wrap_callback from Orange.widgets.utils.concurrent import ConcurrentWidgetMixin, TaskState @@ -116,6 +115,16 @@ def _preprocess_words( with words preprocessors that change words (e.g. normalization) must be applied to words too. """ + # workaround to preprocess words + # TODO: currently preprocessors work only on corpus, when there will be more + # cases like this think about implementation of preprocessors for a list + # of strings + words_feature = StringVariable("words") + words_c = Corpus( + Domain([], metas=[words_feature]), + metas=np.array([[w] for w in words]), + text_features=[words_feature] + ) # only transformers and normalizers preprocess on the word level pps = [ pp @@ -123,15 +132,14 @@ def _preprocess_words( if isinstance(pp, (BaseTransformer, BaseNormalizer)) ] for i, pp in enumerate(pps): - # TODO: _preprocess is protected make it public - words = [pp._preprocess(w) for w in words] + words_c = pp(words_c) callback((i + 1) / len(pps)) - return words + return [w[0] for w in words_c.tokens] def _run( corpus: Corpus, - words: Table, + words: List[str], scoring_methods: List[str], aggregation: str, additional_params: dict, @@ -155,7 +163,6 @@ def _run( state TaskState for reporting the task status and giving partial results """ - def callback(i: float) -> None: state.set_progress_value(i * 100) if state.is_interruption_requested(): diff --git a/orangecontrib/text/widgets/tests/test_owscoredocuments.py b/orangecontrib/text/widgets/tests/test_owscoredocuments.py index 02aed9cd8..52d24baa2 100644 --- a/orangecontrib/text/widgets/tests/test_owscoredocuments.py +++ b/orangecontrib/text/widgets/tests/test_owscoredocuments.py @@ -1,10 +1,11 @@ import unittest from math import isclose -from typing import List, Union +from typing import List from unittest.mock import patch import numpy as np from AnyQt.QtCore import Qt +from Orange.util import dummy_callback from Orange.widgets.tests.base import WidgetTest from Orange.data import Table, StringVariable, Domain, ContinuousVariable from Orange.widgets.tests.utils import simulate @@ -13,7 +14,10 @@ from orangecontrib.text import Corpus from orangecontrib.text import preprocess from orangecontrib.text.vectorization.document_embedder import DocumentEmbedder -from orangecontrib.text.widgets.owscoredocuments import OWScoreDocuments +from orangecontrib.text.widgets.owscoredocuments import ( + OWScoreDocuments, + _preprocess_words, +) def embedding_mock(_, corpus, __): @@ -297,6 +301,42 @@ def test_sort_table(self): data = [model.data(model.index(i, 0)) for i in range(model.rowCount())] self.assertListEqual(data, natural_sorted(self.corpus.titles)[::-1]) + def test_preprocess_words(self): + corpus = Corpus.from_file("book-excerpts") + words = [ + "House", + "dóctor", + "boy", + "way", + "Rum https://google.com", + "

abracadabra

", + ] + + pp_list = [ + preprocess.LowercaseTransformer(), + preprocess.StripAccentsTransformer(), + preprocess.UrlRemover(), + preprocess.HtmlTransformer(), + ] + for p in pp_list: + corpus = p(corpus) + + self.assertListEqual( + ["house", "doctor", "boy", "way", "rum", "abracadabra"], + _preprocess_words(corpus, words, dummy_callback), + ) + + words = ["House", "dóctor", "boys", "way", "Rum"] + + pp_list = [preprocess.SnowballStemmer()] + for p in pp_list: + corpus = p(corpus) + + self.assertListEqual( + ["hous", "doctor", "boy", "way", "rum"], + _preprocess_words(corpus, words, dummy_callback), + ) + if __name__ == "__main__": unittest.main()