From ffc33fe25b5feefcaa5e5b6cf91b371fcb4b1d02 Mon Sep 17 00:00:00 2001
From: Primoz Godec
Date: Fri, 13 Aug 2021 12:04:23 +0200
Subject: [PATCH] Score documents: fix word preprocessing
---
.../text/widgets/owscoredocuments.py | 19 +++++---
.../widgets/tests/test_owscoredocuments.py | 44 ++++++++++++++++++-
2 files changed, 55 insertions(+), 8 deletions(-)
diff --git a/orangecontrib/text/widgets/owscoredocuments.py b/orangecontrib/text/widgets/owscoredocuments.py
index cac8e932f..68a8495fd 100644
--- a/orangecontrib/text/widgets/owscoredocuments.py
+++ b/orangecontrib/text/widgets/owscoredocuments.py
@@ -10,7 +10,6 @@
Domain,
StringVariable,
ContinuousVariable,
- DiscreteVariable,
)
from Orange.util import wrap_callback
from Orange.widgets.utils.concurrent import ConcurrentWidgetMixin, TaskState
@@ -116,6 +115,16 @@ def _preprocess_words(
with words preprocessors that change words (e.g. normalization) must
be applied to words too.
"""
+ # workaround to preprocess words
+ # TODO: currently preprocessors work only on corpus, when there will be more
+ # cases like this think about implementation of preprocessors for a list
+ # of strings
+ words_feature = StringVariable("words")
+ words_c = Corpus(
+ Domain([], metas=[words_feature]),
+ metas=np.array([[w] for w in words]),
+ text_features=[words_feature]
+ )
# only transformers and normalizers preprocess on the word level
pps = [
pp
@@ -123,15 +132,14 @@ def _preprocess_words(
if isinstance(pp, (BaseTransformer, BaseNormalizer))
]
for i, pp in enumerate(pps):
- # TODO: _preprocess is protected make it public
- words = [pp._preprocess(w) for w in words]
+ words_c = pp(words_c)
callback((i + 1) / len(pps))
- return words
+ return [w[0] for w in words_c.tokens]
def _run(
corpus: Corpus,
- words: Table,
+ words: List[str],
scoring_methods: List[str],
aggregation: str,
additional_params: dict,
@@ -155,7 +163,6 @@ def _run(
state
TaskState for reporting the task status and giving partial results
"""
-
def callback(i: float) -> None:
state.set_progress_value(i * 100)
if state.is_interruption_requested():
diff --git a/orangecontrib/text/widgets/tests/test_owscoredocuments.py b/orangecontrib/text/widgets/tests/test_owscoredocuments.py
index 02aed9cd8..52d24baa2 100644
--- a/orangecontrib/text/widgets/tests/test_owscoredocuments.py
+++ b/orangecontrib/text/widgets/tests/test_owscoredocuments.py
@@ -1,10 +1,11 @@
import unittest
from math import isclose
-from typing import List, Union
+from typing import List
from unittest.mock import patch
import numpy as np
from AnyQt.QtCore import Qt
+from Orange.util import dummy_callback
from Orange.widgets.tests.base import WidgetTest
from Orange.data import Table, StringVariable, Domain, ContinuousVariable
from Orange.widgets.tests.utils import simulate
@@ -13,7 +14,10 @@
from orangecontrib.text import Corpus
from orangecontrib.text import preprocess
from orangecontrib.text.vectorization.document_embedder import DocumentEmbedder
-from orangecontrib.text.widgets.owscoredocuments import OWScoreDocuments
+from orangecontrib.text.widgets.owscoredocuments import (
+ OWScoreDocuments,
+ _preprocess_words,
+)
def embedding_mock(_, corpus, __):
@@ -297,6 +301,42 @@ def test_sort_table(self):
data = [model.data(model.index(i, 0)) for i in range(model.rowCount())]
self.assertListEqual(data, natural_sorted(self.corpus.titles)[::-1])
+ def test_preprocess_words(self):
+ corpus = Corpus.from_file("book-excerpts")
+ words = [
+ "House",
+ "dóctor",
+ "boy",
+ "way",
+ "Rum https://google.com",
+ "abracadabra
",
+ ]
+
+ pp_list = [
+ preprocess.LowercaseTransformer(),
+ preprocess.StripAccentsTransformer(),
+ preprocess.UrlRemover(),
+ preprocess.HtmlTransformer(),
+ ]
+ for p in pp_list:
+ corpus = p(corpus)
+
+ self.assertListEqual(
+ ["house", "doctor", "boy", "way", "rum", "abracadabra"],
+ _preprocess_words(corpus, words, dummy_callback),
+ )
+
+ words = ["House", "dóctor", "boys", "way", "Rum"]
+
+ pp_list = [preprocess.SnowballStemmer()]
+ for p in pp_list:
+ corpus = p(corpus)
+
+ self.assertListEqual(
+ ["hous", "doctor", "boy", "way", "rum"],
+ _preprocess_words(corpus, words, dummy_callback),
+ )
+
if __name__ == "__main__":
unittest.main()