diff --git a/orangecontrib/text/widgets/owstatistics.py b/orangecontrib/text/widgets/owstatistics.py index dbe931dc5..ebde42410 100644 --- a/orangecontrib/text/widgets/owstatistics.py +++ b/orangecontrib/text/widgets/owstatistics.py @@ -3,7 +3,7 @@ from copy import copy from itertools import groupby from string import punctuation -from typing import Callable, List, Optional, Tuple, Union, Generator, Iterator +from typing import Callable, List, Optional, Tuple, Union, Generator, Iterator, Dict import numpy as np from AnyQt.QtWidgets import QComboBox, QGridLayout, QLabel, QLineEdit, QSizePolicy @@ -14,6 +14,7 @@ from Orange.widgets.utils.widgetpreview import WidgetPreview from Orange.widgets.widget import Input, Output, OWWidget from nltk import tokenize +from orangecanvas.gui.utils import disconnected from orangewidget.widget import Msg from orangecontrib.text import Corpus @@ -73,28 +74,6 @@ def count_appearances( return sum(d.lower().count(c) for c in characters for d in document) -def preprocess_only_words(corpus: Corpus) -> Corpus: - """ - Apply the preprocessor that splits words, transforms them to lower case - (and removes punctuations). - - Parameters - ---------- - corpus - Corpus on which the preprocessor will be applied. - - Returns - ------- - Preprocessed corpus. Result of pre-processing is saved in tokens/ngrams. - """ - p = PreprocessorList( - [LowercaseTransformer(), - # by default regexp keeps only words (no punctuations, no spaces) - RegexpTokenizer()] - ) - return p(corpus) - - def get_source(corpus: Corpus, source: str) -> Union[List[str], Iterator[List[str]]]: """ Extract source from corpus according to source variable: @@ -252,7 +231,6 @@ def per_cent_unique_words( Ratio between unique words count and all words count """ assert source == Sources.TOKENS - corpus = preprocess_only_words(corpus) def perc_unique(tokens: str): callback() @@ -270,7 +248,6 @@ def starts_with( Number of words that starts with the string in `prefix`. """ assert source == Sources.TOKENS - corpus = preprocess_only_words(corpus) def number_starts_with(tokens: List[str]): callback() @@ -289,7 +266,6 @@ def ends_with( Number of words that ends with the string in `postfix`. """ assert source == Sources.TOKENS - corpus = preprocess_only_words(corpus) def number_ends_with(tokens: List[str]): callback() @@ -393,7 +369,6 @@ def lix( https://en.wikipedia.org/wiki/Lix_(readability_test) """ assert source == Sources.TOKENS - corpus = preprocess_only_words(corpus) tokenizer = tokenize.PunktSentenceTokenizer() def lix_index(document, tokens): @@ -426,18 +401,21 @@ class ComputeValue: pattern Some statistics need additional parameter with the pattern (e.g. starts with), for others it is set to empty string. + source + Part of the corpus used for computation: either tokens/ngrams or whole documents """ - def __init__(self, function: Callable, pattern: str) -> None: + def __init__(self, function: Callable, pattern: str, source: str) -> None: self.function = function self.pattern = pattern + self.source = source def __call__(self, data: Corpus) -> np.ndarray: """ This function compute values on new table. """ # lambda is added as a placeholder for a callback. - return self.function(data, self.pattern, lambda: True)[0] + return self.function(data, self.pattern, self.source, lambda: True)[0] def __eq__(self, other): return self.function == other.function and self.pattern == other.pattern @@ -455,7 +433,7 @@ def __hash__(self): ("Word count", words_count, None, (Sources.DOCUMENTS,)), ("Character count", characters_count, None, (Sources.DOCUMENTS, Sources.TOKENS)), ("N-gram count", n_gram_count, None, (Sources.TOKENS,)), - ("Average word length", average_word_len, None, (Sources.DOCUMENTS,)), # todo: discuss + ("Average word length", average_word_len, None, (Sources.DOCUMENTS,)), ("Punctuation count", punctuation_count, None, (Sources.DOCUMENTS,)), ("Capital letter count", capital_count, None, (Sources.DOCUMENTS,)), ("Vowel count", vowel_count, "a,e,i,o,u", (Sources.DOCUMENTS,)), @@ -505,7 +483,7 @@ def advance(): fun = STATISTICS_FUNCTIONS[s] result = fun(corpus, patern, source, advance) if result is not None: - result = result + (ComputeValue(fun, patern),) + result = result + (ComputeValue(fun, patern, source),) state.set_partial_result((s, patern, source, result)) @@ -530,6 +508,7 @@ class Warning(OWWidget.Warning): want_main_area = False mainArea_width_height_ratio = None + settings_version = 2 # rules used to reset the active rules default_rules = [(0, "", STATISTICS[0][-1][0]), (1, "", STATISTICS[0][-1][0])] active_rules: List[Tuple[int, str, str]] = Setting(default_rules[:]) @@ -633,10 +612,10 @@ def _add_line(): def _remove_line(): self.statistics_combos.pop().deleteLater() self.line_edits.pop().deleteLater() + self.source_combos.pop().deleteLater() self.remove_buttons.pop().deleteLater() def _fix_tab_order(): - # TODO: write it differently - check create class for i, (r, c, l, s) in enumerate( zip(self.active_rules, self.statistics_combos, self.line_edits, self.source_combos) ): @@ -646,9 +625,10 @@ def _fix_tab_order(): l.setVisible(True) else: l.setVisible(False) - s.clear() - s.addItems(STATISTICS_DEFAULT_SOURCES[r[0]]) - s.setCurrentText(r[2]) + with disconnected(s.currentIndexChanged, self._sync_edit_source_combo): + s.clear() + s.addItems(STATISTICS_DEFAULT_SOURCES[r[0]]) + s.setCurrentText(r[2]) n = len(self.active_rules) while n > len(self.statistics_combos): @@ -673,7 +653,7 @@ def _sync_edit_combo(self) -> None: combo = self.sender() edit_index = self.statistics_combos.index(combo) selected_i = combo.currentIndex() - default_value = STATISTICS_DEFAULT_VALUE[selected_i] + default_value = STATISTICS_DEFAULT_VALUE[selected_i] or "" default_source = STATISTICS_DEFAULT_SOURCES[selected_i][0] self.active_rules[edit_index] = (selected_i, default_value, default_source) self.adjust_n_rule_rows() @@ -682,18 +662,14 @@ def _sync_edit_line(self) -> None: """ Update rules when line edit value changed """ line_edit = self.sender() edit_index = self.line_edits.index(line_edit) - self.active_rules[edit_index] = ( - self.active_rules[edit_index][0], - line_edit.text(), - STATISTICS_DEFAULT_SOURCES[edit_index][0] - ) + arules = self.active_rules[edit_index] + self.active_rules[edit_index] = (arules[0], line_edit.text(), arules[2]) def _sync_edit_source_combo(self) -> None: - """ Update rules when line edit value changed """ + """ Update rules when source value change """ combo = self.sender() edit_index = self.source_combos.index(combo) value = combo.currentText() - print(value) arules = self.active_rules[edit_index] self.active_rules[edit_index] = (arules[0], arules[1], value) @@ -766,6 +742,21 @@ def output_results(self) -> None: ) self.Outputs.corpus.send(new_corpus) + @classmethod + def migrate_settings(cls, settings: Dict, version: int): + def def_source(idx): + """Return source that behaviour is the most similar to previous version""" + if STATISTICS_NAMES[idx] == "Regex": + # regex was working on tokens in the previous version + return Sources.TOKENS + # others that allow both sources were working on documents + return STATISTICS_DEFAULT_SOURCES[idx][0] + + if version < 2: + if "active_rules" in settings: + new_rules = [(r, v, def_source(r)) for r, v in settings["active_rules"]] + settings["active_rules"] = new_rules + if __name__ == "__main__": WidgetPreview(OWStatistics).run(Corpus.from_file("book-excerpts")) diff --git a/orangecontrib/text/widgets/tests/test_owstatistics.py b/orangecontrib/text/widgets/tests/test_owstatistics.py index e802d9622..e3082e406 100644 --- a/orangecontrib/text/widgets/tests/test_owstatistics.py +++ b/orangecontrib/text/widgets/tests/test_owstatistics.py @@ -7,6 +7,12 @@ from Orange.widgets.tests.base import WidgetTest from Orange.widgets.tests.utils import simulate from orangecontrib.text import Corpus +from orangecontrib.text.preprocess import ( + PreprocessorList, + LowercaseTransformer, + RegexpTokenizer, + StopwordsFilter, +) from orangecontrib.text.tag import AveragedPerceptronTagger from orangecontrib.text.widgets.owstatistics import ( STATISTICS_NAMES, @@ -57,14 +63,12 @@ def _set_feature( """ simulate.combobox_activate_item(self.widget.statistics_combos[0], feature_name) self.widget.line_edits[0].setText(value) - print(self.widget.active_rules, feature_name, value, source) simulate.combobox_activate_item(self.widget.source_combos[0], source) - print(self.widget.active_rules) for button in self.widget.remove_buttons[1:]: button.click() def _compute_features( - self, feature_name: str, value: str = "", source: str = Sources.DOCUMENTS + self, feature_name: str, value: str = "", source: str = Sources.DOCUMENTS ) -> Corpus: """ Send `self.corpus` to widget, set statistic which need bo be computed, @@ -118,7 +122,6 @@ def test_characters_count(self): self.send_signal(self.widget.Inputs.corpus, None) self.assertIsNone(self.get_output(self.widget.Outputs.corpus)) - # todo: make different preprocessing and the test all tokens statistics again def test_n_gram_count(self): """ Test n-grams count statistic """ @@ -176,14 +179,14 @@ def test_per_cent_unique_words(self): """ Test per-cent unique words statistic """ data = self._compute_features("Per cent unique terms", source=Sources.TOKENS) np.testing.assert_array_almost_equal( - data.X.flatten(), [1, 1, 0.909091, 1] + data.X.flatten(), [1, 1, 0.84615, 1], decimal=5 ) with self.corpus.unlocked(): - self.corpus[1][-1] = "" + self.corpus[1][-1] = " " data = self._compute_features("Per cent unique terms", source=Sources.TOKENS) np.testing.assert_array_almost_equal( - data.X.flatten(), [1, np.nan, 0.909091, 1] + data.X.flatten(), [1, np.nan, 0.84615, 1], decimal=5 ) self.send_signal(self.widget.Inputs.corpus, None) @@ -202,10 +205,10 @@ def test_starts_with(self): def test_ends_with(self): """ Test ends with count statistic """ - data = self._compute_features("Ends with", "t") + data = self._compute_features("Ends with", "t", Sources.TOKENS) np.testing.assert_array_almost_equal(data.X.flatten(), [3, 3, 1, 2]) - data = self._compute_features("Ends with", "et") + data = self._compute_features("Ends with", "et", Sources.TOKENS) np.testing.assert_array_almost_equal(data.X.flatten(), [1, 1, 0, 0]) self.send_signal(self.widget.Inputs.corpus, None) @@ -213,32 +216,50 @@ def test_ends_with(self): def test_contains(self): """ Test contains count statistic """ - data = self._compute_features("Contains", "t") + data = self._compute_features("Contains", "t", Sources.DOCUMENTS) np.testing.assert_array_almost_equal(data.X.flatten(), [5, 4, 4, 9]) - data = self._compute_features("Contains", "et") + data = self._compute_features("Contains", "et", Sources.DOCUMENTS) np.testing.assert_array_almost_equal(data.X.flatten(), [2, 1, 0, 0]) - data = self._compute_features("Contains", "is") + data = self._compute_features("Contains", "is", Sources.DOCUMENTS) np.testing.assert_array_almost_equal(data.X.flatten(), [1, 2, 2, 0]) + data = self._compute_features("Contains", "t", Sources.TOKENS) + np.testing.assert_array_almost_equal(data.X.flatten(), [5, 4, 4, 9]) + + data = self._compute_features("Contains", " ", Sources.TOKENS) + np.testing.assert_array_almost_equal(data.X.flatten(), [0, 0, 0, 0]) + self.send_signal(self.widget.Inputs.corpus, None) self.assertIsNone(self.get_output(self.widget.Outputs.corpus)) def test_regex(self): """ Test regex statistic """ # words that contain digit - data = self._compute_features("Regex", r"\w*\d\w*") + data = self._compute_features("Regex", r"\w*\d\w*", Sources.DOCUMENTS) np.testing.assert_array_almost_equal(data.X.flatten(), [0, 0, 0, 1]) - # words that contain digit - data = self._compute_features("Regex", r"\w*is\w*") + # words that contain is + data = self._compute_features("Regex", r"\w*is\w*", Sources.DOCUMENTS) np.testing.assert_array_almost_equal(data.X.flatten(), [1, 2, 2, 0]) # count specific n-gram - data = self._compute_features("Regex", r"ipsum\ dolor") + data = self._compute_features("Regex", r"ipsum\ dolor", Sources.DOCUMENTS) np.testing.assert_array_almost_equal(data.X.flatten(), [1, 0, 0, 0]) + # words that contain digit + data = self._compute_features("Regex", r"\w*\d\w*", Sources.TOKENS) + np.testing.assert_array_almost_equal(data.X.flatten(), [0, 0, 0, 1]) + + # words that contain is + data = self._compute_features("Regex", r"\w*is\w*", Sources.TOKENS) + np.testing.assert_array_almost_equal(data.X.flatten(), [1, 2, 2, 0]) + + # count specific n-gram + data = self._compute_features("Regex", r"ipsum\ dolor", Sources.TOKENS) + np.testing.assert_array_almost_equal(data.X.flatten(), [0, 0, 0, 0]) + self.send_signal(self.widget.Inputs.corpus, None) self.assertIsNone(self.get_output(self.widget.Outputs.corpus)) @@ -249,7 +270,7 @@ def test_pos(self): - test with corpus that has pos tags """ self.send_signal(self.widget.Inputs.corpus, self.corpus) - self._set_feature("POS tag", "NN") + self._set_feature("POS tag", "NN", Sources.TOKENS) self.widget.apply() self.wait_until_finished() res = self.get_output(self.widget.Outputs.corpus) @@ -260,7 +281,7 @@ def test_pos(self): result = tagger(self.corpus) self.send_signal(self.widget.Inputs.corpus, result) - self._set_feature("POS tag", "NN") + self._set_feature("POS tag", "NN", Sources.TOKENS) self.widget.apply() self.wait_until_finished() res = self.get_output(self.widget.Outputs.corpus) @@ -275,7 +296,7 @@ def test_yule(self): - test with corpus that has pos tags """ self.send_signal(self.widget.Inputs.corpus, self.corpus) - self._set_feature("Yule's I") + self._set_feature("Yule's I", source=Sources.TOKENS) self.widget.apply() self.wait_until_finished() res = self.get_output(self.widget.Outputs.corpus) @@ -288,7 +309,7 @@ def test_yule(self): result = tagger(self.corpus) self.send_signal(self.widget.Inputs.corpus, result) - self._set_feature("Yule's I") + self._set_feature("Yule's I", source=Sources.TOKENS) self.widget.apply() self.wait_until_finished() res = self.get_output(self.widget.Outputs.corpus) @@ -304,7 +325,7 @@ def test_lix(self): with self.corpus.unlocked(): self.corpus[1][-1] = "simple. simple." self.send_signal(self.widget.Inputs.corpus, self.corpus) - self._set_feature("LIX index") + self._set_feature("LIX index", source=Sources.TOKENS) self.widget.apply() self.wait_until_finished() res = self.get_output(self.widget.Outputs.corpus) @@ -312,6 +333,40 @@ def test_lix(self): # the second document will have lower complexity than the first one self.assertLess(res[1][0], res[0][0]) + def test_stats_different_preprocessing(self): + pp = [LowercaseTransformer(), RegexpTokenizer(), StopwordsFilter(language="en")] + pp = PreprocessorList(pp) + self.corpus = pp(self.corpus) + + data = self._compute_features("Character count", "", Sources.TOKENS) + np.testing.assert_array_almost_equal(data.X.flatten(), [47, 44, 46, 51]) + + data = self._compute_features("N-gram count", "", Sources.TOKENS) + np.testing.assert_array_almost_equal(data.X.flatten(), [8, 9, 9, 9]) + + data = self._compute_features("Per cent unique terms", "", Sources.TOKENS) + np.testing.assert_array_almost_equal(data.X.flatten(), [1, 1, 1, 1]) + + # none start with the capital because of Lowercase preprocessor + data = self._compute_features("Starts with", "L", Sources.TOKENS) + np.testing.assert_array_almost_equal(data.X.flatten(), [0, 0, 0, 0]) + + data = self._compute_features("Starts with", "a", Sources.TOKENS) + np.testing.assert_array_almost_equal(data.X.flatten(), [2, 0, 0, 2]) + + data = self._compute_features("Ends with", "a", Sources.TOKENS) + np.testing.assert_array_almost_equal(data.X.flatten(), [0, 1, 2, 1]) + + # non contain comma since we use RegexP preprocessor + data = self._compute_features("Contains", ",", Sources.TOKENS) + np.testing.assert_array_almost_equal(data.X.flatten(), [0, 0, 0, 0]) + + data = self._compute_features("Contains", "a", Sources.TOKENS) + np.testing.assert_array_almost_equal(data.X.flatten(), [2, 2, 6, 5]) + + data = self._compute_features("Regex", "{e", Sources.TOKENS) + np.testing.assert_array_almost_equal(data.X.flatten(), [0, 0, 0, 0]) + def test_statistics_combination(self): """ Testing three statistics at same time and see if column concatenated @@ -323,9 +378,9 @@ def test_statistics_combination(self): starts_with_index = STATISTICS_NAMES.index("Starts with") capital_counts_index = STATISTICS_NAMES.index("Capital letter count") self.widget.active_rules = [ - (wc_index, ""), - (starts_with_index, "a"), - (capital_counts_index, ""), + (wc_index, "", Sources.DOCUMENTS), + (starts_with_index, "a", Sources.TOKENS), + (capital_counts_index, "", Sources.DOCUMENTS), ] self.widget.adjust_n_rule_rows() @@ -350,43 +405,44 @@ def test_dictionary_statistics(self): """ self.send_signal(self.widget.Inputs.corpus, self.corpus) - self.widget.active_rules = [ - (1, ""), - ] + self.widget.active_rules = [(1, "", Sources.DOCUMENTS)] self.widget.adjust_n_rule_rows() self.widget.apply() self.wait_until_finished() - self.assertListEqual([(1, None)], list(self.widget.result_dict.keys())) + expected = [(1, "", Sources.DOCUMENTS)] + self.assertListEqual(expected, list(self.widget.result_dict.keys())) - self.widget.active_rules = [(1, ""), (2, "")] + self.widget.active_rules = [(1, "", Sources.DOCUMENTS), (2, "", Sources.TOKENS)] self.widget.adjust_n_rule_rows() self.widget.apply() self.wait_until_finished() - self.assertListEqual( - [(1, ""), (2, None)], list(self.widget.result_dict.keys()) - ) + expected = [(1, "", Sources.DOCUMENTS), (2, "", Sources.TOKENS)] + self.assertListEqual(expected, list(self.widget.result_dict.keys())) - self.widget.active_rules = [(2, "")] + self.widget.active_rules = [(2, "", Sources.TOKENS)] self.widget.adjust_n_rule_rows() self.widget.apply() self.wait_until_finished() - self.assertListEqual([(2, None)], list(self.widget.result_dict.keys())) + expected = [(2, "", Sources.TOKENS)] + self.assertListEqual(expected, list(self.widget.result_dict.keys())) # dict should empty on new data self.send_signal(self.widget.Inputs.corpus, self.corpus) self.assertListEqual([], list(self.widget.result_dict.keys())) def test_settings(self): - """ Test whether context correctly restore rules """ - rules = [(0, ""), (1, ""), (2, None)] + """Test whether context correctly restore rules""" + doc, tk = Sources.DOCUMENTS, Sources.TOKENS + rules = [(0, "", doc), (1, "", doc), (2, "", tk)] self.send_signal(self.widget.Inputs.corpus, self.corpus) self.widget.active_rules = rules[:] self.send_signal(self.widget.Inputs.corpus, self.book_data) - self.assertListEqual([(0, ""), (1, ""), (2, None)], self.widget.active_rules) + expected = [(0, "", doc), (1, "", doc), (2, "", tk)] + self.assertListEqual(expected, self.widget.active_rules) def test_compute_values(self): """ Test compute values on new data """ @@ -418,13 +474,13 @@ def test_add_row(self): if x.text() == "+" ][0] add_button.click() - self.assertListEqual([(0, "")], self.widget.active_rules) + self.assertListEqual([(0, "", Sources.DOCUMENTS)], self.widget.active_rules) def test_remove_row(self): self.send_signal(self.widget.Inputs.corpus, self.corpus) - self.widget.active_rules = [(0, "")] + self.widget.active_rules = [(0, "", Sources.DOCUMENTS)] self.widget.adjust_n_rule_rows() - self.assertListEqual([(0, "")], self.widget.active_rules) + self.assertListEqual([(0, "", Sources.DOCUMENTS)], self.widget.active_rules) remove_button = [ x @@ -434,6 +490,32 @@ def test_remove_row(self): remove_button.click() self.assertListEqual([], self.widget.active_rules) + def test_migrate_settings(self): + vals = [""] * 6 + ["a,e", "b,c", "", "a", "b", "c", r"\w*is", "NN,VV", "", ""] + settings = {"__version__": 1, "active_rules": list(zip(range(17), vals))} + widget = self.create_widget(OWStatistics, stored_settings=settings) + self.send_signal(self.widget.Inputs.corpus, self.corpus, widget=widget) + + expected = [ + (0, "", Sources.DOCUMENTS), + (1, "", Sources.DOCUMENTS), + (2, "", Sources.TOKENS), + (3, "", Sources.DOCUMENTS), + (4, "", Sources.DOCUMENTS), + (5, "", Sources.DOCUMENTS), + (6, "a,e", Sources.DOCUMENTS), + (7, "b,c", Sources.DOCUMENTS), + (8, "", Sources.TOKENS), + (9, "a", Sources.TOKENS), + (10, "b", Sources.TOKENS), + (11, "c", Sources.DOCUMENTS), + (12, r"\w*is", Sources.DOCUMENTS), + (13, "NN,VV", Sources.TOKENS), + (14, "", Sources.TOKENS), + (15, "", Sources.TOKENS), + ] + self.assertListEqual(expected, widget.active_rules) + if __name__ == "__main__": unittest.main()