From 3597ccbeb533eb10929fc7119b6a29158606bec7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Primo=C5=BE=20Godec?= Date: Sun, 1 Mar 2020 12:59:20 +0100 Subject: [PATCH] Statistics widget --- .../text/widgets/icons/Statistics.svg | 99 +++ orangecontrib/text/widgets/owstatistics.py | 685 ++++++++++++++++++ .../text/widgets/tests/test_owstatistics.py | 439 +++++++++++ orangecontrib/text/widgets/utils/context.py | 28 + orangecontrib/text/widgets/utils/widgets.py | 65 ++ 5 files changed, 1316 insertions(+) create mode 100644 orangecontrib/text/widgets/icons/Statistics.svg create mode 100644 orangecontrib/text/widgets/owstatistics.py create mode 100644 orangecontrib/text/widgets/tests/test_owstatistics.py create mode 100644 orangecontrib/text/widgets/utils/context.py diff --git a/orangecontrib/text/widgets/icons/Statistics.svg b/orangecontrib/text/widgets/icons/Statistics.svg new file mode 100644 index 000000000..f5371108e --- /dev/null +++ b/orangecontrib/text/widgets/icons/Statistics.svg @@ -0,0 +1,99 @@ + + + + + + + + image/svg+xml + + + + + + + + + + + + + diff --git a/orangecontrib/text/widgets/owstatistics.py b/orangecontrib/text/widgets/owstatistics.py new file mode 100644 index 000000000..5ef07b72f --- /dev/null +++ b/orangecontrib/text/widgets/owstatistics.py @@ -0,0 +1,685 @@ +import re +from copy import copy +from string import punctuation +from typing import Callable, List, Optional, Tuple + +import numpy as np +from AnyQt.QtCore import QSize +from AnyQt.QtWidgets import QComboBox, QGridLayout, QLabel, QLineEdit + +from Orange.widgets import gui +from Orange.widgets.settings import ContextSetting +from Orange.widgets.utils.concurrent import ConcurrentWidgetMixin, TaskState +from Orange.widgets.utils.widgetpreview import WidgetPreview +from Orange.widgets.widget import Input, Output, OWWidget +from orangewidget.widget import Msg + +from orangecontrib.text import Corpus + +# those functions are implemented here since they are used in more statistics +from orangecontrib.text.preprocess import ( + LowercaseTransformer, + Preprocessor, + RegexpTokenizer, + UrlRemover, +) +from orangecontrib.text.widgets.utils import format_summary_details +from orangecontrib.text.widgets.utils.context import ( + AlmostPerfectContextHandler, +) + + +def num_words(document: str, callback: Callable) -> int: + """ + Return number of words in document-string. Word is every entity divided by + space, tab, newline. + """ + callback() + return len(document.split()) + + +def char_count(document: str, callback: Callable) -> int: + """ + Count number of alpha-numerical in document/string. + """ + callback() + return sum(c.isalnum() for c in document) + + +def digit_count(document: str, callback: Callable) -> int: + """ + Count number of digits in document/string. + """ + callback() + return sum(c.isdigit() for c in document) + + +def count_appearances( + document: str, characters: List[str], callback: Callable +) -> int: + """ + Count number of appearances of chars from `characters` list. + """ + callback() + # I think it supports the majority of main languages + # Y can be vowel too sometimes - it is not possible to distinguish + return sum(document.lower().count(c) for c in characters) + + +def preprocess_only_words(corpus: Corpus) -> Corpus: + """ + Apply the preprocessor that splits words, transforms them to lower case + (and removes punctuations). + + Parameters + ---------- + corpus + Corpus on which the preprocessor will be applied. + + Returns + ------- + Preprocessed corpus. Result of pre-processing is saved in tokens/ngrams. + """ + p = Preprocessor( + transformers=[LowercaseTransformer()], + # by default regexp keeps only words (no punctuations, no spaces) + tokenizer=RegexpTokenizer(), + ) + return p(corpus, inplace=False) + + +# every statistic returns a np.ndarray with statistics +# and list with variables names - it must be implemented here since some +# statistics in the future will have more variables + + +def words_count( + corpus: Corpus, _: str, callback: Callable +) -> Tuple[np.ndarray, List[str]]: + """ + Count number of words in each document. + """ + corpus = preprocess_only_words(corpus) + # np.c_ makes column vector (ndarray) out of the list + # [1, 2, 3] -> [[1], [2], [3]] + return ( + np.c_[[num_words(d, callback) for d in corpus.documents]], + ["Word count"], + ) + + +def characters_count( + corpus: Corpus, _: str, callback: Callable +) -> Tuple[np.ndarray, List[str]]: + """ + Count number of characters without spaces, newlines, tabs, ... + """ + return ( + np.c_[[char_count(d, callback) for d in corpus.documents]], + ["Character count"], + ) + + +def n_gram_count( + corpus: Corpus, _: str, callback: Callable +) -> Tuple[np.ndarray, List[str]]: + """ + Count number of n-grams in every document + """ + + def ng_count(n_gram: List[str]): + callback() + return len(n_gram) + + return np.c_[list(map(ng_count, corpus.ngrams))], ["N-gram count"] + + +def word_density( + corpus: Corpus, _: str, callback: Callable +) -> Tuple[np.ndarray, List[str]]: + """ + Computes word density as: word count / character count + 1 + """ + return ( + np.c_[ + [ + char_count(d, lambda: True) / num_words(d, callback) + for d in corpus.documents + ] + ], + ["Average word length"], + ) + + +def punctuation_count( + corpus: Corpus, _: str, callback: Callable +) -> Tuple[np.ndarray, List[str]]: + """ + Count number of punctuation signs + """ + + def num_punctuation(document: str): + callback() + return sum(document.count(c) for c in punctuation) + + return ( + np.c_[list(map(num_punctuation, corpus.documents))], + ["Punctuation count"], + ) + + +def capital_count( + corpus: Corpus, _: str, callback: Callable +) -> Tuple[np.ndarray, List[str]]: + """ + Count number of capital letters in documents + """ + + def num_capitals(document: str): + callback() + return sum(1 for c in document if c.isupper()) + + return ( + np.c_[list(map(num_capitals, corpus.documents))], + ["Capital letter count"], + ) + + +def vowel_count( + corpus: Corpus, vowels: str, callback: Callable +) -> Tuple[np.ndarray, List[str]]: + """ + Count number of vowels in documents + """ + # comma separated string of vowels to list + vowels = [v.strip() for v in vowels.split(",")] + return ( + np.c_[ + [count_appearances(d, vowels, callback) for d in corpus.documents] + ], + ["Vowel count"], + ) + + +def consonant_count( + corpus: Corpus, consonants: str, callback: Callable +) -> Tuple[np.ndarray, List[str]]: + """ + Count number of consonants in documents. Consonants are all alnum + characters except vowels and numbers + """ + # comma separated string of consonants to list + consonants = [v.strip() for v in consonants.split(",")] + return ( + np.c_[ + [ + count_appearances(d, consonants, callback) + for d in corpus.documents + ] + ], + ["Consonant count"], + ) + + +def per_cent_unique_words( + corpus: Corpus, _: str, callback: Callable +) -> Tuple[np.ndarray, List[str]]: + """ + Ratio between unique words count and all words count + """ + corpus = preprocess_only_words(corpus) + + def perc_unique(tokens: str): + callback() + return len(set(tokens)) / len(tokens) + + return np.c_[list(map(perc_unique, corpus.tokens))], ["% unique words"] + + +def starts_with( + corpus: Corpus, prefix: str, callback: Callable +) -> Tuple[np.ndarray, List[str]]: + """ + Number of words that starts with the string in `prefix`. + """ + corpus = preprocess_only_words(corpus) + + def number_starts_with(tokens: List[str]): + callback() + return sum(t.startswith(prefix) for t in tokens) + + return ( + np.c_[list(map(number_starts_with, corpus.tokens))], + [f"Starts with {prefix}"], + ) + + +def ends_with( + corpus: Corpus, postfix: str, callback: Callable +) -> Tuple[np.ndarray, List[str]]: + """ + Number of words that ends with the string in `postfix`. + """ + corpus = preprocess_only_words(corpus) + + def number_ends_with(tokens: List[str]): + callback() + return sum(t.endswith(postfix) for t in tokens) + + return ( + np.c_[list(map(number_ends_with, corpus.tokens))], + [f"Ends with {postfix}"], + ) + + +def contains( + corpus: Corpus, text: str, callback: Callable +) -> Tuple[np.ndarray, List[str]]: + """ + Number of words that contains string in `text`. + """ + return ( + np.c_[ + [count_appearances(d, [text], callback) for d in corpus.documents] + ], + [f"Contains {text}"], + ) + + +def regex( + corpus: Corpus, expression: str, callback: Callable +) -> Tuple[np.ndarray, List[str]]: + """ + Count occurrences of pattern in `expression`. + """ + pattern = re.compile(expression) + + def number_regex(tokens: List[str]): + callback() + return sum(bool(pattern.match(t)) for t in tokens) + + return ( + np.c_[list(map(number_regex, corpus.tokens))], + [f"Regex {expression}"], + ) + + +def pos_tags( + corpus: Corpus, pos_tags: str, callback: Callable +) -> Optional[Tuple[np.ndarray, List[str]]]: + """ + Count number of specified pos tags in corpus + """ + p_tags = [v.strip().lower() for v in pos_tags.split(",")] + + def cust_count(tags): + callback() + tags = [t.lower() for t in tags] + return sum(tags.count(t) for t in p_tags) + + if corpus.pos_tags is None: + return None + return ( + np.c_[[cust_count(p) for p in corpus.pos_tags]], + [f"POS tags {pos_tags}"], + ) + + +class ComputeValue: + """ + Class which provides compute value functionality. It stores the function + that is used to compute values on new data table using this domain. + + Attributes + ---------- + function + Function that computes new values + pattern + Some statistics need additional parameter with the pattern + (e.g. starts with), for others it is set to empty string. + """ + + def __init__(self, function: Callable, pattern: str) -> None: + self.function = function + self.pattern = pattern + + def __call__(self, data: Corpus) -> np.ndarray: + """ + This function compute values on new table. + """ + # lambda is added as a placeholder for a callback. + return self.function(data, self.pattern, lambda: True)[0] + + +# the definition of all statistics used in this widget, if new statistic +# is required ad it to this list + +STATISTICS = [ + # (name of the statistics, function to compute, default value) + # if default value is None - text box is not required + ("Word count", words_count, None), + ("Character count", characters_count, None), + ("N-gram count", n_gram_count, None), + ("Average word length", word_density, None), + ("Punctuation count", punctuation_count, None), + ("Capital letter count", capital_count, None), + ("Vowel count", vowel_count, "a,e,i,o,u"), + ( + "Consonant count", + consonant_count, + "b,c,d,f,g,h,j,k,l,m,n,p,q,r,s,t,v,w,x,y,z", + ), + ("Per cent unique words", per_cent_unique_words, None), + ("Starts with", starts_with, ""), + ("Ends with", ends_with, ""), + ("Contains", contains, ""), + ("Regex", regex, ""), + ("POS tag", pos_tags, "NN,VV,JJ"), +] +STATISTICS_NAMES = list(list(zip(*STATISTICS))[0]) +STATISTICS_FUNCTIONS = list(list(zip(*STATISTICS))[1]) +STATISTICS_DEFAULT_VALUE = list(list(zip(*STATISTICS))[2]) + + +def run(corpus: Corpus, statistics: Tuple[int, str], state: TaskState) -> None: + """ + This function runs the computation for new features. + All results will be reported as a partial results. + + Parameters + ---------- + corpus + The corpus on which the computation is held. + statistics + Tuple of statistic pairs to be computed: + (statistics id, string pattern) + state + State used to report progress and partial results. + """ + # callback is called for each corpus element statistics time + tick_values = iter(np.linspace(0, 100, len(corpus) * len(statistics))) + + def advance(): + state.set_progress_value(next(tick_values)) + + for s, patern in statistics: + fun = STATISTICS_FUNCTIONS[s] + result = fun(corpus, patern, advance) + if result is not None: + result = result + (ComputeValue(fun, patern),) + state.set_partial_result((s, patern, result)) + + +class OWStatistics(OWWidget, ConcurrentWidgetMixin): + name = "Statistics" + description = "Create new statistic variables for documents." + keywords = [] + icon = "icons/Statistics.svg" + + class Inputs: + corpus = Input("Corpus", Corpus) + + class Outputs: + corpus = Output("Corpus", Corpus) + + class Warning(OWWidget.Warning): + not_computed = Msg( + "{} statistics cannot be computed and is omitted from results." + ) + + want_main_area = False + settingsHandler = AlmostPerfectContextHandler(0.9) + + # settings + default_rules = [(0, ""), (1, "")] # rules used to reset the active rules + active_rules: List[Tuple[int, str]] = ContextSetting(default_rules[:]) + # rules active at time of apply clicked + applied_rules: Optional[List[Tuple[int, str]]] = None + + result_dict = {} + + def __init__(self) -> None: + OWWidget.__init__(self) + ConcurrentWidgetMixin.__init__(self) + self.corpus = None + + # the list with combos from the widget + self.combos = [] + # the list with line edits from the widget + self.line_edits = [] + # the list of buttons in front of controls that removes them + self.remove_buttons = [] + + self._init_controls() + + def _init_controls(self) -> None: + """ Init all controls of the widget """ + self._init_statistics_box() + box = gui.hBox(self.controlArea) + gui.rubber(box) + gui.button( + box, + self, + "Apply", + autoDefault=False, + width=180, + callback=self.apply, + ) + + def _init_statistics_box(self) -> None: + """ + Init the statistics box in control area - place where used statistics + are listed, remove, and added. + """ + patternbox = gui.vBox(self.controlArea, box=True) + self.rules_box = rules_box = QGridLayout() + patternbox.layout().addLayout(self.rules_box) + box = gui.hBox(patternbox) + gui.button( + box, + self, + "+", + callback=self._add_row, + autoDefault=False, + flat=True, + minimumSize=(QSize(20, 20)), + ) + gui.rubber(box) + self.rules_box.setColumnMinimumWidth(1, 70) + self.rules_box.setColumnMinimumWidth(0, 10) + self.rules_box.setColumnStretch(0, 1) + self.rules_box.setColumnStretch(1, 1) + self.rules_box.setColumnStretch(2, 100) + rules_box.addWidget(QLabel("Feature"), 0, 1) + rules_box.addWidget(QLabel("Pattern"), 0, 2) + self.adjust_n_rule_rows() + + def adjust_n_rule_rows(self) -> None: + """ + Add or remove lines in statistics box if needed and fix the tab order. + """ + + def _add_line(): + n_lines = len(self.combos) + 1 + + # add delete symbol + button = gui.button( + None, + self, + label="×", + flat=True, + height=20, + styleSheet="* {font-size: 16pt; color: silver}" + "*:hover {color: black}", + autoDefault=False, + callback=self._remove_row, + ) + button.setMinimumSize(QSize(12, 20)) + self.rules_box.addWidget(button, n_lines, 0) + self.remove_buttons.append(button) + + # add statistics type dropdown + combo = QComboBox() + combo.addItems(STATISTICS_NAMES) + combo.currentIndexChanged.connect(self._sync_edit_combo) + self.rules_box.addWidget(combo, n_lines, 1) + self.combos.append(combo) + + # add line edit for patern + line_edit = QLineEdit() + self.rules_box.addWidget(line_edit, n_lines, 2) + line_edit.textChanged.connect(self._sync_edit_line) + self.line_edits.append(line_edit) + + def _remove_line(): + self.combos.pop().deleteLater() + self.line_edits.pop().deleteLater() + self.remove_buttons.pop().deleteLater() + + def _fix_tab_order(): + # TODO: write it differently - check create class + for i, (r, c, l) in enumerate( + zip(self.active_rules, self.combos, self.line_edits) + ): + c.setCurrentIndex(r[0]) # update combo + l.setText(r[1]) # update line edit + if STATISTICS_DEFAULT_VALUE[r[0]] is not None: + l.setVisible(True) + else: + l.setVisible(False) + + n = len(self.active_rules) + while n > len(self.combos): + _add_line() + while len(self.combos) > n: + _remove_line() + _fix_tab_order() + + def _add_row(self) -> None: + """ Add a new row to the statistic box """ + self.active_rules.append((0, "")) + self.adjust_n_rule_rows() + + def _remove_row(self) -> None: + """ Removes the clicked row in the statistic box """ + remove_idx = self.remove_buttons.index(self.sender()) + del self.active_rules[remove_idx] + self.adjust_n_rule_rows() + + def _sync_edit_combo(self) -> None: + """ Update rules when combo value changed """ + combo = self.sender() + edit_index = self.combos.index(combo) + selected_i = combo.currentIndex() + default_value = STATISTICS_DEFAULT_VALUE[selected_i] + self.active_rules[edit_index] = ( + selected_i, + default_value or self.active_rules[edit_index][1], + ) + self.adjust_n_rule_rows() + + def _sync_edit_line(self) -> None: + """ Update rules when line edit value changed """ + line_edit = self.sender() + edit_index = self.line_edits.index(line_edit) + self.active_rules[edit_index] = ( + self.active_rules[edit_index][0], + line_edit.text(), + ) + + @Inputs.corpus + def set_data(self, corpus) -> None: + self.closeContext() + self.corpus = corpus + self.active_rules = self.default_rules[:] + self.openContext(corpus) + self.adjust_n_rule_rows() + self.result_dict = {} # empty computational results when new data + # reset old output - it also handle case with corpus == None + self.Outputs.corpus.send(None) + + # summary + if corpus: + self.info.set_input_summary( + len(corpus), format_summary_details(corpus) + ) + self.apply() + else: + self.info.set_input_summary(self.info.NoInput) + self.info.set_output_summary(self.info.NoOutput) + + def apply(self) -> None: + """ + This function is called when user click apply button. It starts + the computation. When computation is finished results are shown + on the output - on_done. + """ + if self.corpus is None: + return + self.applied_rules = copy(self.active_rules) + self.cancel() # cancel task since user clicked apply again + rules_to_compute = [ + r for r in self.active_rules if r not in self.result_dict + ] + self.start(run, self.corpus, rules_to_compute) + + def on_exception(self, exception: Exception) -> None: + raise exception + + def on_partial_result( + self, result: Tuple[int, str, Tuple[np.ndarray, List[str], Callable]] + ) -> None: + statistic, patern, result = result + self.result_dict[(statistic, patern)] = result + + def on_done(self, result: None) -> None: + # join results + if self.corpus: + self.output_results() + + # remove unnecessary results from dict - it can happen that user + # already removes the statistic from gui but it is still computed + for k in list(self.result_dict.keys()): + if k not in self.active_rules: + del self.result_dict[k] + + def output_results(self) -> None: + self.Warning.not_computed.clear() + to_stack = [] + attributes = [] + comput_values = [] + not_computed = [] + for rule in self.applied_rules: + # check for safety reasons - in practice should not happen + if rule in self.result_dict: + res = self.result_dict[rule] + if res is None: + not_computed.append(STATISTICS_NAMES[rule[0]]) + else: + data, variables, comp_value = res + to_stack.append(data) + attributes += variables + comput_values.append(comp_value) + if not_computed: + self.Warning.not_computed(", ".join(not_computed)) + # here we will use extend_attributes function - this function add + # attributes to existing corpus so it must be copied first + # TODO: when change of pre-processing is finished change this function + # to have inplace parameter which is False by default, + # also I would prefer extend_attriubtes where you give variables + # instead of strings on input + new_corpus = self.corpus.copy() + if to_stack: + new_corpus.extend_attributes( + np.hstack(to_stack), attributes, compute_values=comput_values + ) + self.Outputs.corpus.send(new_corpus) + + # summary + self.info.set_output_summary( + len(new_corpus), format_summary_details(new_corpus) + ) + + +if __name__ == "__main__": + WidgetPreview(OWStatistics).run(Corpus.from_file("book-excerpts")) diff --git a/orangecontrib/text/widgets/tests/test_owstatistics.py b/orangecontrib/text/widgets/tests/test_owstatistics.py new file mode 100644 index 000000000..930376727 --- /dev/null +++ b/orangecontrib/text/widgets/tests/test_owstatistics.py @@ -0,0 +1,439 @@ +import unittest +from unittest.mock import Mock + +import numpy as np +import pkg_resources +from AnyQt.QtWidgets import QPushButton + +from Orange.data import Domain, StringVariable +from Orange.widgets.tests.base import WidgetTest +from orangecontrib.text import Corpus +from orangecontrib.text.tag import AveragedPerceptronTagger +from orangecontrib.text.widgets.owstatistics import ( + STATISTICS_NAMES, + OWStatistics, +) + + +class TestStatisticsWidget(WidgetTest): + def setUp(self) -> None: + self.widget = self.create_widget(OWStatistics) + self.book_data = Corpus.from_file("book-excerpts") + self._create_simple_data() + + def _create_simple_data(self) -> None: + """ + Creat a simple dataset with 4 documents. Save it to `self.corpus`. + """ + metas = np.array( + [ + "Lorem ipsum dolor sit amet, consectetur adipiscing elit.", + "Duis viverra elit eu mi blandit, {et} sollicitudin nisi ", + " a porta\tleo. Duis vitae ultrices massa. Mauris ut pulvinar a", + "tortor. Class (aptent) taciti\nsociosqu ad lit1ora torquent per", + ] + ).reshape(-1, 1) + text_var = StringVariable("text") + domain = Domain([], metas=[text_var]) + self.corpus = Corpus( + domain, + X=np.empty((len(metas), 0)), + metas=metas, + text_features=[text_var], + ) + + def _set_feature(self, feature_name: str, value: str = ""): + """ + Set statistic which need to be computed by widget. It sets only one + statistics. + + Parameters + ---------- + feature_name + The name of statistic + value + If statistic need a value (e.g. prefix) it is passed here. + """ + feature_index = STATISTICS_NAMES.index(feature_name) + self.widget.active_rules = [(feature_index, value)] + self.widget.adjust_n_rule_rows() + + def _compute_features(self, feature_name: str, value: str = "") -> Corpus: + """ + Send `self.corpus` to widget, set statistic which need bo be computed, + run the computation, and return widget output. + + Parameters + ---------- + feature_name + The name of the statistic, only one statistic is set + value + The value if statistic need it. + + Returns + ------- + Resulting corpus. + """ + self.send_signal(self.widget.Inputs.corpus, self.corpus) + self.wait_until_finished() + self._set_feature(feature_name, value) + self.widget.apply() + self.wait_until_finished() + res = self.get_output(self.widget.Outputs.corpus) + self.assertTupleEqual((len(self.corpus), 1), res.X.shape) + return res + + def test_send_data(self): + """ Test with basic data, and empty data """ + self.send_signal(self.widget.Inputs.corpus, self.book_data) + self.assertEqual(len(self.book_data), len(self.widget.corpus)) + + self.send_signal(self.widget.Inputs.corpus, None) + self.assertIsNone(self.widget.corpus) + self.widget.apply() + self.assertIsNone(self.get_output(self.widget.Outputs.corpus)) + + def test_words_count(self): + """ Test words count statistic """ + data = self._compute_features("Word count") + np.testing.assert_array_equal(data.X.flatten(), [8, 9, 11, 9]) + + self.send_signal(self.widget.Inputs.corpus, None) + self.assertIsNone(self.get_output(self.widget.Outputs.corpus)) + + def test_characters_count(self): + """ Test characters count statistic """ + data = self._compute_features("Character count") + np.testing.assert_array_equal(data.X.flatten(), [47, 44, 48, 51]) + + self.send_signal(self.widget.Inputs.corpus, None) + self.assertIsNone(self.get_output(self.widget.Outputs.corpus)) + + def test_n_gram_count(self): + """ Test n-grams count statistic """ + data = self._compute_features("N-gram count") + np.testing.assert_array_equal(data.X.flatten(), [10, 12, 13, 12]) + + self.send_signal(self.widget.Inputs.corpus, None) + self.assertIsNone(self.get_output(self.widget.Outputs.corpus)) + + def test_word_density(self): + """ Test word density statistic """ + data = self._compute_features("Average word length") + np.testing.assert_array_almost_equal( + data.X.flatten(), [5.875, 4.888889, 4.363636, 5.666667] + ) + + self.send_signal(self.widget.Inputs.corpus, None) + self.assertIsNone(self.get_output(self.widget.Outputs.corpus)) + + def test_punctuations_cont(self): + """ Test punctuations count statistic """ + data = self._compute_features("Punctuation count") + np.testing.assert_array_equal(data.X.flatten(), [2, 3, 2, 3]) + + self.send_signal(self.widget.Inputs.corpus, None) + self.assertIsNone(self.get_output(self.widget.Outputs.corpus)) + + def test_capitals_count(self): + """ Test capitals count statistic """ + data = self._compute_features("Capital letter count") + np.testing.assert_array_equal(data.X.flatten(), [1, 1, 2, 1]) + + self.send_signal(self.widget.Inputs.corpus, None) + self.assertIsNone(self.get_output(self.widget.Outputs.corpus)) + + def test_vowels_count(self): + """ Test vowels count statistic """ + data = self._compute_features("Vowel count", "a,e,i,o,u") + np.testing.assert_array_equal(data.X.flatten(), [19, 20, 23, 20]) + + self.send_signal(self.widget.Inputs.corpus, None) + self.assertIsNone(self.get_output(self.widget.Outputs.corpus)) + + def test_consonants_count(self): + """ Test consonants count statistic """ + data = self._compute_features( + "Consonant count", "b,c,d,f,g,h,j,k,l,m,n,p,q,r,s,t,v,w,x,y,z" + ) + np.testing.assert_array_equal(data.X.flatten(), [28, 24, 25, 30]) + + self.send_signal(self.widget.Inputs.corpus, None) + self.assertIsNone(self.get_output(self.widget.Outputs.corpus)) + + def test_per_cent_unique_words(self): + """ Test per-cent unique words statistic """ + data = self._compute_features("Per cent unique words") + np.testing.assert_array_almost_equal( + data.X.flatten(), [1, 1, 0.909091, 1] + ) + + self.send_signal(self.widget.Inputs.corpus, None) + self.assertIsNone(self.get_output(self.widget.Outputs.corpus)) + + def test_starts_with(self): + """ Test starts with count statistic """ + data = self._compute_features("Starts with", "a") + np.testing.assert_array_almost_equal(data.X.flatten(), [2, 0, 2, 2]) + + data = self._compute_features("Starts with", "ap") + np.testing.assert_array_almost_equal(data.X.flatten(), [0, 0, 0, 1]) + + self.send_signal(self.widget.Inputs.corpus, None) + self.assertIsNone(self.get_output(self.widget.Outputs.corpus)) + + def test_ends_with(self): + """ Test ends with count statistic """ + data = self._compute_features("Ends with", "t") + np.testing.assert_array_almost_equal(data.X.flatten(), [3, 3, 1, 2]) + + data = self._compute_features("Ends with", "et") + np.testing.assert_array_almost_equal(data.X.flatten(), [1, 1, 0, 0]) + + self.send_signal(self.widget.Inputs.corpus, None) + self.assertIsNone(self.get_output(self.widget.Outputs.corpus)) + + def test_contains(self): + """ Test contains count statistic """ + data = self._compute_features("Contains", "t") + np.testing.assert_array_almost_equal(data.X.flatten(), [5, 4, 4, 9]) + + data = self._compute_features("Contains", "et") + np.testing.assert_array_almost_equal(data.X.flatten(), [2, 1, 0, 0]) + + data = self._compute_features("Contains", "is") + np.testing.assert_array_almost_equal(data.X.flatten(), [1, 2, 2, 0]) + + self.send_signal(self.widget.Inputs.corpus, None) + self.assertIsNone(self.get_output(self.widget.Outputs.corpus)) + + def test_regex(self): + """ Test regex statistic """ + # words that contains digit + data = self._compute_features("Regex", "\w*\d\w*") + np.testing.assert_array_almost_equal(data.X.flatten(), [0, 0, 0, 1]) + + # words that contains digit + data = self._compute_features("Regex", "\w*is\w*") + np.testing.assert_array_almost_equal(data.X.flatten(), [1, 2, 2, 0]) + + self.send_signal(self.widget.Inputs.corpus, None) + self.assertIsNone(self.get_output(self.widget.Outputs.corpus)) + + def test_pos(self): + """ + Test post tags count + - test with corpus that has no pos tags - warning raised + - test with corpus that has pos tags + """ + self.send_signal(self.widget.Inputs.corpus, self.corpus) + self._set_feature("POS tag", "NN") + self.widget.apply() + self.wait_until_finished() + res = self.get_output(self.widget.Outputs.corpus) + self.assertEqual(0, res.X.shape[1]) + self.assertTrue(self.widget.Warning.not_computed.is_shown()) + + tagger = AveragedPerceptronTagger() + result = tagger.tag_corpus(self.corpus) + + self.send_signal(self.widget.Inputs.corpus, result) + self._set_feature("POS tag", "NN") + self.widget.apply() + self.wait_until_finished() + res = self.get_output(self.widget.Outputs.corpus) + self.assertTupleEqual((len(self.corpus), 1), res.X.shape) + np.testing.assert_array_almost_equal(res.X.flatten(), [7, 6, 4, 6]) + self.assertFalse(self.widget.Warning.not_computed.is_shown()) + + def test_statistics_combination(self): + """ + Testing three statistics at same time and see if column concatenated + correctly. + """ + self.send_signal(self.widget.Inputs.corpus, self.corpus) + + wc_index = STATISTICS_NAMES.index("Word count") + starts_with_index = STATISTICS_NAMES.index("Starts with") + capital_counts_index = STATISTICS_NAMES.index("Capital letter count") + self.widget.active_rules = [ + (wc_index, ""), + (starts_with_index, "a"), + (capital_counts_index, ""), + ] + self.widget.adjust_n_rule_rows() + + self.widget.apply() + self.wait_until_finished() + res = self.get_output(self.widget.Outputs.corpus) + + self.assertTupleEqual((len(self.corpus), 3), res.X.shape) + np.testing.assert_array_almost_equal( + res.X[:, 0].flatten(), [8, 9, 11, 9] + ) + np.testing.assert_array_almost_equal( + res.X[:, 1].flatten(), [2, 0, 2, 2] + ) + np.testing.assert_array_almost_equal( + res.X[:, 2].flatten(), [1, 1, 2, 1] + ) + + def test_dictionary_statistics(self): + """ + Test remove statistic from the dictionary when they are not required + """ + self.send_signal(self.widget.Inputs.corpus, self.corpus) + + self.widget.active_rules = [ + (1, ""), + ] + self.widget.adjust_n_rule_rows() + self.widget.apply() + self.wait_until_finished() + + self.assertListEqual([(1, "")], list(self.widget.result_dict.keys())) + + self.widget.active_rules = [(1, ""), (2, "")] + self.widget.adjust_n_rule_rows() + self.widget.apply() + self.wait_until_finished() + + self.assertListEqual( + [(1, ""), (2, "")], list(self.widget.result_dict.keys()) + ) + + self.widget.active_rules = [(2, "")] + self.widget.adjust_n_rule_rows() + self.widget.apply() + self.wait_until_finished() + + self.assertListEqual([(2, "")], list(self.widget.result_dict.keys())) + + # dict should empty on new data + self.send_signal(self.widget.Inputs.corpus, self.corpus) + self.assertListEqual([], list(self.widget.result_dict.keys())) + + def test_context(self): + """ Test whether context correctly restore rules """ + rules = [(0, ""), (1, ""), (2, "")] + self.send_signal(self.widget.Inputs.corpus, self.corpus) + self.widget.active_rules = rules[:] + + self.send_signal(self.widget.Inputs.corpus, self.book_data) + self.assertListEqual([(0, ""), (1, "")], self.widget.active_rules) + + self.send_signal(self.widget.Inputs.corpus, self.corpus) + self.assertListEqual(rules, self.widget.active_rules) + + def test_compute_values(self): + """ Test compute values on new data """ + data = self._compute_features("Word count") + + computed = Corpus.from_table(data.domain, self.book_data) + self.assertEqual(data.domain, computed.domain) + self.assertTupleEqual((len(self.book_data), 1), computed.X.shape) + + def test_append_to_existing_X(self): + """ Test if new features are correctly attached to X matrix """ + data = Corpus.from_file("election-tweets-2016") + self.send_signal(self.widget.Inputs.corpus, data) + self.wait_until_finished() + statistics = self.get_output(self.widget.Outputs.corpus) + + self.assertTupleEqual( + (data.X.shape[0], data.X.shape[1] + 2), statistics.X.shape + ) + + def test_add_row(self): + self.send_signal(self.widget.Inputs.corpus, self.corpus) + self.wait_until_finished() + self.widget.active_rules = [] + self.widget.adjust_n_rule_rows() + add_button = [ + x + for x in self.widget.controlArea.findChildren(QPushButton) + if x.text() == "+" + ][0] + add_button.click() + self.assertListEqual([(0, "")], self.widget.active_rules) + + def test_remove_row(self): + self.send_signal(self.widget.Inputs.corpus, self.corpus) + self.widget.active_rules = [(0, "")] + self.widget.adjust_n_rule_rows() + self.assertListEqual([(0, "")], self.widget.active_rules) + + remove_button = [ + x + for x in self.widget.controlArea.findChildren(QPushButton) + if x.text() == "×" + ][0] + remove_button.click() + self.assertListEqual([], self.widget.active_rules) + + def test_input_summary(self): + """ Test correctness of the input summary """ + self.widget.info.set_input_summary = in_sum = Mock() + + self.send_signal(self.widget.Inputs.corpus, self.corpus) + in_sum.assert_called_with( + len(self.corpus), + "4 instances, 1 variable\nFeatures: —\nTarget: —\nMetas: string " + "(not shown)", + ) + in_sum.reset_mock() + + self.send_signal(self.widget.Inputs.corpus, self.book_data) + in_sum.assert_called_with( + len(self.book_data), + "140 instances, 2 variables\nFeatures: —\nTarget: categorical\n" + "Metas: string (not shown)", + ) + in_sum.reset_mock() + + self.send_signal(self.widget.Inputs.corpus, None) + in_sum.assert_called_with(self.widget.info.NoInput) + + def test_output_summary(self): + """ Test correctness of the output summary""" + self.widget.info.set_output_summary = out_sum = Mock() + + self.send_signal(self.widget.Inputs.corpus, self.corpus) + self.wait_until_finished() + out_sum.assert_called_with( + len(self.corpus), + "4 instances, 3 variables\nFeatures: 2 numeric\nTarget: —\nMetas: " + "string (not shown)", + ) + out_sum.reset_mock() + + self.send_signal(self.widget.Inputs.corpus, self.book_data) + self.wait_until_finished() + out_sum.assert_called_with( + len(self.book_data), + "140 instances, 4 variables\nFeatures: 2 numeric\nTarget: " + "categorical\nMetas: string (not shown)", + ) + out_sum.reset_mock() + + self.send_signal(self.widget.Inputs.corpus, None) + self.wait_until_finished() + out_sum.assert_called_with(self.widget.info.NoOutput) + + def test_remove_function(self): + """ + This test will start to fail when version of Orange > 3.26.0 + When this tests fails: + - removes `format_summary_details` and `format_variables_string` from + utils.widget + - replace `format_summary_details` in statistics widget with the same + function from core orange + - set minimum orange version to 3.25 for the text add-on + """ + self.assertLessEqual( + pkg_resources.get_distribution("orange3").version, "3.26.0" + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/orangecontrib/text/widgets/utils/context.py b/orangecontrib/text/widgets/utils/context.py new file mode 100644 index 000000000..fe383c866 --- /dev/null +++ b/orangecontrib/text/widgets/utils/context.py @@ -0,0 +1,28 @@ +from Orange.widgets.settings import PerfectDomainContextHandler + + +class AlmostPerfectContextHandler(PerfectDomainContextHandler): + """ + This context compares both domains and demands that both domain matches + in share_domain_matches (e.g. 0.9) of variables. The position of variables + (attribute, meta, class_var) is not important since widget that use this + handler do not use their values directly. + + Attributes + ---------- + share_domain_matches + The share of domain attributes that need to match. + """ + def __init__(self, share_domain_matches: float) -> None: + super().__init__() + self.share_domain_matches = share_domain_matches + + def match(self, context, domain, attributes, class_vars, metas): + context_vars = context.attributes + context.class_vars + context.metas + domain_vars = attributes + class_vars + metas + matching_vars = [var for var in context_vars if var in domain_vars] + + return (self.PERFECT_MATCH + if (len(matching_vars) / len(domain_vars) + > self.share_domain_matches) + else self.NO_MATCH) diff --git a/orangecontrib/text/widgets/utils/widgets.py b/orangecontrib/text/widgets/utils/widgets.py index b06fcb5e4..effdb5c1f 100644 --- a/orangecontrib/text/widgets/utils/widgets.py +++ b/orangecontrib/text/widgets/utils/widgets.py @@ -7,6 +7,8 @@ QGridLayout, QCheckBox, QStackedLayout) from AnyQt.QtGui import QColor from AnyQt.QtCore import QDate, pyqtSignal, Qt, QSize +from Orange.data import DiscreteVariable, ContinuousVariable, TimeVariable, \ + StringVariable from Orange.widgets.gui import OWComponent, hBox from Orange.widgets import settings @@ -565,3 +567,66 @@ def load_provider(self, path_to_file): self.resource_path = path_to_file self.valueChanged.emit(self.model_path, self.resource_path) + +def format_variables_string(variables): + """ + A function that formats the descriptive part of the input/output summary for + either features, targets or metas of the input dataset. + + :param variables: Features, targets or metas of the input dataset + :return: A formatted string + """ + if not variables: + return '—' + + agg = [] + for var_type_name, var_type in [('categorical', DiscreteVariable), + ('numeric', ContinuousVariable), + ('time', TimeVariable), + ('string', StringVariable)]: + # Disable pylint here because a `TimeVariable` is also a + # `ContinuousVariable`, and should be labelled as such. That is why + # it is necessary to check the type this way instead of using + # `isinstance`, which would fail in the above case + var_type_list = [v for v in variables if type(v) is var_type] # pylint: disable=unidiomatic-typecheck + if var_type_list: + not_shown = ' (not shown)' if issubclass(var_type, StringVariable)\ + else '' + agg.append((f'{var_type_name}{not_shown}', len(var_type_list))) + + attrs, counts = list(zip(*agg)) + if len(attrs) > 1: + var_string = [f'{i} {j}' for i, j in zip(counts, attrs)] + var_string = f'{sum(counts)} ({", ".join(var_string)})' + elif counts[0] == 1: + var_string = attrs[0] + else: + var_string = f'{counts[0]} {attrs[0]}' + return var_string + + +def format_summary_details(data): + """ + A function that forms the entire descriptive part of the input/output + summary. + + :param data: A dataset + :type data: Orange.data.Table + :return: A formatted string + """ + def _plural(number): + return 's' * (number != 1) + + details = '' + if data: + features = format_variables_string(data.domain.attributes) + targets = format_variables_string(data.domain.class_vars) + metas = format_variables_string(data.domain.metas) + + n_features = len(data.domain.variables) + len(data.domain.metas) + details = \ + f'{len(data)} instance{_plural(len(data))}, ' \ + f'{n_features} variable{_plural(n_features)}\n' \ + f'Features: {features}\nTarget: {targets}\nMetas: {metas}' + + return details