From 3597ccbeb533eb10929fc7119b6a29158606bec7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Primo=C5=BE=20Godec?=
Date: Sun, 1 Mar 2020 12:59:20 +0100
Subject: [PATCH] Statistics widget
---
.../text/widgets/icons/Statistics.svg | 99 +++
orangecontrib/text/widgets/owstatistics.py | 685 ++++++++++++++++++
.../text/widgets/tests/test_owstatistics.py | 439 +++++++++++
orangecontrib/text/widgets/utils/context.py | 28 +
orangecontrib/text/widgets/utils/widgets.py | 65 ++
5 files changed, 1316 insertions(+)
create mode 100644 orangecontrib/text/widgets/icons/Statistics.svg
create mode 100644 orangecontrib/text/widgets/owstatistics.py
create mode 100644 orangecontrib/text/widgets/tests/test_owstatistics.py
create mode 100644 orangecontrib/text/widgets/utils/context.py
diff --git a/orangecontrib/text/widgets/icons/Statistics.svg b/orangecontrib/text/widgets/icons/Statistics.svg
new file mode 100644
index 000000000..f5371108e
--- /dev/null
+++ b/orangecontrib/text/widgets/icons/Statistics.svg
@@ -0,0 +1,99 @@
+
+
diff --git a/orangecontrib/text/widgets/owstatistics.py b/orangecontrib/text/widgets/owstatistics.py
new file mode 100644
index 000000000..5ef07b72f
--- /dev/null
+++ b/orangecontrib/text/widgets/owstatistics.py
@@ -0,0 +1,685 @@
+import re
+from copy import copy
+from string import punctuation
+from typing import Callable, List, Optional, Tuple
+
+import numpy as np
+from AnyQt.QtCore import QSize
+from AnyQt.QtWidgets import QComboBox, QGridLayout, QLabel, QLineEdit
+
+from Orange.widgets import gui
+from Orange.widgets.settings import ContextSetting
+from Orange.widgets.utils.concurrent import ConcurrentWidgetMixin, TaskState
+from Orange.widgets.utils.widgetpreview import WidgetPreview
+from Orange.widgets.widget import Input, Output, OWWidget
+from orangewidget.widget import Msg
+
+from orangecontrib.text import Corpus
+
+# those functions are implemented here since they are used in more statistics
+from orangecontrib.text.preprocess import (
+ LowercaseTransformer,
+ Preprocessor,
+ RegexpTokenizer,
+ UrlRemover,
+)
+from orangecontrib.text.widgets.utils import format_summary_details
+from orangecontrib.text.widgets.utils.context import (
+ AlmostPerfectContextHandler,
+)
+
+
+def num_words(document: str, callback: Callable) -> int:
+ """
+ Return number of words in document-string. Word is every entity divided by
+ space, tab, newline.
+ """
+ callback()
+ return len(document.split())
+
+
+def char_count(document: str, callback: Callable) -> int:
+ """
+ Count number of alpha-numerical in document/string.
+ """
+ callback()
+ return sum(c.isalnum() for c in document)
+
+
+def digit_count(document: str, callback: Callable) -> int:
+ """
+ Count number of digits in document/string.
+ """
+ callback()
+ return sum(c.isdigit() for c in document)
+
+
+def count_appearances(
+ document: str, characters: List[str], callback: Callable
+) -> int:
+ """
+ Count number of appearances of chars from `characters` list.
+ """
+ callback()
+ # I think it supports the majority of main languages
+ # Y can be vowel too sometimes - it is not possible to distinguish
+ return sum(document.lower().count(c) for c in characters)
+
+
+def preprocess_only_words(corpus: Corpus) -> Corpus:
+ """
+ Apply the preprocessor that splits words, transforms them to lower case
+ (and removes punctuations).
+
+ Parameters
+ ----------
+ corpus
+ Corpus on which the preprocessor will be applied.
+
+ Returns
+ -------
+ Preprocessed corpus. Result of pre-processing is saved in tokens/ngrams.
+ """
+ p = Preprocessor(
+ transformers=[LowercaseTransformer()],
+ # by default regexp keeps only words (no punctuations, no spaces)
+ tokenizer=RegexpTokenizer(),
+ )
+ return p(corpus, inplace=False)
+
+
+# every statistic returns a np.ndarray with statistics
+# and list with variables names - it must be implemented here since some
+# statistics in the future will have more variables
+
+
+def words_count(
+ corpus: Corpus, _: str, callback: Callable
+) -> Tuple[np.ndarray, List[str]]:
+ """
+ Count number of words in each document.
+ """
+ corpus = preprocess_only_words(corpus)
+ # np.c_ makes column vector (ndarray) out of the list
+ # [1, 2, 3] -> [[1], [2], [3]]
+ return (
+ np.c_[[num_words(d, callback) for d in corpus.documents]],
+ ["Word count"],
+ )
+
+
+def characters_count(
+ corpus: Corpus, _: str, callback: Callable
+) -> Tuple[np.ndarray, List[str]]:
+ """
+ Count number of characters without spaces, newlines, tabs, ...
+ """
+ return (
+ np.c_[[char_count(d, callback) for d in corpus.documents]],
+ ["Character count"],
+ )
+
+
+def n_gram_count(
+ corpus: Corpus, _: str, callback: Callable
+) -> Tuple[np.ndarray, List[str]]:
+ """
+ Count number of n-grams in every document
+ """
+
+ def ng_count(n_gram: List[str]):
+ callback()
+ return len(n_gram)
+
+ return np.c_[list(map(ng_count, corpus.ngrams))], ["N-gram count"]
+
+
+def word_density(
+ corpus: Corpus, _: str, callback: Callable
+) -> Tuple[np.ndarray, List[str]]:
+ """
+ Computes word density as: word count / character count + 1
+ """
+ return (
+ np.c_[
+ [
+ char_count(d, lambda: True) / num_words(d, callback)
+ for d in corpus.documents
+ ]
+ ],
+ ["Average word length"],
+ )
+
+
+def punctuation_count(
+ corpus: Corpus, _: str, callback: Callable
+) -> Tuple[np.ndarray, List[str]]:
+ """
+ Count number of punctuation signs
+ """
+
+ def num_punctuation(document: str):
+ callback()
+ return sum(document.count(c) for c in punctuation)
+
+ return (
+ np.c_[list(map(num_punctuation, corpus.documents))],
+ ["Punctuation count"],
+ )
+
+
+def capital_count(
+ corpus: Corpus, _: str, callback: Callable
+) -> Tuple[np.ndarray, List[str]]:
+ """
+ Count number of capital letters in documents
+ """
+
+ def num_capitals(document: str):
+ callback()
+ return sum(1 for c in document if c.isupper())
+
+ return (
+ np.c_[list(map(num_capitals, corpus.documents))],
+ ["Capital letter count"],
+ )
+
+
+def vowel_count(
+ corpus: Corpus, vowels: str, callback: Callable
+) -> Tuple[np.ndarray, List[str]]:
+ """
+ Count number of vowels in documents
+ """
+ # comma separated string of vowels to list
+ vowels = [v.strip() for v in vowels.split(",")]
+ return (
+ np.c_[
+ [count_appearances(d, vowels, callback) for d in corpus.documents]
+ ],
+ ["Vowel count"],
+ )
+
+
+def consonant_count(
+ corpus: Corpus, consonants: str, callback: Callable
+) -> Tuple[np.ndarray, List[str]]:
+ """
+ Count number of consonants in documents. Consonants are all alnum
+ characters except vowels and numbers
+ """
+ # comma separated string of consonants to list
+ consonants = [v.strip() for v in consonants.split(",")]
+ return (
+ np.c_[
+ [
+ count_appearances(d, consonants, callback)
+ for d in corpus.documents
+ ]
+ ],
+ ["Consonant count"],
+ )
+
+
+def per_cent_unique_words(
+ corpus: Corpus, _: str, callback: Callable
+) -> Tuple[np.ndarray, List[str]]:
+ """
+ Ratio between unique words count and all words count
+ """
+ corpus = preprocess_only_words(corpus)
+
+ def perc_unique(tokens: str):
+ callback()
+ return len(set(tokens)) / len(tokens)
+
+ return np.c_[list(map(perc_unique, corpus.tokens))], ["% unique words"]
+
+
+def starts_with(
+ corpus: Corpus, prefix: str, callback: Callable
+) -> Tuple[np.ndarray, List[str]]:
+ """
+ Number of words that starts with the string in `prefix`.
+ """
+ corpus = preprocess_only_words(corpus)
+
+ def number_starts_with(tokens: List[str]):
+ callback()
+ return sum(t.startswith(prefix) for t in tokens)
+
+ return (
+ np.c_[list(map(number_starts_with, corpus.tokens))],
+ [f"Starts with {prefix}"],
+ )
+
+
+def ends_with(
+ corpus: Corpus, postfix: str, callback: Callable
+) -> Tuple[np.ndarray, List[str]]:
+ """
+ Number of words that ends with the string in `postfix`.
+ """
+ corpus = preprocess_only_words(corpus)
+
+ def number_ends_with(tokens: List[str]):
+ callback()
+ return sum(t.endswith(postfix) for t in tokens)
+
+ return (
+ np.c_[list(map(number_ends_with, corpus.tokens))],
+ [f"Ends with {postfix}"],
+ )
+
+
+def contains(
+ corpus: Corpus, text: str, callback: Callable
+) -> Tuple[np.ndarray, List[str]]:
+ """
+ Number of words that contains string in `text`.
+ """
+ return (
+ np.c_[
+ [count_appearances(d, [text], callback) for d in corpus.documents]
+ ],
+ [f"Contains {text}"],
+ )
+
+
+def regex(
+ corpus: Corpus, expression: str, callback: Callable
+) -> Tuple[np.ndarray, List[str]]:
+ """
+ Count occurrences of pattern in `expression`.
+ """
+ pattern = re.compile(expression)
+
+ def number_regex(tokens: List[str]):
+ callback()
+ return sum(bool(pattern.match(t)) for t in tokens)
+
+ return (
+ np.c_[list(map(number_regex, corpus.tokens))],
+ [f"Regex {expression}"],
+ )
+
+
+def pos_tags(
+ corpus: Corpus, pos_tags: str, callback: Callable
+) -> Optional[Tuple[np.ndarray, List[str]]]:
+ """
+ Count number of specified pos tags in corpus
+ """
+ p_tags = [v.strip().lower() for v in pos_tags.split(",")]
+
+ def cust_count(tags):
+ callback()
+ tags = [t.lower() for t in tags]
+ return sum(tags.count(t) for t in p_tags)
+
+ if corpus.pos_tags is None:
+ return None
+ return (
+ np.c_[[cust_count(p) for p in corpus.pos_tags]],
+ [f"POS tags {pos_tags}"],
+ )
+
+
+class ComputeValue:
+ """
+ Class which provides compute value functionality. It stores the function
+ that is used to compute values on new data table using this domain.
+
+ Attributes
+ ----------
+ function
+ Function that computes new values
+ pattern
+ Some statistics need additional parameter with the pattern
+ (e.g. starts with), for others it is set to empty string.
+ """
+
+ def __init__(self, function: Callable, pattern: str) -> None:
+ self.function = function
+ self.pattern = pattern
+
+ def __call__(self, data: Corpus) -> np.ndarray:
+ """
+ This function compute values on new table.
+ """
+ # lambda is added as a placeholder for a callback.
+ return self.function(data, self.pattern, lambda: True)[0]
+
+
+# the definition of all statistics used in this widget, if new statistic
+# is required ad it to this list
+
+STATISTICS = [
+ # (name of the statistics, function to compute, default value)
+ # if default value is None - text box is not required
+ ("Word count", words_count, None),
+ ("Character count", characters_count, None),
+ ("N-gram count", n_gram_count, None),
+ ("Average word length", word_density, None),
+ ("Punctuation count", punctuation_count, None),
+ ("Capital letter count", capital_count, None),
+ ("Vowel count", vowel_count, "a,e,i,o,u"),
+ (
+ "Consonant count",
+ consonant_count,
+ "b,c,d,f,g,h,j,k,l,m,n,p,q,r,s,t,v,w,x,y,z",
+ ),
+ ("Per cent unique words", per_cent_unique_words, None),
+ ("Starts with", starts_with, ""),
+ ("Ends with", ends_with, ""),
+ ("Contains", contains, ""),
+ ("Regex", regex, ""),
+ ("POS tag", pos_tags, "NN,VV,JJ"),
+]
+STATISTICS_NAMES = list(list(zip(*STATISTICS))[0])
+STATISTICS_FUNCTIONS = list(list(zip(*STATISTICS))[1])
+STATISTICS_DEFAULT_VALUE = list(list(zip(*STATISTICS))[2])
+
+
+def run(corpus: Corpus, statistics: Tuple[int, str], state: TaskState) -> None:
+ """
+ This function runs the computation for new features.
+ All results will be reported as a partial results.
+
+ Parameters
+ ----------
+ corpus
+ The corpus on which the computation is held.
+ statistics
+ Tuple of statistic pairs to be computed:
+ (statistics id, string pattern)
+ state
+ State used to report progress and partial results.
+ """
+ # callback is called for each corpus element statistics time
+ tick_values = iter(np.linspace(0, 100, len(corpus) * len(statistics)))
+
+ def advance():
+ state.set_progress_value(next(tick_values))
+
+ for s, patern in statistics:
+ fun = STATISTICS_FUNCTIONS[s]
+ result = fun(corpus, patern, advance)
+ if result is not None:
+ result = result + (ComputeValue(fun, patern),)
+ state.set_partial_result((s, patern, result))
+
+
+class OWStatistics(OWWidget, ConcurrentWidgetMixin):
+ name = "Statistics"
+ description = "Create new statistic variables for documents."
+ keywords = []
+ icon = "icons/Statistics.svg"
+
+ class Inputs:
+ corpus = Input("Corpus", Corpus)
+
+ class Outputs:
+ corpus = Output("Corpus", Corpus)
+
+ class Warning(OWWidget.Warning):
+ not_computed = Msg(
+ "{} statistics cannot be computed and is omitted from results."
+ )
+
+ want_main_area = False
+ settingsHandler = AlmostPerfectContextHandler(0.9)
+
+ # settings
+ default_rules = [(0, ""), (1, "")] # rules used to reset the active rules
+ active_rules: List[Tuple[int, str]] = ContextSetting(default_rules[:])
+ # rules active at time of apply clicked
+ applied_rules: Optional[List[Tuple[int, str]]] = None
+
+ result_dict = {}
+
+ def __init__(self) -> None:
+ OWWidget.__init__(self)
+ ConcurrentWidgetMixin.__init__(self)
+ self.corpus = None
+
+ # the list with combos from the widget
+ self.combos = []
+ # the list with line edits from the widget
+ self.line_edits = []
+ # the list of buttons in front of controls that removes them
+ self.remove_buttons = []
+
+ self._init_controls()
+
+ def _init_controls(self) -> None:
+ """ Init all controls of the widget """
+ self._init_statistics_box()
+ box = gui.hBox(self.controlArea)
+ gui.rubber(box)
+ gui.button(
+ box,
+ self,
+ "Apply",
+ autoDefault=False,
+ width=180,
+ callback=self.apply,
+ )
+
+ def _init_statistics_box(self) -> None:
+ """
+ Init the statistics box in control area - place where used statistics
+ are listed, remove, and added.
+ """
+ patternbox = gui.vBox(self.controlArea, box=True)
+ self.rules_box = rules_box = QGridLayout()
+ patternbox.layout().addLayout(self.rules_box)
+ box = gui.hBox(patternbox)
+ gui.button(
+ box,
+ self,
+ "+",
+ callback=self._add_row,
+ autoDefault=False,
+ flat=True,
+ minimumSize=(QSize(20, 20)),
+ )
+ gui.rubber(box)
+ self.rules_box.setColumnMinimumWidth(1, 70)
+ self.rules_box.setColumnMinimumWidth(0, 10)
+ self.rules_box.setColumnStretch(0, 1)
+ self.rules_box.setColumnStretch(1, 1)
+ self.rules_box.setColumnStretch(2, 100)
+ rules_box.addWidget(QLabel("Feature"), 0, 1)
+ rules_box.addWidget(QLabel("Pattern"), 0, 2)
+ self.adjust_n_rule_rows()
+
+ def adjust_n_rule_rows(self) -> None:
+ """
+ Add or remove lines in statistics box if needed and fix the tab order.
+ """
+
+ def _add_line():
+ n_lines = len(self.combos) + 1
+
+ # add delete symbol
+ button = gui.button(
+ None,
+ self,
+ label="×",
+ flat=True,
+ height=20,
+ styleSheet="* {font-size: 16pt; color: silver}"
+ "*:hover {color: black}",
+ autoDefault=False,
+ callback=self._remove_row,
+ )
+ button.setMinimumSize(QSize(12, 20))
+ self.rules_box.addWidget(button, n_lines, 0)
+ self.remove_buttons.append(button)
+
+ # add statistics type dropdown
+ combo = QComboBox()
+ combo.addItems(STATISTICS_NAMES)
+ combo.currentIndexChanged.connect(self._sync_edit_combo)
+ self.rules_box.addWidget(combo, n_lines, 1)
+ self.combos.append(combo)
+
+ # add line edit for patern
+ line_edit = QLineEdit()
+ self.rules_box.addWidget(line_edit, n_lines, 2)
+ line_edit.textChanged.connect(self._sync_edit_line)
+ self.line_edits.append(line_edit)
+
+ def _remove_line():
+ self.combos.pop().deleteLater()
+ self.line_edits.pop().deleteLater()
+ self.remove_buttons.pop().deleteLater()
+
+ def _fix_tab_order():
+ # TODO: write it differently - check create class
+ for i, (r, c, l) in enumerate(
+ zip(self.active_rules, self.combos, self.line_edits)
+ ):
+ c.setCurrentIndex(r[0]) # update combo
+ l.setText(r[1]) # update line edit
+ if STATISTICS_DEFAULT_VALUE[r[0]] is not None:
+ l.setVisible(True)
+ else:
+ l.setVisible(False)
+
+ n = len(self.active_rules)
+ while n > len(self.combos):
+ _add_line()
+ while len(self.combos) > n:
+ _remove_line()
+ _fix_tab_order()
+
+ def _add_row(self) -> None:
+ """ Add a new row to the statistic box """
+ self.active_rules.append((0, ""))
+ self.adjust_n_rule_rows()
+
+ def _remove_row(self) -> None:
+ """ Removes the clicked row in the statistic box """
+ remove_idx = self.remove_buttons.index(self.sender())
+ del self.active_rules[remove_idx]
+ self.adjust_n_rule_rows()
+
+ def _sync_edit_combo(self) -> None:
+ """ Update rules when combo value changed """
+ combo = self.sender()
+ edit_index = self.combos.index(combo)
+ selected_i = combo.currentIndex()
+ default_value = STATISTICS_DEFAULT_VALUE[selected_i]
+ self.active_rules[edit_index] = (
+ selected_i,
+ default_value or self.active_rules[edit_index][1],
+ )
+ self.adjust_n_rule_rows()
+
+ def _sync_edit_line(self) -> None:
+ """ Update rules when line edit value changed """
+ line_edit = self.sender()
+ edit_index = self.line_edits.index(line_edit)
+ self.active_rules[edit_index] = (
+ self.active_rules[edit_index][0],
+ line_edit.text(),
+ )
+
+ @Inputs.corpus
+ def set_data(self, corpus) -> None:
+ self.closeContext()
+ self.corpus = corpus
+ self.active_rules = self.default_rules[:]
+ self.openContext(corpus)
+ self.adjust_n_rule_rows()
+ self.result_dict = {} # empty computational results when new data
+ # reset old output - it also handle case with corpus == None
+ self.Outputs.corpus.send(None)
+
+ # summary
+ if corpus:
+ self.info.set_input_summary(
+ len(corpus), format_summary_details(corpus)
+ )
+ self.apply()
+ else:
+ self.info.set_input_summary(self.info.NoInput)
+ self.info.set_output_summary(self.info.NoOutput)
+
+ def apply(self) -> None:
+ """
+ This function is called when user click apply button. It starts
+ the computation. When computation is finished results are shown
+ on the output - on_done.
+ """
+ if self.corpus is None:
+ return
+ self.applied_rules = copy(self.active_rules)
+ self.cancel() # cancel task since user clicked apply again
+ rules_to_compute = [
+ r for r in self.active_rules if r not in self.result_dict
+ ]
+ self.start(run, self.corpus, rules_to_compute)
+
+ def on_exception(self, exception: Exception) -> None:
+ raise exception
+
+ def on_partial_result(
+ self, result: Tuple[int, str, Tuple[np.ndarray, List[str], Callable]]
+ ) -> None:
+ statistic, patern, result = result
+ self.result_dict[(statistic, patern)] = result
+
+ def on_done(self, result: None) -> None:
+ # join results
+ if self.corpus:
+ self.output_results()
+
+ # remove unnecessary results from dict - it can happen that user
+ # already removes the statistic from gui but it is still computed
+ for k in list(self.result_dict.keys()):
+ if k not in self.active_rules:
+ del self.result_dict[k]
+
+ def output_results(self) -> None:
+ self.Warning.not_computed.clear()
+ to_stack = []
+ attributes = []
+ comput_values = []
+ not_computed = []
+ for rule in self.applied_rules:
+ # check for safety reasons - in practice should not happen
+ if rule in self.result_dict:
+ res = self.result_dict[rule]
+ if res is None:
+ not_computed.append(STATISTICS_NAMES[rule[0]])
+ else:
+ data, variables, comp_value = res
+ to_stack.append(data)
+ attributes += variables
+ comput_values.append(comp_value)
+ if not_computed:
+ self.Warning.not_computed(", ".join(not_computed))
+ # here we will use extend_attributes function - this function add
+ # attributes to existing corpus so it must be copied first
+ # TODO: when change of pre-processing is finished change this function
+ # to have inplace parameter which is False by default,
+ # also I would prefer extend_attriubtes where you give variables
+ # instead of strings on input
+ new_corpus = self.corpus.copy()
+ if to_stack:
+ new_corpus.extend_attributes(
+ np.hstack(to_stack), attributes, compute_values=comput_values
+ )
+ self.Outputs.corpus.send(new_corpus)
+
+ # summary
+ self.info.set_output_summary(
+ len(new_corpus), format_summary_details(new_corpus)
+ )
+
+
+if __name__ == "__main__":
+ WidgetPreview(OWStatistics).run(Corpus.from_file("book-excerpts"))
diff --git a/orangecontrib/text/widgets/tests/test_owstatistics.py b/orangecontrib/text/widgets/tests/test_owstatistics.py
new file mode 100644
index 000000000..930376727
--- /dev/null
+++ b/orangecontrib/text/widgets/tests/test_owstatistics.py
@@ -0,0 +1,439 @@
+import unittest
+from unittest.mock import Mock
+
+import numpy as np
+import pkg_resources
+from AnyQt.QtWidgets import QPushButton
+
+from Orange.data import Domain, StringVariable
+from Orange.widgets.tests.base import WidgetTest
+from orangecontrib.text import Corpus
+from orangecontrib.text.tag import AveragedPerceptronTagger
+from orangecontrib.text.widgets.owstatistics import (
+ STATISTICS_NAMES,
+ OWStatistics,
+)
+
+
+class TestStatisticsWidget(WidgetTest):
+ def setUp(self) -> None:
+ self.widget = self.create_widget(OWStatistics)
+ self.book_data = Corpus.from_file("book-excerpts")
+ self._create_simple_data()
+
+ def _create_simple_data(self) -> None:
+ """
+ Creat a simple dataset with 4 documents. Save it to `self.corpus`.
+ """
+ metas = np.array(
+ [
+ "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
+ "Duis viverra elit eu mi blandit, {et} sollicitudin nisi ",
+ " a porta\tleo. Duis vitae ultrices massa. Mauris ut pulvinar a",
+ "tortor. Class (aptent) taciti\nsociosqu ad lit1ora torquent per",
+ ]
+ ).reshape(-1, 1)
+ text_var = StringVariable("text")
+ domain = Domain([], metas=[text_var])
+ self.corpus = Corpus(
+ domain,
+ X=np.empty((len(metas), 0)),
+ metas=metas,
+ text_features=[text_var],
+ )
+
+ def _set_feature(self, feature_name: str, value: str = ""):
+ """
+ Set statistic which need to be computed by widget. It sets only one
+ statistics.
+
+ Parameters
+ ----------
+ feature_name
+ The name of statistic
+ value
+ If statistic need a value (e.g. prefix) it is passed here.
+ """
+ feature_index = STATISTICS_NAMES.index(feature_name)
+ self.widget.active_rules = [(feature_index, value)]
+ self.widget.adjust_n_rule_rows()
+
+ def _compute_features(self, feature_name: str, value: str = "") -> Corpus:
+ """
+ Send `self.corpus` to widget, set statistic which need bo be computed,
+ run the computation, and return widget output.
+
+ Parameters
+ ----------
+ feature_name
+ The name of the statistic, only one statistic is set
+ value
+ The value if statistic need it.
+
+ Returns
+ -------
+ Resulting corpus.
+ """
+ self.send_signal(self.widget.Inputs.corpus, self.corpus)
+ self.wait_until_finished()
+ self._set_feature(feature_name, value)
+ self.widget.apply()
+ self.wait_until_finished()
+ res = self.get_output(self.widget.Outputs.corpus)
+ self.assertTupleEqual((len(self.corpus), 1), res.X.shape)
+ return res
+
+ def test_send_data(self):
+ """ Test with basic data, and empty data """
+ self.send_signal(self.widget.Inputs.corpus, self.book_data)
+ self.assertEqual(len(self.book_data), len(self.widget.corpus))
+
+ self.send_signal(self.widget.Inputs.corpus, None)
+ self.assertIsNone(self.widget.corpus)
+ self.widget.apply()
+ self.assertIsNone(self.get_output(self.widget.Outputs.corpus))
+
+ def test_words_count(self):
+ """ Test words count statistic """
+ data = self._compute_features("Word count")
+ np.testing.assert_array_equal(data.X.flatten(), [8, 9, 11, 9])
+
+ self.send_signal(self.widget.Inputs.corpus, None)
+ self.assertIsNone(self.get_output(self.widget.Outputs.corpus))
+
+ def test_characters_count(self):
+ """ Test characters count statistic """
+ data = self._compute_features("Character count")
+ np.testing.assert_array_equal(data.X.flatten(), [47, 44, 48, 51])
+
+ self.send_signal(self.widget.Inputs.corpus, None)
+ self.assertIsNone(self.get_output(self.widget.Outputs.corpus))
+
+ def test_n_gram_count(self):
+ """ Test n-grams count statistic """
+ data = self._compute_features("N-gram count")
+ np.testing.assert_array_equal(data.X.flatten(), [10, 12, 13, 12])
+
+ self.send_signal(self.widget.Inputs.corpus, None)
+ self.assertIsNone(self.get_output(self.widget.Outputs.corpus))
+
+ def test_word_density(self):
+ """ Test word density statistic """
+ data = self._compute_features("Average word length")
+ np.testing.assert_array_almost_equal(
+ data.X.flatten(), [5.875, 4.888889, 4.363636, 5.666667]
+ )
+
+ self.send_signal(self.widget.Inputs.corpus, None)
+ self.assertIsNone(self.get_output(self.widget.Outputs.corpus))
+
+ def test_punctuations_cont(self):
+ """ Test punctuations count statistic """
+ data = self._compute_features("Punctuation count")
+ np.testing.assert_array_equal(data.X.flatten(), [2, 3, 2, 3])
+
+ self.send_signal(self.widget.Inputs.corpus, None)
+ self.assertIsNone(self.get_output(self.widget.Outputs.corpus))
+
+ def test_capitals_count(self):
+ """ Test capitals count statistic """
+ data = self._compute_features("Capital letter count")
+ np.testing.assert_array_equal(data.X.flatten(), [1, 1, 2, 1])
+
+ self.send_signal(self.widget.Inputs.corpus, None)
+ self.assertIsNone(self.get_output(self.widget.Outputs.corpus))
+
+ def test_vowels_count(self):
+ """ Test vowels count statistic """
+ data = self._compute_features("Vowel count", "a,e,i,o,u")
+ np.testing.assert_array_equal(data.X.flatten(), [19, 20, 23, 20])
+
+ self.send_signal(self.widget.Inputs.corpus, None)
+ self.assertIsNone(self.get_output(self.widget.Outputs.corpus))
+
+ def test_consonants_count(self):
+ """ Test consonants count statistic """
+ data = self._compute_features(
+ "Consonant count", "b,c,d,f,g,h,j,k,l,m,n,p,q,r,s,t,v,w,x,y,z"
+ )
+ np.testing.assert_array_equal(data.X.flatten(), [28, 24, 25, 30])
+
+ self.send_signal(self.widget.Inputs.corpus, None)
+ self.assertIsNone(self.get_output(self.widget.Outputs.corpus))
+
+ def test_per_cent_unique_words(self):
+ """ Test per-cent unique words statistic """
+ data = self._compute_features("Per cent unique words")
+ np.testing.assert_array_almost_equal(
+ data.X.flatten(), [1, 1, 0.909091, 1]
+ )
+
+ self.send_signal(self.widget.Inputs.corpus, None)
+ self.assertIsNone(self.get_output(self.widget.Outputs.corpus))
+
+ def test_starts_with(self):
+ """ Test starts with count statistic """
+ data = self._compute_features("Starts with", "a")
+ np.testing.assert_array_almost_equal(data.X.flatten(), [2, 0, 2, 2])
+
+ data = self._compute_features("Starts with", "ap")
+ np.testing.assert_array_almost_equal(data.X.flatten(), [0, 0, 0, 1])
+
+ self.send_signal(self.widget.Inputs.corpus, None)
+ self.assertIsNone(self.get_output(self.widget.Outputs.corpus))
+
+ def test_ends_with(self):
+ """ Test ends with count statistic """
+ data = self._compute_features("Ends with", "t")
+ np.testing.assert_array_almost_equal(data.X.flatten(), [3, 3, 1, 2])
+
+ data = self._compute_features("Ends with", "et")
+ np.testing.assert_array_almost_equal(data.X.flatten(), [1, 1, 0, 0])
+
+ self.send_signal(self.widget.Inputs.corpus, None)
+ self.assertIsNone(self.get_output(self.widget.Outputs.corpus))
+
+ def test_contains(self):
+ """ Test contains count statistic """
+ data = self._compute_features("Contains", "t")
+ np.testing.assert_array_almost_equal(data.X.flatten(), [5, 4, 4, 9])
+
+ data = self._compute_features("Contains", "et")
+ np.testing.assert_array_almost_equal(data.X.flatten(), [2, 1, 0, 0])
+
+ data = self._compute_features("Contains", "is")
+ np.testing.assert_array_almost_equal(data.X.flatten(), [1, 2, 2, 0])
+
+ self.send_signal(self.widget.Inputs.corpus, None)
+ self.assertIsNone(self.get_output(self.widget.Outputs.corpus))
+
+ def test_regex(self):
+ """ Test regex statistic """
+ # words that contains digit
+ data = self._compute_features("Regex", "\w*\d\w*")
+ np.testing.assert_array_almost_equal(data.X.flatten(), [0, 0, 0, 1])
+
+ # words that contains digit
+ data = self._compute_features("Regex", "\w*is\w*")
+ np.testing.assert_array_almost_equal(data.X.flatten(), [1, 2, 2, 0])
+
+ self.send_signal(self.widget.Inputs.corpus, None)
+ self.assertIsNone(self.get_output(self.widget.Outputs.corpus))
+
+ def test_pos(self):
+ """
+ Test post tags count
+ - test with corpus that has no pos tags - warning raised
+ - test with corpus that has pos tags
+ """
+ self.send_signal(self.widget.Inputs.corpus, self.corpus)
+ self._set_feature("POS tag", "NN")
+ self.widget.apply()
+ self.wait_until_finished()
+ res = self.get_output(self.widget.Outputs.corpus)
+ self.assertEqual(0, res.X.shape[1])
+ self.assertTrue(self.widget.Warning.not_computed.is_shown())
+
+ tagger = AveragedPerceptronTagger()
+ result = tagger.tag_corpus(self.corpus)
+
+ self.send_signal(self.widget.Inputs.corpus, result)
+ self._set_feature("POS tag", "NN")
+ self.widget.apply()
+ self.wait_until_finished()
+ res = self.get_output(self.widget.Outputs.corpus)
+ self.assertTupleEqual((len(self.corpus), 1), res.X.shape)
+ np.testing.assert_array_almost_equal(res.X.flatten(), [7, 6, 4, 6])
+ self.assertFalse(self.widget.Warning.not_computed.is_shown())
+
+ def test_statistics_combination(self):
+ """
+ Testing three statistics at same time and see if column concatenated
+ correctly.
+ """
+ self.send_signal(self.widget.Inputs.corpus, self.corpus)
+
+ wc_index = STATISTICS_NAMES.index("Word count")
+ starts_with_index = STATISTICS_NAMES.index("Starts with")
+ capital_counts_index = STATISTICS_NAMES.index("Capital letter count")
+ self.widget.active_rules = [
+ (wc_index, ""),
+ (starts_with_index, "a"),
+ (capital_counts_index, ""),
+ ]
+ self.widget.adjust_n_rule_rows()
+
+ self.widget.apply()
+ self.wait_until_finished()
+ res = self.get_output(self.widget.Outputs.corpus)
+
+ self.assertTupleEqual((len(self.corpus), 3), res.X.shape)
+ np.testing.assert_array_almost_equal(
+ res.X[:, 0].flatten(), [8, 9, 11, 9]
+ )
+ np.testing.assert_array_almost_equal(
+ res.X[:, 1].flatten(), [2, 0, 2, 2]
+ )
+ np.testing.assert_array_almost_equal(
+ res.X[:, 2].flatten(), [1, 1, 2, 1]
+ )
+
+ def test_dictionary_statistics(self):
+ """
+ Test remove statistic from the dictionary when they are not required
+ """
+ self.send_signal(self.widget.Inputs.corpus, self.corpus)
+
+ self.widget.active_rules = [
+ (1, ""),
+ ]
+ self.widget.adjust_n_rule_rows()
+ self.widget.apply()
+ self.wait_until_finished()
+
+ self.assertListEqual([(1, "")], list(self.widget.result_dict.keys()))
+
+ self.widget.active_rules = [(1, ""), (2, "")]
+ self.widget.adjust_n_rule_rows()
+ self.widget.apply()
+ self.wait_until_finished()
+
+ self.assertListEqual(
+ [(1, ""), (2, "")], list(self.widget.result_dict.keys())
+ )
+
+ self.widget.active_rules = [(2, "")]
+ self.widget.adjust_n_rule_rows()
+ self.widget.apply()
+ self.wait_until_finished()
+
+ self.assertListEqual([(2, "")], list(self.widget.result_dict.keys()))
+
+ # dict should empty on new data
+ self.send_signal(self.widget.Inputs.corpus, self.corpus)
+ self.assertListEqual([], list(self.widget.result_dict.keys()))
+
+ def test_context(self):
+ """ Test whether context correctly restore rules """
+ rules = [(0, ""), (1, ""), (2, "")]
+ self.send_signal(self.widget.Inputs.corpus, self.corpus)
+ self.widget.active_rules = rules[:]
+
+ self.send_signal(self.widget.Inputs.corpus, self.book_data)
+ self.assertListEqual([(0, ""), (1, "")], self.widget.active_rules)
+
+ self.send_signal(self.widget.Inputs.corpus, self.corpus)
+ self.assertListEqual(rules, self.widget.active_rules)
+
+ def test_compute_values(self):
+ """ Test compute values on new data """
+ data = self._compute_features("Word count")
+
+ computed = Corpus.from_table(data.domain, self.book_data)
+ self.assertEqual(data.domain, computed.domain)
+ self.assertTupleEqual((len(self.book_data), 1), computed.X.shape)
+
+ def test_append_to_existing_X(self):
+ """ Test if new features are correctly attached to X matrix """
+ data = Corpus.from_file("election-tweets-2016")
+ self.send_signal(self.widget.Inputs.corpus, data)
+ self.wait_until_finished()
+ statistics = self.get_output(self.widget.Outputs.corpus)
+
+ self.assertTupleEqual(
+ (data.X.shape[0], data.X.shape[1] + 2), statistics.X.shape
+ )
+
+ def test_add_row(self):
+ self.send_signal(self.widget.Inputs.corpus, self.corpus)
+ self.wait_until_finished()
+ self.widget.active_rules = []
+ self.widget.adjust_n_rule_rows()
+ add_button = [
+ x
+ for x in self.widget.controlArea.findChildren(QPushButton)
+ if x.text() == "+"
+ ][0]
+ add_button.click()
+ self.assertListEqual([(0, "")], self.widget.active_rules)
+
+ def test_remove_row(self):
+ self.send_signal(self.widget.Inputs.corpus, self.corpus)
+ self.widget.active_rules = [(0, "")]
+ self.widget.adjust_n_rule_rows()
+ self.assertListEqual([(0, "")], self.widget.active_rules)
+
+ remove_button = [
+ x
+ for x in self.widget.controlArea.findChildren(QPushButton)
+ if x.text() == "×"
+ ][0]
+ remove_button.click()
+ self.assertListEqual([], self.widget.active_rules)
+
+ def test_input_summary(self):
+ """ Test correctness of the input summary """
+ self.widget.info.set_input_summary = in_sum = Mock()
+
+ self.send_signal(self.widget.Inputs.corpus, self.corpus)
+ in_sum.assert_called_with(
+ len(self.corpus),
+ "4 instances, 1 variable\nFeatures: —\nTarget: —\nMetas: string "
+ "(not shown)",
+ )
+ in_sum.reset_mock()
+
+ self.send_signal(self.widget.Inputs.corpus, self.book_data)
+ in_sum.assert_called_with(
+ len(self.book_data),
+ "140 instances, 2 variables\nFeatures: —\nTarget: categorical\n"
+ "Metas: string (not shown)",
+ )
+ in_sum.reset_mock()
+
+ self.send_signal(self.widget.Inputs.corpus, None)
+ in_sum.assert_called_with(self.widget.info.NoInput)
+
+ def test_output_summary(self):
+ """ Test correctness of the output summary"""
+ self.widget.info.set_output_summary = out_sum = Mock()
+
+ self.send_signal(self.widget.Inputs.corpus, self.corpus)
+ self.wait_until_finished()
+ out_sum.assert_called_with(
+ len(self.corpus),
+ "4 instances, 3 variables\nFeatures: 2 numeric\nTarget: —\nMetas: "
+ "string (not shown)",
+ )
+ out_sum.reset_mock()
+
+ self.send_signal(self.widget.Inputs.corpus, self.book_data)
+ self.wait_until_finished()
+ out_sum.assert_called_with(
+ len(self.book_data),
+ "140 instances, 4 variables\nFeatures: 2 numeric\nTarget: "
+ "categorical\nMetas: string (not shown)",
+ )
+ out_sum.reset_mock()
+
+ self.send_signal(self.widget.Inputs.corpus, None)
+ self.wait_until_finished()
+ out_sum.assert_called_with(self.widget.info.NoOutput)
+
+ def test_remove_function(self):
+ """
+ This test will start to fail when version of Orange > 3.26.0
+ When this tests fails:
+ - removes `format_summary_details` and `format_variables_string` from
+ utils.widget
+ - replace `format_summary_details` in statistics widget with the same
+ function from core orange
+ - set minimum orange version to 3.25 for the text add-on
+ """
+ self.assertLessEqual(
+ pkg_resources.get_distribution("orange3").version, "3.26.0"
+ )
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/orangecontrib/text/widgets/utils/context.py b/orangecontrib/text/widgets/utils/context.py
new file mode 100644
index 000000000..fe383c866
--- /dev/null
+++ b/orangecontrib/text/widgets/utils/context.py
@@ -0,0 +1,28 @@
+from Orange.widgets.settings import PerfectDomainContextHandler
+
+
+class AlmostPerfectContextHandler(PerfectDomainContextHandler):
+ """
+ This context compares both domains and demands that both domain matches
+ in share_domain_matches (e.g. 0.9) of variables. The position of variables
+ (attribute, meta, class_var) is not important since widget that use this
+ handler do not use their values directly.
+
+ Attributes
+ ----------
+ share_domain_matches
+ The share of domain attributes that need to match.
+ """
+ def __init__(self, share_domain_matches: float) -> None:
+ super().__init__()
+ self.share_domain_matches = share_domain_matches
+
+ def match(self, context, domain, attributes, class_vars, metas):
+ context_vars = context.attributes + context.class_vars + context.metas
+ domain_vars = attributes + class_vars + metas
+ matching_vars = [var for var in context_vars if var in domain_vars]
+
+ return (self.PERFECT_MATCH
+ if (len(matching_vars) / len(domain_vars)
+ > self.share_domain_matches)
+ else self.NO_MATCH)
diff --git a/orangecontrib/text/widgets/utils/widgets.py b/orangecontrib/text/widgets/utils/widgets.py
index b06fcb5e4..effdb5c1f 100644
--- a/orangecontrib/text/widgets/utils/widgets.py
+++ b/orangecontrib/text/widgets/utils/widgets.py
@@ -7,6 +7,8 @@
QGridLayout, QCheckBox, QStackedLayout)
from AnyQt.QtGui import QColor
from AnyQt.QtCore import QDate, pyqtSignal, Qt, QSize
+from Orange.data import DiscreteVariable, ContinuousVariable, TimeVariable, \
+ StringVariable
from Orange.widgets.gui import OWComponent, hBox
from Orange.widgets import settings
@@ -565,3 +567,66 @@ def load_provider(self, path_to_file):
self.resource_path = path_to_file
self.valueChanged.emit(self.model_path, self.resource_path)
+
+def format_variables_string(variables):
+ """
+ A function that formats the descriptive part of the input/output summary for
+ either features, targets or metas of the input dataset.
+
+ :param variables: Features, targets or metas of the input dataset
+ :return: A formatted string
+ """
+ if not variables:
+ return '—'
+
+ agg = []
+ for var_type_name, var_type in [('categorical', DiscreteVariable),
+ ('numeric', ContinuousVariable),
+ ('time', TimeVariable),
+ ('string', StringVariable)]:
+ # Disable pylint here because a `TimeVariable` is also a
+ # `ContinuousVariable`, and should be labelled as such. That is why
+ # it is necessary to check the type this way instead of using
+ # `isinstance`, which would fail in the above case
+ var_type_list = [v for v in variables if type(v) is var_type] # pylint: disable=unidiomatic-typecheck
+ if var_type_list:
+ not_shown = ' (not shown)' if issubclass(var_type, StringVariable)\
+ else ''
+ agg.append((f'{var_type_name}{not_shown}', len(var_type_list)))
+
+ attrs, counts = list(zip(*agg))
+ if len(attrs) > 1:
+ var_string = [f'{i} {j}' for i, j in zip(counts, attrs)]
+ var_string = f'{sum(counts)} ({", ".join(var_string)})'
+ elif counts[0] == 1:
+ var_string = attrs[0]
+ else:
+ var_string = f'{counts[0]} {attrs[0]}'
+ return var_string
+
+
+def format_summary_details(data):
+ """
+ A function that forms the entire descriptive part of the input/output
+ summary.
+
+ :param data: A dataset
+ :type data: Orange.data.Table
+ :return: A formatted string
+ """
+ def _plural(number):
+ return 's' * (number != 1)
+
+ details = ''
+ if data:
+ features = format_variables_string(data.domain.attributes)
+ targets = format_variables_string(data.domain.class_vars)
+ metas = format_variables_string(data.domain.metas)
+
+ n_features = len(data.domain.variables) + len(data.domain.metas)
+ details = \
+ f'{len(data)} instance{_plural(len(data))}, ' \
+ f'{n_features} variable{_plural(n_features)}\n' \
+ f'Features: {features}\nTarget: {targets}\nMetas: {metas}'
+
+ return details