From 3597ccbeb533eb10929fc7119b6a29158606bec7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Primo=C5=BE=20Godec?= <p.godec9@gmail.com>
Date: Sun, 1 Mar 2020 12:59:20 +0100
Subject: [PATCH] Statistics widget

---
 .../text/widgets/icons/Statistics.svg         |  99 +++
 orangecontrib/text/widgets/owstatistics.py    | 685 ++++++++++++++++++
 .../text/widgets/tests/test_owstatistics.py   | 439 +++++++++++
 orangecontrib/text/widgets/utils/context.py   |  28 +
 orangecontrib/text/widgets/utils/widgets.py   |  65 ++
 5 files changed, 1316 insertions(+)
 create mode 100644 orangecontrib/text/widgets/icons/Statistics.svg
 create mode 100644 orangecontrib/text/widgets/owstatistics.py
 create mode 100644 orangecontrib/text/widgets/tests/test_owstatistics.py
 create mode 100644 orangecontrib/text/widgets/utils/context.py
diff --git a/orangecontrib/text/widgets/icons/Statistics.svg b/orangecontrib/text/widgets/icons/Statistics.svg
new file mode 100644
index 000000000..f5371108e
--- /dev/null
+++ b/orangecontrib/text/widgets/icons/Statistics.svg
@@ -0,0 +1,99 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   sodipodi:docname="Statistics.svg"
+   inkscape:version="1.0beta2 (2b71d25, 2019-12-03)"
+   id="svg8"
+   version="1.1"
+   viewBox="0 0 70 70"
+   height="70mm"
+   width="70mm">
+  <defs
+     id="defs2" />
+  <sodipodi:namedview
+     inkscape:snap-global="false"
+     width="70mm"
+     inkscape:window-maximized="0"
+     inkscape:window-y="153"
+     inkscape:window-x="1773"
+     inkscape:window-height="855"
+     inkscape:window-width="1375"
+     showgrid="false"
+     inkscape:document-rotation="0"
+     inkscape:current-layer="layer1"
+     inkscape:document-units="mm"
+     inkscape:cy="256.78082"
+     inkscape:cx="242.501"
+     inkscape:zoom="0.98994949"
+     inkscape:pageshadow="2"
+     inkscape:pageopacity="0.0"
+     borderopacity="1.0"
+     bordercolor="#666666"
+     pagecolor="#ffffff"
+     id="base" />
+  <metadata
+     id="metadata5">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+        <dc:title></dc:title>
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <g
+     transform="translate(-17.510675,-12.967093)"
+     id="layer1"
+     inkscape:groupmode="layer"
+     inkscape:label="Layer 1">
+    <path
+       inkscape:connector-curvature="0"
+       id="path14"
+       d="m 24.738509,24.431001 c 0,47.072183 0,47.072183 0,47.072183 H 80.28284"
+       style="fill:none;stroke:#000000;stroke-width:3.93891;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
+    <rect
+       rx="1.9832883"
+       style="fill:#000000;stroke-width:5.70657;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:1;stroke-dasharray:none"
+       id="rect841-4"
+       width="9.5355101"
+       height="25.570072"
+       x="31.389412"
+       y="39.250027"
+       ry="1.983288" />
+    <rect
+       ry="1.983288"
+       y="28.498968"
+       x="43.284847"
+       height="36.321129"
+       width="9.535511"
+       id="rect841-4-1"
+       style="fill:#000000;stroke-width:6.80127;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:1;stroke-dasharray:none"
+       rx="1.9832883" />
+    <rect
+       ry="1.9832885"
+       y="46.223686"
+       x="55.180279"
+       height="18.596415"
+       width="9.535511"
+       id="rect841-4-8"
+       style="fill:#000000;stroke-width:4.86659;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:1;stroke-dasharray:none"
+       rx="1.9832883" />
+    <rect
+       ry="1.9832883"
+       y="37.216038"
+       x="67.075714"
+       height="27.604055"
+       width="9.535511"
+       id="rect841-4-15"
+       style="fill:#000000;stroke-width:5.92919;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:1;stroke-dasharray:none"
+       rx="1.9832883" />
+  </g>
+</svg>
diff --git a/orangecontrib/text/widgets/owstatistics.py b/orangecontrib/text/widgets/owstatistics.py
new file mode 100644
index 000000000..5ef07b72f
--- /dev/null
+++ b/orangecontrib/text/widgets/owstatistics.py
@@ -0,0 +1,685 @@
+import re
+from copy import copy
+from string import punctuation
+from typing import Callable, List, Optional, Tuple
+
+import numpy as np
+from AnyQt.QtCore import QSize
+from AnyQt.QtWidgets import QComboBox, QGridLayout, QLabel, QLineEdit
+
+from Orange.widgets import gui
+from Orange.widgets.settings import ContextSetting
+from Orange.widgets.utils.concurrent import ConcurrentWidgetMixin, TaskState
+from Orange.widgets.utils.widgetpreview import WidgetPreview
+from Orange.widgets.widget import Input, Output, OWWidget
+from orangewidget.widget import Msg
+
+from orangecontrib.text import Corpus
+
+# those functions are implemented here since they are used in more statistics
+from orangecontrib.text.preprocess import (
+    LowercaseTransformer,
+    Preprocessor,
+    RegexpTokenizer,
+    UrlRemover,
+)
+from orangecontrib.text.widgets.utils import format_summary_details
+from orangecontrib.text.widgets.utils.context import (
+    AlmostPerfectContextHandler,
+)
+
+
+def num_words(document: str, callback: Callable) -> int:
+    """
+    Return number of words in document-string. Word is every entity divided by
+    space, tab, newline.
+    """
+    callback()
+    return len(document.split())
+
+
+def char_count(document: str, callback: Callable) -> int:
+    """
+    Count number of alpha-numerical in document/string.
+    """
+    callback()
+    return sum(c.isalnum() for c in document)
+
+
+def digit_count(document: str, callback: Callable) -> int:
+    """
+    Count number of digits in document/string.
+    """
+    callback()
+    return sum(c.isdigit() for c in document)
+
+
+def count_appearances(
+    document: str, characters: List[str], callback: Callable
+) -> int:
+    """
+    Count number of appearances of chars from `characters` list.
+    """
+    callback()
+    # I think it supports the majority of main languages
+    # Y can be vowel too sometimes - it is not possible to distinguish
+    return sum(document.lower().count(c) for c in characters)
+
+
+def preprocess_only_words(corpus: Corpus) -> Corpus:
+    """
+    Apply the preprocessor that splits words, transforms them to lower case
+    (and removes punctuations).
+
+    Parameters
+    ----------
+    corpus
+        Corpus on which the preprocessor will be applied.
+
+    Returns
+    -------
+    Preprocessed corpus. Result of pre-processing is saved in tokens/ngrams.
+    """
+    p = Preprocessor(
+        transformers=[LowercaseTransformer()],
+        # by default regexp keeps only words (no punctuations, no spaces)
+        tokenizer=RegexpTokenizer(),
+    )
+    return p(corpus, inplace=False)
+
+
+# every statistic returns a np.ndarray with statistics
+# and list with variables names - it must be implemented here since some
+# statistics in the future will have more variables
+
+
+def words_count(
+    corpus: Corpus, _: str, callback: Callable
+) -> Tuple[np.ndarray, List[str]]:
+    """
+    Count number of words in each document.
+    """
+    corpus = preprocess_only_words(corpus)
+    # np.c_ makes column vector (ndarray) out of the list
+    # [1, 2, 3] -> [[1], [2], [3]]
+    return (
+        np.c_[[num_words(d, callback) for d in corpus.documents]],
+        ["Word count"],
+    )
+
+
+def characters_count(
+    corpus: Corpus, _: str, callback: Callable
+) -> Tuple[np.ndarray, List[str]]:
+    """
+    Count number of characters without spaces, newlines, tabs, ...
+    """
+    return (
+        np.c_[[char_count(d, callback) for d in corpus.documents]],
+        ["Character count"],
+    )
+
+
+def n_gram_count(
+    corpus: Corpus, _: str, callback: Callable
+) -> Tuple[np.ndarray, List[str]]:
+    """
+    Count number of n-grams in every document
+    """
+
+    def ng_count(n_gram: List[str]):
+        callback()
+        return len(n_gram)
+
+    return np.c_[list(map(ng_count, corpus.ngrams))], ["N-gram count"]
+
+
+def word_density(
+    corpus: Corpus, _: str, callback: Callable
+) -> Tuple[np.ndarray, List[str]]:
+    """
+    Computes word density as: word count / character count + 1
+    """
+    return (
+        np.c_[
+            [
+                char_count(d, lambda: True) / num_words(d, callback)
+                for d in corpus.documents
+            ]
+        ],
+        ["Average word length"],
+    )
+
+
+def punctuation_count(
+    corpus: Corpus, _: str, callback: Callable
+) -> Tuple[np.ndarray, List[str]]:
+    """
+    Count number of punctuation signs
+    """
+
+    def num_punctuation(document: str):
+        callback()
+        return sum(document.count(c) for c in punctuation)
+
+    return (
+        np.c_[list(map(num_punctuation, corpus.documents))],
+        ["Punctuation count"],
+    )
+
+
+def capital_count(
+    corpus: Corpus, _: str, callback: Callable
+) -> Tuple[np.ndarray, List[str]]:
+    """
+    Count number of capital letters in documents
+    """
+
+    def num_capitals(document: str):
+        callback()
+        return sum(1 for c in document if c.isupper())
+
+    return (
+        np.c_[list(map(num_capitals, corpus.documents))],
+        ["Capital letter count"],
+    )
+
+
+def vowel_count(
+    corpus: Corpus, vowels: str, callback: Callable
+) -> Tuple[np.ndarray, List[str]]:
+    """
+    Count number of vowels in documents
+    """
+    # comma separated string of vowels to list
+    vowels = [v.strip() for v in vowels.split(",")]
+    return (
+        np.c_[
+            [count_appearances(d, vowels, callback) for d in corpus.documents]
+        ],
+        ["Vowel count"],
+    )
+
+
+def consonant_count(
+    corpus: Corpus, consonants: str, callback: Callable
+) -> Tuple[np.ndarray, List[str]]:
+    """
+    Count number of consonants in documents. Consonants are all alnum
+    characters except vowels and numbers
+    """
+    # comma separated string of consonants to list
+    consonants = [v.strip() for v in consonants.split(",")]
+    return (
+        np.c_[
+            [
+                count_appearances(d, consonants, callback)
+                for d in corpus.documents
+            ]
+        ],
+        ["Consonant count"],
+    )
+
+
+def per_cent_unique_words(
+    corpus: Corpus, _: str, callback: Callable
+) -> Tuple[np.ndarray, List[str]]:
+    """
+    Ratio between unique words count and all words count
+    """
+    corpus = preprocess_only_words(corpus)
+
+    def perc_unique(tokens: str):
+        callback()
+        return len(set(tokens)) / len(tokens)
+
+    return np.c_[list(map(perc_unique, corpus.tokens))], ["% unique words"]
+
+
+def starts_with(
+    corpus: Corpus, prefix: str, callback: Callable
+) -> Tuple[np.ndarray, List[str]]:
+    """
+    Number of words that starts with the string in `prefix`.
+    """
+    corpus = preprocess_only_words(corpus)
+
+    def number_starts_with(tokens: List[str]):
+        callback()
+        return sum(t.startswith(prefix) for t in tokens)
+
+    return (
+        np.c_[list(map(number_starts_with, corpus.tokens))],
+        [f"Starts with {prefix}"],
+    )
+
+
+def ends_with(
+    corpus: Corpus, postfix: str, callback: Callable
+) -> Tuple[np.ndarray, List[str]]:
+    """
+    Number of words that ends with the string in `postfix`.
+    """
+    corpus = preprocess_only_words(corpus)
+
+    def number_ends_with(tokens: List[str]):
+        callback()
+        return sum(t.endswith(postfix) for t in tokens)
+
+    return (
+        np.c_[list(map(number_ends_with, corpus.tokens))],
+        [f"Ends with {postfix}"],
+    )
+
+
+def contains(
+    corpus: Corpus, text: str, callback: Callable
+) -> Tuple[np.ndarray, List[str]]:
+    """
+    Number of words that contains string in `text`.
+    """
+    return (
+        np.c_[
+            [count_appearances(d, [text], callback) for d in corpus.documents]
+        ],
+        [f"Contains {text}"],
+    )
+
+
+def regex(
+    corpus: Corpus, expression: str, callback: Callable
+) -> Tuple[np.ndarray, List[str]]:
+    """
+    Count occurrences of pattern in `expression`.
+    """
+    pattern = re.compile(expression)
+
+    def number_regex(tokens: List[str]):
+        callback()
+        return sum(bool(pattern.match(t)) for t in tokens)
+
+    return (
+        np.c_[list(map(number_regex, corpus.tokens))],
+        [f"Regex {expression}"],
+    )
+
+
+def pos_tags(
+    corpus: Corpus, pos_tags: str, callback: Callable
+) -> Optional[Tuple[np.ndarray, List[str]]]:
+    """
+    Count number of specified pos tags in corpus
+    """
+    p_tags = [v.strip().lower() for v in pos_tags.split(",")]
+
+    def cust_count(tags):
+        callback()
+        tags = [t.lower() for t in tags]
+        return sum(tags.count(t) for t in p_tags)
+
+    if corpus.pos_tags is None:
+        return None
+    return (
+        np.c_[[cust_count(p) for p in corpus.pos_tags]],
+        [f"POS tags {pos_tags}"],
+    )
+
+
+class ComputeValue:
+    """
+    Class which provides compute value functionality. It stores the function
+    that is used to compute values on new data table using this domain.
+
+    Attributes
+    ----------
+    function
+        Function that computes new values
+    pattern
+        Some statistics need additional parameter with the pattern
+        (e.g. starts with), for others it is set to empty string.
+    """
+
+    def __init__(self, function: Callable, pattern: str) -> None:
+        self.function = function
+        self.pattern = pattern
+
+    def __call__(self, data: Corpus) -> np.ndarray:
+        """
+        This function compute values on new table.
+        """
+        # lambda is added as a placeholder for a callback.
+        return self.function(data, self.pattern, lambda: True)[0]
+
+
+# the definition of all statistics used in this widget, if new statistic
+# is required ad it to this list
+
+STATISTICS = [
+    # (name of the statistics, function to compute, default value)
+    # if default value is None - text box is not required
+    ("Word count", words_count, None),
+    ("Character count", characters_count, None),
+    ("N-gram count", n_gram_count, None),
+    ("Average word length", word_density, None),
+    ("Punctuation count", punctuation_count, None),
+    ("Capital letter count", capital_count, None),
+    ("Vowel count", vowel_count, "a,e,i,o,u"),
+    (
+        "Consonant count",
+        consonant_count,
+        "b,c,d,f,g,h,j,k,l,m,n,p,q,r,s,t,v,w,x,y,z",
+    ),
+    ("Per cent unique words", per_cent_unique_words, None),
+    ("Starts with", starts_with, ""),
+    ("Ends with", ends_with, ""),
+    ("Contains", contains, ""),
+    ("Regex", regex, ""),
+    ("POS tag", pos_tags, "NN,VV,JJ"),
+]
+STATISTICS_NAMES = list(list(zip(*STATISTICS))[0])
+STATISTICS_FUNCTIONS = list(list(zip(*STATISTICS))[1])
+STATISTICS_DEFAULT_VALUE = list(list(zip(*STATISTICS))[2])
+
+
+def run(corpus: Corpus, statistics: Tuple[int, str], state: TaskState) -> None:
+    """
+    This function runs the computation for new features.
+    All results will be reported as a partial results.
+
+    Parameters
+    ----------
+    corpus
+        The corpus on which the computation is held.
+    statistics
+        Tuple of statistic pairs to be computed:
+        (statistics id, string pattern)
+    state
+        State used to report progress and partial results.
+    """
+    # callback is called for each corpus element statistics time
+    tick_values = iter(np.linspace(0, 100, len(corpus) * len(statistics)))
+
+    def advance():
+        state.set_progress_value(next(tick_values))
+
+    for s, patern in statistics:
+        fun = STATISTICS_FUNCTIONS[s]
+        result = fun(corpus, patern, advance)
+        if result is not None:
+            result = result + (ComputeValue(fun, patern),)
+        state.set_partial_result((s, patern, result))
+
+
+class OWStatistics(OWWidget, ConcurrentWidgetMixin):
+    name = "Statistics"
+    description = "Create new statistic variables for documents."
+    keywords = []
+    icon = "icons/Statistics.svg"
+
+    class Inputs:
+        corpus = Input("Corpus", Corpus)
+
+    class Outputs:
+        corpus = Output("Corpus", Corpus)
+
+    class Warning(OWWidget.Warning):
+        not_computed = Msg(
+            "{} statistics cannot be computed and is omitted from results."
+        )
+
+    want_main_area = False
+    settingsHandler = AlmostPerfectContextHandler(0.9)
+
+    # settings
+    default_rules = [(0, ""), (1, "")]  # rules used to reset the active rules
+    active_rules: List[Tuple[int, str]] = ContextSetting(default_rules[:])
+    # rules active at time of apply clicked
+    applied_rules: Optional[List[Tuple[int, str]]] = None
+
+    result_dict = {}
+
+    def __init__(self) -> None:
+        OWWidget.__init__(self)
+        ConcurrentWidgetMixin.__init__(self)
+        self.corpus = None
+
+        # the list with combos from the widget
+        self.combos = []
+        # the list with line edits from the widget
+        self.line_edits = []
+        # the list of buttons in front of controls that removes them
+        self.remove_buttons = []
+
+        self._init_controls()
+
+    def _init_controls(self) -> None:
+        """ Init all controls of the widget """
+        self._init_statistics_box()
+        box = gui.hBox(self.controlArea)
+        gui.rubber(box)
+        gui.button(
+            box,
+            self,
+            "Apply",
+            autoDefault=False,
+            width=180,
+            callback=self.apply,
+        )
+
+    def _init_statistics_box(self) -> None:
+        """
+        Init the statistics box in control area - place where used statistics
+        are listed, remove, and added.
+        """
+        patternbox = gui.vBox(self.controlArea, box=True)
+        self.rules_box = rules_box = QGridLayout()
+        patternbox.layout().addLayout(self.rules_box)
+        box = gui.hBox(patternbox)
+        gui.button(
+            box,
+            self,
+            "+",
+            callback=self._add_row,
+            autoDefault=False,
+            flat=True,
+            minimumSize=(QSize(20, 20)),
+        )
+        gui.rubber(box)
+        self.rules_box.setColumnMinimumWidth(1, 70)
+        self.rules_box.setColumnMinimumWidth(0, 10)
+        self.rules_box.setColumnStretch(0, 1)
+        self.rules_box.setColumnStretch(1, 1)
+        self.rules_box.setColumnStretch(2, 100)
+        rules_box.addWidget(QLabel("Feature"), 0, 1)
+        rules_box.addWidget(QLabel("Pattern"), 0, 2)
+        self.adjust_n_rule_rows()
+
+    def adjust_n_rule_rows(self) -> None:
+        """
+        Add or remove lines in statistics box if needed and fix the tab order.
+        """
+
+        def _add_line():
+            n_lines = len(self.combos) + 1
+
+            # add delete symbol
+            button = gui.button(
+                None,
+                self,
+                label="×",
+                flat=True,
+                height=20,
+                styleSheet="* {font-size: 16pt; color: silver}"
+                "*:hover {color: black}",
+                autoDefault=False,
+                callback=self._remove_row,
+            )
+            button.setMinimumSize(QSize(12, 20))
+            self.rules_box.addWidget(button, n_lines, 0)
+            self.remove_buttons.append(button)
+
+            # add statistics type dropdown
+            combo = QComboBox()
+            combo.addItems(STATISTICS_NAMES)
+            combo.currentIndexChanged.connect(self._sync_edit_combo)
+            self.rules_box.addWidget(combo, n_lines, 1)
+            self.combos.append(combo)
+
+            # add line edit for patern
+            line_edit = QLineEdit()
+            self.rules_box.addWidget(line_edit, n_lines, 2)
+            line_edit.textChanged.connect(self._sync_edit_line)
+            self.line_edits.append(line_edit)
+
+        def _remove_line():
+            self.combos.pop().deleteLater()
+            self.line_edits.pop().deleteLater()
+            self.remove_buttons.pop().deleteLater()
+
+        def _fix_tab_order():
+            # TODO: write it differently - check create class
+            for i, (r, c, l) in enumerate(
+                zip(self.active_rules, self.combos, self.line_edits)
+            ):
+                c.setCurrentIndex(r[0])  # update combo
+                l.setText(r[1])  # update line edit
+                if STATISTICS_DEFAULT_VALUE[r[0]] is not None:
+                    l.setVisible(True)
+                else:
+                    l.setVisible(False)
+
+        n = len(self.active_rules)
+        while n > len(self.combos):
+            _add_line()
+        while len(self.combos) > n:
+            _remove_line()
+        _fix_tab_order()
+
+    def _add_row(self) -> None:
+        """ Add a new row to the statistic box """
+        self.active_rules.append((0, ""))
+        self.adjust_n_rule_rows()
+
+    def _remove_row(self) -> None:
+        """ Removes the clicked row in the statistic box """
+        remove_idx = self.remove_buttons.index(self.sender())
+        del self.active_rules[remove_idx]
+        self.adjust_n_rule_rows()
+
+    def _sync_edit_combo(self) -> None:
+        """ Update rules when combo value changed """
+        combo = self.sender()
+        edit_index = self.combos.index(combo)
+        selected_i = combo.currentIndex()
+        default_value = STATISTICS_DEFAULT_VALUE[selected_i]
+        self.active_rules[edit_index] = (
+            selected_i,
+            default_value or self.active_rules[edit_index][1],
+        )
+        self.adjust_n_rule_rows()
+
+    def _sync_edit_line(self) -> None:
+        """ Update rules when line edit value changed """
+        line_edit = self.sender()
+        edit_index = self.line_edits.index(line_edit)
+        self.active_rules[edit_index] = (
+            self.active_rules[edit_index][0],
+            line_edit.text(),
+        )
+
+    @Inputs.corpus
+    def set_data(self, corpus) -> None:
+        self.closeContext()
+        self.corpus = corpus
+        self.active_rules = self.default_rules[:]
+        self.openContext(corpus)
+        self.adjust_n_rule_rows()
+        self.result_dict = {}  # empty computational results when new data
+        # reset old output - it also handle case with corpus == None
+        self.Outputs.corpus.send(None)
+
+        # summary
+        if corpus:
+            self.info.set_input_summary(
+                len(corpus), format_summary_details(corpus)
+            )
+            self.apply()
+        else:
+            self.info.set_input_summary(self.info.NoInput)
+        self.info.set_output_summary(self.info.NoOutput)
+
+    def apply(self) -> None:
+        """
+        This function is called when user click apply button. It starts
+        the computation. When computation is finished results are shown
+        on the output - on_done.
+        """
+        if self.corpus is None:
+            return
+        self.applied_rules = copy(self.active_rules)
+        self.cancel()  # cancel task since user clicked apply again
+        rules_to_compute = [
+            r for r in self.active_rules if r not in self.result_dict
+        ]
+        self.start(run, self.corpus, rules_to_compute)
+
+    def on_exception(self, exception: Exception) -> None:
+        raise exception
+
+    def on_partial_result(
+        self, result: Tuple[int, str, Tuple[np.ndarray, List[str], Callable]]
+    ) -> None:
+        statistic, patern, result = result
+        self.result_dict[(statistic, patern)] = result
+
+    def on_done(self, result: None) -> None:
+        # join results
+        if self.corpus:
+            self.output_results()
+
+        # remove unnecessary results from dict - it can happen that user
+        # already removes the statistic from gui but it is still computed
+        for k in list(self.result_dict.keys()):
+            if k not in self.active_rules:
+                del self.result_dict[k]
+
+    def output_results(self) -> None:
+        self.Warning.not_computed.clear()
+        to_stack = []
+        attributes = []
+        comput_values = []
+        not_computed = []
+        for rule in self.applied_rules:
+            # check for safety reasons - in practice should not happen
+            if rule in self.result_dict:
+                res = self.result_dict[rule]
+                if res is None:
+                    not_computed.append(STATISTICS_NAMES[rule[0]])
+                else:
+                    data, variables, comp_value = res
+                    to_stack.append(data)
+                    attributes += variables
+                    comput_values.append(comp_value)
+        if not_computed:
+            self.Warning.not_computed(", ".join(not_computed))
+        # here we will use extend_attributes function - this function add
+        # attributes to existing corpus so it must be copied first
+        # TODO: when change of pre-processing is finished change this function
+        #  to have inplace parameter which is False by default,
+        #  also I would prefer extend_attriubtes where you give variables
+        #  instead of strings on input
+        new_corpus = self.corpus.copy()
+        if to_stack:
+            new_corpus.extend_attributes(
+                np.hstack(to_stack), attributes, compute_values=comput_values
+            )
+        self.Outputs.corpus.send(new_corpus)
+
+        # summary
+        self.info.set_output_summary(
+            len(new_corpus), format_summary_details(new_corpus)
+        )
+
+
+if __name__ == "__main__":
+    WidgetPreview(OWStatistics).run(Corpus.from_file("book-excerpts"))
diff --git a/orangecontrib/text/widgets/tests/test_owstatistics.py b/orangecontrib/text/widgets/tests/test_owstatistics.py
new file mode 100644
index 000000000..930376727
--- /dev/null
+++ b/orangecontrib/text/widgets/tests/test_owstatistics.py
@@ -0,0 +1,439 @@
+import unittest
+from unittest.mock import Mock
+
+import numpy as np
+import pkg_resources
+from AnyQt.QtWidgets import QPushButton
+
+from Orange.data import Domain, StringVariable
+from Orange.widgets.tests.base import WidgetTest
+from orangecontrib.text import Corpus
+from orangecontrib.text.tag import AveragedPerceptronTagger
+from orangecontrib.text.widgets.owstatistics import (
+    STATISTICS_NAMES,
+    OWStatistics,
+)
+
+
+class TestStatisticsWidget(WidgetTest):
+    def setUp(self) -> None:
+        self.widget = self.create_widget(OWStatistics)
+        self.book_data = Corpus.from_file("book-excerpts")
+        self._create_simple_data()
+
+    def _create_simple_data(self) -> None:
+        """
+        Creat a simple dataset with 4 documents. Save it to `self.corpus`.
+        """
+        metas = np.array(
+            [
+                "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
+                "Duis viverra elit eu mi blandit, {et} sollicitudin nisi ",
+                " a porta\tleo. Duis vitae ultrices massa. Mauris ut pulvinar a",
+                "tortor. Class (aptent) taciti\nsociosqu ad lit1ora torquent per",
+            ]
+        ).reshape(-1, 1)
+        text_var = StringVariable("text")
+        domain = Domain([], metas=[text_var])
+        self.corpus = Corpus(
+            domain,
+            X=np.empty((len(metas), 0)),
+            metas=metas,
+            text_features=[text_var],
+        )
+
+    def _set_feature(self, feature_name: str, value: str = ""):
+        """
+        Set statistic which need to be computed by widget. It sets only one
+        statistics.
+
+        Parameters
+        ----------
+        feature_name
+            The name of statistic
+        value
+            If statistic need a value (e.g. prefix) it is passed here.
+        """
+        feature_index = STATISTICS_NAMES.index(feature_name)
+        self.widget.active_rules = [(feature_index, value)]
+        self.widget.adjust_n_rule_rows()
+
+    def _compute_features(self, feature_name: str, value: str = "") -> Corpus:
+        """
+        Send `self.corpus` to widget, set statistic which need bo be computed,
+        run the computation, and return widget output.
+
+        Parameters
+        ----------
+        feature_name
+            The name of the statistic, only one statistic is set
+        value
+            The value if statistic need it.
+
+        Returns
+        -------
+        Resulting corpus.
+        """
+        self.send_signal(self.widget.Inputs.corpus, self.corpus)
+        self.wait_until_finished()
+        self._set_feature(feature_name, value)
+        self.widget.apply()
+        self.wait_until_finished()
+        res = self.get_output(self.widget.Outputs.corpus)
+        self.assertTupleEqual((len(self.corpus), 1), res.X.shape)
+        return res
+
+    def test_send_data(self):
+        """ Test with basic data, and empty data """
+        self.send_signal(self.widget.Inputs.corpus, self.book_data)
+        self.assertEqual(len(self.book_data), len(self.widget.corpus))
+
+        self.send_signal(self.widget.Inputs.corpus, None)
+        self.assertIsNone(self.widget.corpus)
+        self.widget.apply()
+        self.assertIsNone(self.get_output(self.widget.Outputs.corpus))
+
+    def test_words_count(self):
+        """ Test words count statistic """
+        data = self._compute_features("Word count")
+        np.testing.assert_array_equal(data.X.flatten(), [8, 9, 11, 9])
+
+        self.send_signal(self.widget.Inputs.corpus, None)
+        self.assertIsNone(self.get_output(self.widget.Outputs.corpus))
+
+    def test_characters_count(self):
+        """ Test characters count statistic """
+        data = self._compute_features("Character count")
+        np.testing.assert_array_equal(data.X.flatten(), [47, 44, 48, 51])
+
+        self.send_signal(self.widget.Inputs.corpus, None)
+        self.assertIsNone(self.get_output(self.widget.Outputs.corpus))
+
+    def test_n_gram_count(self):
+        """ Test n-grams count statistic """
+        data = self._compute_features("N-gram count")
+        np.testing.assert_array_equal(data.X.flatten(), [10, 12, 13, 12])
+
+        self.send_signal(self.widget.Inputs.corpus, None)
+        self.assertIsNone(self.get_output(self.widget.Outputs.corpus))
+
+    def test_word_density(self):
+        """ Test word density statistic """
+        data = self._compute_features("Average word length")
+        np.testing.assert_array_almost_equal(
+            data.X.flatten(), [5.875, 4.888889, 4.363636, 5.666667]
+        )
+
+        self.send_signal(self.widget.Inputs.corpus, None)
+        self.assertIsNone(self.get_output(self.widget.Outputs.corpus))
+
+    def test_punctuations_cont(self):
+        """ Test punctuations count statistic """
+        data = self._compute_features("Punctuation count")
+        np.testing.assert_array_equal(data.X.flatten(), [2, 3, 2, 3])
+
+        self.send_signal(self.widget.Inputs.corpus, None)
+        self.assertIsNone(self.get_output(self.widget.Outputs.corpus))
+
+    def test_capitals_count(self):
+        """ Test capitals count statistic """
+        data = self._compute_features("Capital letter count")
+        np.testing.assert_array_equal(data.X.flatten(), [1, 1, 2, 1])
+
+        self.send_signal(self.widget.Inputs.corpus, None)
+        self.assertIsNone(self.get_output(self.widget.Outputs.corpus))
+
+    def test_vowels_count(self):
+        """ Test vowels count statistic """
+        data = self._compute_features("Vowel count", "a,e,i,o,u")
+        np.testing.assert_array_equal(data.X.flatten(), [19, 20, 23, 20])
+
+        self.send_signal(self.widget.Inputs.corpus, None)
+        self.assertIsNone(self.get_output(self.widget.Outputs.corpus))
+
+    def test_consonants_count(self):
+        """ Test consonants count statistic """
+        data = self._compute_features(
+            "Consonant count", "b,c,d,f,g,h,j,k,l,m,n,p,q,r,s,t,v,w,x,y,z"
+        )
+        np.testing.assert_array_equal(data.X.flatten(), [28, 24, 25, 30])
+
+        self.send_signal(self.widget.Inputs.corpus, None)
+        self.assertIsNone(self.get_output(self.widget.Outputs.corpus))
+
+    def test_per_cent_unique_words(self):
+        """ Test per-cent unique words statistic """
+        data = self._compute_features("Per cent unique words")
+        np.testing.assert_array_almost_equal(
+            data.X.flatten(), [1, 1, 0.909091, 1]
+        )
+
+        self.send_signal(self.widget.Inputs.corpus, None)
+        self.assertIsNone(self.get_output(self.widget.Outputs.corpus))
+
+    def test_starts_with(self):
+        """ Test starts with count statistic """
+        data = self._compute_features("Starts with", "a")
+        np.testing.assert_array_almost_equal(data.X.flatten(), [2, 0, 2, 2])
+
+        data = self._compute_features("Starts with", "ap")
+        np.testing.assert_array_almost_equal(data.X.flatten(), [0, 0, 0, 1])
+
+        self.send_signal(self.widget.Inputs.corpus, None)
+        self.assertIsNone(self.get_output(self.widget.Outputs.corpus))
+
+    def test_ends_with(self):
+        """ Test ends with count statistic """
+        data = self._compute_features("Ends with", "t")
+        np.testing.assert_array_almost_equal(data.X.flatten(), [3, 3, 1, 2])
+
+        data = self._compute_features("Ends with", "et")
+        np.testing.assert_array_almost_equal(data.X.flatten(), [1, 1, 0, 0])
+
+        self.send_signal(self.widget.Inputs.corpus, None)
+        self.assertIsNone(self.get_output(self.widget.Outputs.corpus))
+
+    def test_contains(self):
+        """ Test contains count statistic """
+        data = self._compute_features("Contains", "t")
+        np.testing.assert_array_almost_equal(data.X.flatten(), [5, 4, 4, 9])
+
+        data = self._compute_features("Contains", "et")
+        np.testing.assert_array_almost_equal(data.X.flatten(), [2, 1, 0, 0])
+
+        data = self._compute_features("Contains", "is")
+        np.testing.assert_array_almost_equal(data.X.flatten(), [1, 2, 2, 0])
+
+        self.send_signal(self.widget.Inputs.corpus, None)
+        self.assertIsNone(self.get_output(self.widget.Outputs.corpus))
+
+    def test_regex(self):
+        """ Test regex statistic """
+        # words that contains digit
+        data = self._compute_features("Regex", "\w*\d\w*")
+        np.testing.assert_array_almost_equal(data.X.flatten(), [0, 0, 0, 1])
+
+        # words that contains digit
+        data = self._compute_features("Regex", "\w*is\w*")
+        np.testing.assert_array_almost_equal(data.X.flatten(), [1, 2, 2, 0])
+
+        self.send_signal(self.widget.Inputs.corpus, None)
+        self.assertIsNone(self.get_output(self.widget.Outputs.corpus))
+
+    def test_pos(self):
+        """
+        Test post tags count
+        - test with corpus that has no pos tags - warning raised
+        - test with corpus that has pos tags
+        """
+        self.send_signal(self.widget.Inputs.corpus, self.corpus)
+        self._set_feature("POS tag", "NN")
+        self.widget.apply()
+        self.wait_until_finished()
+        res = self.get_output(self.widget.Outputs.corpus)
+        self.assertEqual(0, res.X.shape[1])
+        self.assertTrue(self.widget.Warning.not_computed.is_shown())
+
+        tagger = AveragedPerceptronTagger()
+        result = tagger.tag_corpus(self.corpus)
+
+        self.send_signal(self.widget.Inputs.corpus, result)
+        self._set_feature("POS tag", "NN")
+        self.widget.apply()
+        self.wait_until_finished()
+        res = self.get_output(self.widget.Outputs.corpus)
+        self.assertTupleEqual((len(self.corpus), 1), res.X.shape)
+        np.testing.assert_array_almost_equal(res.X.flatten(), [7, 6, 4, 6])
+        self.assertFalse(self.widget.Warning.not_computed.is_shown())
+
+    def test_statistics_combination(self):
+        """
+        Testing three statistics at same time and see if column concatenated
+        correctly.
+        """
+        self.send_signal(self.widget.Inputs.corpus, self.corpus)
+
+        wc_index = STATISTICS_NAMES.index("Word count")
+        starts_with_index = STATISTICS_NAMES.index("Starts with")
+        capital_counts_index = STATISTICS_NAMES.index("Capital letter count")
+        self.widget.active_rules = [
+            (wc_index, ""),
+            (starts_with_index, "a"),
+            (capital_counts_index, ""),
+        ]
+        self.widget.adjust_n_rule_rows()
+
+        self.widget.apply()
+        self.wait_until_finished()
+        res = self.get_output(self.widget.Outputs.corpus)
+
+        self.assertTupleEqual((len(self.corpus), 3), res.X.shape)
+        np.testing.assert_array_almost_equal(
+            res.X[:, 0].flatten(), [8, 9, 11, 9]
+        )
+        np.testing.assert_array_almost_equal(
+            res.X[:, 1].flatten(), [2, 0, 2, 2]
+        )
+        np.testing.assert_array_almost_equal(
+            res.X[:, 2].flatten(), [1, 1, 2, 1]
+        )
+
+    def test_dictionary_statistics(self):
+        """
+        Test remove statistic from the dictionary when they are not required
+        """
+        self.send_signal(self.widget.Inputs.corpus, self.corpus)
+
+        self.widget.active_rules = [
+            (1, ""),
+        ]
+        self.widget.adjust_n_rule_rows()
+        self.widget.apply()
+        self.wait_until_finished()
+
+        self.assertListEqual([(1, "")], list(self.widget.result_dict.keys()))
+
+        self.widget.active_rules = [(1, ""), (2, "")]
+        self.widget.adjust_n_rule_rows()
+        self.widget.apply()
+        self.wait_until_finished()
+
+        self.assertListEqual(
+            [(1, ""), (2, "")], list(self.widget.result_dict.keys())
+        )
+
+        self.widget.active_rules = [(2, "")]
+        self.widget.adjust_n_rule_rows()
+        self.widget.apply()
+        self.wait_until_finished()
+
+        self.assertListEqual([(2, "")], list(self.widget.result_dict.keys()))
+
+        # dict should empty on new data
+        self.send_signal(self.widget.Inputs.corpus, self.corpus)
+        self.assertListEqual([], list(self.widget.result_dict.keys()))
+
+    def test_context(self):
+        """ Test whether context correctly restore rules """
+        rules = [(0, ""), (1, ""), (2, "")]
+        self.send_signal(self.widget.Inputs.corpus, self.corpus)
+        self.widget.active_rules = rules[:]
+
+        self.send_signal(self.widget.Inputs.corpus, self.book_data)
+        self.assertListEqual([(0, ""), (1, "")], self.widget.active_rules)
+
+        self.send_signal(self.widget.Inputs.corpus, self.corpus)
+        self.assertListEqual(rules, self.widget.active_rules)
+
+    def test_compute_values(self):
+        """ Test compute values on new data """
+        data = self._compute_features("Word count")
+
+        computed = Corpus.from_table(data.domain, self.book_data)
+        self.assertEqual(data.domain, computed.domain)
+        self.assertTupleEqual((len(self.book_data), 1), computed.X.shape)
+
+    def test_append_to_existing_X(self):
+        """ Test if new features are correctly attached to X matrix """
+        data = Corpus.from_file("election-tweets-2016")
+        self.send_signal(self.widget.Inputs.corpus, data)
+        self.wait_until_finished()
+        statistics = self.get_output(self.widget.Outputs.corpus)
+
+        self.assertTupleEqual(
+            (data.X.shape[0], data.X.shape[1] + 2), statistics.X.shape
+        )
+
+    def test_add_row(self):
+        self.send_signal(self.widget.Inputs.corpus, self.corpus)
+        self.wait_until_finished()
+        self.widget.active_rules = []
+        self.widget.adjust_n_rule_rows()
+        add_button = [
+            x
+            for x in self.widget.controlArea.findChildren(QPushButton)
+            if x.text() == "+"
+        ][0]
+        add_button.click()
+        self.assertListEqual([(0, "")], self.widget.active_rules)
+
+    def test_remove_row(self):
+        self.send_signal(self.widget.Inputs.corpus, self.corpus)
+        self.widget.active_rules = [(0, "")]
+        self.widget.adjust_n_rule_rows()
+        self.assertListEqual([(0, "")], self.widget.active_rules)
+
+        remove_button = [
+            x
+            for x in self.widget.controlArea.findChildren(QPushButton)
+            if x.text() == "×"
+        ][0]
+        remove_button.click()
+        self.assertListEqual([], self.widget.active_rules)
+
+    def test_input_summary(self):
+        """ Test correctness of the input summary """
+        self.widget.info.set_input_summary = in_sum = Mock()
+
+        self.send_signal(self.widget.Inputs.corpus, self.corpus)
+        in_sum.assert_called_with(
+            len(self.corpus),
+            "4 instances, 1 variable\nFeatures: —\nTarget: —\nMetas: string "
+            "(not shown)",
+        )
+        in_sum.reset_mock()
+
+        self.send_signal(self.widget.Inputs.corpus, self.book_data)
+        in_sum.assert_called_with(
+            len(self.book_data),
+            "140 instances, 2 variables\nFeatures: —\nTarget: categorical\n"
+            "Metas: string (not shown)",
+        )
+        in_sum.reset_mock()
+
+        self.send_signal(self.widget.Inputs.corpus, None)
+        in_sum.assert_called_with(self.widget.info.NoInput)
+
+    def test_output_summary(self):
+        """ Test correctness of the output summary"""
+        self.widget.info.set_output_summary = out_sum = Mock()
+
+        self.send_signal(self.widget.Inputs.corpus, self.corpus)
+        self.wait_until_finished()
+        out_sum.assert_called_with(
+            len(self.corpus),
+            "4 instances, 3 variables\nFeatures: 2 numeric\nTarget: —\nMetas: "
+            "string (not shown)",
+        )
+        out_sum.reset_mock()
+
+        self.send_signal(self.widget.Inputs.corpus, self.book_data)
+        self.wait_until_finished()
+        out_sum.assert_called_with(
+            len(self.book_data),
+            "140 instances, 4 variables\nFeatures: 2 numeric\nTarget: "
+            "categorical\nMetas: string (not shown)",
+        )
+        out_sum.reset_mock()
+
+        self.send_signal(self.widget.Inputs.corpus, None)
+        self.wait_until_finished()
+        out_sum.assert_called_with(self.widget.info.NoOutput)
+
+    def test_remove_function(self):
+        """
+        This test will start to fail when version of Orange > 3.26.0
+        When this tests fails:
+        - removes `format_summary_details` and `format_variables_string` from
+          utils.widget
+        - replace `format_summary_details` in statistics widget with the same
+          function from core orange
+        - set minimum orange version to 3.25 for the text add-on
+        """
+        self.assertLessEqual(
+            pkg_resources.get_distribution("orange3").version, "3.26.0"
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/orangecontrib/text/widgets/utils/context.py b/orangecontrib/text/widgets/utils/context.py
new file mode 100644
index 000000000..fe383c866
--- /dev/null
+++ b/orangecontrib/text/widgets/utils/context.py
@@ -0,0 +1,28 @@
+from Orange.widgets.settings import PerfectDomainContextHandler
+
+
+class AlmostPerfectContextHandler(PerfectDomainContextHandler):
+    """
+    This context compares both domains and demands that both domain matches
+    in share_domain_matches (e.g. 0.9) of variables. The position of variables
+    (attribute, meta, class_var) is not important since widget that use this
+    handler do not use their values directly.
+
+    Attributes
+    ----------
+    share_domain_matches
+        The share of domain attributes that need to match.
+    """
+    def __init__(self, share_domain_matches: float) -> None:
+        super().__init__()
+        self.share_domain_matches = share_domain_matches
+
+    def match(self, context, domain, attributes, class_vars, metas):
+        context_vars = context.attributes + context.class_vars + context.metas
+        domain_vars = attributes + class_vars + metas
+        matching_vars = [var for var in context_vars if var in domain_vars]
+
+        return (self.PERFECT_MATCH
+                if (len(matching_vars) / len(domain_vars)
+                    > self.share_domain_matches)
+                else self.NO_MATCH)
diff --git a/orangecontrib/text/widgets/utils/widgets.py b/orangecontrib/text/widgets/utils/widgets.py
index b06fcb5e4..effdb5c1f 100644
--- a/orangecontrib/text/widgets/utils/widgets.py
+++ b/orangecontrib/text/widgets/utils/widgets.py
@@ -7,6 +7,8 @@
                              QGridLayout, QCheckBox, QStackedLayout)
 from AnyQt.QtGui import QColor
 from AnyQt.QtCore import QDate, pyqtSignal, Qt, QSize
+from Orange.data import DiscreteVariable, ContinuousVariable, TimeVariable, \
+    StringVariable
 
 from Orange.widgets.gui import OWComponent, hBox
 from Orange.widgets import settings
@@ -565,3 +567,66 @@ def load_provider(self, path_to_file):
         self.resource_path = path_to_file
         self.valueChanged.emit(self.model_path, self.resource_path)
 
+
+def format_variables_string(variables):
+    """
+    A function that formats the descriptive part of the input/output summary for
+    either features, targets or metas of the input dataset.
+
+    :param variables: Features, targets or metas of the input dataset
+    :return: A formatted string
+    """
+    if not variables:
+        return '—'
+
+    agg = []
+    for var_type_name, var_type in [('categorical', DiscreteVariable),
+                                    ('numeric', ContinuousVariable),
+                                    ('time', TimeVariable),
+                                    ('string', StringVariable)]:
+        # Disable pylint here because a `TimeVariable` is also a
+        # `ContinuousVariable`, and should be labelled as such. That is why
+        # it is necessary to check the type this way instead of using
+        # `isinstance`, which would fail in the above case
+        var_type_list = [v for v in variables if type(v) is var_type]  # pylint: disable=unidiomatic-typecheck
+        if var_type_list:
+            not_shown = ' (not shown)' if issubclass(var_type, StringVariable)\
+                else ''
+            agg.append((f'{var_type_name}{not_shown}', len(var_type_list)))
+
+    attrs, counts = list(zip(*agg))
+    if len(attrs) > 1:
+        var_string = [f'{i} {j}' for i, j in zip(counts, attrs)]
+        var_string = f'{sum(counts)} ({", ".join(var_string)})'
+    elif counts[0] == 1:
+        var_string = attrs[0]
+    else:
+        var_string = f'{counts[0]} {attrs[0]}'
+    return var_string
+
+
+def format_summary_details(data):
+    """
+    A function that forms the entire descriptive part of the input/output
+    summary.
+
+    :param data: A dataset
+    :type data: Orange.data.Table
+    :return: A formatted string
+    """
+    def _plural(number):
+        return 's' * (number != 1)
+
+    details = ''
+    if data:
+        features = format_variables_string(data.domain.attributes)
+        targets = format_variables_string(data.domain.class_vars)
+        metas = format_variables_string(data.domain.metas)
+
+        n_features = len(data.domain.variables) + len(data.domain.metas)
+        details = \
+            f'{len(data)} instance{_plural(len(data))}, ' \
+            f'{n_features} variable{_plural(n_features)}\n' \
+            f'Features: {features}\nTarget: {targets}\nMetas: {metas}'
+
+    return details