Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

OWStatistics: Add new statistics method #676

Merged
merged 2 commits into from
Aug 12, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 63 additions & 1 deletion orangecontrib/text/widgets/owstatistics.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import re
from collections import Counter
from copy import copy
from itertools import groupby
from string import punctuation
from typing import Callable, List, Optional, Tuple

Expand All @@ -12,14 +14,14 @@
from Orange.widgets.utils.concurrent import ConcurrentWidgetMixin, TaskState
from Orange.widgets.utils.widgetpreview import WidgetPreview
from Orange.widgets.widget import Input, Output, OWWidget
from nltk import tokenize
from orangewidget.widget import Msg

from orangecontrib.text import Corpus

# those functions are implemented here since they are used in more statistics
from orangecontrib.text.preprocess import (
LowercaseTransformer,
Preprocessor,
RegexpTokenizer,
PreprocessorList)
from orangecontrib.text.widgets.utils.context import (
Expand Down Expand Up @@ -325,6 +327,63 @@ def cust_count(tags):
)


def yule(
corpus: Corpus, _: str, callback: Callable
) -> Optional[Tuple[np.ndarray, List[str]]]:
"""
Yule's I measure: higher number is higher diversity - richer vocabulary
PSP volume 42 issue 2 Cover and Back matter. (1946).
Mathematical Proceedings of the Cambridge Philosophical Society, 42(2), B1-B2.
doi:10.1017/S0305004100022799
"""
if corpus.pos_tags is None:
return None

def yules_i(tags):
callback()
d = Counter(tags)
m1 = float(len(d))
m2 = sum([len(list(g)) * (freq ** 2) for freq, g in
groupby(sorted(d.values()))])
try:
return (m1 * m1) / (m2 - m1)
except ZeroDivisionError:
return 0

return (
np.c_[[yules_i(p) for p in corpus.pos_tags]],
[f"Yule's I"],
)


def lix(
corpus: Corpus, _: str, callback: Callable
) -> Optional[Tuple[np.ndarray, List[str]]]:
"""
Readability index LIX
https://en.wikipedia.org/wiki/Lix_(readability_test)
"""
corpus = preprocess_only_words(corpus)
tokenizer = tokenize.PunktSentenceTokenizer()

def lix_index(document, tokens):
callback()
# if the text is a single sentence, scores will be high
sentences = len(tokenizer.tokenize(document))
words = len(tokens)
long_words = len([token for token in tokens if len(token) > 6])
try:
return words/sentences + (long_words*100/words)
except ZeroDivisionError:
return 0

return (
np.c_[[lix_index(d, tokens) for d, tokens in zip(corpus.documents,
corpus.tokens)]],
["LIX index"],
)


class ComputeValue:
"""
Class which provides compute value functionality. It stores the function
Expand Down Expand Up @@ -375,6 +434,8 @@ def __call__(self, data: Corpus) -> np.ndarray:
("Contains", contains, ""),
("Regex", regex, ""),
("POS tag", pos_tags, "NN,VV,JJ"),
("Yule's I", yule, None),
("LIX index", lix, None),
]
STATISTICS_NAMES = list(list(zip(*STATISTICS))[0])
STATISTICS_FUNCTIONS = list(list(zip(*STATISTICS))[1])
Expand Down Expand Up @@ -428,6 +489,7 @@ class Warning(OWWidget.Warning):
)

want_main_area = False
mainArea_width_height_ratio = None
settingsHandler = AlmostPerfectContextHandler(0.9)

# settings
Expand Down
42 changes: 42 additions & 0 deletions orangecontrib/text/widgets/tests/test_owstatistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,48 @@ def test_pos(self):
np.testing.assert_array_almost_equal(res.X.flatten(), [6, 5, 4, 5])
self.assertFalse(self.widget.Warning.not_computed.is_shown())

def test_yule(self):
"""
Test Yule's I - complexity index.
- test with corpus that has no pos tags - warning raised
- test with corpus that has pos tags
"""
self.send_signal(self.widget.Inputs.corpus, self.corpus)
self._set_feature("Yule's I")
self.widget.apply()
self.wait_until_finished()
res = self.get_output(self.widget.Outputs.corpus)
self.assertEqual(0, res.X.shape[1])
self.assertTrue(self.widget.Warning.not_computed.is_shown())

self.corpus[1][-1] = "simple"
tagger = AveragedPerceptronTagger()
result = tagger(self.corpus)

self.send_signal(self.widget.Inputs.corpus, result)
self._set_feature("Yule's I")
self.widget.apply()
self.wait_until_finished()
res = self.get_output(self.widget.Outputs.corpus)
self.assertTupleEqual((len(self.corpus), 1), res.X.shape)
# the second document will have lower complexity than the first one
self.assertLess(res[1][0], res[0][0])
self.assertFalse(self.widget.Warning.not_computed.is_shown())

def test_lix(self):
"""
Test LIX readability score.
"""
self.corpus[1][-1] = "simple. simple."
self.send_signal(self.widget.Inputs.corpus, self.corpus)
self._set_feature("LIX index")
self.widget.apply()
self.wait_until_finished()
res = self.get_output(self.widget.Outputs.corpus)
self.assertTupleEqual((len(self.corpus), 1), res.X.shape)
# the second document will have lower complexity than the first one
self.assertLess(res[1][0], res[0][0])

def test_statistics_combination(self):
"""
Testing three statistics at same time and see if column concatenated
Expand Down