-
-
Notifications
You must be signed in to change notification settings - Fork 85
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
bd433eb
commit 7c8424d
Showing
2 changed files
with
250 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,178 @@ | ||
from collections import namedtuple | ||
import numpy as np | ||
|
||
from Orange.data.util import get_unique_names | ||
from Orange.widgets.data.owrank import TableModel, TableView | ||
from Orange.widgets.widget import OWWidget | ||
from Orange.data import Domain, StringVariable, ContinuousVariable, Table | ||
from AnyQt.QtCore import Qt | ||
from AnyQt.QtWidgets import QTableView | ||
|
||
from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures | ||
from nltk import BigramCollocationFinder, TrigramCollocationFinder | ||
|
||
from orangecontrib.text import Corpus | ||
from orangewidget import settings, gui | ||
from orangewidget.utils.signals import Input, Output | ||
from orangewidget.utils.widgetpreview import WidgetPreview | ||
|
||
NGRAM_TYPES = [BigramCollocationFinder, TrigramCollocationFinder] | ||
|
||
ScoreMeta = namedtuple("score_meta", ["name", "scorer"]) | ||
|
||
bi_measures = BigramAssocMeasures() | ||
tri_measures = TrigramAssocMeasures() | ||
|
||
SCORING_METHODS = [ | ||
ScoreMeta("Pointwise Mutual Information", [bi_measures.pmi, | ||
tri_measures.pmi]), | ||
ScoreMeta("Chi Square", [bi_measures.chi_sq, tri_measures.chi_sq]), | ||
ScoreMeta("Dice", [bi_measures.dice]), | ||
ScoreMeta("Fisher", [bi_measures.fisher]), | ||
ScoreMeta("Jaccard", [bi_measures.jaccard, tri_measures.jaccard]), | ||
ScoreMeta("Likelihood ratio", [bi_measures.likelihood_ratio, | ||
tri_measures.likelihood_ratio]), | ||
ScoreMeta("Mi Like", [bi_measures.mi_like, tri_measures.mi_like]), | ||
ScoreMeta("Phi Square", [bi_measures.phi_sq]), | ||
ScoreMeta("Poisson Stirling", [bi_measures.poisson_stirling, | ||
tri_measures.poisson_stirling]), | ||
ScoreMeta("Raw Frequency", [bi_measures.raw_freq, tri_measures.raw_freq]), | ||
ScoreMeta("Student's T", [bi_measures.student_t, tri_measures.student_t]) | ||
] | ||
|
||
|
||
class OWCollocations(OWWidget): | ||
name = "Collocations" | ||
description = "Compute significant bigrams and trigrams." | ||
keywords = ["PMI"] | ||
icon = "icons/Collocations.svg" | ||
|
||
class Inputs: | ||
corpus = Input("Corpus", Corpus) | ||
|
||
class Outputs: | ||
corpus = Output("Table", Table) | ||
|
||
want_main_area = True | ||
|
||
# settings | ||
type_index = settings.Setting(0) | ||
selected_method = settings.Setting(0) | ||
freq_filter = settings.Setting(1) | ||
auto_apply = settings.Setting(True) | ||
|
||
def __init__(self) -> None: | ||
OWWidget.__init__(self) | ||
self.corpus = None | ||
self.type = NGRAM_TYPES[self.type_index] | ||
self.method = None | ||
self.results = None | ||
|
||
setting_box = gui.vBox(self.controlArea, box="Settings") | ||
gui.radioButtons(setting_box, self, "type_index", | ||
btnLabels=["Bigrams", "Trigrams"], | ||
orientation=Qt.Horizontal, | ||
callback=self._change_type) | ||
|
||
gui.spin(setting_box, self, "freq_filter", minv=1, maxv=1000, step=1, | ||
label="Frequency", callback=self.commit) | ||
|
||
method_box = gui.vBox(self.controlArea, box="Scoring Method") | ||
self.method_rb = gui.radioButtons(method_box, self, "selected_method", | ||
btnLabels=[m.name for m in | ||
SCORING_METHODS], | ||
callback=self.commit) | ||
|
||
gui.rubber(self.controlArea) | ||
|
||
gui.button(self.buttonsArea, self, "Restore Original Order", | ||
callback=self.restore_order, | ||
tooltip="Show rows in the original order", | ||
autoDefault=False) | ||
|
||
# GUI | ||
self.collModel = model = TableModel(parent=self) # type: TableModel | ||
model.setHorizontalHeaderLabels(["Method", "Score"]) | ||
self.collView = view = TableView(self) # type: TableView | ||
self.mainArea.layout().addWidget(view) | ||
view.setModel(model) | ||
view.resizeColumnsToContents() | ||
view.setSelectionMode(QTableView.NoSelection) | ||
|
||
@Inputs.corpus | ||
def set_corpus(self, corpus): | ||
self.collModel.clear() | ||
self.collModel.resetSorting(True) | ||
self.corpus = corpus | ||
self.commit() | ||
|
||
def _change_type(self): | ||
self.type = NGRAM_TYPES[self.type_index] | ||
if self.type_index == 1: | ||
self.method_rb.buttons[2].setDisabled(True) | ||
self.method_rb.buttons[3].setDisabled(True) | ||
self.method_rb.buttons[7].setDisabled(True) | ||
if self.selected_method in [2, 3, 7]: | ||
self.method_rb.buttons[0].click() | ||
else: | ||
self.method_rb.buttons[2].setDisabled(False) | ||
self.method_rb.buttons[3].setDisabled(False) | ||
self.method_rb.buttons[7].setDisabled(False) | ||
self.commit() | ||
|
||
def compute_scores(self): | ||
self.collModel.clear() | ||
self.collModel.resetSorting(True) | ||
finder = self.type.from_documents(self.corpus.tokens) | ||
finder.apply_freq_filter(self.freq_filter) | ||
|
||
res = finder.score_ngrams(self.method.scorer[self.type_index]) | ||
collocations = np.array([" ".join(col) for col, score in res], | ||
dtype=object)[:, None] | ||
scores = np.array([score for col, score in res], dtype=float)[:, None] | ||
|
||
self.results = (collocations, scores) | ||
|
||
def commit(self): | ||
if self.corpus is None: | ||
return | ||
|
||
self.type = NGRAM_TYPES[self.type_index] | ||
self.method = SCORING_METHODS[self.selected_method] | ||
|
||
self.compute_scores() | ||
|
||
if not self.results: | ||
self.collModel.clear() | ||
self.Outputs.corpus.send(None) | ||
return | ||
|
||
output = self.create_scores_table() | ||
self.collModel[:] = np.hstack(self.results)[:20] | ||
self.collView.resizeColumnsToContents() | ||
|
||
self.Outputs.corpus.send(output) | ||
|
||
def create_scores_table(self): | ||
col_var, score_var = get_unique_names(self.corpus.domain, | ||
["Collocations", "Scores"]) | ||
|
||
domain = Domain([ContinuousVariable(col_var)], | ||
metas=[StringVariable(score_var)]) | ||
|
||
collocations, scores = self.results | ||
|
||
new_table = Table.from_numpy(domain, scores, metas=collocations) | ||
new_table.name = "Collocation Scores" | ||
return new_table | ||
|
||
def restore_order(self): | ||
"""Restore the original data order of the current view.""" | ||
model = self.collModel | ||
if model is not None: | ||
model.resetSorting(yes_reset=True) | ||
|
||
|
||
if __name__ == "__main__": # pragma: no cover | ||
previewer = WidgetPreview(OWCollocations) | ||
previewer.run(Corpus.from_file("deerwester.tab"), no_exit=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
import unittest | ||
|
||
from AnyQt.QtCore import Qt | ||
from Orange.widgets.tests.base import WidgetTest | ||
|
||
from orangecontrib.text import Corpus | ||
from orangecontrib.text import preprocess | ||
from orangecontrib.text.widgets.owcollocations import OWCollocations | ||
|
||
|
||
class TestOWCollocations(WidgetTest): | ||
|
||
def setUp(self) -> None: | ||
self.widget: OWCollocations = self.create_widget(OWCollocations) | ||
|
||
# create corpus | ||
self.corpus = Corpus.from_file("book-excerpts") | ||
|
||
def test_set_data(self): | ||
self.send_signal(self.widget.Inputs.corpus, self.corpus) | ||
output = self.get_output(self.widget.Outputs.corpus) | ||
self.assertEqual(len(self.widget.collModel), 20) | ||
self.assertEqual(len(output), 69309) | ||
|
||
def test_preprocessed(self): | ||
pp_list = [ | ||
preprocess.LowercaseTransformer(), | ||
preprocess.PunktSentenceTokenizer(), | ||
preprocess.SnowballStemmer(), | ||
] | ||
for p in pp_list: | ||
self.pp_corpus = p(self.corpus) | ||
|
||
self.send_signal(self.widget.Inputs.corpus, self.pp_corpus) | ||
self.assertEqual(len(self.widget.collModel), 20) | ||
|
||
def test_trigrams(self): | ||
model = self.widget.collModel | ||
self.send_signal(self.widget.Inputs.corpus, self.corpus) | ||
bigram = len(model[0][0].split(" ")) | ||
|
||
# trigrams | ||
self.widget.controls.type_index.buttons[1].click() | ||
trigram = len(self.widget.collModel[0][0].split(" ")) | ||
|
||
self.assertGreater(trigram, bigram) | ||
|
||
def test_change_scorer(self): | ||
model = self.widget.collModel | ||
self.send_signal(self.widget.Inputs.corpus, self.corpus) | ||
self.assertEqual(len(model[0]), 2) | ||
|
||
for i, _ in enumerate(self.widget.controls.selected_method.buttons): | ||
self.widget.controls.selected_method.buttons[i].click() | ||
self.assertTrue(self.widget.Outputs.corpus) | ||
|
||
def test_sort_table(self): | ||
"""Test that sorting the table for one method doesn't crash the | ||
widget when changing method""" | ||
view = self.widget.collView | ||
self.send_signal(self.widget.Inputs.corpus, self.corpus) | ||
score = self.widget.collModel[0][1] | ||
|
||
view.horizontalHeader().setSortIndicator(0, Qt.AscendingOrder) | ||
|
||
# change method | ||
self.widget.controls.selected_method.buttons[1].click() | ||
self.assertNotEqual(self.widget.collModel[0][1], score) | ||
|
||
|
||
if __name__ == "__main__": | ||
unittest.main() |