Skip to content

Commit

Permalink
[ENH] OWCollocations Widget
Browse files Browse the repository at this point in the history
  • Loading branch information
ajdapretnar committed May 6, 2022
1 parent bd433eb commit 7c8424d
Show file tree
Hide file tree
Showing 2 changed files with 250 additions and 0 deletions.
178 changes: 178 additions & 0 deletions orangecontrib/text/widgets/owcollocations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
from collections import namedtuple
import numpy as np

from Orange.data.util import get_unique_names
from Orange.widgets.data.owrank import TableModel, TableView
from Orange.widgets.widget import OWWidget
from Orange.data import Domain, StringVariable, ContinuousVariable, Table
from AnyQt.QtCore import Qt
from AnyQt.QtWidgets import QTableView

from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures
from nltk import BigramCollocationFinder, TrigramCollocationFinder

from orangecontrib.text import Corpus
from orangewidget import settings, gui
from orangewidget.utils.signals import Input, Output
from orangewidget.utils.widgetpreview import WidgetPreview

NGRAM_TYPES = [BigramCollocationFinder, TrigramCollocationFinder]

ScoreMeta = namedtuple("score_meta", ["name", "scorer"])

bi_measures = BigramAssocMeasures()
tri_measures = TrigramAssocMeasures()

SCORING_METHODS = [
ScoreMeta("Pointwise Mutual Information", [bi_measures.pmi,
tri_measures.pmi]),
ScoreMeta("Chi Square", [bi_measures.chi_sq, tri_measures.chi_sq]),
ScoreMeta("Dice", [bi_measures.dice]),
ScoreMeta("Fisher", [bi_measures.fisher]),
ScoreMeta("Jaccard", [bi_measures.jaccard, tri_measures.jaccard]),
ScoreMeta("Likelihood ratio", [bi_measures.likelihood_ratio,
tri_measures.likelihood_ratio]),
ScoreMeta("Mi Like", [bi_measures.mi_like, tri_measures.mi_like]),
ScoreMeta("Phi Square", [bi_measures.phi_sq]),
ScoreMeta("Poisson Stirling", [bi_measures.poisson_stirling,
tri_measures.poisson_stirling]),
ScoreMeta("Raw Frequency", [bi_measures.raw_freq, tri_measures.raw_freq]),
ScoreMeta("Student's T", [bi_measures.student_t, tri_measures.student_t])
]


class OWCollocations(OWWidget):
name = "Collocations"
description = "Compute significant bigrams and trigrams."
keywords = ["PMI"]
icon = "icons/Collocations.svg"

class Inputs:
corpus = Input("Corpus", Corpus)

class Outputs:
corpus = Output("Table", Table)

want_main_area = True

# settings
type_index = settings.Setting(0)
selected_method = settings.Setting(0)
freq_filter = settings.Setting(1)
auto_apply = settings.Setting(True)

def __init__(self) -> None:
OWWidget.__init__(self)
self.corpus = None
self.type = NGRAM_TYPES[self.type_index]
self.method = None
self.results = None

setting_box = gui.vBox(self.controlArea, box="Settings")
gui.radioButtons(setting_box, self, "type_index",
btnLabels=["Bigrams", "Trigrams"],
orientation=Qt.Horizontal,
callback=self._change_type)

gui.spin(setting_box, self, "freq_filter", minv=1, maxv=1000, step=1,
label="Frequency", callback=self.commit)

method_box = gui.vBox(self.controlArea, box="Scoring Method")
self.method_rb = gui.radioButtons(method_box, self, "selected_method",
btnLabels=[m.name for m in
SCORING_METHODS],
callback=self.commit)

gui.rubber(self.controlArea)

gui.button(self.buttonsArea, self, "Restore Original Order",
callback=self.restore_order,
tooltip="Show rows in the original order",
autoDefault=False)

# GUI
self.collModel = model = TableModel(parent=self) # type: TableModel
model.setHorizontalHeaderLabels(["Method", "Score"])
self.collView = view = TableView(self) # type: TableView
self.mainArea.layout().addWidget(view)
view.setModel(model)
view.resizeColumnsToContents()
view.setSelectionMode(QTableView.NoSelection)

@Inputs.corpus
def set_corpus(self, corpus):
self.collModel.clear()
self.collModel.resetSorting(True)
self.corpus = corpus
self.commit()

def _change_type(self):
self.type = NGRAM_TYPES[self.type_index]
if self.type_index == 1:
self.method_rb.buttons[2].setDisabled(True)
self.method_rb.buttons[3].setDisabled(True)
self.method_rb.buttons[7].setDisabled(True)
if self.selected_method in [2, 3, 7]:
self.method_rb.buttons[0].click()
else:
self.method_rb.buttons[2].setDisabled(False)
self.method_rb.buttons[3].setDisabled(False)
self.method_rb.buttons[7].setDisabled(False)
self.commit()

def compute_scores(self):
self.collModel.clear()
self.collModel.resetSorting(True)
finder = self.type.from_documents(self.corpus.tokens)
finder.apply_freq_filter(self.freq_filter)

res = finder.score_ngrams(self.method.scorer[self.type_index])
collocations = np.array([" ".join(col) for col, score in res],
dtype=object)[:, None]
scores = np.array([score for col, score in res], dtype=float)[:, None]

self.results = (collocations, scores)

def commit(self):
if self.corpus is None:
return

self.type = NGRAM_TYPES[self.type_index]
self.method = SCORING_METHODS[self.selected_method]

self.compute_scores()

if not self.results:
self.collModel.clear()
self.Outputs.corpus.send(None)
return

output = self.create_scores_table()
self.collModel[:] = np.hstack(self.results)[:20]
self.collView.resizeColumnsToContents()

self.Outputs.corpus.send(output)

def create_scores_table(self):
col_var, score_var = get_unique_names(self.corpus.domain,
["Collocations", "Scores"])

domain = Domain([ContinuousVariable(col_var)],
metas=[StringVariable(score_var)])

collocations, scores = self.results

new_table = Table.from_numpy(domain, scores, metas=collocations)
new_table.name = "Collocation Scores"
return new_table

def restore_order(self):
"""Restore the original data order of the current view."""
model = self.collModel
if model is not None:
model.resetSorting(yes_reset=True)


if __name__ == "__main__": # pragma: no cover
previewer = WidgetPreview(OWCollocations)
previewer.run(Corpus.from_file("deerwester.tab"), no_exit=True)
72 changes: 72 additions & 0 deletions orangecontrib/text/widgets/tests/test_owcollocations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import unittest

from AnyQt.QtCore import Qt
from Orange.widgets.tests.base import WidgetTest

from orangecontrib.text import Corpus
from orangecontrib.text import preprocess
from orangecontrib.text.widgets.owcollocations import OWCollocations


class TestOWCollocations(WidgetTest):

def setUp(self) -> None:
self.widget: OWCollocations = self.create_widget(OWCollocations)

# create corpus
self.corpus = Corpus.from_file("book-excerpts")

def test_set_data(self):
self.send_signal(self.widget.Inputs.corpus, self.corpus)
output = self.get_output(self.widget.Outputs.corpus)
self.assertEqual(len(self.widget.collModel), 20)
self.assertEqual(len(output), 69309)

def test_preprocessed(self):
pp_list = [
preprocess.LowercaseTransformer(),
preprocess.PunktSentenceTokenizer(),
preprocess.SnowballStemmer(),
]
for p in pp_list:
self.pp_corpus = p(self.corpus)

self.send_signal(self.widget.Inputs.corpus, self.pp_corpus)
self.assertEqual(len(self.widget.collModel), 20)

def test_trigrams(self):
model = self.widget.collModel
self.send_signal(self.widget.Inputs.corpus, self.corpus)
bigram = len(model[0][0].split(" "))

# trigrams
self.widget.controls.type_index.buttons[1].click()
trigram = len(self.widget.collModel[0][0].split(" "))

self.assertGreater(trigram, bigram)

def test_change_scorer(self):
model = self.widget.collModel
self.send_signal(self.widget.Inputs.corpus, self.corpus)
self.assertEqual(len(model[0]), 2)

for i, _ in enumerate(self.widget.controls.selected_method.buttons):
self.widget.controls.selected_method.buttons[i].click()
self.assertTrue(self.widget.Outputs.corpus)

def test_sort_table(self):
"""Test that sorting the table for one method doesn't crash the
widget when changing method"""
view = self.widget.collView
self.send_signal(self.widget.Inputs.corpus, self.corpus)
score = self.widget.collModel[0][1]

view.horizontalHeader().setSortIndicator(0, Qt.AscendingOrder)

# change method
self.widget.controls.selected_method.buttons[1].click()
self.assertNotEqual(self.widget.collModel[0][1], score)


if __name__ == "__main__":
unittest.main()

0 comments on commit 7c8424d

Please sign in to comment.