Skip to content

Commit

Permalink
Word Enrichment: Added ConcurrentWidgetMixin to compute in another th…
Browse files Browse the repository at this point in the history
…read
  • Loading branch information
PrimozGodec committed Feb 17, 2020
1 parent b0f6807 commit cf96f20
Showing 1 changed file with 101 additions and 68 deletions.
169 changes: 101 additions & 68 deletions orangecontrib/text/widgets/owwordenrichment.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,48 @@
from types import SimpleNamespace
from typing import List, Optional, Any

import numpy as np
from AnyQt.QtWidgets import QTreeWidget, QTreeView, QTreeWidgetItem, \
QApplication
from AnyQt.QtWidgets import QTreeWidget, QTreeView, QTreeWidgetItem

from Orange.data import Table, Domain
from Orange.widgets import gui
from Orange.widgets.settings import Setting
from Orange.widgets.utils.concurrent import ConcurrentWidgetMixin, TaskState
from Orange.widgets.widget import OWWidget, Msg, Input
from Orange.statistics.util import FDR
from PyQt5.QtCore import QSize
from orangecontrib.text import Corpus
from orangecontrib.text.util import np_sp_sum
from orangecontrib.text.stats import hypergeom_p_values
from orangecontrib.text.vectorization import BowVectorizer


class OWWordEnrichment(OWWidget):
class Result(SimpleNamespace):
words: Optional[List[str]] = None
p_values: Optional[List[float]] = None
fdr_values: Optional[List[float]] = None


class Runner:
@staticmethod
def run(
selected_data_transformed: Table,
data: Table,
result: Result,
state: TaskState
) -> None:
state.set_status("Listing words")
result.words = [
i.name for i in selected_data_transformed.domain.attributes]
state.set_status("Computing p-values")
result.p_values = hypergeom_p_values(
data.X, selected_data_transformed.X,
callback=state.set_progress_value
)
state.set_status("Computing FDR values")
result.fdr_values = FDR(result.p_values)


class OWWordEnrichment(OWWidget, ConcurrentWidgetMixin):
# Basic widget info
name = "Word Enrichment"
description = "Word enrichment analysis for selected documents."
Expand All @@ -35,28 +63,22 @@ class Error(OWWidget.Error):
all_selected = Msg('All examples can not be selected!')

# Settings
filter_by_p = Setting(False)
filter_p_value = Setting(0.01)
filter_by_fdr = Setting(True)
filter_fdr_value = Setting(0.2)
filter_by_p: bool = Setting(False)
filter_p_value: float = Setting(0.01)
filter_by_fdr: bool = Setting(True)
filter_fdr_value: float = Setting(0.2)

def __init__(self):
super().__init__()
OWWidget.__init__(self)
ConcurrentWidgetMixin.__init__(self)

# Init data
self.data = None
self.selected_data = None
self.selected_data_transformed = None # used for transforming the 'selected data' into the 'data' domain

self.words = []
self.p_values = []
self.fdr_values = []
# used for transforming the 'selected data' into the 'data' domain
self.selected_data_transformed = None

# Info section
fbox = gui.widgetBox(self.controlArea, "Info")
self.info_all = gui.label(fbox, self, 'Cluster words:')
self.info_sel = gui.label(fbox, self, 'Selected words:')
self.info_fil = gui.label(fbox, self, 'After filtering:')
self.results = Result()

# Filtering settings
fbox = gui.widgetBox(self.controlArea, "Filter")
Expand All @@ -68,7 +90,6 @@ def __init__(self):
self.spin_p = gui.doubleSpin(hbox, self, 'filter_p_value',
1e-4, 1, step=1e-4, labelWidth=15,
callback=self.filter_and_display,
callbackOnReturn=True,
tooltip="Max p-value for word")
self.spin_p.setEnabled(self.filter_by_p)

Expand All @@ -79,7 +100,6 @@ def __init__(self):
self.spin_fdr = gui.doubleSpin(hbox, self, 'filter_fdr_value',
1e-4, 1, step=1e-4, labelWidth=15,
callback=self.filter_and_display,
callbackOnReturn=True,
tooltip="Max p-value for word")
self.spin_fdr.setEnabled(self.filter_by_fdr)
gui.rubber(self.controlArea)
Expand All @@ -90,7 +110,7 @@ def __init__(self):
self.sig_words.setColumnCount(len(self.cols))
self.sig_words.setHeaderLabels(self.cols)
self.sig_words.setSortingEnabled(True)
self.sig_words.setSelectionMode(QTreeView.ExtendedSelection)
self.sig_words.setSelectionMode(QTreeView.NoSelection)
self.sig_words.sortByColumn(2, 0) # 0 is ascending order
for i in range(len(self.cols)):
self.sig_words.resizeColumnToContents(i)
Expand All @@ -102,6 +122,8 @@ def sizeHint(self):
@Inputs.data
def set_data(self, data=None):
self.data = data
# selected data transformed depends on data domain
self.selected_data_transformed = None

@Inputs.selected_data
def set_data_selected(self, data=None):
Expand Down Expand Up @@ -135,7 +157,8 @@ def check_data(self):
self.clear()
return
self.data = Corpus.from_table(bow_domain, self.data)
self.selected_data_transformed = Corpus.from_table(bow_domain, self.selected_data)
self.selected_data_transformed = Corpus.from_table(
bow_domain, self.selected_data)

if np_sp_sum(self.selected_data_transformed.X) == 0:
self.Error.no_words_overlap()
Expand All @@ -144,15 +167,15 @@ def check_data(self):
self.Error.all_selected()
self.clear()
else:
self.set_input_info()
self.apply()
else:
self.clear()

def clear(self):
self.sig_words.clear()
self.info_all.setText('Cluster words:')
self.info_sel.setText('Selected words:')
self.info_fil.setText('After filtering:')
self.info.set_input_summary(self.info.NoInput)
self.info.set_output_summary(self.info.NoOutput)

def filter_enabled(self, b):
self.chb_p.setEnabled(b)
Expand All @@ -168,43 +191,58 @@ def filter_and_display(self):
if self.selected_data_transformed is None: # do nothing when no Data
return

count = 0
if self.words:
for word, pval, fval in zip(self.words, self.p_values, self.fdr_values):
if (not self.filter_by_p or pval <= self.filter_p_value) and \
(not self.filter_by_fdr or fval <= self.filter_fdr_value):
it = EATreeWidgetItem(word, pval, fval, self.sig_words)
self.sig_words.addTopLevelItem(it)
count += 1
if self.results.words:
count = self.build_tree()
else:
count = 0

for i in range(len(self.cols)):
self.sig_words.resizeColumnToContents(i)
self.set_output_info(count)

self.info_all.setText('Cluster words: {}'.format(len(self.selected_data_transformed.domain.attributes)))
self.info_sel.setText('Selected words: {}'.format(np.count_nonzero(np_sp_sum(self.selected_data_transformed.X, axis=0))))
if not self.filter_by_p and not self.filter_by_fdr:
self.info_fil.setText('After filtering:')
self.info_fil.setEnabled(False)
else:
self.info_fil.setEnabled(True)
self.info_fil.setText('After filtering: {}'.format(count))

def progress(self, p):
self.progressBarSet(p)
def build_tree(self) -> int:
count = 0
for word, pval, fval in zip(
self.results.words,
self.results.p_values,
self.results.fdr_values
):
if ((not self.filter_by_p or pval <= self.filter_p_value) and
(not self.filter_by_fdr or fval <= self.filter_fdr_value)):
it = EATreeWidgetItem(word, pval, fval, self.sig_words)
self.sig_words.addTopLevelItem(it)
count += 1
return count

def set_input_info(self) -> None:
cluster_words = len(self.selected_data_transformed.domain.attributes)
selected_words = np.count_nonzero(np_sp_sum(
self.selected_data_transformed.X, axis=0))

self.info.set_input_summary(
f"{cluster_words}|{selected_words}",
f"Total words: {cluster_words}\n"
f"Words in subset: {selected_words}")

def set_output_info(self, count: int) -> None:
self.info.set_output_summary(str(count), f"{count} words displayed")

def apply(self):
self.clear()
self.progressBarInit()
self.sig_words.clear()
self.filter_enabled(False)

self.words = [i.name for i in self.selected_data_transformed.domain.attributes]
self.p_values = hypergeom_p_values(self.data.X,
self.selected_data_transformed.X,
callback=self.progress)
self.fdr_values = FDR(self.p_values)
self.start(
Runner.run,
self.selected_data_transformed,
self.data,
self.results
)

def on_done(self, result: Result) -> None:
self.filter_and_display()
self.filter_enabled(True)
self.progressBarFinished()

def on_exception(self, ex: Exception) -> None:
self.filter_enabled(True)

def tree_to_table(self):
view = [self.cols]
Expand All @@ -214,12 +252,13 @@ def tree_to_table(self):
for j in range(3):
line.append(self.sig_words.topLevelItem(i).text(j))
view.append(line)
return(view)
return view

def send_report(self):
if self.words:
if self.results.words:
self.report_table("Enriched words", self.tree_to_table())


fp = lambda score: "%0.5f" % score if score > 10e-3 else "%0.1e" % score
fpt = lambda score: "%0.9f" % score if score > 10e-3 else "%0.5e" % score

Expand All @@ -238,19 +277,13 @@ def __lt__(self, other):
col = self.treeWidget().sortColumn()
return self.data[col] < other.data[col]

def main():

if __name__ == '__main__':
from orangewidget.utils.widgetpreview import WidgetPreview
from orangecontrib.text.vectorization import BowVectorizer

corpus = Corpus.from_file('book-excerpts')
vect = BowVectorizer()
corpus_vect = vect.transform(corpus)
app = QApplication([])
widget = OWWordEnrichment()
widget.set_data(corpus_vect)
subset_corpus = corpus_vect[:10]
widget.set_data_selected(subset_corpus)
widget.handleNewSignals()
widget.show()
app.exec()

if __name__ == '__main__':
main()
WidgetPreview(OWWordEnrichment).run(
set_data_selected=corpus_vect[:10], set_data=corpus_vect)

0 comments on commit cf96f20

Please sign in to comment.