Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Word Enrichment: Computing in a separate thread (with ConcurrentMixin) #492

Merged
merged 2 commits into from
Feb 17, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
169 changes: 103 additions & 66 deletions orangecontrib/text/widgets/owwordenrichment.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,48 @@
from types import SimpleNamespace
from typing import List, Optional, Any

import numpy as np
from AnyQt.QtWidgets import QTreeWidget, QTreeView, QTreeWidgetItem, \
QApplication
from AnyQt.QtWidgets import QTreeWidget, QTreeView, QTreeWidgetItem

from Orange.data import Table, Domain
from Orange.widgets import gui
from Orange.widgets.settings import Setting
from Orange.widgets.utils.concurrent import ConcurrentWidgetMixin, TaskState
from Orange.widgets.widget import OWWidget, Msg, Input
from Orange.statistics.util import FDR
from PyQt5.QtCore import QSize
from orangecontrib.text import Corpus
from orangecontrib.text.util import np_sp_sum
from orangecontrib.text.stats import hypergeom_p_values
from orangecontrib.text.vectorization import BowVectorizer


class OWWordEnrichment(OWWidget):
class Result(SimpleNamespace):
words: Optional[List[str]] = None
p_values: Optional[List[float]] = None
fdr_values: Optional[List[float]] = None


class Runner:
@staticmethod
def run(
selected_data_transformed: Table,
data: Table,
result: Result,
state: TaskState
) -> None:
state.set_status("Listing words")
result.words = [
i.name for i in selected_data_transformed.domain.attributes]
state.set_status("Computing p-values")
result.p_values = hypergeom_p_values(
data.X, selected_data_transformed.X,
callback=state.set_progress_value
)
state.set_status("Computing FDR values")
result.fdr_values = FDR(result.p_values)


class OWWordEnrichment(OWWidget, ConcurrentWidgetMixin):
# Basic widget info
name = "Word Enrichment"
description = "Word enrichment analysis for selected documents."
Expand All @@ -35,28 +63,26 @@ class Error(OWWidget.Error):
all_selected = Msg('All examples can not be selected!')

# Settings
filter_by_p = Setting(False)
filter_p_value = Setting(0.01)
filter_by_fdr = Setting(True)
filter_fdr_value = Setting(0.2)
filter_by_p: bool = Setting(False)
filter_p_value: float = Setting(0.01)
filter_by_fdr: bool = Setting(True)
filter_fdr_value: float = Setting(0.2)

def __init__(self):
super().__init__()
OWWidget.__init__(self)
ConcurrentWidgetMixin.__init__(self)

# Init data
self.data = None
self.selected_data = None
self.selected_data_transformed = None # used for transforming the 'selected data' into the 'data' domain
# used for transforming the 'selected data' into the 'data' domain
self.selected_data_transformed = None

self.words = []
self.p_values = []
self.fdr_values = []
self.results = Result()

# Info section
# info box
fbox = gui.widgetBox(self.controlArea, "Info")
self.info_all = gui.label(fbox, self, 'Cluster words:')
self.info_sel = gui.label(fbox, self, 'Selected words:')
self.info_fil = gui.label(fbox, self, 'After filtering:')
self.info_fil = gui.label(fbox, self, 'Words displayed: 0')

# Filtering settings
fbox = gui.widgetBox(self.controlArea, "Filter")
Expand All @@ -68,7 +94,6 @@ def __init__(self):
self.spin_p = gui.doubleSpin(hbox, self, 'filter_p_value',
1e-4, 1, step=1e-4, labelWidth=15,
callback=self.filter_and_display,
callbackOnReturn=True,
tooltip="Max p-value for word")
self.spin_p.setEnabled(self.filter_by_p)

Expand All @@ -79,7 +104,6 @@ def __init__(self):
self.spin_fdr = gui.doubleSpin(hbox, self, 'filter_fdr_value',
1e-4, 1, step=1e-4, labelWidth=15,
callback=self.filter_and_display,
callbackOnReturn=True,
tooltip="Max p-value for word")
self.spin_fdr.setEnabled(self.filter_by_fdr)
gui.rubber(self.controlArea)
Expand All @@ -90,7 +114,7 @@ def __init__(self):
self.sig_words.setColumnCount(len(self.cols))
self.sig_words.setHeaderLabels(self.cols)
self.sig_words.setSortingEnabled(True)
self.sig_words.setSelectionMode(QTreeView.ExtendedSelection)
self.sig_words.setSelectionMode(QTreeView.NoSelection)
self.sig_words.sortByColumn(2, 0) # 0 is ascending order
for i in range(len(self.cols)):
self.sig_words.resizeColumnToContents(i)
Expand All @@ -102,6 +126,8 @@ def sizeHint(self):
@Inputs.data
def set_data(self, data=None):
self.data = data
# selected data transformed depends on data domain
self.selected_data_transformed = None

@Inputs.selected_data
def set_data_selected(self, data=None):
Expand Down Expand Up @@ -135,7 +161,8 @@ def check_data(self):
self.clear()
return
self.data = Corpus.from_table(bow_domain, self.data)
self.selected_data_transformed = Corpus.from_table(bow_domain, self.selected_data)
self.selected_data_transformed = Corpus.from_table(
bow_domain, self.selected_data)

if np_sp_sum(self.selected_data_transformed.X) == 0:
self.Error.no_words_overlap()
Expand All @@ -144,15 +171,15 @@ def check_data(self):
self.Error.all_selected()
self.clear()
else:
self.set_input_info()
self.apply()
else:
self.clear()

def clear(self):
self.sig_words.clear()
self.info_all.setText('Cluster words:')
self.info_sel.setText('Selected words:')
self.info_fil.setText('After filtering:')
self.info.set_input_summary(self.info.NoInput)
self.set_displayed_info(0)

def filter_enabled(self, b):
self.chb_p.setEnabled(b)
Expand All @@ -168,43 +195,58 @@ def filter_and_display(self):
if self.selected_data_transformed is None: # do nothing when no Data
return

count = 0
if self.words:
for word, pval, fval in zip(self.words, self.p_values, self.fdr_values):
if (not self.filter_by_p or pval <= self.filter_p_value) and \
(not self.filter_by_fdr or fval <= self.filter_fdr_value):
it = EATreeWidgetItem(word, pval, fval, self.sig_words)
self.sig_words.addTopLevelItem(it)
count += 1
if self.results.words:
count = self.build_tree()
else:
count = 0

for i in range(len(self.cols)):
self.sig_words.resizeColumnToContents(i)
self.set_displayed_info(count)

self.info_all.setText('Cluster words: {}'.format(len(self.selected_data_transformed.domain.attributes)))
self.info_sel.setText('Selected words: {}'.format(np.count_nonzero(np_sp_sum(self.selected_data_transformed.X, axis=0))))
if not self.filter_by_p and not self.filter_by_fdr:
self.info_fil.setText('After filtering:')
self.info_fil.setEnabled(False)
else:
self.info_fil.setEnabled(True)
self.info_fil.setText('After filtering: {}'.format(count))

def progress(self, p):
self.progressBarSet(p)
def build_tree(self) -> int:
count = 0
for word, pval, fval in zip(
self.results.words,
self.results.p_values,
self.results.fdr_values
):
if ((not self.filter_by_p or pval <= self.filter_p_value) and
(not self.filter_by_fdr or fval <= self.filter_fdr_value)):
it = EATreeWidgetItem(word, pval, fval, self.sig_words)
self.sig_words.addTopLevelItem(it)
count += 1
return count

def set_input_info(self) -> None:
cluster_words = len(self.selected_data_transformed.domain.attributes)
selected_words = np.count_nonzero(np_sp_sum(
self.selected_data_transformed.X, axis=0))

self.info.set_input_summary(
f"{cluster_words}|{selected_words}",
f"Total words: {cluster_words}\n"
f"Words in subset: {selected_words}")

def set_displayed_info(self, count: int) -> None:
self.info_fil.setText(f"Words displayed: {count}")

def apply(self):
self.clear()
self.progressBarInit()
self.sig_words.clear()
self.filter_enabled(False)

self.words = [i.name for i in self.selected_data_transformed.domain.attributes]
self.p_values = hypergeom_p_values(self.data.X,
self.selected_data_transformed.X,
callback=self.progress)
self.fdr_values = FDR(self.p_values)
self.start(
Runner.run,
self.selected_data_transformed,
self.data,
self.results
)

def on_done(self, result: Result) -> None:
self.filter_and_display()
self.filter_enabled(True)
self.progressBarFinished()

def on_exception(self, ex: Exception) -> None:
self.filter_enabled(True)

def tree_to_table(self):
view = [self.cols]
Expand All @@ -214,12 +256,13 @@ def tree_to_table(self):
for j in range(3):
line.append(self.sig_words.topLevelItem(i).text(j))
view.append(line)
return(view)
return view

def send_report(self):
if self.words:
if self.results.words:
self.report_table("Enriched words", self.tree_to_table())


fp = lambda score: "%0.5f" % score if score > 10e-3 else "%0.1e" % score
fpt = lambda score: "%0.9f" % score if score > 10e-3 else "%0.5e" % score

Expand All @@ -238,19 +281,13 @@ def __lt__(self, other):
col = self.treeWidget().sortColumn()
return self.data[col] < other.data[col]

def main():

if __name__ == '__main__':
from orangewidget.utils.widgetpreview import WidgetPreview
from orangecontrib.text.vectorization import BowVectorizer

corpus = Corpus.from_file('book-excerpts')
vect = BowVectorizer()
corpus_vect = vect.transform(corpus)
app = QApplication([])
widget = OWWordEnrichment()
widget.set_data(corpus_vect)
subset_corpus = corpus_vect[:10]
widget.set_data_selected(subset_corpus)
widget.handleNewSignals()
widget.show()
app.exec()

if __name__ == '__main__':
main()
WidgetPreview(OWWordEnrichment).run(
set_data_selected=corpus_vect[:10], set_data=corpus_vect)
Loading