Skip to content

Commit

Permalink
owbasevectorizer - computation in the tread
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed May 25, 2022
1 parent 4c38b3f commit c809707
Show file tree
Hide file tree
Showing 8 changed files with 126 additions and 20 deletions.
11 changes: 11 additions & 0 deletions orangecontrib/text/tests/test_bowvectorizer.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import unittest
from unittest.mock import MagicMock, call

import numpy as np
from Orange.data import Domain, StringVariable
Expand Down Expand Up @@ -231,6 +232,16 @@ def test_tfidf_correctness(self):
idfs_test = self.test_counts * np.log(n / document_appearance)
self.assert_bow_same(bow_test, idfs_test, self.terms)

def test_callback(self):
vect = BowVectorizer()
corpus = Corpus.from_file("deerwester")
callback = MagicMock()

result = vect.transform(corpus, callback=callback)
self.assertIsInstance(result, Corpus)
self.assertEqual(len(result.domain.variables), 43)
callback.assert_has_calls([call(0.3), call(0.6), call(0.9), call(1)])


if __name__ == "__main__":
unittest.main()
15 changes: 15 additions & 0 deletions orangecontrib/text/tests/test_simhash.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import unittest
from unittest.mock import MagicMock, call

from orangecontrib.text.corpus import Corpus
from orangecontrib.text.vectorization import SimhashVectorizer
Expand All @@ -18,3 +19,17 @@ def test_transform(self):
def test_report(self):
vect = SimhashVectorizer()
self.assertGreater(len(vect.report()), 0)

def test_callback(self):
vect = SimhashVectorizer(shingle_len=10, f=64)
callback = MagicMock()
result = vect.transform(self.corpus, callback=callback)

self.assertIsInstance(result, Corpus)
self.assertEqual(len(result), len(self.corpus))
self.assertEqual(result.X.shape, (len(self.corpus), 64))
callback.assert_has_calls([call(i / len(self.corpus)) for i in range(9)])


if __name__ == "__main__":
unittest.main()
7 changes: 6 additions & 1 deletion orangecontrib/text/vectorization/bagofwords.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from functools import partial

import numpy as np
from Orange.util import dummy_callback
from gensim import corpora, models, matutils
from sklearn.preprocessing import normalize

Expand Down Expand Up @@ -68,18 +69,21 @@ def __init__(self, norm=NONE, wlocal=COUNT, wglobal=NONE):
self.wlocal = wlocal
self.wglobal = wglobal

def _transform(self, corpus, source_dict=None):
def _transform(self, corpus, source_dict=None, callback=dummy_callback):
temp_corpus = list(corpus.ngrams_iterator(' ', include_postags=True))
dic = corpora.Dictionary(temp_corpus, prune_at=None) if not source_dict else source_dict
callback(0.3)
temp_corpus = [dic.doc2bow(doc) for doc in temp_corpus]
model = models.TfidfModel(dictionary=dic, normalize=False,
wlocal=self.wlocals[self.wlocal],
wglobal=self.wglobals[self.wglobal])
callback(0.6)

X = matutils.corpus2csc(model[temp_corpus], dtype=float, num_terms=len(dic)).T
norm = self.norms[self.norm]
if norm:
X = norm(X)
callback(0.9)

# set compute values
shared_cv = SharedTransform(self, corpus.used_preprocessor,
Expand All @@ -88,6 +92,7 @@ def _transform(self, corpus, source_dict=None):
for i in range(len(dic))]

corpus = self.add_features(corpus, X, dic, cv, var_attrs={'bow-feature': True})
callback(1)
return corpus

def report(self):
Expand Down
7 changes: 4 additions & 3 deletions orangecontrib/text/vectorization/base.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,23 @@
import numpy as np

from Orange.data.util import SharedComputeValue
from Orange.util import dummy_callback
from orangecontrib.text.util import Sparse2CorpusSliceable


class BaseVectorizer:
"""Base class for vectorization objects. """
name = NotImplemented

def transform(self, corpus, copy=True, source_dict=None):
def transform(self, corpus, copy=True, source_dict=None, callback=dummy_callback):
"""Transforms a corpus to a new one with additional attributes. """
if not (len(corpus.dictionary) or source_dict) or not len(corpus):
return corpus
if copy:
corpus = corpus.copy()
return self._transform(corpus, source_dict)
return self._transform(corpus, source_dict, callback)

def _transform(self, corpus, source_dict):
def _transform(self, corpus, source_dict, callback):
raise NotImplementedError

def report(self):
Expand Down
10 changes: 7 additions & 3 deletions orangecontrib/text/vectorization/simhash.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import nltk
from Orange.util import dummy_callback
from simhash import Simhash
import numpy as np

Expand Down Expand Up @@ -36,7 +37,7 @@ def compute_hash(self, tokens):
def int2binarray(self, num):
return [int(x) for x in self._bin_format.format(num)]

def _transform(self, corpus, source_dict):
def _transform(self, corpus, source_dict, callback=dummy_callback):
""" Computes simhash values from the given corpus
and creates a new one with a simhash attribute.
Expand All @@ -46,8 +47,11 @@ def _transform(self, corpus, source_dict):
Returns:
Corpus with `simhash` variable
"""

X = np.array([self.int2binarray(self.compute_hash(doc)) for doc in corpus.tokens], dtype=float)
hashes = []
for i, doc in enumerate(corpus.tokens):
hashes.append(self.int2binarray(self.compute_hash(doc)))
callback(i / len(corpus))
X = np.array(hashes, dtype=float)
corpus = corpus.extend_attributes(
X,
feature_names=[
Expand Down
22 changes: 21 additions & 1 deletion orangecontrib/text/widgets/tests/test_owbagofwords.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import unittest

from Orange.widgets.tests.base import WidgetTest, WidgetOutputsTestMixin
from Orange.widgets.tests.base import WidgetTest

from orangecontrib.text.corpus import Corpus
from orangecontrib.text.widgets.owbagofwords import OWTBagOfWords
Expand All @@ -19,6 +19,26 @@ def test_corpus(self):
self.send_signal("Corpus", self.corpus)
self.send_signal("Corpus", None)

def test_output(self):
self.send_signal("Corpus", self.corpus)
output = self.get_output(self.widget.Outputs.corpus)
self.assertEqual(len(self.corpus), len(output))
self.assertEqual(42, len(output.domain.attributes))

self.send_signal("Corpus", self.corpus[:2])
output = self.get_output(self.widget.Outputs.corpus)
self.assertEqual(2, len(output))
self.assertListEqual(
# fmt: off
[
"a", "abc", "applications", "computer", "for", "human", "interface",
"lab", "machine", "of", "opinion", "response", "survey", "system",
"time", "user"
],
# fmt: on
[x.name for x in output.domain.attributes]
)


if __name__ == "__main__":
unittest.main()
30 changes: 30 additions & 0 deletions orangecontrib/text/widgets/tests/test_owsimhash.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import unittest

from Orange.widgets.tests.base import WidgetTest

from orangecontrib.text.corpus import Corpus
from orangecontrib.text.widgets.owsimhash import OWSimhash


class TestOWSimhash(WidgetTest):
def setUp(self):
self.widget = self.create_widget(OWSimhash)
self.corpus = Corpus.from_file("deerwester")

def test_corpus(self):
"""
Just basic test.
GH-247
"""
self.send_signal("Corpus", self.corpus)
self.send_signal("Corpus", None)

def test_output(self):
self.send_signal("Corpus", self.corpus)
output = self.get_output(self.widget.Outputs.corpus)
self.assertEqual(len(self.corpus), len(output))
self.assertEqual(64, len(output.domain.attributes))


if __name__ == "__main__":
unittest.main()
44 changes: 32 additions & 12 deletions orangecontrib/text/widgets/utils/owbasevectorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,21 @@

from Orange.widgets import gui
from Orange.widgets import settings
from Orange.widgets.utils.concurrent import ConcurrentWidgetMixin
from Orange.widgets.widget import OWWidget, Input, Output
from orangecontrib.text.corpus import Corpus


class OWBaseVectorizer(OWWidget):
def _run(method, corpus, task_state):
def callback(progress):
if task_state.is_interruption_requested():
raise Exception
task_state.set_progress_value(progress * 100)

return method.transform(corpus, callback=callback)


class OWBaseVectorizer(OWWidget, ConcurrentWidgetMixin, openclass=True):
""" A base class for feature extraction methods.
Notes:
Expand All @@ -30,6 +40,7 @@ class Outputs:

def __init__(self):
super().__init__()
ConcurrentWidgetMixin.__init__(self)
self.corpus = None
self.method = None
self.new_corpus = None
Expand All @@ -52,7 +63,7 @@ def set_data(self, data):
self.corpus = data
if self.corpus is None:
self.new_corpus, self.new_attrs = None, None
self.invalidate()
self.apply()

def hide_attrs(self):
if self.new_corpus:
Expand All @@ -61,28 +72,37 @@ def hide_attrs(self):
if f.name in self.new_attrs:
f.attributes['hidden'] = self.hidden_cb
self.new_corpus = self.new_corpus.transform(new_domain)
self.commit()
self.commit.deferred()

@gui.deferred
def commit(self):
self.Outputs.corpus.send(self.new_corpus)

def apply(self):
if self.corpus is not None:
self.new_corpus = self.method.transform(self.corpus)
self.new_attrs = {f.name for f in self.new_corpus.domain.attributes} \
- {f.name for f in self.corpus.domain.attributes}

def invalidate(self):
self.apply()
self.hide_attrs()
self.commit()
self.start(_run, self.method, self.corpus)
else:
self.cancel()
self.commit.deferred()

def on_done(self, result: Corpus) -> None:
if result and self.corpus:
self.new_corpus = result
self.new_attrs = {f.name for f in result.domain.attributes} - {
f.name for f in self.corpus.domain.attributes
}
self.hide_attrs()
else:
# when corpus removed from the input while processing
self.new_attrs = self.new_corpus = None
self.commit.deferred()

def update_method(self):
self.method = self.Method()

def on_change(self):
self.update_method()
self.invalidate()
self.apply()

def send_report(self):
self.report_items(self.method.report())
Expand Down

0 comments on commit c809707

Please sign in to comment.