Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FIX] Score Documents - adapt to the latest changes in document embedding #866

Merged
merged 1 commit into from
Jun 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions orangecontrib/text/widgets/owscoredocuments.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,14 @@ def _embedding_similarity(
emb = DocumentEmbedder(language)

cb_part = len(corpus) / (len(corpus) + len(words))
documet_embeddings, skipped = emb(corpus, wrap_callback(callback, 0, cb_part))
documet_embeddings, skipped = emb.transform(
corpus, wrap_callback(callback, 0, cb_part)
)
assert skipped is None

words = [[w] for w in words]
word_embeddings = np.array(
emb([[w] for w in words], wrap_callback(callback, cb_part, 1 - cb_part))
emb.transform(words, wrap_callback(callback, cb_part, 1 - cb_part))
)
return cosine_similarity(documet_embeddings.X, word_embeddings)

Expand Down
19 changes: 5 additions & 14 deletions orangecontrib/text/widgets/tests/test_owscoredocuments.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from Orange.widgets.tests.utils import simulate

from orangecontrib.text import Corpus, preprocess
from orangecontrib.text.vectorization.document_embedder import DocumentEmbedder
from orangecontrib.text.vectorization.document_embedder import _ServerEmbedder
from orangecontrib.text.widgets.owscoredocuments import (
OWScoreDocuments,
SelectionMethods,
Expand All @@ -22,17 +22,8 @@
from orangecontrib.text.widgets.utils.words import create_words_table


def embedding_mock(_, corpus, __):
if isinstance(corpus, list):
return np.ones((len(corpus), 10))
else: # corpus is Corpus
return (
Corpus.from_numpy(
domain=Domain([ContinuousVariable(str(i)) for i in range(10)]),
X=np.ones((len(corpus), 10)),
),
None,
)
def embedding_mock(_, data, callback=None):
return np.ones((len(data), 10))


class TestOWScoreDocuments(WidgetTest):
Expand Down Expand Up @@ -126,7 +117,7 @@ def test_guess_word_attribute(self):
self.send_signal(self.widget.Inputs.words, None)
self.assertIsNone(self.widget.words)

@patch.object(DocumentEmbedder, "__call__", new=embedding_mock)
@patch.object(_ServerEmbedder, "embedd_data", new=embedding_mock)
def test_change_scorer(self):
model = self.widget.model
self.send_signal(self.widget.Inputs.corpus, self.corpus)
Expand Down Expand Up @@ -224,7 +215,7 @@ def test_word_appearance(self):
self.wait_until_finished()
self.assertListEqual([x[1] for x in self.widget.model], [1, 1, 1])

@patch.object(DocumentEmbedder, "__call__", new=embedding_mock)
@patch.object(_ServerEmbedder, "embedd_data", new=embedding_mock)
def test_embedding_similarity(self):
corpus = self.create_corpus(
[
Expand Down