From 839444a6fc93dd3a938f70ecf21868da6468d4b7 Mon Sep 17 00:00:00 2001
From: PrimozGodec
Date: Thu, 26 Jan 2023 11:23:51 +0100
Subject: [PATCH] Keywords - replace embedding with MBERT
---
orangecontrib/text/widgets/owkeywords.py | 11 +---------
.../text/widgets/tests/test_owkeywords.py | 22 +++++++++----------
2 files changed, 11 insertions(+), 22 deletions(-)
diff --git a/orangecontrib/text/widgets/owkeywords.py b/orangecontrib/text/widgets/owkeywords.py
index 973810f61..eb4d761a4 100644
--- a/orangecontrib/text/widgets/owkeywords.py
+++ b/orangecontrib/text/widgets/owkeywords.py
@@ -324,12 +324,6 @@ def __on_rake_lang_changed(self):
del self.__cached_keywords[ScoringMethods.RAKE]
self.update_scores()
- def __on_emb_lang_changed(self):
- if ScoringMethods.EMBEDDING in self.selected_scoring_methods:
- if ScoringMethods.EMBEDDING in self.__cached_keywords:
- del self.__cached_keywords[ScoringMethods.EMBEDDING]
- self.update_scores()
-
def __on_filter_changed(self):
model = self.view.model()
model.setFilterFixedString(self.__filter_line_edit.text().strip())
@@ -389,10 +383,7 @@ def update_scores(self):
},
ScoringMethods.RAKE: {
"language": RAKE_LANGUAGES[self.rake_lang_index],
- "max_len": self.corpus.ngram_range[1] if self.corpus else 1
- },
- ScoringMethods.EMBEDDING: {
- "language": EMBEDDING_LANGUAGES[self.embedding_lang_index],
+ "max_len": self.corpus.ngram_range[1] if self.corpus else 1,
},
}
self.start(run, self.corpus, self.words, self.__cached_keywords,
diff --git a/orangecontrib/text/widgets/tests/test_owkeywords.py b/orangecontrib/text/widgets/tests/test_owkeywords.py
index 171db94b8..c75734f1f 100644
--- a/orangecontrib/text/widgets/tests/test_owkeywords.py
+++ b/orangecontrib/text/widgets/tests/test_owkeywords.py
@@ -1,5 +1,4 @@
# pylint: disable=missing-docstring
-from typing import List
import unittest
from unittest.mock import Mock, patch
@@ -175,15 +174,17 @@ def test_sort_nans_asc(self):
def test_scoring_methods(self):
# speed-up the test execution
- def dummy_embedding(tokens, language, progress_callback=None):
- return tfidf_keywords(tokens, progress_callback)
-
- methods = [("TF-IDF", Mock(wraps=tfidf_keywords)),
- ("YAKE!", Mock(wraps=yake_keywords)),
- ("Rake", Mock(wraps=rake_keywords)),
- ("Embedding", Mock(side_effect=dummy_embedding))]
+ def dummy_mbert(tokens, progress_callback=None):
+ return [[("kw1", 0.2), ("kw2", 0.3)] * len(tokens)]
+
+ methods = [
+ ("TF-IDF", Mock(wraps=tfidf_keywords)),
+ ("YAKE!", Mock(wraps=yake_keywords)),
+ ("Rake", Mock(wraps=rake_keywords)),
+ ("MBERT", Mock(side_effect=dummy_mbert)),
+ ]
with patch.object(ScoringMethods, "ITEMS", methods) as m:
- scores = {"TF-IDF", "YAKE!", "Rake", "Embedding"}
+ scores = {"TF-IDF", "YAKE!", "Rake", "MBERT"}
settings = {"selected_scoring_methods": scores}
widget = self.create_widget(OWKeywords, stored_settings=settings)
@@ -191,8 +192,6 @@ def dummy_embedding(tokens, language, progress_callback=None):
simulate.combobox_activate_item(cb, "Arabic")
cb = widget.controls.rake_lang_index
simulate.combobox_activate_item(cb, "Finnish")
- cb = widget.controls.embedding_lang_index
- simulate.combobox_activate_item(cb, "Kazakh")
self.send_signal(widget.Inputs.corpus, self.corpus, widget=widget)
self.wait_until_finished(widget=widget, timeout=10000)
@@ -205,7 +204,6 @@ def dummy_embedding(tokens, language, progress_callback=None):
m[3][1].assert_called_once()
self.assertEqual(m[1][1].call_args[1]["language"], "Arabic")
self.assertEqual(m[2][1].call_args[1]["language"], "Finnish")
- self.assertEqual(m[3][1].call_args[1]["language"], "Kazakh")
def test_method_change(self):
"""Test method change by clicking"""