From 13cf312368650f889cd10324684f96e455888f53 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Primo=C5=BE=20Godec?=
Date: Thu, 18 Mar 2021 12:53:50 +0100
Subject: [PATCH] Score documents
---
.../text/widgets/owdocumentscorer.py | 28 +++++++++++--------
1 file changed, 16 insertions(+), 12 deletions(-)
diff --git a/orangecontrib/text/widgets/owdocumentscorer.py b/orangecontrib/text/widgets/owdocumentscorer.py
index 281bfd3b9..76cc226f0 100644
--- a/orangecontrib/text/widgets/owdocumentscorer.py
+++ b/orangecontrib/text/widgets/owdocumentscorer.py
@@ -3,7 +3,7 @@
from typing import List
import numpy as np
-from Orange.data import Domain, StringVariable, Table
+from Orange.data import Domain, StringVariable, Table, ContinuousVariable
from Orange.util import wrap_callback
from Orange.widgets.utils.concurrent import ConcurrentWidgetMixin, TaskState
from Orange.widgets.widget import Input, Msg, Output, OWWidget
@@ -42,7 +42,7 @@ def _word_ratio(corpus, words, callback):
def _embedding_similarity(corpus, words, callback, embedding_language):
ticks = iter(np.linspace(0, 0.8, len(corpus) + len(words)))
- # TODO: current embedding only report success unify them to report progress float
+ # TODO: currently embedding report success unify them to report progress float
def emb_cb(sucess: bool):
if sucess:
callback(next(ticks))
@@ -58,9 +58,10 @@ def emb_cb(sucess: bool):
SCORING_METHODS = {
- "word_frequency": ("Word frequency", _word_frequency),
- "word_ratio": ("Word ratio", _word_ratio),
- "embedding_similarity": ("Embedding similarity", _embedding_similarity),
+ # key: (Method's name, Method's function, Tooltip)
+ "word_frequency": ("Word frequency", _word_frequency, "Relative frequency of the word in the document."),
+ "word_ratio": ("Word ratio", _word_ratio, "Percentage of words in the document (Jaccard index)."),
+ "embedding_similarity": ("Embedding similarity", _embedding_similarity, "Cosine distance between the document and the word."),
}
ADDITIONAL_OPTIONS = {
@@ -130,6 +131,7 @@ def callback(i: float):
state.set_partial_result((sm, aggregation, scs))
+# todo: get rid of it
def set_cls_attributes(cls):
"""
Class decorator that set widget settings dynamically for each scoring method
@@ -149,7 +151,7 @@ def set_cls_attributes(cls):
@set_cls_attributes
class OWDocumentScorer(OWWidget, ConcurrentWidgetMixin):
- name = "Score documents"
+ name = "Score Documents"
description = ""
icon = "icons/CorpusViewer.svg"
priority = 500
@@ -168,7 +170,7 @@ class Outputs:
class Warning(OWWidget.Warning):
missing_words = Msg("Provide words on the input")
missing_corpus = Msg("Provide corpus on the input")
- corpus_not_normalized = Msg("Use preprocesses to normalize corpus.")
+ corpus_not_normalized = Msg("Use Preprocess Text to normalize corpus.")
class Error(OWWidget.Error):
unknown_err = Msg("An error occurred.\n{}")
@@ -185,11 +187,11 @@ def __init__(self):
def _setup_control_area(self):
box = gui.widgetBox(self.controlArea, "Scoring method")
- for value, (n, _) in SCORING_METHODS.items():
+ for value, (n, _, tt) in SCORING_METHODS.items():
# TODO: Gui - fix controlos layout
b = gui.hBox(box, margin=0)
gui.checkBox(
- b, self, value, label=n, callback=self._setting_changed
+ b, self, value, label=n, callback=self._setting_changed, tooltip=tt
)
if value in ADDITIONAL_OPTIONS:
value, options = ADDITIONAL_OPTIONS[value]
@@ -252,6 +254,7 @@ def set_data(self, corpus: Corpus):
@Inputs.words
def set_words(self, words: Table):
+ # todo: handle case when no type=words
if words is not None:
self.Warning.missing_words.clear()
words_attr = next(
@@ -272,7 +275,7 @@ def _send_output(self, scores, labels):
orig = self.corpus
domain = orig.domain
corpus = Corpus(
- Domain(domain.X, domain.Y, domain.metas + tuple(labels)),
+ Domain(domain.attributes, domain.class_var, metas=domain.metas + tuple(ContinuousVariable(l) for l in labels)),
orig.X,
orig.Y,
np.hstack([orig.metas, scores]),
@@ -293,11 +296,13 @@ def _fill_table(self, scores, labels):
def _fill_and_output(self):
scores, labels = self._prepare_scores()
self._fill_table(scores, labels)
+ self._send_output(scores, labels)
def _clear(self):
self.scores = {}
self.cancel()
- self._fill_and_output()
+ if self.corpus is not None:
+ self._fill_and_output()
self.commit()
def _setting_changed(self):
@@ -350,7 +355,6 @@ def _get_active_scorers(self):
return [attr for attr in SCORING_METHODS if getattr(self, attr)]
def _get_active_aggregation(self):
- # todo: self.aggregations value instead of int
return list(AGGREGATIONS.keys())[self.aggregation]
@staticmethod