From 13cf312368650f889cd10324684f96e455888f53 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Primo=C5=BE=20Godec?= <p.godec9@gmail.com>
Date: Thu, 18 Mar 2021 12:53:50 +0100
Subject: [PATCH] Score documents

---
 .../text/widgets/owdocumentscorer.py          | 28 +++++++++++--------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/orangecontrib/text/widgets/owdocumentscorer.py b/orangecontrib/text/widgets/owdocumentscorer.py
index 281bfd3b9..76cc226f0 100644
--- a/orangecontrib/text/widgets/owdocumentscorer.py
+++ b/orangecontrib/text/widgets/owdocumentscorer.py
@@ -3,7 +3,7 @@
 from typing import List
 
 import numpy as np
-from Orange.data import Domain, StringVariable, Table
+from Orange.data import Domain, StringVariable, Table, ContinuousVariable
 from Orange.util import wrap_callback
 from Orange.widgets.utils.concurrent import ConcurrentWidgetMixin, TaskState
 from Orange.widgets.widget import Input, Msg, Output, OWWidget
@@ -42,7 +42,7 @@ def _word_ratio(corpus, words, callback):
 def _embedding_similarity(corpus, words, callback, embedding_language):
     ticks = iter(np.linspace(0, 0.8, len(corpus) + len(words)))
 
-    # TODO: current embedding only report success unify them to report progress float
+    # TODO: currently embedding report success unify them to report progress float
     def emb_cb(sucess: bool):
         if sucess:
             callback(next(ticks))
@@ -58,9 +58,10 @@ def emb_cb(sucess: bool):
 
 
 SCORING_METHODS = {
-    "word_frequency": ("Word frequency", _word_frequency),
-    "word_ratio": ("Word ratio", _word_ratio),
-    "embedding_similarity": ("Embedding similarity", _embedding_similarity),
+    # key: (Method's name, Method's function, Tooltip)
+    "word_frequency": ("Word frequency", _word_frequency, "Relative frequency of the word in the document."),
+    "word_ratio": ("Word ratio", _word_ratio, "Percentage of words in the document (Jaccard index)."),
+    "embedding_similarity": ("Embedding similarity", _embedding_similarity, "Cosine distance between the document and the word."),
 }
 
 ADDITIONAL_OPTIONS = {
@@ -130,6 +131,7 @@ def callback(i: float):
         state.set_partial_result((sm, aggregation, scs))
 
 
+# todo: get rid of it
 def set_cls_attributes(cls):
     """
     Class decorator that set widget settings dynamically for each scoring method
@@ -149,7 +151,7 @@ def set_cls_attributes(cls):
 
 @set_cls_attributes
 class OWDocumentScorer(OWWidget, ConcurrentWidgetMixin):
-    name = "Score documents"
+    name = "Score Documents"
     description = ""
     icon = "icons/CorpusViewer.svg"
     priority = 500
@@ -168,7 +170,7 @@ class Outputs:
     class Warning(OWWidget.Warning):
         missing_words = Msg("Provide words on the input")
         missing_corpus = Msg("Provide corpus on the input")
-        corpus_not_normalized = Msg("Use preprocesses to normalize corpus.")
+        corpus_not_normalized = Msg("Use Preprocess Text to normalize corpus.")
 
     class Error(OWWidget.Error):
         unknown_err = Msg("An error occurred.\n{}")
@@ -185,11 +187,11 @@ def __init__(self):
 
     def _setup_control_area(self):
         box = gui.widgetBox(self.controlArea, "Scoring method")
-        for value, (n, _) in SCORING_METHODS.items():
+        for value, (n, _, tt) in SCORING_METHODS.items():
             # TODO: Gui - fix controlos layout
             b = gui.hBox(box, margin=0)
             gui.checkBox(
-                b, self, value, label=n, callback=self._setting_changed
+                b, self, value, label=n, callback=self._setting_changed, tooltip=tt
             )
             if value in ADDITIONAL_OPTIONS:
                 value, options = ADDITIONAL_OPTIONS[value]
@@ -252,6 +254,7 @@ def set_data(self, corpus: Corpus):
 
     @Inputs.words
     def set_words(self, words: Table):
+        # todo: handle case when no type=words
         if words is not None:
             self.Warning.missing_words.clear()
             words_attr = next(
@@ -272,7 +275,7 @@ def _send_output(self, scores, labels):
             orig = self.corpus
             domain = orig.domain
             corpus = Corpus(
-                Domain(domain.X, domain.Y, domain.metas + tuple(labels)),
+                Domain(domain.attributes, domain.class_var, metas=domain.metas + tuple(ContinuousVariable(l) for l in labels)),
                 orig.X,
                 orig.Y,
                 np.hstack([orig.metas, scores]),
@@ -293,11 +296,13 @@ def _fill_table(self, scores, labels):
     def _fill_and_output(self):
         scores, labels = self._prepare_scores()
         self._fill_table(scores, labels)
+        self._send_output(scores, labels)
 
     def _clear(self):
         self.scores = {}
         self.cancel()
-        self._fill_and_output()
+        if self.corpus is not None:
+            self._fill_and_output()
         self.commit()
 
     def _setting_changed(self):
@@ -350,7 +355,6 @@ def _get_active_scorers(self):
         return [attr for attr in SCORING_METHODS if getattr(self, attr)]
 
     def _get_active_aggregation(self):
-        # todo: self.aggregations value instead of int
         return list(AGGREGATIONS.keys())[self.aggregation]
 
     @staticmethod