From 06242177cd9b4e20e7b2c22cb8b8e8a2e4a89047 Mon Sep 17 00:00:00 2001 From: Vesna Tanko Date: Sat, 3 Apr 2021 20:58:10 +0200 Subject: [PATCH] Extract Keywords: New widget --- orangecontrib/text/keywords.py | 234 +++++++++ orangecontrib/text/tests/test_keywords.py | 109 ++++ orangecontrib/text/widgets/owkeywords.py | 467 ++++++++++++++++++ .../text/widgets/tests/test_owkeywords.py | 195 ++++++++ requirements.txt | 1 + 5 files changed, 1006 insertions(+) create mode 100644 orangecontrib/text/keywords.py create mode 100644 orangecontrib/text/tests/test_keywords.py create mode 100644 orangecontrib/text/widgets/owkeywords.py create mode 100644 orangecontrib/text/widgets/tests/test_owkeywords.py diff --git a/orangecontrib/text/keywords.py b/orangecontrib/text/keywords.py new file mode 100644 index 000000000..b66324b54 --- /dev/null +++ b/orangecontrib/text/keywords.py @@ -0,0 +1,234 @@ +""" +Module for keyword extraction. +""" +from collections import defaultdict +from itertools import chain +from typing import List, Tuple, Callable + +import yake +from sklearn.feature_extraction.text import TfidfVectorizer + +from Orange.util import dummy_callback + +YAKE_LANGUAGE_MAPPING = { + "Arabic": "ar", + "Armenian": "hy", + "Breton": "br", + "Bulgarian": "bg", + "Chinese": "zh", + "Croatian": "hr", + "Czech": "cz", + "Danish": "da", + "Dutch": "nl", + "English": "en", + "Estonian": "et", + "Finnish": "fi", + "French": "fr", + "German": "de", + "Greek": "el", + "Hindi": "hi", + "Hungarian": "hu", + "Indonesian": "id", + "Italian": "it", + "Japanese": "ja", + "Latvian": "lv", + "Lithuanian": "lt", + "Norwegian": "no", + "Persian": "fa", + "Polish": "pl", + "Portuguese": "pt", + "Romanian": "ro", + "Russian": "ru", + "Slovak": "sk", + "Slovenian": "sl", + "Spanish": "es", + "Swedish": "sv", + "Turkish": "tr", + "Ukrainian": "uk" +} + + +def tfidf_keywords( + tokens: List[List[str]], + progress_callback: Callable = None +) -> List[List[Tuple[str, float]]]: + """ + Extract keywords using TF-IDF. + + Parameters + ---------- + tokens : list + Lists of tokens. + progress_callback : callable + Function for reporting progress. + + Returns + ------- + keywords : list + """ + if progress_callback is None: + progress_callback = dummy_callback + + vectorizer = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False) + X = vectorizer.fit_transform(tokens) + words = vectorizer.get_feature_names() + + keywords = [] + n_docs = X.shape[0] + for i, row in enumerate(X): + progress_callback(i / n_docs) + keywords.append([(words[i], row[0, i]) for i in row.nonzero()[1]]) + return keywords + + +def yake_keywords( + documents: List[str], + language: str = "English", + max_len: int = 1, + progress_callback: Callable = None +) -> List[List[Tuple[str, float]]]: + """ + Extract keywords using YAKE!. + + Parameters + ---------- + documents : list + List of documents. + language : str + Selected language. + max_len : int + Maximum number of tokens. + progress_callback : callable + Function for reporting progress. + + Returns + ------- + keywords : list + """ + if progress_callback is None: + progress_callback = dummy_callback + + language = YAKE_LANGUAGE_MAPPING[language] + extractor = yake.KeywordExtractor(lan=language, n=max_len) + + keywords = [] + n_docs = len(documents) + for i, text in enumerate(documents): + progress_callback(i / n_docs) + keywords.append(extractor.extract_keywords(text)) + return keywords + + +class ScoringMethods: + """ + Scoring methods enum. + """ + TF_IDF, RAKE, YAKE, EMBEDDING = "TF-IDF", "Rake", "YAKE!", "Embedding" + ITEMS = list(zip((TF_IDF, YAKE), + (tfidf_keywords, yake_keywords))) + + TOKEN_METHODS = TF_IDF, EMBEDDING + DOCUMENT_METHODS = RAKE, YAKE + + +class AggregationMethods: + """ + Aggregation methods enum and helper functions. + """ + MEAN, MIN, MAX = range(3) + ITEMS = "Mean", "Minimum", "Maximum" + + @staticmethod + def aggregate( + keywords: List[List[Tuple[str, float]]], + agg_method: int + ) -> List[Tuple[str, float]]: + """ + Aggregate scores. + + Parameters + ---------- + keywords : list + List of keywords for each document. + agg_method : int + Method type. One of: MEAN, MIN, MAX. + + Returns + ------- + Aggregated keyword scores. + """ + return [AggregationMethods.mean, + AggregationMethods.min, + AggregationMethods.max][agg_method](keywords) + + @staticmethod + def mean( + keywords: List[List[Tuple[str, float]]] + ) -> List[Tuple[str, float]]: + """ + 'mean' aggregation function. + + Parameters + ---------- + keywords : list + List of keywords for each document. + + Returns + ------- + Aggregated keyword scores. + """ + scores = list(chain.from_iterable(keywords)) + unique_scores = defaultdict(lambda: 0.) + for word, score in scores: + unique_scores[word] += score + for word, score in unique_scores.items(): + unique_scores[word] = score / len(keywords) + return list(unique_scores.items()) + + @staticmethod + def min( + keywords: List[List[Tuple[str, float]]] + ) -> List[Tuple[str, float]]: + """ + 'min' aggregation function. + + Parameters + ---------- + keywords : list + List of keywords for each document. + + Returns + ------- + Aggregated keyword scores. + """ + scores = list(chain.from_iterable(keywords)) + unique_scores = defaultdict(lambda: 1.) + for word, score in scores: + assert score <= 1 + if unique_scores[word] > score: + unique_scores[word] = score + return list(unique_scores.items()) + + @staticmethod + def max( + keywords: List[List[Tuple[str, float]]] + ) -> List[Tuple[str, float]]: + """ + 'max' aggregation function. + + Parameters + ---------- + keywords : list + List of keywords for each document. + + Returns + ------- + Aggregated keyword scores. + """ + scores = list(chain.from_iterable(keywords)) + unique_scores = defaultdict(lambda: 0.) + for word, score in scores: + assert score >= 0 + if unique_scores[word] < score: + unique_scores[word] = score + return list(unique_scores.items()) diff --git a/orangecontrib/text/tests/test_keywords.py b/orangecontrib/text/tests/test_keywords.py new file mode 100644 index 000000000..a90cdae14 --- /dev/null +++ b/orangecontrib/text/tests/test_keywords.py @@ -0,0 +1,109 @@ +# pylint: disable=missing-docstring +import unittest + +from orangecontrib.text.keywords import tfidf_keywords, yake_keywords, \ + AggregationMethods + + +class TestTfIdf(unittest.TestCase): + def test_extractor(self): + tokens = [["foo", "bar", "baz", "baz"], + ["foobar"], + []] + keywords = tfidf_keywords(tokens) + self.assertEqual(len(keywords), 3) + self.assertEqual(len(keywords[0]), 3) + self.assertEqual(len(keywords[1]), 1) + self.assertEqual(len(keywords[2]), 0) + + self.assertEqual(keywords[0][0][0], "baz") + self.assertGreaterEqual(keywords[0][0][1], 0.8) + self.assertLessEqual(keywords[0][0][1], 1) + + self.assertEqual(keywords[0][1][0], "bar") + self.assertEqual(keywords[0][2][0], "foo") + + self.assertEqual(keywords[1][0][0], "foobar") + + def test_empty_tokens(self): + self.assertRaises(ValueError, tfidf_keywords, []) + self.assertRaises(ValueError, tfidf_keywords, [[]]) + + def test_single_letter_tokens(self): + keywords = tfidf_keywords([["a", "b", "b", " "]]) + self.assertEqual(keywords[0][0][0], " ") + self.assertEqual(keywords[0][1][0], "b") + self.assertEqual(keywords[0][2][0], "a") + + +class TestYake(unittest.TestCase): + def test_extractor(self): + documents = [ + "Human machine interface for lab abc computer applications", + "A survey of user opinion of computer system response time" + ] + keywords = yake_keywords(documents) + self.assertEqual(len(keywords), 2) + self.assertEqual(len(keywords[0]), 7) + self.assertEqual(len(keywords[1]), 7) + + def test_empty_documents(self): + keywords = yake_keywords([]) + self.assertEqual(len(keywords), 0) + + def test_single_letter_documents(self): + keywords = yake_keywords(["foo", "", "too"]) + self.assertEqual(len(keywords), 3) + self.assertEqual(len(keywords[0]), 1) + self.assertEqual(len(keywords[1]), 0) + self.assertEqual(len(keywords[2]), 0) + + +class TestAggregationMethods(unittest.TestCase): + def test_aggregate_mean(self): + keywords = [[("foo", 0.1)], + [("foo", 0.3), ("bar", 0.6)], + [("foo", 0.5)]] + scores = AggregationMethods.mean(keywords) + self.assertEqual(scores[0][0], "foo") + self.assertEqual(scores[1][0], "bar") + self.assertAlmostEqual(scores[0][1], 0.3) + self.assertAlmostEqual(scores[1][1], 0.2) + + def test_aggregate_min(self): + keywords = [[("foo", 0.1)], + [("foo", 0.3), ("bar", 0.6)], + [("foo", 0.5)]] + scores = AggregationMethods.min(keywords) + self.assertEqual(scores[0], ("foo", 0.1)) + self.assertEqual(scores[1], ("bar", 0.6)) + + def test_aggregate_max(self): + keywords = [[("foo", 0.1)], + [("foo", 0.3), ("bar", 0.6)], + [("foo", 0.5)]] + scores = AggregationMethods.max(keywords) + self.assertEqual(scores[0], ("foo", 0.5)) + self.assertEqual(scores[1], ("bar", 0.6)) + + def test_aggregate(self): + keywords = [[("foo", 0.1)], + [("foo", 0.3), ("bar", 0.6)], + [("foo", 0.5)]] + scores = AggregationMethods.aggregate(keywords, AggregationMethods.MEAN) + self.assertEqual(scores[0][0], "foo") + self.assertEqual(scores[1][0], "bar") + self.assertAlmostEqual(scores[0][1], 0.3) + self.assertAlmostEqual(scores[1][1], 0.2) + + scores = AggregationMethods.aggregate(keywords, AggregationMethods.MIN) + self.assertEqual(scores[0], ("foo", 0.1)) + self.assertEqual(scores[1], ("bar", 0.6)) + + scores = AggregationMethods.aggregate(keywords, AggregationMethods.MAX) + self.assertEqual(scores[0], ("foo", 0.5)) + self.assertEqual(scores[1], ("bar", 0.6)) + + +if __name__ == "__main__": + unittest.main() diff --git a/orangecontrib/text/widgets/owkeywords.py b/orangecontrib/text/widgets/owkeywords.py new file mode 100644 index 000000000..18f7d62bc --- /dev/null +++ b/orangecontrib/text/widgets/owkeywords.py @@ -0,0 +1,467 @@ +# pylint: disable=missing-docstring +from types import SimpleNamespace +from typing import Optional, Set, List, Tuple, Dict, Any + +import numpy as np +import pandas as pd + +from AnyQt.QtCore import Qt, QSortFilterProxyModel, QItemSelection, \ + QItemSelectionModel, QModelIndex, Signal +from AnyQt.QtWidgets import QCheckBox, QLineEdit, QTableView, QGridLayout, \ + QRadioButton, QButtonGroup + +from Orange.data import Table, Domain, StringVariable, ContinuousVariable +from Orange.util import wrap_callback +from Orange.widgets import gui +from Orange.widgets.settings import DomainContextHandler, ContextSetting, \ + Setting +from Orange.widgets.utils.concurrent import ConcurrentWidgetMixin, TaskState +from Orange.widgets.utils.itemmodels import PyTableModel, TableModel +from Orange.widgets.widget import Input, Output, OWWidget, Msg + +from orangecontrib.text import Corpus +from orangecontrib.text.keywords import ScoringMethods, AggregationMethods, \ + YAKE_LANGUAGE_MAPPING +from orangecontrib.text.preprocess import BaseNormalizer + +WORDS_COLUMN_NAME = "Words" +YAKE_LANGUAGES = list(YAKE_LANGUAGE_MAPPING.keys()) + + +class Results(SimpleNamespace): + # currently wanted (aggregated) scores + scores: List[Tuple[Any, ...]] = [] + # labels for currently wanted scores + labels: List[str] = [] + # all calculated keywords {method: [[(word1, score1), ...]]} + all_keywords: Dict[str, List[List[Tuple[str, float]]]] = {} + + +def run( + corpus: Optional[Corpus], + words: Optional[List], + cached_keywords: Dict, + scoring_methods: Set, + scoring_methods_kwargs: Dict, + agg_method: int, + state: TaskState +) -> Results: + results = Results(scores=[], labels=[], all_keywords={}) + if not corpus: + return results + + # passed by reference (and not copied) - to save partial results + results.all_keywords = cached_keywords + if not scoring_methods: + return results + + def callback(i: float, status=""): + state.set_progress_value(i * 100) + if status: + state.set_status(status) + if state.is_interruption_requested(): + raise Exception + + callback(0, "Calculating...") + scores = {} + tokens = corpus.tokens + documents = corpus.documents + step = 1 / len(scoring_methods) + for method_name, func in ScoringMethods.ITEMS: + if method_name in scoring_methods: + if method_name not in results.all_keywords: + i = len(results.labels) + cb = wrap_callback(callback, start=i * step, + end=(i + 1) * step) + + needs_tokens = method_name in ScoringMethods.TOKEN_METHODS + kw = {"progress_callback": cb} + kw.update(scoring_methods_kwargs.get(method_name, {})) + + keywords = func(tokens if needs_tokens else documents, **kw) + results.all_keywords[method_name] = keywords + + keywords = results.all_keywords[method_name] + scores[method_name] = \ + dict(AggregationMethods.aggregate(keywords, agg_method)) + + results.labels.append(method_name) + + scores = pd.DataFrame(scores) + if words: + + # Normalize words + for preprocessor in corpus.used_preprocessor.preprocessors: + if isinstance(preprocessor, BaseNormalizer): + words = [preprocessor.normalizer(w) for w in words] + + # Filter scores using words + scores = scores.loc[words] if any(w in scores.index for w in words) \ + else scores.iloc[:0] + + results.scores = scores.reset_index().sort_values( + by=[results.labels[0], "index"], + ascending=[False, True] + ).values.tolist() + + return results + + +class SelectionMethods: + NONE, ALL, MANUAL, N_BEST = range(4) + ITEMS = "None", "All", "Manual", "Top words" + + +class KeywordsTableView(QTableView): + pressedAny = Signal() + + def __init__(self): + super().__init__( + sortingEnabled=True, + editTriggers=QTableView.NoEditTriggers, + selectionBehavior=QTableView.SelectRows, + selectionMode=QTableView.ExtendedSelection, + cornerButtonEnabled=False, + ) + self.setItemDelegate(gui.ColoredBarItemDelegate(self)) + self.verticalHeader().setDefaultSectionSize(22) + + def mousePressEvent(self, event): + super().mousePressEvent(event) + self.pressedAny.emit() + + +class KeywordsTableModel(PyTableModel): + def data(self, index, role=Qt.DisplayRole): + if role in (gui.BarRatioRole, Qt.DisplayRole): + return super().data(index, Qt.EditRole) + if role == Qt.BackgroundColorRole and index.column() == 0: + return TableModel.ColorForRole[TableModel.Meta] + return super().data(index, role) + + def _argsortData(self, data, order): + """Always sort NaNs last""" + indices = np.argsort(data, kind='mergesort') + if order == Qt.DescendingOrder: + return np.roll(indices[::-1], -np.isnan(data).sum()) + return indices + + +class SortFilterProxyModel(QSortFilterProxyModel): + def sort(self, column: int, order: Qt.SortOrder = Qt.AscendingOrder): + self.lessThan = lambda *args: self.__nan_less_than(*args, order=order) + super().sort(column, order) + + def __nan_less_than(self, left_ind: QModelIndex, right_ind: QModelIndex, + order: Qt.SortOrder = Qt.AscendingOrder) -> bool: + left = self.sourceModel().data(left_ind, role=Qt.EditRole) + right = self.sourceModel().data(right_ind, role=Qt.EditRole) + if isinstance(right, float) and isinstance(left, float): + # NaNs always at the end + if np.isnan(right): + right = 1 - order + if np.isnan(left): + left = 1 - order + return left < right + return super().lessThan(left_ind, right_ind) + + +class OWKeywords(OWWidget, ConcurrentWidgetMixin): + name = "Extract Keywords" + description = "Infer characteristic words from Corpus." + icon = "icons/Keywords.svg" + priority = 1100 + keywords = ["characteristic", "term"] + + DEFAULT_SORTING = (1, Qt.DescendingOrder) + + settingsHandler = DomainContextHandler() + selected_scoring_methods: Set[str] = Setting({ScoringMethods.TF_IDF}) + yake_lang_index: int = Setting(YAKE_LANGUAGES.index("English")) + agg_method: int = Setting(AggregationMethods.MEAN) + sel_method: int = ContextSetting(SelectionMethods.N_BEST) + n_selected: int = ContextSetting(3) + sort_column_order: Tuple[int, int] = Setting(DEFAULT_SORTING) + selected_words = ContextSetting([], schema_only=True) + auto_apply: bool = Setting(True) + + class Inputs: + corpus = Input("Corpus", Corpus) + words = Input("Words", Table) + + class Outputs: + words = Output("Words", Corpus) + + class Warning(OWWidget.Warning): + no_words_column = Msg("Input is missing 'Words' column.") + + def __init__(self): + OWWidget.__init__(self) + ConcurrentWidgetMixin.__init__(self) + self.corpus: Optional[Corpus] = None + self.words: Optional[List] = None + self.__cached_keywords = {} + self.model = KeywordsTableModel(parent=self) + self._setup_gui() + + def _setup_gui(self): + grid = QGridLayout() + box = gui.widgetBox(self.controlArea, "Scoring Methods", grid) + + yake_cb = gui.comboBox( + self.controlArea, self, "yake_lang_index", items=YAKE_LANGUAGES, + callback=self.__on_yake_lang_changed + ) + + for i, (method_name, _) in enumerate(ScoringMethods.ITEMS): + check_box = QCheckBox(method_name, self) + check_box.setChecked(method_name in self.selected_scoring_methods) + check_box.stateChanged.connect( + lambda state, name=method_name: + self.__on_scoring_method_state_changed(state, name) + ) + box.layout().addWidget(check_box, i, 0) + if method_name == ScoringMethods.YAKE: + box.layout().addWidget(yake_cb, i, 1) + + box = gui.vBox(self.controlArea, "Aggregation") + gui.comboBox( + box, self, "agg_method", items=AggregationMethods.ITEMS, + callback=self.update_scores + ) + + box = gui.vBox(self.controlArea, "Select Words") + grid = QGridLayout() + grid.setContentsMargins(0, 0, 0, 0) + box.layout().addLayout(grid) + + self.__sel_method_buttons = QButtonGroup() + for method, label in enumerate(SelectionMethods.ITEMS): + button = QRadioButton(label) + button.setChecked(method == self.sel_method) + grid.addWidget(button, method, 0) + self.__sel_method_buttons.addButton(button, method) + self.__sel_method_buttons.buttonClicked[int].connect( + self._set_selection_method + ) + + spin = gui.spin( + box, self, "n_selected", 1, 999, addToLayout=False, + callback=lambda: self._set_selection_method( + SelectionMethods.N_BEST) + ) + grid.addWidget(spin, 3, 1) + + gui.rubber(self.controlArea) + gui.auto_send(self.buttonsArea, self, "auto_apply") + + self.__filter_line_edit = QLineEdit( + textChanged=self.__on_filter_changed, + placeholderText="Filter..." + ) + self.mainArea.layout().addWidget(self.__filter_line_edit) + + def select_manual(): + self._set_selection_method(SelectionMethods.MANUAL) + + self.view = KeywordsTableView() + self.view.pressedAny.connect(select_manual) + self.view.horizontalHeader().setSortIndicator(*self.DEFAULT_SORTING) + self.view.horizontalHeader().sectionClicked.connect( + self.__on_horizontal_header_clicked) + self.view.verticalHeader().sectionClicked.connect(select_manual) + self.mainArea.layout().addWidget(self.view) + + proxy = SortFilterProxyModel() + proxy.setFilterKeyColumn(0) + proxy.setFilterCaseSensitivity(False) + self.view.setModel(proxy) + self.view.model().setSourceModel(self.model) + self.view.selectionModel().selectionChanged.connect( + self.__on_selection_changed + ) + + def __on_scoring_method_state_changed(self, state: int, method_name: str): + if state == Qt.Checked: + self.selected_scoring_methods.add(method_name) + elif method_name in self.selected_scoring_methods: + self.selected_scoring_methods.remove(method_name) + self.update_scores() + + def __on_yake_lang_changed(self): + if ScoringMethods.YAKE in self.selected_scoring_methods: + if ScoringMethods.YAKE in self.__cached_keywords: + del self.__cached_keywords[ScoringMethods.YAKE] + self.update_scores() + + def __on_filter_changed(self): + model = self.view.model() + model.setFilterFixedString(self.__filter_line_edit.text().strip()) + self._select_rows() + + def __on_horizontal_header_clicked(self, index: int): + header = self.view.horizontalHeader() + self.sort_column_order = (index, header.sortIndicatorOrder()) + self._select_rows() + # explicitly call commit, because __on_selection_changed will not be + # invoked, since selection is actually the same, only order is not + if self.sel_method == SelectionMethods.MANUAL and self.selected_words \ + or self.sel_method == SelectionMethods.ALL: + self.commit() + + def __on_selection_changed(self): + selected_rows = self.view.selectionModel().selectedRows(0) + model = self.view.model() + self.selected_words = [model.data(model.index(i.row(), 0)) + for i in selected_rows] + self.commit() + + @Inputs.corpus + def set_corpus(self, corpus: Optional[Corpus]): + self.closeContext() + self._clear() + self.corpus = corpus + self.openContext(self.corpus) + self.__sel_method_buttons.button(self.sel_method).setChecked(True) + + def _clear(self): + self.clear_messages() + self.cancel() + self.selected_words = [] + self.model.clear() + self.__cached_keywords = {} + + @Inputs.words + def set_words(self, words: Optional[Table]): + self.words = None + self.Warning.no_words_column.clear() + if words: + if WORDS_COLUMN_NAME in words.domain and words.domain[ + WORDS_COLUMN_NAME].attributes.get("type") == "words": + self.words = list(words.get_column_view(WORDS_COLUMN_NAME)[0]) + else: + self.Warning.no_words_column() + + def handleNewSignals(self): + self.update_scores() + + def update_scores(self): + kwargs = { + ScoringMethods.YAKE: { + "language": YAKE_LANGUAGES[self.yake_lang_index], + "max_len": self.corpus.ngram_range[1] if self.corpus else 1 + } + } + self.start(run, self.corpus, self.words, self.__cached_keywords, + self.selected_scoring_methods, kwargs, self.agg_method) + + def _set_selection_method(self, method: int): + self.sel_method = method + self.__sel_method_buttons.button(method).setChecked(True) + self._select_rows() + + def _select_rows(self): + model = self.view.model() + n_rows, n_columns = model.rowCount(), model.columnCount() + if self.sel_method == SelectionMethods.NONE: + selection = QItemSelection() + elif self.sel_method == SelectionMethods.ALL: + selection = QItemSelection( + model.index(0, 0), + model.index(n_rows - 1, n_columns - 1) + ) + elif self.sel_method == SelectionMethods.MANUAL: + selection = QItemSelection() + for i in range(n_rows): + word = model.data(model.index(i, 0)) + if word in self.selected_words: + _selection = QItemSelection(model.index(i, 0), + model.index(i, n_columns - 1)) + selection.merge(_selection, QItemSelectionModel.Select) + elif self.sel_method == SelectionMethods.N_BEST: + n_sel = min(self.n_selected, n_rows) + selection = QItemSelection( + model.index(0, 0), + model.index(n_sel - 1, n_columns - 1) + ) + else: + raise NotImplementedError + + self.view.selectionModel().select( + selection, QItemSelectionModel.ClearAndSelect + ) + + def on_exception(self, ex: Exception): + raise ex + + def on_partial_result(self, _: Any): + pass + + # pylint: disable=arguments-differ + def on_done(self, results: Results): + self.__cached_keywords = results.all_keywords + self.model.wrap(results.scores) + self.model.setHorizontalHeaderLabels(["Words"] + results.labels) + self._apply_sorting() + if self.model.rowCount() > 0: + self._select_rows() + else: + self.__on_selection_changed() + + def _apply_sorting(self): + if self.model.columnCount() <= self.sort_column_order[0]: + self.sort_column_order = self.DEFAULT_SORTING + + header = self.view.horizontalHeader() + current_sorting = (header.sortIndicatorSection(), + header.sortIndicatorOrder()) + if current_sorting != self.sort_column_order: + header.setSortIndicator(*self.sort_column_order) + # needed to sort nans; 1. column has strings + # if self.sort_column_order[0] > 0: + # self.model.sort(*self.sort_column_order) + + def onDeleteWidget(self): + self.shutdown() + super().onDeleteWidget() + + def commit(self): + words = None + if self.selected_words: + words_var = StringVariable(WORDS_COLUMN_NAME) + words_var.attributes = {"type": "words"} + model = self.model + attrs = [ContinuousVariable(model.headerData(i + 1, Qt.Horizontal)) + for i in range(len(self.selected_scoring_methods))] + domain = Domain(attrs, metas=[words_var]) + + sort_column, reverse = self.sort_column_order + data = sorted(model, key=lambda a: a[sort_column], reverse=reverse) + data = [s[1:] + s[:1] for s in data if s[0] in self.selected_words] + words = Table.from_list(domain, data) + words.name = "Words" + + self.Outputs.words.send(words) + + def send_report(self): + if not self.corpus: + return + self.report_data("Corpus", self.corpus) + if self.words is not None: + self.report_paragraph("Words", ", ".join(self.words)) + self.report_table("Keywords", self.view, num_format="{:.3f}") + + +if __name__ == "__main__": + # pylint: disable=ungrouped-imports + from Orange.widgets.utils.widgetpreview import WidgetPreview + + words_var_ = StringVariable(WORDS_COLUMN_NAME) + words_var_.attributes = {"type": "words"} + lists = [[w] for w in ["human", "graph", "minors", "trees"]] + words_ = Table.from_list(Domain([], metas=[words_var_]), lists) + words_.name = "Words" + WidgetPreview(OWKeywords).run( + set_corpus=Corpus.from_file("deerwester"), # deerwester book-excerpts + # set_words=words_ + ) diff --git a/orangecontrib/text/widgets/tests/test_owkeywords.py b/orangecontrib/text/widgets/tests/test_owkeywords.py new file mode 100644 index 000000000..d11566860 --- /dev/null +++ b/orangecontrib/text/widgets/tests/test_owkeywords.py @@ -0,0 +1,195 @@ +# pylint: disable=missing-docstring +from typing import List +import unittest +from unittest.mock import Mock, patch + +import numpy as np + +from Orange.data import StringVariable, Table, Domain +from Orange.widgets.tests.base import WidgetTest + +from orangecontrib.text import Corpus +from orangecontrib.text.preprocess import * +from orangecontrib.text.widgets.owkeywords import OWKeywords, run, \ + AggregationMethods, ScoringMethods + + +def create_words_table(words: List) -> Table: + words_var = StringVariable("Words") + words_var.attributes = {"type": "words"} + domain = Domain([], metas=[words_var]) + data = [[w] for w in words] + words = Table.from_list(domain, data) + words.name = "Words" + return words + + +class TestRunner(unittest.TestCase): + def setUp(self): + self.corpus = Corpus.from_file("deerwester") + self.state = Mock() + self.state.is_interruption_requested = Mock(return_value=False) + + def test_run_default(self): + results = run(self.corpus, None, {}, {ScoringMethods.TF_IDF}, {}, + AggregationMethods.MEAN, self.state) + self.assertEqual(results.scores[0][0], "of") + self.assertAlmostEqual(results.scores[0][1], 0.16, 2) + self.assertEqual(results.labels, ["TF-IDF"]) + + def test_run_multiple_methods(self): + results = run(self.corpus, None, {}, + {ScoringMethods.TF_IDF, ScoringMethods.YAKE}, {}, + AggregationMethods.MEAN, self.state) + self.assertEqual(results.scores[0][0], "of") + self.assertAlmostEqual(results.scores[0][1], 0.16, 2) + self.assertTrue(np.isnan(np.nan)) + self.assertEqual(results.labels, ["TF-IDF", "YAKE!"]) + + def test_run_no_data(self): + results = run(None, None, {}, {ScoringMethods.TF_IDF}, {}, + AggregationMethods.MEAN, Mock()) + self.assertEqual(results.scores, []) + self.assertEqual(results.labels, []) + self.assertEqual(results.all_keywords, {}) + + def test_run_no_methods(self): + cached_keywords = Mock() + results = run(self.corpus, None, cached_keywords, set(), {}, + AggregationMethods.MEAN, Mock()) + self.assertEqual(results.scores, []) + self.assertEqual(results.labels, []) + self.assertIs(results.all_keywords, cached_keywords) + + def test_run_with_words(self): + words = ["human", "graph", "minors", "trees"] + results = run(self.corpus, words, {}, {ScoringMethods.TF_IDF}, {}, + AggregationMethods.MEAN, self.state) + self.assertEqual(len(results.scores), 4) + + words = ["foo", "bar"] + results = run(self.corpus, words, {}, {ScoringMethods.TF_IDF}, {}, + AggregationMethods.MEAN, self.state) + self.assertEqual(len(results.scores), 0) + + words = [] + results = run(self.corpus, words, {}, {ScoringMethods.TF_IDF}, {}, + AggregationMethods.MEAN, self.state) + self.assertEqual(len(results.scores), 42) + + words = None + results = run(self.corpus, words, {}, {ScoringMethods.TF_IDF}, {}, + AggregationMethods.MEAN, self.state) + self.assertEqual(len(results.scores), 42) + + def test_run_normalize_words(self): + normalizer = WordNetLemmatizer() + corpus = normalizer(self.corpus) + + words = ["minor", "tree"] + results = run(corpus, words, {}, {ScoringMethods.TF_IDF}, {}, + AggregationMethods.MEAN, self.state) + self.assertEqual(len(results.scores), 2) + + words = ["minors", "trees"] + results = run(corpus, words, {}, {ScoringMethods.TF_IDF}, {}, + AggregationMethods.MEAN, self.state) + self.assertEqual(len(results.scores), 2) + + def test_run_with_cached_results(self): + results1 = run(self.corpus, None, {}, + {ScoringMethods.TF_IDF, ScoringMethods.YAKE}, {}, + AggregationMethods.MEAN, self.state) + + with patch("orangecontrib.text.keywords.tfidf_keywords") as mock: + results2 = run(self.corpus, None, results1.all_keywords, + {ScoringMethods.TF_IDF, ScoringMethods.YAKE}, {}, + AggregationMethods.MEAN, self.state) + mock.assert_not_called() + self.assertNanEqual(results1.scores, results2.scores) + self.assertNanEqual(results1.labels, results2.labels) + self.assertNanEqual(results1.all_keywords, results2.all_keywords) + + def test_run_interrupt(self): + state = Mock() + state.is_interruption_requested = Mock(return_value=True) + self.assertRaises(Exception, run, self.corpus, None, {}, + {ScoringMethods.TF_IDF}, {}, + AggregationMethods.MEAN, state) + + def assertNanEqual(self, table1, table2): + for list1, list2 in zip(table1, table2): + for x1, x2 in zip(list1, list2): + if isinstance(x1, float) and np.isnan(x1): + self.assertTrue(np.isnan(x2)) + else: + self.assertEqual(x1, x2) + + +class TestOWWordList(WidgetTest): + def setUp(self): + self.widget = self.create_widget(OWKeywords) + self.corpus = Corpus.from_file("deerwester") + + def test_default(self): + self.send_signal(self.widget.Inputs.corpus, self.corpus) + self.wait_until_finished() + output = self.get_output(self.widget.Outputs.words) + self.assertEqual(len(output), 3) + self.assertTrue(output.domain.metas[0].name, "Words") + self.assertDictEqual(output.domain.metas[0].attributes, + {"type": "words"}) + self.assertListEqual(list(output.metas[:, 0]), + ["of", "system", "graph"]) + + def test_input_words(self): + words = create_words_table(["human", "graph", "minors", "trees"]) + self.send_signal(self.widget.Inputs.corpus, self.corpus) + self.send_signal(self.widget.Inputs.words, words) + self.wait_until_finished() + output = self.get_output(self.widget.Outputs.words) + self.assertListEqual(list(output.metas[:, 0]), + ["graph", "trees", "minors"]) + + def test_input_words_no_type(self): + words = Table("zoo") + self.send_signal(self.widget.Inputs.corpus, self.corpus) + self.send_signal(self.widget.Inputs.words, words) + self.assertTrue(self.widget.Warning.no_words_column.is_shown()) + self.send_signal(self.widget.Inputs.words, None) + self.assertFalse(self.widget.Warning.no_words_column.is_shown()) + + def test_sort_nans_desc(self): + settings = {"selected_scoring_methods": {"TF-IDF", "YAKE!"}, + "sort_column_order": (2, 1)} + widget = self.create_widget(OWKeywords, stored_settings=settings) + self.send_signal(widget.Inputs.corpus, self.corpus, widget=widget) + self.wait_until_finished(widget=widget) + output = self.get_output(widget.Outputs.words, widget=widget) + self.assertListEqual(list(output.metas[:, 0]), + ["user", "graph", "trees"]) + + def test_sort_nans_asc(self): + settings = {"selected_scoring_methods": {"TF-IDF", "YAKE!"}, + "sort_column_order": (2, 0)} + widget = self.create_widget(OWKeywords, stored_settings=settings) + self.send_signal(widget.Inputs.corpus, self.corpus, widget=widget) + self.wait_until_finished(widget=widget) + output = self.get_output(widget.Outputs.words, widget=widget) + self.assertListEqual(list(output.metas[:, 0]), + ["widths", "opinion", "applications"]) + + def test_send_report(self): + self.send_signal(self.widget.Inputs.corpus, self.corpus) + self.wait_until_finished() + self.widget.send_report() + words = create_words_table(["human", "graph", "minors", "trees"]) + self.send_signal(self.widget.Inputs.words, words) + self.wait_until_finished() + self.widget.send_report() + self.send_signal(self.widget.Inputs.corpus, None) + self.widget.send_report() + + +if __name__ == "__main__": + unittest.main() diff --git a/requirements.txt b/requirements.txt index d30839df5..ceb496590 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,3 +17,4 @@ lxml biopython # Enables Pubmed widget. ufal.udpipe >=1.2.0.3 orange-widget-base >=4.12.0 +yake