From d4b7b39873a0e09fc4f6746ecd4aa559bdb507ff Mon Sep 17 00:00:00 2001 From: Nikola Djukic Date: Fri, 6 Mar 2020 16:07:35 +0100 Subject: [PATCH] Document embedding script and widget --- orangecontrib/text/vectorization/dense.py | 164 ++++++++++++ .../text/widgets/owdocumentembedding.py | 236 ++++++++++++++++++ 2 files changed, 400 insertions(+) create mode 100644 orangecontrib/text/vectorization/dense.py create mode 100644 orangecontrib/text/widgets/owdocumentembedding.py diff --git a/orangecontrib/text/vectorization/dense.py b/orangecontrib/text/vectorization/dense.py new file mode 100644 index 000000000..a40ccc62c --- /dev/null +++ b/orangecontrib/text/vectorization/dense.py @@ -0,0 +1,164 @@ +from Orange.misc.server_embedder import ServerEmbedderCommunicator +from orangecontrib.text import Corpus + +import zlib +import base64 +import json +import sys +import numpy as np +import warnings + +from typing import List, Tuple, Any, Optional + +AGGREGATORS = ['mean', 'sum', 'max', 'min'] +LANGS_TO_ISO = {'English': 'en', 'Slovenian': 'sl', 'German': 'de'} +LANGUAGES = list(LANGS_TO_ISO.values()) +EMB_DIM = 300 + +class PretrainedEmbedder: + """This class is used for obtaining dense embeddings of documents in + corpus using fastText pretrained models from: + + E. Grave, P. Bojanowski, P. Gupta, A. Joulin, T. Mikolov, + Learning Word Vectors for 157 Languages. + Proceedings of the International Conference on Language Resources and Evaluation, 2018. + + Embedding is performed on server so the internet connection is a + prerequisite for using the class. Currently supported languages are: + - English (en) + - Slovenian (sl) + - German (de) + + Attributes: + language (str): ISO 639-1 (two-letter) code of desired language. + aggregator (str): Aggregator which creates document embedding (single + vector) from word embeddings (multiple vectors). + Allowed values are mean, sum, max, min. + """ + + def __init__(self, language: str = 'en', + aggregator: str = 'mean') -> None: + + lang_error = '{} is not a valid language. Allowed values: {}' + agg_error = '{} is not a valid aggregator. Allowed values: {}' + if(language.lower() not in LANGUAGES): + raise ValueError(lang_error.format(language, ', '.join(LANGUAGES))) + else: + self.language = language.lower() + + if(aggregator.lower() not in AGGREGATORS): + raise ValueError(agg_error.format(aggregator, ', '.join(AGGREGATORS))) + else: + self.aggregator = aggregator.lower() + + self._dim = EMB_DIM + self._embedder = _ServerEmbedder(self.aggregator, + model_name = self.language, + max_parallel_requests = 0, + server_url = '', + # TODO set proper url + embedder_type = 'text') + + def transform(self, corpus: Corpus, copy: bool = True, + processed_callback = None) -> Corpus: + """Adds matrix of document embeddings to a corpus. + + Args: + corpus (Corpus): Corpus on which transform is performed. + copy (bool): If set to True, a copy of corpus is made. + + Returns: + Corpus (original or a copy) with new features added. + + Raises: + ValueError: If corpus is not instance of Corpus. + RuntimeError: If document in corpus is larger than + 50 KB after compression. + """ + if(not isinstance(corpus, Corpus)): + raise ValueError("Input should be instance of Corpus.") + else: + corpus = corpus.copy() if copy else corpus + embs = self._embedder.embedd_table(corpus.tokens, + processed_callback = processed_callback) + + + # Check if some documents in corpus in weren't embedded + # for some reason. This is a very rare case. + warnings.simplefilter('always', RuntimeWarning) + for i, em in enumerate(embs): + if(em is None): + embs[i] = np.zeros(self._dim) * np.nan + warnings.warn("Some documents were not embedded for \ + for unknown reason. Those documents \ + are represented as vectors of nans", + RuntimeWarning) + + variable_attrs = { + 'hidden': True, + 'skip-normalization': True, + 'dense-embedding-feature': True + } + + corpus.extend_attributes(np.array(embs), + ['Dim{}'.format(i) for i in range(self._dim)], + var_attrs = variable_attrs) + return corpus + + def report(self) -> Tuple[Tuple[str, str], Tuple[str, str]]: + """Reports on current parameters of PretrainedEmbedder. + + Returns: + Tuple of parameters. + """ + return (('Language', self.language), + ('Aggregator', self.aggregator)) + + def set_cancelled(self): + if(self._embedder): + self._embedder.set_cancelled() + + def clear_cache(self): + if(self._embedder): + self._embedder._cache.clear_cache() + + def __enter__(self): + return self + + def __exit__(self, type, value, traceback): + self.set_cancelled() + + def __del__(self): + self.__exit__(None, None, None) + + +class _ServerEmbedder(ServerEmbedderCommunicator): + + def __init__(self, aggregator: str, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + self.content_type = 'application/json' + self.aggregator = aggregator + + async def _encode_data_instance(self, data_instance: Any) -> Optional[bytes]: + data_string = json.dumps(data_instance) + data = base64.b64encode(zlib.compress( + data_string.encode('utf-8', 'replace'), + level = -1)).decode('utf-8', 'replace') + + if(sys.getsizeof(data) > 50000): + raise RuntimeError("Document in corpus is too large. \ + Size limit is 50 KB (after compression).") + + data_dict = { + "data": data, + "aggregator": self.aggregator + } + + json_string = json.dumps(data_dict) + return json_string.encode('utf-8', 'replace') + +if(__name__ == '__main__'): + corpus = Corpus.from_file('deerwester') + embedder = PretrainedEmbedder(language = 'en', aggregator = 'max') + embedder.clear_cache() + embedder.transform(corpus) \ No newline at end of file diff --git a/orangecontrib/text/widgets/owdocumentembedding.py b/orangecontrib/text/widgets/owdocumentembedding.py new file mode 100644 index 000000000..884e330e1 --- /dev/null +++ b/orangecontrib/text/widgets/owdocumentembedding.py @@ -0,0 +1,236 @@ +from AnyQt.QtWidgets import QApplication, QLayout, QPushButton, QStyle, QLabel +from AnyQt.QtCore import Qt + +import numpy as np + +from orangecontrib.text.widgets.utils import owbasevectorizer, widgets +from Orange.widgets.gui import widgetBox, comboBox, spin, auto_commit, hBox +from Orange.widgets.settings import Setting +from Orange.widgets.widget import OWWidget, Msg, Input, Output +from Orange.widgets.utils.concurrent import ConcurrentWidgetMixin, TaskState + + +from orangecontrib.text.corpus import Corpus + +from orangecontrib.text.vectorization.dense import PretrainedEmbedder +from orangecontrib.text.vectorization.dense import LANGS_TO_ISO, AGGREGATORS, EMB_DIM +from Orange.misc.utils.embedder_utils import EmbeddingConnectionError +from typing import Any + +def run_pretrained_embedder( + corpus: Corpus, + language: str, + aggregator: str, + state: TaskState) -> Corpus: + + """Runs PretrainedEmbedder. + + Args: + corpus (Corpus): Corpus on which transform is performed. + language (str): ISO 639-1 (two-letter) code of desired language. + aggregator (str): Aggregator which creates document embedding (single + vector) from word embeddings (multiple vectors). + Allowed values are mean, sum, max, min. + state (TaskState): State object. + + Returns: + New corpus with additional features. + """ + + embedder = PretrainedEmbedder(language=language, + aggregator=aggregator) + + ticks = iter(np.linspace(0., 100., len(corpus))) + + def advance(success=True): + if(state.is_interruption_requested()): + embedder.set_cancelled() + if(success): + state.set_progress_value(next(ticks)) + + try: + new_corpus = embedder.transform(corpus, processed_callback=advance) + return new_corpus + + except EmbeddingConnectionError: + return None + + +class OWPretrainedEmbedder(OWWidget, ConcurrentWidgetMixin): + name = "Document Embedding" + description = "Document embedding using pretrained models." + keywords = ['embedding', 'document embedding', 'text'] + icon = 'icons/TextEmbedding.svg' + priority = 300 + + want_main_area = False + _auto_apply = Setting(default = True) + + class Inputs: + corpus = Input('Corpus', Corpus) + + class Outputs: + new_corpus = Output('Corpus', Corpus) + + class Error(OWWidget.Error): + no_connection = Msg("""No internet connection. + Please establish a connection or + use another vectorizer.""") + unexpected_error = Msg('Embedding error: {}') + + class Warning(OWWidget.Warning): + unsuccessful_embeddings = Msg('Some embeddings were unsuccessful.') + + + language = Setting(default = 'English') + aggregator = Setting(default = 'mean') + + def __init__(self): + OWWidget.__init__(self) + ConcurrentWidgetMixin.__init__(self) + + self.languages = list(LANGS_TO_ISO.keys()) + self.aggregators = AGGREGATORS + self.corpus = None + self.new_corpus = None + self._setup_layout() + + def _setup_layout(self): + self.controlArea.setMinimumWidth(self.controlArea.sizeHint().width()) + + widget_box = widgetBox(self.controlArea, 'Settings') + + self.language_cb = comboBox(widget = widget_box, + master = self, + value = 'language', + label = 'Language: ', + orientation = Qt.Horizontal, + items = self.languages, + callback = self._option_changed + ) + + self.aggregator_cb = comboBox(widget = widget_box, + master = self, + value = 'aggregator', + label = 'Aggregator: ', + orientation = Qt.Horizontal, + items = self.aggregators, + callback = self._option_changed + ) + + self.auto_commit_widget = auto_commit(widget = self.controlArea, + master = self, + value = '_auto_apply', + label = 'Apply', + commit = self.commit) + + self.cancel_button = QPushButton( + 'Cancel', + icon = self.style() + .standardIcon(QStyle.SP_DialogCancelButton)) + + self.cancel_button.clicked.connect(self.cancel) + + hbox = hBox(self.controlArea) + hbox.layout().addWidget(self.cancel_button) + self.cancel_button.setDisabled(True) + + def set_input_corpus_summary(self, corpus): + if(corpus is None): + self.info.set_input_summary(self.info.NoInput) + else: + self.info.set_input_summary("{} documents." + .format(len(corpus))) + + def set_output_corpus_summary(self, corpus): + if(corpus is None): + self.info.set_output_summary(self.info.NoOutput) + else: + unsuccessful = np.sum(np.isnan(corpus.X)) / EMB_DIM + if(unsuccessful > 0): + self.Warning.unsuccessful_embeddings() + self.info.set_output_summary("Successful: {}, Unsuccessful: {}" + .format(len(corpus) - unsuccessful, + unsuccessful)) + + @Inputs.corpus + def set_data(self, data): + self.Warning.clear() + self.set_input_corpus_summary(data) + + if (not data): + self.corpus = None + self.clear_outputs() + return + + self.corpus = data + self.commit() + + def _option_changed(self): + self.commit() + + def commit(self): + if(self.corpus is None): + self.clear_outputs() + return + + self._set_fields(False) + + try: + self.start(run_pretrained_embedder, + self.corpus, + LANGS_TO_ISO[self.language], + self.aggregator) + except EmbeddingConnectionError: + self.Error.no_connection() + self.cancel() + except Exception as ex: + self.Error.unexpected_error(type(ex).__name__) + self.cancel() + + self.Error.clear() + + def on_done(self, result: Any) -> None: + self._set_fields(True) + self._send_output_signals(result) + + def on_partial_results(self, result: Any): + self.cancel() + self.Error.no_connection() + + def on_exception(self, ex: Exception): + self._set_fields(False) + self.Error.unexpected_error(type(ex).__name__) + self.clear_outputs() + + def cancel(self): + self._set_fields(True) + super().cancel() + + def _set_fields(self, active): + self.auto_commit_widget.setDisabled(not active) + self.cancel_button.setDisabled(active) + self.language_cb.setDisabled(not active) + self.aggregator_cb.setDisabled(not active) + + + def _send_output_signals(self, result): + self.Outputs.new_corpus.send(result) + self.set_output_corpus_summary(result) + + + def clear_outputs(self): + self._send_output_signals(None) + + def onDeleteWidget(self): + self.cancel() + super().onDeleteWidget() + + +if __name__ == '__main__': + app = QApplication([]) + widget = OWPretrainedEmbedder() + widget.show() + corpus = Corpus.from_file('book-excerpts') + widget.set_data(corpus) + app.exec() \ No newline at end of file