Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Import documents - language dialog and language guessing #918

Merged
merged 1 commit into from
Dec 5, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 51 additions & 7 deletions orangecontrib/text/widgets/owimportdocuments.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
QVBoxLayout, QLabel, QGridLayout, QSizePolicy, QCompleter
)
from numpy import array
from orangewidget.settings import ContextHandler, Context

from orangewidget.utils.itemmodels import PyListModel

Expand All @@ -37,17 +38,19 @@
ThreadExecutor, FutureWatcher, methodinvoke
)
from Orange.widgets.widget import Output
from orangecanvas.preview.previewbrowser import TextLabel

from orangecontrib.text.corpus import Corpus
from orangecontrib.text.import_documents import ImportDocuments, \
NoDocumentsException

try:
from orangecanvas.preview.previewbrowser import TextLabel
except ImportError:
from Orange.canvas.preview.previewbrowser import TextLabel
from orangecontrib.text.import_documents import ImportDocuments, NoDocumentsException
from orangecontrib.text.language import (
ISO2LANG,
detect_language,
LANG2ISO,
LanguageModel,
)

# domain for skipped images output

SKIPPED_DOMAIN = Domain([], metas=[
StringVariable("name"),
StringVariable("path")
Expand Down Expand Up @@ -85,6 +88,26 @@ class State(enum.IntEnum):
NoState, Processing, Done, Cancelled, Error = range(5)


class ImportDocumentContextHandler(ContextHandler):
"""Context handler that matches hashes of documents"""

@staticmethod
def corpus_hash(corpus: Corpus) -> int:
"""Compute hash of all documents in the Corpus"""
return hash(tuple(corpus.documents))

def new_context(self, corpus: Corpus) -> Context:
context = super().new_context()
context.documents_hash = self.corpus_hash(corpus)
return context

# noinspection PyMethodOverriding
def match(self, context: Context, corpus: Corpus) -> int:
if context.documents_hash == self.corpus_hash(corpus):
return self.PERFECT_MATCH
return self.NO_MATCH


class OWImportDocuments(widget.OWWidget):
name = "Import Documents"
description = "Import text documents from folders."
Expand All @@ -95,6 +118,8 @@ class Outputs:
data = Output("Corpus", Corpus, default=True)
skipped_documents = Output("Skipped documents", Table)

settingsHandler = ImportDocumentContextHandler()

LOCAL_FILE, URL = range(2)
source = settings.Setting(LOCAL_FILE)
#: list of recent paths
Expand All @@ -104,6 +129,7 @@ class Outputs:
lemma_cb = settings.Setting(True)
pos_cb = settings.Setting(False)
ner_cb = settings.Setting(False)
language: str = settings.ContextSetting("English")

want_main_area = False
resizing_enabled = False
Expand Down Expand Up @@ -217,6 +243,17 @@ def __init__(self):
lambda: reloadbutton.setEnabled(reloadaction.isEnabled())
)

gui.comboBox(
self.controlArea,
self,
"language",
box="Language",
model=LanguageModel(),
sendSelectedValue=True,
searchable=True,
callback=self.commit,
)

box = gui.hBox(self.controlArea, "Conllu import options")
gui.checkBox(box, self, "lemma_cb", "Lemma",
callback=self.commit)
Expand Down Expand Up @@ -535,6 +572,7 @@ def start(self):
self.error()
self.Warning.clear()
self.progress_widget.setValue(0)
self.closeContext()

self.__invalidated = False
startdir = self.currentPath if self.source == self.LOCAL_FILE \
Expand Down Expand Up @@ -624,6 +662,10 @@ def __onRunFinished(self):
self.n_text_data = len(corpus)
self.n_text_categories = len(corpus.domain.class_var.values) \
if corpus.domain.class_var else 0
self.language = ISO2LANG[corpus.language or detect_language(corpus)]
self.openContext(corpus)
else:
self.language = None

self.base_corpus = self.corpus = corpus
self.is_conllu = is_conllu
Expand Down Expand Up @@ -681,6 +723,8 @@ def commit(self):
"""
if self.is_conllu:
self.add_features()
if self.corpus:
self.corpus.attributes["language"] = LANG2ISO[self.language]
self.Outputs.data.send(self.corpus)
if self.skipped_documents:
skipped_table = (
Expand Down
33 changes: 33 additions & 0 deletions orangecontrib/text/widgets/tests/test_owimportdocuments.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from unittest.mock import patch, Mock

from Orange.widgets.tests.base import WidgetTest
from Orange.widgets.tests.utils import simulate

from orangecontrib.text.widgets.owimportdocuments import OWImportDocuments


Expand Down Expand Up @@ -117,6 +119,37 @@ def test_load_empty_folder(self):
self.wait_until_finished(widget=widget)
self.assertIsNone(self.get_output(widget.Outputs.data))

def tests_context(self):
self.widget: OWImportDocuments = self.create_widget(OWImportDocuments)
# change default to something else to see if language is changed
self.widget.language = "Slovenian"

path = os.path.join(os.path.dirname(__file__), "data/documents", "good")
self.widget.setCurrentPath(path)
self.widget.reload()
self.wait_until_finished()

# english is recognized for selected documents
self.assertEqual(self.widget.language, "English")
self.assertEqual("en", self.get_output(self.widget.Outputs.data).language)
simulate.combobox_activate_item(self.widget.controls.language, "Dutch")

self.assertEqual(self.widget.language, "Dutch")
self.assertEqual("nl", self.get_output(self.widget.Outputs.data).language)

# read something else
path1 = os.path.join(os.path.dirname(__file__), "data/conllu")
self.widget.setCurrentPath(path1)
self.widget.reload()
self.wait_until_finished()

# read same data again and observe if context is restored
self.widget.setCurrentPath(path)
self.widget.reload()
self.wait_until_finished()
self.assertEqual(self.widget.language, "Dutch")
self.assertEqual("nl", self.get_output(self.widget.Outputs.data).language)


if __name__ == "__main__":
unittest.main()