Skip to content

Commit

Permalink
Corpus widget - language dropdown
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Nov 4, 2022
1 parent ed37e83 commit e69636c
Show file tree
Hide file tree
Showing 2 changed files with 161 additions and 6 deletions.
96 changes: 90 additions & 6 deletions orangecontrib/text/widgets/owcorpus.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import numpy as np

import numpy as np
from AnyQt.QtCore import Qt
from Orange.data import Table, StringVariable, Variable
from Orange.data.io import FileFormat
from Orange.widgets import gui
Expand All @@ -10,10 +11,65 @@
DomainContextHandler
from Orange.widgets.widget import OWWidget, Msg, Input, Output
from Orange.widgets.utils.concurrent import TaskState, ConcurrentWidgetMixin
from orangecanvas.gui.utils import disconnected
from orangewidget.settings import ContextHandler

from orangecontrib.text.corpus import Corpus, get_sample_corpora_dir
from orangecontrib.text.language import (
LANG2ISO,
detect_language,
ISO2LANG,
LanguageModel,
)
from orangecontrib.text.widgets.utils import widgets, QSize


class CorpusContextHandler(DomainContextHandler):
"""
Since Corpus enable language selection and language is not domain dependent
but documents dependent setting specific handler is required. It will mathc
contexts when selected attributes are the same and hash of the documents
is the same.
Note: With this modification context matching is stricter. It was discussed
that in this case there would be two contexts required one for attributes
and one for language. Idea is that in the feature we implement context handlers
such that specific matcher can be set for a specific setting (e.g. language).
"""

def open_context(self, widget, corpus):
"""
Modifying open_context such that it propagates complete corpus not only
domain - required for hash computation
"""
if corpus is None:
return
ContextHandler.open_context(
self, widget, corpus, *self.encode_domain(corpus.domain)
)

def new_context(self, corpus, attributes, metas):
"""Adding hash of documents to the context"""
context = super().new_context(corpus, attributes, metas)
context.documents_hash = hash(tuple(corpus.documents))
return context

def match(self, context, corpus, attrs, metas):
"""
For a match documents in the corpus must have same hash value and
attributes should mathc
"""
if hasattr(context, "documents_hash") and not context.documents_hash == hash(
tuple(corpus.documents)
):
return self.NO_MATCH
return super().match(context, corpus.domain, attrs, metas)

def decode_setting(self, setting, value, corpus=None, *args):
"""Modifying decode setting to work with Corpus instead of domain"""
return super().decode_setting(setting, value, corpus.domain, *args)


class OWCorpus(OWWidget, ConcurrentWidgetMixin):
name = "Corpus"
description = "Load a corpus of text documents."
Expand All @@ -37,7 +93,7 @@ class Outputs:
for f in sorted(set(FileFormat.readers.values()),
key=list(FileFormat.readers.values()).index)))

settingsHandler = DomainContextHandler()
settingsHandler = CorpusContextHandler()

recent_files = Setting([
"book-excerpts.tab",
Expand All @@ -48,6 +104,7 @@ class Outputs:
])
used_attrs = ContextSetting([])
title_variable = ContextSetting("")
language: str = ContextSetting("English")

class Error(OWWidget.Error):
read_file = Msg("Can't read file ({})")
Expand All @@ -74,10 +131,29 @@ def __init__(self):
# dropdown to select title variable
self.title_model = DomainModel(
valid_types=(StringVariable,), placeholder="(no title)")
box = gui.vBox(self.controlArea, "Corpus settings")
common_settings = dict(
labelWidth=100,
searchable=True,
orientation=Qt.Horizontal,
callback=self.update_feature_selection,
)
gui.comboBox(
self.controlArea, self, "title_variable",
box="Title variable", model=self.title_model,
callback=self.update_feature_selection
box,
self,
"title_variable",
label="Title variable",
model=self.title_model,
**common_settings
)
gui.comboBox(
box,
self,
"language",
label="Language",
model=LanguageModel(),
sendSelectedValue=True,
**common_settings
)

# Used Text Features
Expand Down Expand Up @@ -145,7 +221,10 @@ def open_file(self, path=None, data=None):
self.Error.clear()
self.cancel()
self.unused_attrs_model[:] = []
self.used_attrs_model[:] = []
with disconnected(
self.used_attrs_model.rowsRemoved, self.update_feature_selection
):
self.used_attrs_model[:] = []
self.start(self._load_corpus, path, data)

def on_done(self, corpus: Corpus) -> None:
Expand All @@ -160,6 +239,9 @@ def on_done(self, corpus: Corpus) -> None:
self.Error.corpus_without_text_features()
self.Outputs.corpus.send(None)
return
# set language on Corpus's language (when corpus with already defined
# language opened) or guess language
self.language = ISO2LANG[corpus.language or detect_language(corpus)]
self.openContext(self.corpus)
self.used_attrs_model.extend(self.used_attrs)
self.unused_attrs_model.extend(
Expand Down Expand Up @@ -225,6 +307,7 @@ def _setup_title_dropdown(self):

def update_feature_selection(self):
self.Error.no_text_features_used.clear()

# duplicated data when reordering inside a single window
def remove_duplicates(l):
unique = []
Expand All @@ -242,6 +325,7 @@ def remove_duplicates(l):
self.Error.no_text_features_used()

self.corpus.set_title_variable(self.title_variable)
self.corpus.attributes["language"] = LANG2ISO[self.language]
# prevent sending "empty" corpora
dom = self.corpus.domain
empty = not (dom.variables or dom.metas) \
Expand Down
71 changes: 71 additions & 0 deletions orangecontrib/text/widgets/tests/test_owcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import numpy as np
from Orange.data import Table, Domain, StringVariable, ContinuousVariable
from Orange.widgets.tests.base import WidgetTest
from Orange.widgets.tests.utils import simulate

from orangecontrib.text import Corpus
from orangecontrib.text.widgets.owcorpus import OWCorpus
Expand Down Expand Up @@ -249,6 +250,76 @@ def test_corpus_without_text_features(self):
self.widget.Error.corpus_without_text_features.is_shown()
)

def test_context(self):
data = Table(Corpus.from_file("book-excerpts"))
data.attributes["language"] = "sl"
self.send_signal(self.widget.Inputs.data, data)
self.wait_until_finished()
self.assertEqual("Slovenian", self.widget.language)
self.assertEqual("sl", self.get_output(self.widget.Outputs.corpus).language)

# change language to see if context work later when reopened
simulate.combobox_activate_item(self.widget.controls.language, "Dutch")
self.assertEqual("Dutch", self.widget.language)
self.assertEqual("nl", self.get_output(self.widget.Outputs.corpus).language)

data1 = Table(Corpus.from_file("deerwester"))
self.send_signal(self.widget.Inputs.data, data1)
self.wait_until_finished()
self.assertEqual("English", self.widget.language)
self.assertEqual("en", self.get_output(self.widget.Outputs.corpus).language)

data.attributes["language"] = "sk"
self.send_signal(self.widget.Inputs.data, data)
self.wait_until_finished()
self.assertEqual("Dutch", self.widget.language)
self.assertEqual("nl", self.get_output(self.widget.Outputs.corpus).language)

# different documents in corpus (should not match the context)
data2 = data[:10]
data2.attributes["language"] = "sl"
self.send_signal(self.widget.Inputs.data, data2)
self.wait_until_finished()
self.assertEqual("Slovenian", self.widget.language)
self.assertEqual("sl", self.get_output(self.widget.Outputs.corpus).language)

def test_guess_language(self):
data = Table(Corpus.from_file("book-excerpts"))
# since Table is made from Corpus language attribute is in attributes
# drop it
data.attributes = {}
# change default to something that is not corpus's language
self.widget.language = "Slovenian"
self.send_signal(self.widget.Inputs.data, data)
self.wait_until_finished()
self.assertEqual("English", self.widget.language)
self.assertEqual("en", self.get_output(self.widget.Outputs.corpus).language)

# change language to see if context work later when reopened
simulate.combobox_activate_item(self.widget.controls.language, "Dutch")
self.assertEqual("Dutch", self.widget.language)
self.assertEqual("nl", self.get_output(self.widget.Outputs.corpus).language)

data1 = Table(Corpus.from_file("deerwester"))
self.send_signal(self.widget.Inputs.data, data1)
self.wait_until_finished()
self.assertEqual("English", self.widget.language)
self.assertEqual("en", self.get_output(self.widget.Outputs.corpus).language)

data.attributes["language"] = "sk"
self.send_signal(self.widget.Inputs.data, data)
self.wait_until_finished()
self.assertEqual("Dutch", self.widget.language)
self.assertEqual("nl", self.get_output(self.widget.Outputs.corpus).language)

# different documents in corpus (should not match the context)
data2 = data[:10]
data2.attributes["language"] = None
self.send_signal(self.widget.Inputs.data, data2)
self.wait_until_finished()
self.assertEqual("English", self.widget.language)
self.assertEqual("en", self.get_output(self.widget.Outputs.corpus).language)


if __name__ == "__main__":
unittest.main()

0 comments on commit e69636c

Please sign in to comment.