Skip to content

Commit

Permalink
Merge pull request #560 from PrimozGodec/list-nor-read
Browse files Browse the repository at this point in the history
[ENH] List files that are not loaded
  • Loading branch information
ajdapretnar authored Sep 4, 2020
2 parents 4027f0b + 49d24f1 commit 4207270
Show file tree
Hide file tree
Showing 7 changed files with 111 additions and 14 deletions.
49 changes: 35 additions & 14 deletions orangecontrib/text/widgets/owimportdocuments.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

from types import SimpleNamespace as namespace
from concurrent.futures._base import TimeoutError
from typing import List, Optional

from AnyQt.QtCore import Qt, QEvent, QFileInfo, QThread
from AnyQt.QtCore import pyqtSlot as Slot
Expand All @@ -24,6 +25,7 @@
QVBoxLayout, QLabel
)

from Orange.data import Table, Domain, StringVariable
from Orange.widgets import widget, gui, settings
from Orange.widgets.utils.filedialogs import RecentPath
from Orange.widgets.utils.concurrent import (
Expand All @@ -40,6 +42,13 @@
from Orange.canvas.preview.previewbrowser import TextLabel


# domain for skipped images output
SKIPPED_DOMAIN = Domain([], metas=[
StringVariable("name"),
StringVariable("path")
])


def prettifypath(path):
home = os.path.expanduser("~/")
if path.startswith(home): # case sensitivity!
Expand Down Expand Up @@ -79,31 +88,29 @@ class OWImportDocuments(widget.OWWidget):

class Outputs:
data = Output("Corpus", Corpus)
skipped_documents = Output("Skipped documents", Table)

#: list of recent paths
recent_paths = settings.Setting([]) # type: List[RecentPath]
currentPath = settings.Setting(None)
recent_paths: List[RecentPath] = settings.Setting([])
currentPath: Optional[str] = settings.Setting(None)

want_main_area = False
resizing_enabled = False

Modality = Qt.ApplicationModal

MaxRecentItems = 20


class Warning(widget.OWWidget.Warning):
read_error = widget.Msg("{} couldn't be read.")


def __init__(self):
super().__init__()
#: widget's runtime state
self.__state = State.NoState
self.corpus = None
self.n_text_categories = 0
self.n_text_data = 0
self.n_skipped = 0
self.skipped_documents = []

self.__invalidated = False
self.__pendingTask = None
Expand Down Expand Up @@ -169,7 +176,8 @@ def __init__(self):
minimum=0, maximum=100
)
self.cancel_button = QPushButton(
"Cancel", icon=self.style().standardIcon(QStyle.SP_DialogCancelButton),
"Cancel",
icon=self.style().standardIcon(QStyle.SP_DialogCancelButton),
)
self.cancel_button.clicked.connect(self.cancel)

Expand Down Expand Up @@ -286,7 +294,7 @@ def __updateInfo(self):
elif self.__state == State.Done:
nvalid = self.n_text_data
ncategories = self.n_text_categories
n_skipped = self.n_skipped
n_skipped = len(self.skipped_documents)
if ncategories < 2:
text = "{} document{}".format(nvalid, "s" if nvalid != 1 else "")
else:
Expand Down Expand Up @@ -536,10 +544,13 @@ def __onRunFinished(self):
if corpus.domain.class_var else 0

self.corpus = corpus
self.n_skipped = len(errors)
self.corpus.name = "Documents"
self.skipped_documents = errors

if len(errors):
self.Warning.read_error("Some files" if len(errors) > 1 else "One file")
self.Warning.read_error(
"Some files" if len(errors) > 1 else "One file"
)

self.__setRuntimeState(state)
self.commit()
Expand All @@ -561,14 +572,23 @@ def __onReportProgress(self, arg):
assert QThread.currentThread() is self.thread()
if self.__state == State.Processing:
self.pathlabel.setText(prettifypath(arg.lastpath))
self.progress_widget.setValue(arg.progress)
self.progress_widget.setValue(100 * arg.progress)
self.progress_widget.setValue(int(100 * arg.progress))

def commit(self):
"""
Create and commit a Corpus from the collected text meta data.
"""
self.Outputs.data.send(self.corpus)
skipped_table = (
Table.from_list(
SKIPPED_DOMAIN,
[[x, os.path.join(self.currentPath, x)]
for x in self.skipped_documents]
)
if self.skipped_documents else None
)
skipped_table.name = "Skipped documents"
self.Outputs.skipped_documents.send(skipped_table)

def onDeleteWidget(self):
self.cancel()
Expand Down Expand Up @@ -615,8 +635,8 @@ def send_report(self):
('Number of documents', self.n_text_data)]
if self.n_text_categories:
items += [('Categories', self.n_text_categories)]
if self.n_skipped:
items += [('Number of skipped', self.n_skipped)]
if self.skipped_documents:
items += [('Number of skipped', len(self.skipped_documents))]
self.report_items(items, )


Expand Down Expand Up @@ -646,5 +666,6 @@ def main(argv=sys.argv):
w.onDeleteWidget()
return 0


if __name__ == "__main__":
sys.exit(main())
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
1 change: 1 addition & 0 deletions orangecontrib/text/widgets/tests/data/sample_txt.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is a test txt file
75 changes: 75 additions & 0 deletions orangecontrib/text/widgets/tests/test_owimportdocuments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import os
import unittest

from Orange.widgets.tests.base import WidgetTest
from orangecontrib.text.widgets.owimportdocuments import OWImportDocuments


class TestOWImportDocuments(WidgetTest):
def setUp(self) -> None:
self.widget: OWImportDocuments = self.create_widget(OWImportDocuments)
path = os.path.join(os.path.dirname(__file__), "data")
self.widget.setCurrentPath(path)
self.widget.reload()
self.wait_until_finished()

def test_current_path(self):
path = os.path.join(os.path.dirname(__file__), "data")
self.assertEqual(path, self.widget.currentPath)

def test_output(self):
output = self.get_output(self.widget.Outputs.data)
self.assertEqual(4, len(output))
self.assertEqual(3, len(output.domain.metas))
names = output.get_column_view("name")[0]
self.assertListEqual(
["sample_docx", "sample_odt", "sample_pdf", "sample_txt"],
sorted(names.tolist()),
)
texts = output.get_column_view("content")[0]
self.assertListEqual(
[
f"This is a test {x} file"
for x in ["docx", "odt", "pdf", "txt"]
],
sorted([x.strip() for x in texts.tolist()]),
)
self.assertEqual("content", output.text_features[0].name)

skipped_output = self.get_output(self.widget.Outputs.skipped_documents)
self.assertEqual(1, len(skipped_output))
self.assertEqual(2, len(skipped_output.domain.metas))
names = skipped_output.get_column_view("name")[0]
self.assertListEqual(
["sample_pdf_corrupted.pdf"],
sorted(names.tolist()),
)

def test_could_not_be_read_warning(self):
"""
sample_pdf_corrupted.pdf is corrupted file and cannot be loaded
correctly - widget must show the warning
"""
self.assertTrue(self.widget.Warning.read_error.is_shown())
self.assertEqual(
"One file couldn't be read.",
str(self.widget.Warning.read_error),
)

def test_send_report(self):
self.widget.send_report()

def test_info_box(self):
self.assertEqual(
"4 documents, 1 skipped", self.widget.info_area.text()
)

# empty widget
self.widget: OWImportDocuments = self.create_widget(OWImportDocuments)
self.assertEqual(
"No document set selected", self.widget.info_area.text()
)


if __name__ == "__main__":
unittest.main()

0 comments on commit 4207270

Please sign in to comment.