Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] List files that are not loaded #560

Merged
merged 2 commits into from
Sep 4, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 35 additions & 14 deletions orangecontrib/text/widgets/owimportdocuments.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

from types import SimpleNamespace as namespace
from concurrent.futures._base import TimeoutError
from typing import List, Optional

from AnyQt.QtCore import Qt, QEvent, QFileInfo, QThread
from AnyQt.QtCore import pyqtSlot as Slot
Expand All @@ -24,6 +25,7 @@
QVBoxLayout, QLabel
)

from Orange.data import Table, Domain, StringVariable
from Orange.widgets import widget, gui, settings
from Orange.widgets.utils.filedialogs import RecentPath
from Orange.widgets.utils.concurrent import (
Expand All @@ -40,6 +42,13 @@
from Orange.canvas.preview.previewbrowser import TextLabel


# domain for skipped images output
SKIPPED_DOMAIN = Domain([], metas=[
StringVariable("name"),
StringVariable("path")
])


def prettifypath(path):
home = os.path.expanduser("~/")
if path.startswith(home): # case sensitivity!
Expand Down Expand Up @@ -79,31 +88,29 @@ class OWImportDocuments(widget.OWWidget):

class Outputs:
data = Output("Corpus", Corpus)
skipped_documents = Output("Skipped documents", Table)

#: list of recent paths
recent_paths = settings.Setting([]) # type: List[RecentPath]
currentPath = settings.Setting(None)
recent_paths: List[RecentPath] = settings.Setting([])
currentPath: Optional[str] = settings.Setting(None)

want_main_area = False
resizing_enabled = False

Modality = Qt.ApplicationModal

MaxRecentItems = 20


class Warning(widget.OWWidget.Warning):
read_error = widget.Msg("{} couldn't be read.")


def __init__(self):
super().__init__()
#: widget's runtime state
self.__state = State.NoState
self.corpus = None
self.n_text_categories = 0
self.n_text_data = 0
self.n_skipped = 0
self.skipped_documents = []

self.__invalidated = False
self.__pendingTask = None
Expand Down Expand Up @@ -169,7 +176,8 @@ def __init__(self):
minimum=0, maximum=100
)
self.cancel_button = QPushButton(
"Cancel", icon=self.style().standardIcon(QStyle.SP_DialogCancelButton),
"Cancel",
icon=self.style().standardIcon(QStyle.SP_DialogCancelButton),
)
self.cancel_button.clicked.connect(self.cancel)

Expand Down Expand Up @@ -286,7 +294,7 @@ def __updateInfo(self):
elif self.__state == State.Done:
nvalid = self.n_text_data
ncategories = self.n_text_categories
n_skipped = self.n_skipped
n_skipped = len(self.skipped_documents)
if ncategories < 2:
text = "{} document{}".format(nvalid, "s" if nvalid != 1 else "")
else:
Expand Down Expand Up @@ -536,10 +544,13 @@ def __onRunFinished(self):
if corpus.domain.class_var else 0

self.corpus = corpus
self.n_skipped = len(errors)
self.corpus.name = "Documents"
self.skipped_documents = errors

if len(errors):
self.Warning.read_error("Some files" if len(errors) > 1 else "One file")
self.Warning.read_error(
"Some files" if len(errors) > 1 else "One file"
)

self.__setRuntimeState(state)
self.commit()
Expand All @@ -561,14 +572,23 @@ def __onReportProgress(self, arg):
assert QThread.currentThread() is self.thread()
if self.__state == State.Processing:
self.pathlabel.setText(prettifypath(arg.lastpath))
self.progress_widget.setValue(arg.progress)
self.progress_widget.setValue(100 * arg.progress)
self.progress_widget.setValue(int(100 * arg.progress))

def commit(self):
"""
Create and commit a Corpus from the collected text meta data.
"""
self.Outputs.data.send(self.corpus)
skipped_table = (
Table.from_list(
SKIPPED_DOMAIN,
[[x, os.path.join(self.currentPath, x)]
for x in self.skipped_documents]
)
if self.skipped_documents else None
)
skipped_table.name = "Skipped documents"
self.Outputs.skipped_documents.send(skipped_table)

def onDeleteWidget(self):
self.cancel()
Expand Down Expand Up @@ -615,8 +635,8 @@ def send_report(self):
('Number of documents', self.n_text_data)]
if self.n_text_categories:
items += [('Categories', self.n_text_categories)]
if self.n_skipped:
items += [('Number of skipped', self.n_skipped)]
if self.skipped_documents:
items += [('Number of skipped', len(self.skipped_documents))]
self.report_items(items, )


Expand Down Expand Up @@ -646,5 +666,6 @@ def main(argv=sys.argv):
w.onDeleteWidget()
return 0


if __name__ == "__main__":
sys.exit(main())
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
1 change: 1 addition & 0 deletions orangecontrib/text/widgets/tests/data/sample_txt.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is a test txt file
75 changes: 75 additions & 0 deletions orangecontrib/text/widgets/tests/test_owimportdocuments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import os
import unittest

from Orange.widgets.tests.base import WidgetTest
from orangecontrib.text.widgets.owimportdocuments import OWImportDocuments


class TestOWImportDocuments(WidgetTest):
def setUp(self) -> None:
self.widget: OWImportDocuments = self.create_widget(OWImportDocuments)
path = os.path.join(os.path.dirname(__file__), "data")
self.widget.setCurrentPath(path)
self.widget.reload()
self.wait_until_finished()

def test_current_path(self):
path = os.path.join(os.path.dirname(__file__), "data")
self.assertEqual(path, self.widget.currentPath)

def test_output(self):
output = self.get_output(self.widget.Outputs.data)
self.assertEqual(4, len(output))
self.assertEqual(3, len(output.domain.metas))
names = output.get_column_view("name")[0]
self.assertListEqual(
["sample_docx", "sample_odt", "sample_pdf", "sample_txt"],
sorted(names.tolist()),
)
texts = output.get_column_view("content")[0]
self.assertListEqual(
[
f"This is a test {x} file"
for x in ["docx", "odt", "pdf", "txt"]
],
sorted([x.strip() for x in texts.tolist()]),
)
self.assertEqual("content", output.text_features[0].name)

skipped_output = self.get_output(self.widget.Outputs.skipped_documents)
self.assertEqual(1, len(skipped_output))
self.assertEqual(2, len(skipped_output.domain.metas))
names = skipped_output.get_column_view("name")[0]
self.assertListEqual(
["sample_pdf_corrupted.pdf"],
sorted(names.tolist()),
)

def test_could_not_be_read_warning(self):
"""
sample_pdf_corrupted.pdf is corrupted file and cannot be loaded
correctly - widget must show the warning
"""
self.assertTrue(self.widget.Warning.read_error.is_shown())
self.assertEqual(
"One file couldn't be read.",
str(self.widget.Warning.read_error),
)

def test_send_report(self):
self.widget.send_report()

def test_info_box(self):
self.assertEqual(
"4 documents, 1 skipped", self.widget.info_area.text()
)

# empty widget
self.widget: OWImportDocuments = self.create_widget(OWImportDocuments)
self.assertEqual(
"No document set selected", self.widget.info_area.text()
)


if __name__ == "__main__":
unittest.main()