Skip to content

Commit

Permalink
Import Documents: Import from URL
Browse files Browse the repository at this point in the history
  • Loading branch information
VesnaT committed Apr 28, 2021
1 parent bcc8042 commit f1d0dcf
Show file tree
Hide file tree
Showing 4 changed files with 118 additions and 30 deletions.
9 changes: 6 additions & 3 deletions orangecontrib/text/import_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,8 +188,7 @@ def __init__(self, path, *args):
Reader.__init__(self, self.filename, *args)

def read_file(self):
path, name = os.path.split(self.filename)
self.filename = os.path.join(path, quote(name))
self.filename = quote(self.filename, safe="/:")
self.filename = self._trim(self._resolve_redirects(self.filename))
with contextlib.closing(self.urlopen(self.filename)) as response:
name = self._suggest_filename(
Expand All @@ -216,6 +215,10 @@ def __init__(self, startdir: str,
is_url: bool = False,
formats: Tuple[str] = DefaultFormats,
report_progress: Callable = None):
if is_url and not startdir.endswith("/"):
startdir += "/"
elif not is_url:
startdir = os.path.join(startdir, "")
self.startdir = startdir
self.formats = formats
self._report_progress = report_progress
Expand Down Expand Up @@ -394,7 +397,7 @@ def scan_url(topdir: str, include_patterns: Tuple[str] = ("*",),
include_patterns = include_patterns or ("*",)
paths = []
for filename in files:
path = os.path.join(topdir, os.path.join(*filename))
path = topdir + "/".join(filename)
if matches_any(path, include_patterns) and \
not matches_any(path, exclude_patterns):
paths.append(path)
Expand Down
46 changes: 37 additions & 9 deletions orangecontrib/text/tests/test_import_documents.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import unittest
from unittest.mock import patch

import numpy as np
import pandas as pd

from orangecontrib.text.import_documents import ImportDocuments, UrlReader, \
Expand Down Expand Up @@ -56,19 +57,19 @@ def test_scan_url(self):
path = "http://file.biolab.si/text-semantics/data/semeval/"
importer = ImportDocuments(path, True)
paths = importer.scan_url(path)
self.assertEqual(len(paths), 101)
self.assertGreater(len(paths), 0)

def test_scan_url_txt(self):
path = "http://file.biolab.si/text-semantics/data/semeval/"
importer = ImportDocuments(path, True)
paths = importer.scan_url(path, include_patterns=["*.txt"])
self.assertEqual(len(paths), 100)
self.assertGreater(len(paths), 0)

def test_scan_url_csv(self):
path = "http://file.biolab.si/text-semantics/data/"
importer = ImportDocuments(path, True)
paths = importer.scan_url(path, include_patterns=["*.csv"])
self.assertEqual(len(paths), 6)
self.assertGreater(len(paths), 0)

def test_read_meta_data_url(self):
path = "http://file.biolab.si/text-semantics/data/semeval/"
Expand All @@ -77,8 +78,8 @@ def test_read_meta_data_url(self):
self.assertIsInstance(data1, pd.DataFrame)
self.assertEqual(len(err), 0)

@patch("orangecontrib.text.import_documents.ImportDocuments."
"META_DATA_FILE_KEY", "File")
# @patch("orangecontrib.text.import_documents.ImportDocuments."
# "META_DATA_FILE_KEY", "File")
def test_merge_metadata_url(self):
path = "http://file.biolab.si/text-semantics/data/semeval/"
importer = ImportDocuments(path, True)
Expand All @@ -89,24 +90,51 @@ def test_merge_metadata_url(self):
importer._meta_data = meta_data[:50]
corpus = importer._create_corpus()
corpus = importer._add_metadata(corpus)
self.assertEqual(len(corpus), 4)
columns = ["name", "path", "content", "Content", "File", "Keywords"]
self.assertGreater(len(corpus), 0)
columns = ["name", "path", "content", "Content",
"Text file", "Keywords"]
self.assertEqual([v.name for v in corpus.domain.metas], columns)

importer._text_data = text_data[:4] # 'C-1', 'C-14', 'C-17', 'C-18'
importer._meta_data = None
corpus = importer._create_corpus()
corpus = importer._add_metadata(corpus)
self.assertEqual(len(corpus), 4)
self.assertGreater(len(corpus), 0)
columns = ["name", "path", "content"]
self.assertEqual([v.name for v in corpus.domain.metas], columns)

def test_run_url(self):
path = "http://file.biolab.si/text-semantics/data" \
"/predlogi-vladi-sample/"
importer = ImportDocuments(path, True)
corpus1, _ = importer.run()
self.assertGreater(len(corpus1), 0)

mask = np.ones_like(corpus1.metas, dtype=bool)
mask[:, 1] = False

path = "http://file.biolab.si/text-semantics/data" \
"/predlogi-vladi-sample////"
importer = ImportDocuments(path, True)
corpus2, _ = importer.run()
self.assertGreater(len(corpus1), 0)
self.assertEqual(corpus1.metas[mask].tolist(),
corpus2.metas[mask].tolist())

path = "http://file.biolab.si/text-semantics/data" \
"/predlogi-vladi-sample"
importer = ImportDocuments(path, True)
corpus3, _ = importer.run()
self.assertGreater(len(corpus2), 0)
self.assertEqual(corpus1.metas[mask].tolist(),
corpus3.metas[mask].tolist())

def test_run_url_special_characters(self):
path = "http://file.biolab.si/text-semantics/data/" \
"elektrotehniski-vestnik-clanki/"
importer = ImportDocuments(path, True)
corpus, errors = importer.run()
self.assertEqual(len(corpus), 382)
self.assertGreater(len(corpus), 0)


if __name__ == "__main__":
Expand Down
91 changes: 74 additions & 17 deletions orangecontrib/text/widgets/owimportdocuments.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import warnings
import logging
import traceback
from urllib.parse import urlparse

from types import SimpleNamespace as namespace
from concurrent.futures._base import TimeoutError
Expand All @@ -22,11 +23,14 @@
from AnyQt.QtWidgets import (
QAction, QPushButton, QComboBox, QApplication, QStyle, QFileDialog,
QFileIconProvider, QStackedWidget, QProgressBar, QWidget, QHBoxLayout,
QVBoxLayout, QLabel
QVBoxLayout, QLabel, QGridLayout, QSizePolicy, QCompleter
)

from orangewidget.utils.itemmodels import PyListModel

from Orange.data import Table, Domain, StringVariable
from Orange.widgets import widget, gui, settings
from Orange.widgets.data.owfile import LineEditSelectOnFocus
from Orange.widgets.utils.filedialogs import RecentPath
from Orange.widgets.utils.concurrent import (
ThreadExecutor, FutureWatcher, methodinvoke
Expand Down Expand Up @@ -91,9 +95,12 @@ class Outputs:
data = Output("Corpus", Corpus)
skipped_documents = Output("Skipped documents", Table)

LOCAL_FILE, URL = range(2)
source = settings.Setting(LOCAL_FILE)
#: list of recent paths
recent_paths: List[RecentPath] = settings.Setting([])
currentPath: Optional[str] = settings.Setting(None)
recent_urls: List[str] = settings.Setting([])

want_main_area = False
resizing_enabled = False
Expand All @@ -116,8 +123,18 @@ def __init__(self):
self.__invalidated = False
self.__pendingTask = None

vbox = gui.vBox(self.controlArea)
hbox = gui.hBox(vbox)
layout = QGridLayout()
layout.setSpacing(4)
gui.widgetBox(self.controlArea, orientation=layout, box='Source')
source_box = gui.radioButtons(None, self, "source", box=True,
callback=self.start, addToLayout=False)
rb_button = gui.appendRadioButton(source_box, "Folder:",
addToLayout=False)
layout.addWidget(rb_button, 0, 0, Qt.AlignVCenter)

box = gui.hBox(None, addToLayout=False, margin=0)
box.setSizePolicy(QSizePolicy.MinimumExpanding, QSizePolicy.Fixed)

self.recent_cb = QComboBox(
sizeAdjustPolicy=QComboBox.AdjustToMinimumContentsLengthWithIcon,
minimumContentsLength=16,
Expand Down Expand Up @@ -148,25 +165,50 @@ def __init__(self):
browseaction.iconText(),
icon=browseaction.icon(),
toolTip=browseaction.toolTip(),
clicked=browseaction.trigger
clicked=browseaction.trigger,
default=False,
autoDefault=False,
)
reloadbutton = QPushButton(
reloadaction.iconText(),
icon=reloadaction.icon(),
clicked=reloadaction.trigger,
default=True,
default=False,
autoDefault=False,
)

hbox.layout().addWidget(self.recent_cb)
hbox.layout().addWidget(browsebutton)
hbox.layout().addWidget(reloadbutton)
box.layout().addWidget(self.recent_cb)
layout.addWidget(box, 0, 1)
layout.addWidget(browsebutton, 0, 2)
layout.addWidget(reloadbutton, 0, 3)

rb_button = gui.appendRadioButton(source_box, "URL:", addToLayout=False)
layout.addWidget(rb_button, 3, 0, Qt.AlignVCenter)

self.url_combo = url_combo = QComboBox()
url_model = PyListModel()
url_model.wrap(self.recent_urls)
url_combo.setLineEdit(LineEditSelectOnFocus())
url_combo.setModel(url_model)
url_combo.setSizePolicy(QSizePolicy.Ignored, QSizePolicy.Fixed)
url_combo.setEditable(True)
url_combo.setInsertPolicy(url_combo.InsertAtTop)
url_edit = url_combo.lineEdit()
l, t, r, b = url_edit.getTextMargins()
url_edit.setTextMargins(l + 5, t, r, b)
layout.addWidget(url_combo, 3, 1, 1, 3)
url_combo.activated.connect(self._url_set)
# whit completer we set that combo box is case sensitive when
# matching the history
completer = QCompleter()
completer.setCaseSensitivity(Qt.CaseSensitive)
url_combo.setCompleter(completer)

self.addActions([browseaction, reloadaction])

reloadaction.changed.connect(
lambda: reloadbutton.setEnabled(reloadaction.isEnabled())
)
box = gui.vBox(vbox, "Info")
box = gui.vBox(self.controlArea, "Info")
self.infostack = QStackedWidget()

self.info_area = QLabel(
Expand All @@ -179,6 +221,8 @@ def __init__(self):
self.cancel_button = QPushButton(
"Cancel",
icon=self.style().standardIcon(QStyle.SP_DialogCancelButton),
default=False,
autoDefault=False,
)
self.cancel_button.clicked.connect(self.cancel)

Expand Down Expand Up @@ -210,6 +254,17 @@ def __init__(self):

QApplication.postEvent(self, QEvent(RuntimeEvent.Init))

def _url_set(self):
url = self.url_combo.currentText()
pos = self.recent_urls.index(url)
url = url.strip()
if not urlparse(url).scheme:
url = "http://" + url
self.url_combo.setItemText(pos, url)
self.recent_urls[pos] = url
self.source = self.URL
self.start()

def __initRecentItemsModel(self):
if self.currentPath is not None and \
not os.path.isdir(self.currentPath):
Expand Down Expand Up @@ -336,7 +391,8 @@ def setCurrentPath(self, path):
"""
if self.currentPath is not None and path is not None and \
os.path.isdir(self.currentPath) and os.path.isdir(path) and \
os.path.samefile(self.currentPath, path):
os.path.samefile(self.currentPath, path) and \
self.source == self.LOCAL_FILE:
return True

success = True
Expand Down Expand Up @@ -370,7 +426,7 @@ def setCurrentPath(self, path):

if self.__state == State.Processing:
self.cancel()

self.source = self.LOCAL_FILE
return success

def addRecentPath(self, path):
Expand Down Expand Up @@ -447,7 +503,7 @@ def reload(self):
"""
if self.__state == State.Processing:
self.cancel()

self.source = self.LOCAL_FILE
self.corpus = None
self.start()

Expand All @@ -460,7 +516,9 @@ def start(self):
self.progress_widget.setValue(0)

self.__invalidated = False
if self.currentPath is None:
startdir = self.currentPath if self.source == self.LOCAL_FILE \
else self.url_combo.currentText().strip()
if not startdir:
return

if self.__state == State.Processing:
Expand All @@ -470,14 +528,13 @@ def start(self):
.format(self.__pendingTask.startdir))
self.cancel()

startdir = self.currentPath

self.__setRuntimeState(State.Processing)

report_progress = methodinvoke(
self, "__onReportProgress", (object,))

task = ImportDocuments(startdir, report_progress=report_progress)
task = ImportDocuments(startdir, self.source == self.URL,
report_progress=report_progress)

# collect the task state in one convenient place
self.__pendingTask = taskstate = namespace(
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ numpy
python-dateutil<3.0.0 # denpendency for botocore
gensim>=0.12.3 # LDA's show topics unified in 0.12.3
setuptools-git
Orange3 >=3.25.0
Orange3 >=3.28.0
tweepy
beautifulsoup4
simhash >=1.11
Expand Down

0 comments on commit f1d0dcf

Please sign in to comment.