Skip to content

Commit

Permalink
Import Documents: Import from URL
Browse files Browse the repository at this point in the history
  • Loading branch information
VesnaT committed Apr 8, 2021
1 parent 8982a5b commit d58b7e0
Show file tree
Hide file tree
Showing 4 changed files with 87 additions and 18 deletions.
2 changes: 1 addition & 1 deletion orangecontrib/text/import_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ def __init__(self, path, *args):

def read_file(self):
path, name = os.path.split(self.filename)
self.filename = os.path.join(path, quote(name))
self.filename = f"{path}/{quote(name)}"
self.filename = self._trim(self._resolve_redirects(self.filename))
with contextlib.closing(self.urlopen(self.filename)) as response:
name = self._suggest_filename(
Expand Down
88 changes: 72 additions & 16 deletions orangecontrib/text/widgets/owimportdocuments.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import warnings
import logging
import traceback
from urllib.parse import urlparse

from types import SimpleNamespace as namespace
from concurrent.futures._base import TimeoutError
Expand All @@ -22,11 +23,14 @@
from AnyQt.QtWidgets import (
QAction, QPushButton, QComboBox, QApplication, QStyle, QFileDialog,
QFileIconProvider, QStackedWidget, QProgressBar, QWidget, QHBoxLayout,
QVBoxLayout, QLabel
QVBoxLayout, QLabel, QGridLayout, QSizePolicy, QCompleter
)

from orangewidget.utils.itemmodels import PyListModel

from Orange.data import Table, Domain, StringVariable
from Orange.widgets import widget, gui, settings
from Orange.widgets.data.owfile import LineEditSelectOnFocus
from Orange.widgets.utils.filedialogs import RecentPath
from Orange.widgets.utils.concurrent import (
ThreadExecutor, FutureWatcher, methodinvoke
Expand Down Expand Up @@ -91,9 +95,12 @@ class Outputs:
data = Output("Corpus", Corpus)
skipped_documents = Output("Skipped documents", Table)

LOCAL_FILE, URL = range(2)
source = settings.Setting(LOCAL_FILE)
#: list of recent paths
recent_paths: List[RecentPath] = settings.Setting([])
currentPath: Optional[str] = settings.Setting(None)
recent_urls: List[str] = settings.Setting([])

want_main_area = False
resizing_enabled = False
Expand All @@ -116,8 +123,18 @@ def __init__(self):
self.__invalidated = False
self.__pendingTask = None

vbox = gui.vBox(self.controlArea)
hbox = gui.hBox(vbox)
layout = QGridLayout()
layout.setSpacing(4)
gui.widgetBox(self.controlArea, orientation=layout, box='Source')
source_box = gui.radioButtons(None, self, "source", box=True,
callback=self.start, addToLayout=False)
rb_button = gui.appendRadioButton(source_box, "File:",
addToLayout=False)
layout.addWidget(rb_button, 0, 0, Qt.AlignVCenter)

box = gui.hBox(None, addToLayout=False, margin=0)
box.setSizePolicy(QSizePolicy.MinimumExpanding, QSizePolicy.Fixed)

self.recent_cb = QComboBox(
sizeAdjustPolicy=QComboBox.AdjustToMinimumContentsLengthWithIcon,
minimumContentsLength=16,
Expand Down Expand Up @@ -148,25 +165,50 @@ def __init__(self):
browseaction.iconText(),
icon=browseaction.icon(),
toolTip=browseaction.toolTip(),
clicked=browseaction.trigger
clicked=browseaction.trigger,
default=False,
autoDefault=False,
)
reloadbutton = QPushButton(
reloadaction.iconText(),
icon=reloadaction.icon(),
clicked=reloadaction.trigger,
default=True,
default=False,
autoDefault=False,
)

hbox.layout().addWidget(self.recent_cb)
hbox.layout().addWidget(browsebutton)
hbox.layout().addWidget(reloadbutton)
box.layout().addWidget(self.recent_cb)
layout.addWidget(box, 0, 1)
layout.addWidget(browsebutton, 0, 2)
layout.addWidget(reloadbutton, 0, 3)

rb_button = gui.appendRadioButton(source_box, "URL:", addToLayout=False)
layout.addWidget(rb_button, 3, 0, Qt.AlignVCenter)

self.url_combo = url_combo = QComboBox()
url_model = PyListModel()
url_model.wrap(self.recent_urls)
url_combo.setLineEdit(LineEditSelectOnFocus())
url_combo.setModel(url_model)
url_combo.setSizePolicy(QSizePolicy.Ignored, QSizePolicy.Fixed)
url_combo.setEditable(True)
url_combo.setInsertPolicy(url_combo.InsertAtTop)
url_edit = url_combo.lineEdit()
l, t, r, b = url_edit.getTextMargins()
url_edit.setTextMargins(l + 5, t, r, b)
layout.addWidget(url_combo, 3, 1, 1, 3)
url_combo.activated.connect(self._url_set)
# whit completer we set that combo box is case sensitive when
# matching the history
completer = QCompleter()
completer.setCaseSensitivity(Qt.CaseSensitive)
url_combo.setCompleter(completer)

self.addActions([browseaction, reloadaction])

reloadaction.changed.connect(
lambda: reloadbutton.setEnabled(reloadaction.isEnabled())
)
box = gui.vBox(vbox, "Info")
box = gui.vBox(self.controlArea, "Info")
self.infostack = QStackedWidget()

self.info_area = QLabel(
Expand All @@ -179,6 +221,8 @@ def __init__(self):
self.cancel_button = QPushButton(
"Cancel",
icon=self.style().standardIcon(QStyle.SP_DialogCancelButton),
default=False,
autoDefault=False,
)
self.cancel_button.clicked.connect(self.cancel)

Expand Down Expand Up @@ -210,6 +254,17 @@ def __init__(self):

QApplication.postEvent(self, QEvent(RuntimeEvent.Init))

def _url_set(self):
url = self.url_combo.currentText()
pos = self.recent_urls.index(url)
url = url.strip()
if not urlparse(url).scheme:
url = "http://" + url
self.url_combo.setItemText(pos, url)
self.recent_urls[pos] = url
self.source = self.URL
self.start()

def __initRecentItemsModel(self):
if self.currentPath is not None and \
not os.path.isdir(self.currentPath):
Expand Down Expand Up @@ -370,7 +425,7 @@ def setCurrentPath(self, path):

if self.__state == State.Processing:
self.cancel()

self.source = self.LOCAL_FILE
return success

def addRecentPath(self, path):
Expand Down Expand Up @@ -447,7 +502,7 @@ def reload(self):
"""
if self.__state == State.Processing:
self.cancel()

self.source = self.LOCAL_FILE
self.corpus = None
self.start()

Expand All @@ -460,7 +515,9 @@ def start(self):
self.progress_widget.setValue(0)

self.__invalidated = False
if self.currentPath is None:
startdir = self.currentPath if self.source == self.LOCAL_FILE \
else self.url_combo.currentText().strip()
if not startdir:
return

if self.__state == State.Processing:
Expand All @@ -470,14 +527,13 @@ def start(self):
.format(self.__pendingTask.startdir))
self.cancel()

startdir = self.currentPath

self.__setRuntimeState(State.Processing)

report_progress = methodinvoke(
self, "__onReportProgress", (object,))

task = ImportDocuments(startdir, report_progress=report_progress)
task = ImportDocuments(startdir, self.source == self.URL,
report_progress=report_progress)

# collect the task state in one convenient place
self.__pendingTask = taskstate = namespace(
Expand Down
13 changes: 13 additions & 0 deletions orangecontrib/text/widgets/tests/test_owimportdocuments.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,19 @@ def test_load_empty_folder(self):
self.wait_until_finished(widget=widget)
self.assertIsNone(self.get_output(widget.Outputs.data))

@unittest.skip("Due to timeout")
def test_load_from_url(self):
url = "http://file.biolab.si/text-semantics/data/semeval/"
self.widget.recent_urls = [url]
self.widget.url_combo.setCurrentText(url)
self.widget._url_set()
self.wait_until_finished(timeout=20000)
corpus = self.get_output(self.widget.Outputs.data)
self.assertEqual(len(corpus), 100)
skipped = self.get_output(self.widget.Outputs.skipped_documents)
self.assertIsNone(skipped)



if __name__ == "__main__":
unittest.main()
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ numpy
python-dateutil<3.0.0 # denpendency for botocore
gensim>=0.12.3 # LDA's show topics unified in 0.12.3
setuptools-git
Orange3 >=3.25.0
Orange3 >=3.28.0
tweepy
beautifulsoup4
simhash >=1.11
Expand Down

0 comments on commit d58b7e0

Please sign in to comment.