From 48e08f815abdea6e133122950b5bee2efdab362f Mon Sep 17 00:00:00 2001 From: Vesna Tanko Date: Mon, 29 Mar 2021 09:12:14 +0200 Subject: [PATCH 1/3] Import Documents: Add URL reader --- orangecontrib/text/import_documents.py | 75 +++++++++++++--- .../text/tests/test_import_documents.py | 85 +++++++++++++++++++ 2 files changed, 149 insertions(+), 11 deletions(-) create mode 100644 orangecontrib/text/tests/test_import_documents.py diff --git a/orangecontrib/text/import_documents.py b/orangecontrib/text/import_documents.py index 4d2343683..58820fae6 100644 --- a/orangecontrib/text/import_documents.py +++ b/orangecontrib/text/import_documents.py @@ -1,11 +1,15 @@ +import contextlib import fnmatch import logging import os import pathlib import re +from urllib.parse import quote from collections import namedtuple +from tempfile import NamedTemporaryFile from types import SimpleNamespace as namespace +from typing import List, Tuple, Callable from unicodedata import normalize import numpy as np @@ -20,13 +24,14 @@ from pdfminer.layout import LAParams, LTTextBox, LTTextLine from bs4 import BeautifulSoup +import serverfiles + from Orange.data import DiscreteVariable, Domain, StringVariable -from Orange.data.io import detect_encoding +from Orange.data.io import detect_encoding, UrlReader as CoreUrlReader from Orange.util import Registry from orangecontrib.text.corpus import Corpus - DefaultFormats = ("docx", "odt", "txt", "pdf", "xml") TextData = namedtuple( @@ -156,19 +161,53 @@ def read_file(self): self.content = soup.get_text() +class UrlReader(Reader, CoreUrlReader): + ext = [".url"] + + def __init__(self, path, *args): + CoreUrlReader.__init__(self, path) + Reader.__init__(self, self.filename, *args) + + def read_file(self): + path, name = os.path.split(self.filename) + self.filename = os.path.join(path, quote(name)) + self.filename = self._trim(self._resolve_redirects(self.filename)) + with contextlib.closing(self.urlopen(self.filename)) as response: + name = self._suggest_filename( + response.headers["content-disposition"]) + extension = "".join(pathlib.Path(name).suffixes) + with NamedTemporaryFile(suffix=extension, delete=False) as f: + f.write(response.read()) + reader = Reader.get_reader(f.name) + reader.read_file() + self.content = reader.content + os.remove(f.name) + + def make_text_data(self): + text_data = super().make_text_data() + ext = pathlib.Path(self.path).suffix + return TextData(text_data.name, text_data.path, [ext], + text_data.category, text_data.content) + + class ImportDocuments: - def __init__(self, startdir, formats=DefaultFormats, report_progress=None): + def __init__(self, startdir: str, + is_url: bool = False, + formats: Tuple[str] = DefaultFormats, + report_progress: Callable = None): self.startdir = startdir self.formats = formats self._report_progress = report_progress self.cancelled = False self._text_data = [] + self._is_url = is_url - def run(self): + def run(self) -> Tuple[Corpus, List]: text_data = [] errors = [] patterns = ["*.{}".format(fmt.lower()) for fmt in self.formats] - paths = self.scan(self.startdir, include_patterns=patterns) + scan = self.scan_url if self._is_url else self.scan + paths = scan(self.startdir, include_patterns=patterns) n_paths = len(paths) batch = [] @@ -183,7 +222,8 @@ def run(self): batch=batch)) batch = [] - reader = Reader.get_reader(path) + reader = Reader.get_reader(path) if not self._is_url \ + else UrlReader(path) text, error = reader.read() if text is not None: text_data.append(text) @@ -197,7 +237,7 @@ def run(self): self._text_data = text_data return self._create_corpus(), errors - def _create_corpus(self): + def _create_corpus(self) -> Corpus: corpus = None names = ["name", "path", "content"] data = [] @@ -258,10 +298,6 @@ def scan(topdir, include_patterns=("*",), exclude_patterns=(".*",)): if include_patterns is None: include_patterns = ["*"] - def matches_any(fname, patterns): - return any(fnmatch.fnmatch(fname.lower(), pattern) - for pattern in patterns) - paths = [] for dirpath, dirnames, filenames in os.walk(topdir): @@ -275,3 +311,20 @@ def matches_any(fname, patterns): and not matches_any(fname, exclude_patterns)] paths = paths + [os.path.join(dirpath, fname) for fname in filenames] return paths + + @staticmethod + def scan_url(topdir: str, include_patterns: Tuple[str] = ("*",), + exclude_patterns: Tuple[str] = (".*",)) -> List[str]: + include_patterns = include_patterns or ("*",) + paths = [] + for filenames in serverfiles.ServerFiles(topdir).listfiles(): + path = os.path.join(topdir, os.path.join(*filenames)) + if matches_any(path, include_patterns) and \ + not matches_any(path, exclude_patterns): + paths.append(path) + return paths + + +def matches_any(fname: str, patterns: Tuple[str]) -> bool: + return any(fnmatch.fnmatch(fname.lower(), pattern) + for pattern in patterns) diff --git a/orangecontrib/text/tests/test_import_documents.py b/orangecontrib/text/tests/test_import_documents.py new file mode 100644 index 000000000..9131029b5 --- /dev/null +++ b/orangecontrib/text/tests/test_import_documents.py @@ -0,0 +1,85 @@ +import unittest + +from orangecontrib.text.import_documents import ImportDocuments, UrlReader, \ + TxtReader, TextData + + +class TestUrlReader(unittest.TestCase): + def test_init(self): + path = "http://dummy.server.com/data/foo.txt" + reader = UrlReader(path) + self.assertEqual(reader.filename, path) + self.assertEqual(reader.path, path) + + def test_get_reader(self): + path = "http://dummy.server.com/data/foo.txt" + reader = UrlReader.get_reader(path) + self.assertIsInstance(reader, TxtReader) + + def test_read(self): + path = "http://file.biolab.si/text-semantics/data/semeval/C-1.txt" + reader = UrlReader(path) + textdata, error = reader.read() + self.assertIsInstance(textdata, TextData) + self.assertEqual(textdata.name, "C-1") + self.assertEqual(textdata.path, path) + self.assertEqual(textdata.ext, [".txt"]) + self.assertEqual(textdata.category, "semeval") + self.assertTrue(textdata.content.startswith("On The Complexity of Co")) + self.assertEqual(error, "") + + def test_read_file(self): + path = "http://file.biolab.si/text-semantics/data/elektrotehniski-" \ + "vestnik-clanki/detektiranje-utrdb-v-šahu-.txt" + reader = UrlReader(path) + reader.read_file() + self.assertIsInstance(reader.content, str) + + def test_name_text_data(self): + path = "http://dummy.server.com/data/foo.txt" + reader = UrlReader(path) + reader.content = "text" + text_data = reader.make_text_data() + self.assertIsInstance(text_data, TextData) + self.assertEqual(text_data.name, "foo") + self.assertEqual(text_data.path, path) + self.assertEqual(text_data.ext, [".txt"]) + self.assertEqual(text_data.category, "data") + self.assertEqual(text_data.content, "text") + + +class TestImportDocuments(unittest.TestCase): + def test_scan_url(self): + path = "http://file.biolab.si/text-semantics/data/semeval/" + importer = ImportDocuments(path, True) + paths = importer.scan_url(path) + print(paths) + + def test_scan_url_txt(self): + path = "http://file.biolab.si/text-semantics/data/semeval/" + importer = ImportDocuments(path, True) + paths = importer.scan_url(path, include_patterns=["*.txt"]) + print(paths) + + def test_scan_url_csv(self): + path = "http://file.biolab.si/text-semantics/data/" + importer = ImportDocuments(path, True) + paths = importer.scan_url(path, include_patterns=["*.csv"]) + print(paths) + + def test_run_url(self): + path = "http://file.biolab.si/text-semantics/data/semeval/" + importer = ImportDocuments(path, True) + res, err = importer.run() + print(res) + + def test_run_url_metadata(self): + path = "http://file.biolab.si/text-semantics/data/semeval/" + importer = ImportDocuments(path, True, formats=["csv"]) + res, err = importer.run() + print(res) + print(err) + + +if __name__ == "__main__": + unittest.main() From bcc8042a6a14ab2c580e3bd9c44b8d095e2a9172 Mon Sep 17 00:00:00 2001 From: Vesna Tanko Date: Tue, 30 Mar 2021 10:01:09 +0200 Subject: [PATCH 2/3] Import Documents: Read metadata --- orangecontrib/text/import_documents.py | 86 +++++++++++++++++-- .../text/tests/test_import_documents.py | 50 ++++++++--- 2 files changed, 120 insertions(+), 16 deletions(-) diff --git a/orangecontrib/text/import_documents.py b/orangecontrib/text/import_documents.py index 58820fae6..89ad7128a 100644 --- a/orangecontrib/text/import_documents.py +++ b/orangecontrib/text/import_documents.py @@ -4,7 +4,9 @@ import os import pathlib import re +import yaml from urllib.parse import quote +from requests.exceptions import ConnectionError from collections import namedtuple from tempfile import NamedTemporaryFile @@ -13,6 +15,7 @@ from unicodedata import normalize import numpy as np +import pandas as pd import docx2txt from odf.opendocument import load @@ -28,6 +31,7 @@ from Orange.data import DiscreteVariable, Domain, StringVariable from Orange.data.io import detect_encoding, UrlReader as CoreUrlReader +from Orange.data.util import get_unique_names from Orange.util import Registry from orangecontrib.text.corpus import Corpus @@ -161,6 +165,21 @@ def read_file(self): self.content = soup.get_text() +class CsvMetaReader(Reader): + ext = [".csv"] + + def read_file(self): + self.content = pd.read_csv(self.path) + + +class YamlMetaReader(Reader): + ext = [".yaml"] + + def read_file(self): + with open(self.path, "r") as f: + self.content = yaml.safe_load(f) + + class UrlReader(Reader, CoreUrlReader): ext = [".url"] @@ -191,6 +210,8 @@ def make_text_data(self): class ImportDocuments: + META_DATA_FILE_KEY = "Text file" + def __init__(self, startdir: str, is_url: bool = False, formats: Tuple[str] = DefaultFormats, @@ -199,10 +220,18 @@ def __init__(self, startdir: str, self.formats = formats self._report_progress = report_progress self.cancelled = False - self._text_data = [] self._is_url = is_url + self._text_data = [] + self._meta_data: pd.DataFrame = None def run(self) -> Tuple[Corpus, List]: + self._text_data, errors_text = self._read_text_data() + self._meta_data, errors_meta = self._read_meta_data() + corpus = self._create_corpus() + corpus = self._add_metadata(corpus) + return corpus, errors_text + errors_meta + + def _read_text_data(self): text_data = [] errors = [] patterns = ["*.{}".format(fmt.lower()) for fmt in self.formats] @@ -234,8 +263,29 @@ def run(self) -> Tuple[Corpus, List]: if self.cancelled: return - self._text_data = text_data - return self._create_corpus(), errors + return text_data, errors + + def _read_meta_data(self): + scan = self.scan_url if self._is_url else self.scan + patterns = ["*.csv", "*.yaml", "*.yml"] + paths = scan(self.startdir, include_patterns=patterns) + meta_dfs, errors = [], [] + for path in paths: + reader = Reader.get_reader(path) if not self._is_url \ + else UrlReader(path) + data, error = reader.read() + if data is not None: + content = data.content + if isinstance(content, dict): + content = pd.DataFrame(content, index=[0]) + meta_dfs.append(content) + else: + errors.append(error) + + if self.cancelled: + return + + return pd.concat(meta_dfs) if meta_dfs else None, errors def _create_corpus(self) -> Corpus: corpus = None @@ -277,6 +327,27 @@ def _create_corpus(self) -> Corpus: return corpus + def _add_metadata(self, corpus: Corpus) -> Corpus: + if "path" not in corpus.domain or self._meta_data is None \ + or self.META_DATA_FILE_KEY not in self._meta_data.columns: + return corpus + + df = self._meta_data.set_index( + self.startdir + self._meta_data[self.META_DATA_FILE_KEY] + ) + path_column = corpus.get_column_view("path")[0] + if len(df.index.drop_duplicates()) != len(df.index): + df = df[~df.index.duplicated(keep='first')] + filtered = df.reindex(path_column) + for column in filtered.columns: + corpus = corpus.add_column( + StringVariable(get_unique_names(corpus.domain, column)), + filtered[column].to_numpy(), + to_metas=True + ) + + return corpus + @staticmethod def scan(topdir, include_patterns=("*",), exclude_patterns=(".*",)): """ @@ -315,10 +386,15 @@ def scan(topdir, include_patterns=("*",), exclude_patterns=(".*",)): @staticmethod def scan_url(topdir: str, include_patterns: Tuple[str] = ("*",), exclude_patterns: Tuple[str] = (".*",)) -> List[str]: + try: + files = serverfiles.ServerFiles(topdir).listfiles() + except ConnectionError: + return [] + include_patterns = include_patterns or ("*",) paths = [] - for filenames in serverfiles.ServerFiles(topdir).listfiles(): - path = os.path.join(topdir, os.path.join(*filenames)) + for filename in files: + path = os.path.join(topdir, os.path.join(*filename)) if matches_any(path, include_patterns) and \ not matches_any(path, exclude_patterns): paths.append(path) diff --git a/orangecontrib/text/tests/test_import_documents.py b/orangecontrib/text/tests/test_import_documents.py index 9131029b5..9f8a553d2 100644 --- a/orangecontrib/text/tests/test_import_documents.py +++ b/orangecontrib/text/tests/test_import_documents.py @@ -1,4 +1,7 @@ import unittest +from unittest.mock import patch + +import pandas as pd from orangecontrib.text.import_documents import ImportDocuments, UrlReader, \ TxtReader, TextData @@ -53,32 +56,57 @@ def test_scan_url(self): path = "http://file.biolab.si/text-semantics/data/semeval/" importer = ImportDocuments(path, True) paths = importer.scan_url(path) - print(paths) + self.assertEqual(len(paths), 101) def test_scan_url_txt(self): path = "http://file.biolab.si/text-semantics/data/semeval/" importer = ImportDocuments(path, True) paths = importer.scan_url(path, include_patterns=["*.txt"]) - print(paths) + self.assertEqual(len(paths), 100) def test_scan_url_csv(self): path = "http://file.biolab.si/text-semantics/data/" importer = ImportDocuments(path, True) paths = importer.scan_url(path, include_patterns=["*.csv"]) - print(paths) + self.assertEqual(len(paths), 6) - def test_run_url(self): + def test_read_meta_data_url(self): path = "http://file.biolab.si/text-semantics/data/semeval/" importer = ImportDocuments(path, True) - res, err = importer.run() - print(res) + data1, err = importer._read_meta_data() + self.assertIsInstance(data1, pd.DataFrame) + self.assertEqual(len(err), 0) - def test_run_url_metadata(self): + @patch("orangecontrib.text.import_documents.ImportDocuments." + "META_DATA_FILE_KEY", "File") + def test_merge_metadata_url(self): path = "http://file.biolab.si/text-semantics/data/semeval/" - importer = ImportDocuments(path, True, formats=["csv"]) - res, err = importer.run() - print(res) - print(err) + importer = ImportDocuments(path, True) + text_data, _ = importer._read_text_data() + meta_data, _ = importer._read_meta_data() + + importer._text_data = text_data[:4] # 'C-1', 'C-14', 'C-17', 'C-18' + importer._meta_data = meta_data[:50] + corpus = importer._create_corpus() + corpus = importer._add_metadata(corpus) + self.assertEqual(len(corpus), 4) + columns = ["name", "path", "content", "Content", "File", "Keywords"] + self.assertEqual([v.name for v in corpus.domain.metas], columns) + + importer._text_data = text_data[:4] # 'C-1', 'C-14', 'C-17', 'C-18' + importer._meta_data = None + corpus = importer._create_corpus() + corpus = importer._add_metadata(corpus) + self.assertEqual(len(corpus), 4) + columns = ["name", "path", "content"] + self.assertEqual([v.name for v in corpus.domain.metas], columns) + + def test_run_url(self): + path = "http://file.biolab.si/text-semantics/data/" \ + "elektrotehniski-vestnik-clanki/" + importer = ImportDocuments(path, True) + corpus, errors = importer.run() + self.assertEqual(len(corpus), 382) if __name__ == "__main__": From f1d0dcfaa6fdbc1081a8ff21b2b2be24b802fe13 Mon Sep 17 00:00:00 2001 From: Vesna Tanko Date: Tue, 30 Mar 2021 13:32:16 +0200 Subject: [PATCH 3/3] Import Documents: Import from URL --- orangecontrib/text/import_documents.py | 9 +- .../text/tests/test_import_documents.py | 46 ++++++++-- .../text/widgets/owimportdocuments.py | 91 +++++++++++++++---- requirements.txt | 2 +- 4 files changed, 118 insertions(+), 30 deletions(-) diff --git a/orangecontrib/text/import_documents.py b/orangecontrib/text/import_documents.py index 89ad7128a..9ab427438 100644 --- a/orangecontrib/text/import_documents.py +++ b/orangecontrib/text/import_documents.py @@ -188,8 +188,7 @@ def __init__(self, path, *args): Reader.__init__(self, self.filename, *args) def read_file(self): - path, name = os.path.split(self.filename) - self.filename = os.path.join(path, quote(name)) + self.filename = quote(self.filename, safe="/:") self.filename = self._trim(self._resolve_redirects(self.filename)) with contextlib.closing(self.urlopen(self.filename)) as response: name = self._suggest_filename( @@ -216,6 +215,10 @@ def __init__(self, startdir: str, is_url: bool = False, formats: Tuple[str] = DefaultFormats, report_progress: Callable = None): + if is_url and not startdir.endswith("/"): + startdir += "/" + elif not is_url: + startdir = os.path.join(startdir, "") self.startdir = startdir self.formats = formats self._report_progress = report_progress @@ -394,7 +397,7 @@ def scan_url(topdir: str, include_patterns: Tuple[str] = ("*",), include_patterns = include_patterns or ("*",) paths = [] for filename in files: - path = os.path.join(topdir, os.path.join(*filename)) + path = topdir + "/".join(filename) if matches_any(path, include_patterns) and \ not matches_any(path, exclude_patterns): paths.append(path) diff --git a/orangecontrib/text/tests/test_import_documents.py b/orangecontrib/text/tests/test_import_documents.py index 9f8a553d2..179388827 100644 --- a/orangecontrib/text/tests/test_import_documents.py +++ b/orangecontrib/text/tests/test_import_documents.py @@ -1,6 +1,7 @@ import unittest from unittest.mock import patch +import numpy as np import pandas as pd from orangecontrib.text.import_documents import ImportDocuments, UrlReader, \ @@ -56,19 +57,19 @@ def test_scan_url(self): path = "http://file.biolab.si/text-semantics/data/semeval/" importer = ImportDocuments(path, True) paths = importer.scan_url(path) - self.assertEqual(len(paths), 101) + self.assertGreater(len(paths), 0) def test_scan_url_txt(self): path = "http://file.biolab.si/text-semantics/data/semeval/" importer = ImportDocuments(path, True) paths = importer.scan_url(path, include_patterns=["*.txt"]) - self.assertEqual(len(paths), 100) + self.assertGreater(len(paths), 0) def test_scan_url_csv(self): path = "http://file.biolab.si/text-semantics/data/" importer = ImportDocuments(path, True) paths = importer.scan_url(path, include_patterns=["*.csv"]) - self.assertEqual(len(paths), 6) + self.assertGreater(len(paths), 0) def test_read_meta_data_url(self): path = "http://file.biolab.si/text-semantics/data/semeval/" @@ -77,8 +78,8 @@ def test_read_meta_data_url(self): self.assertIsInstance(data1, pd.DataFrame) self.assertEqual(len(err), 0) - @patch("orangecontrib.text.import_documents.ImportDocuments." - "META_DATA_FILE_KEY", "File") + # @patch("orangecontrib.text.import_documents.ImportDocuments." + # "META_DATA_FILE_KEY", "File") def test_merge_metadata_url(self): path = "http://file.biolab.si/text-semantics/data/semeval/" importer = ImportDocuments(path, True) @@ -89,24 +90,51 @@ def test_merge_metadata_url(self): importer._meta_data = meta_data[:50] corpus = importer._create_corpus() corpus = importer._add_metadata(corpus) - self.assertEqual(len(corpus), 4) - columns = ["name", "path", "content", "Content", "File", "Keywords"] + self.assertGreater(len(corpus), 0) + columns = ["name", "path", "content", "Content", + "Text file", "Keywords"] self.assertEqual([v.name for v in corpus.domain.metas], columns) importer._text_data = text_data[:4] # 'C-1', 'C-14', 'C-17', 'C-18' importer._meta_data = None corpus = importer._create_corpus() corpus = importer._add_metadata(corpus) - self.assertEqual(len(corpus), 4) + self.assertGreater(len(corpus), 0) columns = ["name", "path", "content"] self.assertEqual([v.name for v in corpus.domain.metas], columns) def test_run_url(self): + path = "http://file.biolab.si/text-semantics/data" \ + "/predlogi-vladi-sample/" + importer = ImportDocuments(path, True) + corpus1, _ = importer.run() + self.assertGreater(len(corpus1), 0) + + mask = np.ones_like(corpus1.metas, dtype=bool) + mask[:, 1] = False + + path = "http://file.biolab.si/text-semantics/data" \ + "/predlogi-vladi-sample////" + importer = ImportDocuments(path, True) + corpus2, _ = importer.run() + self.assertGreater(len(corpus1), 0) + self.assertEqual(corpus1.metas[mask].tolist(), + corpus2.metas[mask].tolist()) + + path = "http://file.biolab.si/text-semantics/data" \ + "/predlogi-vladi-sample" + importer = ImportDocuments(path, True) + corpus3, _ = importer.run() + self.assertGreater(len(corpus2), 0) + self.assertEqual(corpus1.metas[mask].tolist(), + corpus3.metas[mask].tolist()) + + def test_run_url_special_characters(self): path = "http://file.biolab.si/text-semantics/data/" \ "elektrotehniski-vestnik-clanki/" importer = ImportDocuments(path, True) corpus, errors = importer.run() - self.assertEqual(len(corpus), 382) + self.assertGreater(len(corpus), 0) if __name__ == "__main__": diff --git a/orangecontrib/text/widgets/owimportdocuments.py b/orangecontrib/text/widgets/owimportdocuments.py index 97656c543..3a7624e14 100644 --- a/orangecontrib/text/widgets/owimportdocuments.py +++ b/orangecontrib/text/widgets/owimportdocuments.py @@ -10,6 +10,7 @@ import warnings import logging import traceback +from urllib.parse import urlparse from types import SimpleNamespace as namespace from concurrent.futures._base import TimeoutError @@ -22,11 +23,14 @@ from AnyQt.QtWidgets import ( QAction, QPushButton, QComboBox, QApplication, QStyle, QFileDialog, QFileIconProvider, QStackedWidget, QProgressBar, QWidget, QHBoxLayout, - QVBoxLayout, QLabel + QVBoxLayout, QLabel, QGridLayout, QSizePolicy, QCompleter ) +from orangewidget.utils.itemmodels import PyListModel + from Orange.data import Table, Domain, StringVariable from Orange.widgets import widget, gui, settings +from Orange.widgets.data.owfile import LineEditSelectOnFocus from Orange.widgets.utils.filedialogs import RecentPath from Orange.widgets.utils.concurrent import ( ThreadExecutor, FutureWatcher, methodinvoke @@ -91,9 +95,12 @@ class Outputs: data = Output("Corpus", Corpus) skipped_documents = Output("Skipped documents", Table) + LOCAL_FILE, URL = range(2) + source = settings.Setting(LOCAL_FILE) #: list of recent paths recent_paths: List[RecentPath] = settings.Setting([]) currentPath: Optional[str] = settings.Setting(None) + recent_urls: List[str] = settings.Setting([]) want_main_area = False resizing_enabled = False @@ -116,8 +123,18 @@ def __init__(self): self.__invalidated = False self.__pendingTask = None - vbox = gui.vBox(self.controlArea) - hbox = gui.hBox(vbox) + layout = QGridLayout() + layout.setSpacing(4) + gui.widgetBox(self.controlArea, orientation=layout, box='Source') + source_box = gui.radioButtons(None, self, "source", box=True, + callback=self.start, addToLayout=False) + rb_button = gui.appendRadioButton(source_box, "Folder:", + addToLayout=False) + layout.addWidget(rb_button, 0, 0, Qt.AlignVCenter) + + box = gui.hBox(None, addToLayout=False, margin=0) + box.setSizePolicy(QSizePolicy.MinimumExpanding, QSizePolicy.Fixed) + self.recent_cb = QComboBox( sizeAdjustPolicy=QComboBox.AdjustToMinimumContentsLengthWithIcon, minimumContentsLength=16, @@ -148,25 +165,50 @@ def __init__(self): browseaction.iconText(), icon=browseaction.icon(), toolTip=browseaction.toolTip(), - clicked=browseaction.trigger + clicked=browseaction.trigger, + default=False, + autoDefault=False, ) reloadbutton = QPushButton( reloadaction.iconText(), icon=reloadaction.icon(), clicked=reloadaction.trigger, - default=True, + default=False, + autoDefault=False, ) - - hbox.layout().addWidget(self.recent_cb) - hbox.layout().addWidget(browsebutton) - hbox.layout().addWidget(reloadbutton) + box.layout().addWidget(self.recent_cb) + layout.addWidget(box, 0, 1) + layout.addWidget(browsebutton, 0, 2) + layout.addWidget(reloadbutton, 0, 3) + + rb_button = gui.appendRadioButton(source_box, "URL:", addToLayout=False) + layout.addWidget(rb_button, 3, 0, Qt.AlignVCenter) + + self.url_combo = url_combo = QComboBox() + url_model = PyListModel() + url_model.wrap(self.recent_urls) + url_combo.setLineEdit(LineEditSelectOnFocus()) + url_combo.setModel(url_model) + url_combo.setSizePolicy(QSizePolicy.Ignored, QSizePolicy.Fixed) + url_combo.setEditable(True) + url_combo.setInsertPolicy(url_combo.InsertAtTop) + url_edit = url_combo.lineEdit() + l, t, r, b = url_edit.getTextMargins() + url_edit.setTextMargins(l + 5, t, r, b) + layout.addWidget(url_combo, 3, 1, 1, 3) + url_combo.activated.connect(self._url_set) + # whit completer we set that combo box is case sensitive when + # matching the history + completer = QCompleter() + completer.setCaseSensitivity(Qt.CaseSensitive) + url_combo.setCompleter(completer) self.addActions([browseaction, reloadaction]) reloadaction.changed.connect( lambda: reloadbutton.setEnabled(reloadaction.isEnabled()) ) - box = gui.vBox(vbox, "Info") + box = gui.vBox(self.controlArea, "Info") self.infostack = QStackedWidget() self.info_area = QLabel( @@ -179,6 +221,8 @@ def __init__(self): self.cancel_button = QPushButton( "Cancel", icon=self.style().standardIcon(QStyle.SP_DialogCancelButton), + default=False, + autoDefault=False, ) self.cancel_button.clicked.connect(self.cancel) @@ -210,6 +254,17 @@ def __init__(self): QApplication.postEvent(self, QEvent(RuntimeEvent.Init)) + def _url_set(self): + url = self.url_combo.currentText() + pos = self.recent_urls.index(url) + url = url.strip() + if not urlparse(url).scheme: + url = "http://" + url + self.url_combo.setItemText(pos, url) + self.recent_urls[pos] = url + self.source = self.URL + self.start() + def __initRecentItemsModel(self): if self.currentPath is not None and \ not os.path.isdir(self.currentPath): @@ -336,7 +391,8 @@ def setCurrentPath(self, path): """ if self.currentPath is not None and path is not None and \ os.path.isdir(self.currentPath) and os.path.isdir(path) and \ - os.path.samefile(self.currentPath, path): + os.path.samefile(self.currentPath, path) and \ + self.source == self.LOCAL_FILE: return True success = True @@ -370,7 +426,7 @@ def setCurrentPath(self, path): if self.__state == State.Processing: self.cancel() - + self.source = self.LOCAL_FILE return success def addRecentPath(self, path): @@ -447,7 +503,7 @@ def reload(self): """ if self.__state == State.Processing: self.cancel() - + self.source = self.LOCAL_FILE self.corpus = None self.start() @@ -460,7 +516,9 @@ def start(self): self.progress_widget.setValue(0) self.__invalidated = False - if self.currentPath is None: + startdir = self.currentPath if self.source == self.LOCAL_FILE \ + else self.url_combo.currentText().strip() + if not startdir: return if self.__state == State.Processing: @@ -470,14 +528,13 @@ def start(self): .format(self.__pendingTask.startdir)) self.cancel() - startdir = self.currentPath - self.__setRuntimeState(State.Processing) report_progress = methodinvoke( self, "__onReportProgress", (object,)) - task = ImportDocuments(startdir, report_progress=report_progress) + task = ImportDocuments(startdir, self.source == self.URL, + report_progress=report_progress) # collect the task state in one convenient place self.__pendingTask = taskstate = namespace( diff --git a/requirements.txt b/requirements.txt index ceb496590..50a3ee91c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ numpy python-dateutil<3.0.0 # denpendency for botocore gensim>=0.12.3 # LDA's show topics unified in 0.12.3 setuptools-git -Orange3 >=3.25.0 +Orange3 >=3.28.0 tweepy beautifulsoup4 simhash >=1.11