From 48e08f815abdea6e133122950b5bee2efdab362f Mon Sep 17 00:00:00 2001 From: Vesna Tanko Date: Mon, 29 Mar 2021 09:12:14 +0200 Subject: [PATCH] Import Documents: Add URL reader --- orangecontrib/text/import_documents.py | 75 +++++++++++++--- .../text/tests/test_import_documents.py | 85 +++++++++++++++++++ 2 files changed, 149 insertions(+), 11 deletions(-) create mode 100644 orangecontrib/text/tests/test_import_documents.py diff --git a/orangecontrib/text/import_documents.py b/orangecontrib/text/import_documents.py index 4d2343683..58820fae6 100644 --- a/orangecontrib/text/import_documents.py +++ b/orangecontrib/text/import_documents.py @@ -1,11 +1,15 @@ +import contextlib import fnmatch import logging import os import pathlib import re +from urllib.parse import quote from collections import namedtuple +from tempfile import NamedTemporaryFile from types import SimpleNamespace as namespace +from typing import List, Tuple, Callable from unicodedata import normalize import numpy as np @@ -20,13 +24,14 @@ from pdfminer.layout import LAParams, LTTextBox, LTTextLine from bs4 import BeautifulSoup +import serverfiles + from Orange.data import DiscreteVariable, Domain, StringVariable -from Orange.data.io import detect_encoding +from Orange.data.io import detect_encoding, UrlReader as CoreUrlReader from Orange.util import Registry from orangecontrib.text.corpus import Corpus - DefaultFormats = ("docx", "odt", "txt", "pdf", "xml") TextData = namedtuple( @@ -156,19 +161,53 @@ def read_file(self): self.content = soup.get_text() +class UrlReader(Reader, CoreUrlReader): + ext = [".url"] + + def __init__(self, path, *args): + CoreUrlReader.__init__(self, path) + Reader.__init__(self, self.filename, *args) + + def read_file(self): + path, name = os.path.split(self.filename) + self.filename = os.path.join(path, quote(name)) + self.filename = self._trim(self._resolve_redirects(self.filename)) + with contextlib.closing(self.urlopen(self.filename)) as response: + name = self._suggest_filename( + response.headers["content-disposition"]) + extension = "".join(pathlib.Path(name).suffixes) + with NamedTemporaryFile(suffix=extension, delete=False) as f: + f.write(response.read()) + reader = Reader.get_reader(f.name) + reader.read_file() + self.content = reader.content + os.remove(f.name) + + def make_text_data(self): + text_data = super().make_text_data() + ext = pathlib.Path(self.path).suffix + return TextData(text_data.name, text_data.path, [ext], + text_data.category, text_data.content) + + class ImportDocuments: - def __init__(self, startdir, formats=DefaultFormats, report_progress=None): + def __init__(self, startdir: str, + is_url: bool = False, + formats: Tuple[str] = DefaultFormats, + report_progress: Callable = None): self.startdir = startdir self.formats = formats self._report_progress = report_progress self.cancelled = False self._text_data = [] + self._is_url = is_url - def run(self): + def run(self) -> Tuple[Corpus, List]: text_data = [] errors = [] patterns = ["*.{}".format(fmt.lower()) for fmt in self.formats] - paths = self.scan(self.startdir, include_patterns=patterns) + scan = self.scan_url if self._is_url else self.scan + paths = scan(self.startdir, include_patterns=patterns) n_paths = len(paths) batch = [] @@ -183,7 +222,8 @@ def run(self): batch=batch)) batch = [] - reader = Reader.get_reader(path) + reader = Reader.get_reader(path) if not self._is_url \ + else UrlReader(path) text, error = reader.read() if text is not None: text_data.append(text) @@ -197,7 +237,7 @@ def run(self): self._text_data = text_data return self._create_corpus(), errors - def _create_corpus(self): + def _create_corpus(self) -> Corpus: corpus = None names = ["name", "path", "content"] data = [] @@ -258,10 +298,6 @@ def scan(topdir, include_patterns=("*",), exclude_patterns=(".*",)): if include_patterns is None: include_patterns = ["*"] - def matches_any(fname, patterns): - return any(fnmatch.fnmatch(fname.lower(), pattern) - for pattern in patterns) - paths = [] for dirpath, dirnames, filenames in os.walk(topdir): @@ -275,3 +311,20 @@ def matches_any(fname, patterns): and not matches_any(fname, exclude_patterns)] paths = paths + [os.path.join(dirpath, fname) for fname in filenames] return paths + + @staticmethod + def scan_url(topdir: str, include_patterns: Tuple[str] = ("*",), + exclude_patterns: Tuple[str] = (".*",)) -> List[str]: + include_patterns = include_patterns or ("*",) + paths = [] + for filenames in serverfiles.ServerFiles(topdir).listfiles(): + path = os.path.join(topdir, os.path.join(*filenames)) + if matches_any(path, include_patterns) and \ + not matches_any(path, exclude_patterns): + paths.append(path) + return paths + + +def matches_any(fname: str, patterns: Tuple[str]) -> bool: + return any(fnmatch.fnmatch(fname.lower(), pattern) + for pattern in patterns) diff --git a/orangecontrib/text/tests/test_import_documents.py b/orangecontrib/text/tests/test_import_documents.py new file mode 100644 index 000000000..9131029b5 --- /dev/null +++ b/orangecontrib/text/tests/test_import_documents.py @@ -0,0 +1,85 @@ +import unittest + +from orangecontrib.text.import_documents import ImportDocuments, UrlReader, \ + TxtReader, TextData + + +class TestUrlReader(unittest.TestCase): + def test_init(self): + path = "http://dummy.server.com/data/foo.txt" + reader = UrlReader(path) + self.assertEqual(reader.filename, path) + self.assertEqual(reader.path, path) + + def test_get_reader(self): + path = "http://dummy.server.com/data/foo.txt" + reader = UrlReader.get_reader(path) + self.assertIsInstance(reader, TxtReader) + + def test_read(self): + path = "http://file.biolab.si/text-semantics/data/semeval/C-1.txt" + reader = UrlReader(path) + textdata, error = reader.read() + self.assertIsInstance(textdata, TextData) + self.assertEqual(textdata.name, "C-1") + self.assertEqual(textdata.path, path) + self.assertEqual(textdata.ext, [".txt"]) + self.assertEqual(textdata.category, "semeval") + self.assertTrue(textdata.content.startswith("On The Complexity of Co")) + self.assertEqual(error, "") + + def test_read_file(self): + path = "http://file.biolab.si/text-semantics/data/elektrotehniski-" \ + "vestnik-clanki/detektiranje-utrdb-v-šahu-.txt" + reader = UrlReader(path) + reader.read_file() + self.assertIsInstance(reader.content, str) + + def test_name_text_data(self): + path = "http://dummy.server.com/data/foo.txt" + reader = UrlReader(path) + reader.content = "text" + text_data = reader.make_text_data() + self.assertIsInstance(text_data, TextData) + self.assertEqual(text_data.name, "foo") + self.assertEqual(text_data.path, path) + self.assertEqual(text_data.ext, [".txt"]) + self.assertEqual(text_data.category, "data") + self.assertEqual(text_data.content, "text") + + +class TestImportDocuments(unittest.TestCase): + def test_scan_url(self): + path = "http://file.biolab.si/text-semantics/data/semeval/" + importer = ImportDocuments(path, True) + paths = importer.scan_url(path) + print(paths) + + def test_scan_url_txt(self): + path = "http://file.biolab.si/text-semantics/data/semeval/" + importer = ImportDocuments(path, True) + paths = importer.scan_url(path, include_patterns=["*.txt"]) + print(paths) + + def test_scan_url_csv(self): + path = "http://file.biolab.si/text-semantics/data/" + importer = ImportDocuments(path, True) + paths = importer.scan_url(path, include_patterns=["*.csv"]) + print(paths) + + def test_run_url(self): + path = "http://file.biolab.si/text-semantics/data/semeval/" + importer = ImportDocuments(path, True) + res, err = importer.run() + print(res) + + def test_run_url_metadata(self): + path = "http://file.biolab.si/text-semantics/data/semeval/" + importer = ImportDocuments(path, True, formats=["csv"]) + res, err = importer.run() + print(res) + print(err) + + +if __name__ == "__main__": + unittest.main()