Skip to content

Commit

Permalink
Import Documents: Add URL reader
Browse files Browse the repository at this point in the history
  • Loading branch information
VesnaT committed Apr 28, 2021
1 parent 25fe179 commit 48e08f8
Show file tree
Hide file tree
Showing 2 changed files with 149 additions and 11 deletions.
75 changes: 64 additions & 11 deletions orangecontrib/text/import_documents.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
import contextlib
import fnmatch
import logging
import os
import pathlib
import re
from urllib.parse import quote

from collections import namedtuple
from tempfile import NamedTemporaryFile
from types import SimpleNamespace as namespace
from typing import List, Tuple, Callable
from unicodedata import normalize

import numpy as np
Expand All @@ -20,13 +24,14 @@
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
from bs4 import BeautifulSoup

import serverfiles

from Orange.data import DiscreteVariable, Domain, StringVariable
from Orange.data.io import detect_encoding
from Orange.data.io import detect_encoding, UrlReader as CoreUrlReader
from Orange.util import Registry

from orangecontrib.text.corpus import Corpus


DefaultFormats = ("docx", "odt", "txt", "pdf", "xml")

TextData = namedtuple(
Expand Down Expand Up @@ -156,19 +161,53 @@ def read_file(self):
self.content = soup.get_text()


class UrlReader(Reader, CoreUrlReader):
ext = [".url"]

def __init__(self, path, *args):
CoreUrlReader.__init__(self, path)
Reader.__init__(self, self.filename, *args)

def read_file(self):
path, name = os.path.split(self.filename)
self.filename = os.path.join(path, quote(name))
self.filename = self._trim(self._resolve_redirects(self.filename))
with contextlib.closing(self.urlopen(self.filename)) as response:
name = self._suggest_filename(
response.headers["content-disposition"])
extension = "".join(pathlib.Path(name).suffixes)
with NamedTemporaryFile(suffix=extension, delete=False) as f:
f.write(response.read())
reader = Reader.get_reader(f.name)
reader.read_file()
self.content = reader.content
os.remove(f.name)

def make_text_data(self):
text_data = super().make_text_data()
ext = pathlib.Path(self.path).suffix
return TextData(text_data.name, text_data.path, [ext],
text_data.category, text_data.content)


class ImportDocuments:
def __init__(self, startdir, formats=DefaultFormats, report_progress=None):
def __init__(self, startdir: str,
is_url: bool = False,
formats: Tuple[str] = DefaultFormats,
report_progress: Callable = None):
self.startdir = startdir
self.formats = formats
self._report_progress = report_progress
self.cancelled = False
self._text_data = []
self._is_url = is_url

def run(self):
def run(self) -> Tuple[Corpus, List]:
text_data = []
errors = []
patterns = ["*.{}".format(fmt.lower()) for fmt in self.formats]
paths = self.scan(self.startdir, include_patterns=patterns)
scan = self.scan_url if self._is_url else self.scan
paths = scan(self.startdir, include_patterns=patterns)
n_paths = len(paths)
batch = []

Expand All @@ -183,7 +222,8 @@ def run(self):
batch=batch))
batch = []

reader = Reader.get_reader(path)
reader = Reader.get_reader(path) if not self._is_url \
else UrlReader(path)
text, error = reader.read()
if text is not None:
text_data.append(text)
Expand All @@ -197,7 +237,7 @@ def run(self):
self._text_data = text_data
return self._create_corpus(), errors

def _create_corpus(self):
def _create_corpus(self) -> Corpus:
corpus = None
names = ["name", "path", "content"]
data = []
Expand Down Expand Up @@ -258,10 +298,6 @@ def scan(topdir, include_patterns=("*",), exclude_patterns=(".*",)):
if include_patterns is None:
include_patterns = ["*"]

def matches_any(fname, patterns):
return any(fnmatch.fnmatch(fname.lower(), pattern)
for pattern in patterns)

paths = []

for dirpath, dirnames, filenames in os.walk(topdir):
Expand All @@ -275,3 +311,20 @@ def matches_any(fname, patterns):
and not matches_any(fname, exclude_patterns)]
paths = paths + [os.path.join(dirpath, fname) for fname in filenames]
return paths

@staticmethod
def scan_url(topdir: str, include_patterns: Tuple[str] = ("*",),
exclude_patterns: Tuple[str] = (".*",)) -> List[str]:
include_patterns = include_patterns or ("*",)
paths = []
for filenames in serverfiles.ServerFiles(topdir).listfiles():
path = os.path.join(topdir, os.path.join(*filenames))
if matches_any(path, include_patterns) and \
not matches_any(path, exclude_patterns):
paths.append(path)
return paths


def matches_any(fname: str, patterns: Tuple[str]) -> bool:
return any(fnmatch.fnmatch(fname.lower(), pattern)
for pattern in patterns)
85 changes: 85 additions & 0 deletions orangecontrib/text/tests/test_import_documents.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import unittest

from orangecontrib.text.import_documents import ImportDocuments, UrlReader, \
TxtReader, TextData


class TestUrlReader(unittest.TestCase):
def test_init(self):
path = "http://dummy.server.com/data/foo.txt"
reader = UrlReader(path)
self.assertEqual(reader.filename, path)
self.assertEqual(reader.path, path)

def test_get_reader(self):
path = "http://dummy.server.com/data/foo.txt"
reader = UrlReader.get_reader(path)
self.assertIsInstance(reader, TxtReader)

def test_read(self):
path = "http://file.biolab.si/text-semantics/data/semeval/C-1.txt"
reader = UrlReader(path)
textdata, error = reader.read()
self.assertIsInstance(textdata, TextData)
self.assertEqual(textdata.name, "C-1")
self.assertEqual(textdata.path, path)
self.assertEqual(textdata.ext, [".txt"])
self.assertEqual(textdata.category, "semeval")
self.assertTrue(textdata.content.startswith("On The Complexity of Co"))
self.assertEqual(error, "")

def test_read_file(self):
path = "http://file.biolab.si/text-semantics/data/elektrotehniski-" \
"vestnik-clanki/detektiranje-utrdb-v-šahu-.txt"
reader = UrlReader(path)
reader.read_file()
self.assertIsInstance(reader.content, str)

def test_name_text_data(self):
path = "http://dummy.server.com/data/foo.txt"
reader = UrlReader(path)
reader.content = "text"
text_data = reader.make_text_data()
self.assertIsInstance(text_data, TextData)
self.assertEqual(text_data.name, "foo")
self.assertEqual(text_data.path, path)
self.assertEqual(text_data.ext, [".txt"])
self.assertEqual(text_data.category, "data")
self.assertEqual(text_data.content, "text")


class TestImportDocuments(unittest.TestCase):
def test_scan_url(self):
path = "http://file.biolab.si/text-semantics/data/semeval/"
importer = ImportDocuments(path, True)
paths = importer.scan_url(path)
print(paths)

def test_scan_url_txt(self):
path = "http://file.biolab.si/text-semantics/data/semeval/"
importer = ImportDocuments(path, True)
paths = importer.scan_url(path, include_patterns=["*.txt"])
print(paths)

def test_scan_url_csv(self):
path = "http://file.biolab.si/text-semantics/data/"
importer = ImportDocuments(path, True)
paths = importer.scan_url(path, include_patterns=["*.csv"])
print(paths)

def test_run_url(self):
path = "http://file.biolab.si/text-semantics/data/semeval/"
importer = ImportDocuments(path, True)
res, err = importer.run()
print(res)

def test_run_url_metadata(self):
path = "http://file.biolab.si/text-semantics/data/semeval/"
importer = ImportDocuments(path, True, formats=["csv"])
res, err = importer.run()
print(res)
print(err)


if __name__ == "__main__":
unittest.main()

0 comments on commit 48e08f8

Please sign in to comment.