Skip to content

Commit

Permalink
Merge pull request #637 from VesnaT/import_documents_url
Browse files Browse the repository at this point in the history
[ENH] Import documents: Import from URL
  • Loading branch information
PrimozGodec authored Apr 28, 2021
2 parents 25fe179 + f1d0dcf commit f2d7b44
Show file tree
Hide file tree
Showing 4 changed files with 361 additions and 31 deletions.
158 changes: 145 additions & 13 deletions orangecontrib/text/import_documents.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,21 @@
import contextlib
import fnmatch
import logging
import os
import pathlib
import re
import yaml
from urllib.parse import quote
from requests.exceptions import ConnectionError

from collections import namedtuple
from tempfile import NamedTemporaryFile
from types import SimpleNamespace as namespace
from typing import List, Tuple, Callable
from unicodedata import normalize

import numpy as np
import pandas as pd

import docx2txt
from odf.opendocument import load
Expand All @@ -20,13 +27,15 @@
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
from bs4 import BeautifulSoup

import serverfiles

from Orange.data import DiscreteVariable, Domain, StringVariable
from Orange.data.io import detect_encoding
from Orange.data.io import detect_encoding, UrlReader as CoreUrlReader
from Orange.data.util import get_unique_names
from Orange.util import Registry

from orangecontrib.text.corpus import Corpus


DefaultFormats = ("docx", "odt", "txt", "pdf", "xml")

TextData = namedtuple(
Expand Down Expand Up @@ -156,19 +165,81 @@ def read_file(self):
self.content = soup.get_text()


class CsvMetaReader(Reader):
ext = [".csv"]

def read_file(self):
self.content = pd.read_csv(self.path)


class YamlMetaReader(Reader):
ext = [".yaml"]

def read_file(self):
with open(self.path, "r") as f:
self.content = yaml.safe_load(f)


class UrlReader(Reader, CoreUrlReader):
ext = [".url"]

def __init__(self, path, *args):
CoreUrlReader.__init__(self, path)
Reader.__init__(self, self.filename, *args)

def read_file(self):
self.filename = quote(self.filename, safe="/:")
self.filename = self._trim(self._resolve_redirects(self.filename))
with contextlib.closing(self.urlopen(self.filename)) as response:
name = self._suggest_filename(
response.headers["content-disposition"])
extension = "".join(pathlib.Path(name).suffixes)
with NamedTemporaryFile(suffix=extension, delete=False) as f:
f.write(response.read())
reader = Reader.get_reader(f.name)
reader.read_file()
self.content = reader.content
os.remove(f.name)

def make_text_data(self):
text_data = super().make_text_data()
ext = pathlib.Path(self.path).suffix
return TextData(text_data.name, text_data.path, [ext],
text_data.category, text_data.content)


class ImportDocuments:
def __init__(self, startdir, formats=DefaultFormats, report_progress=None):
META_DATA_FILE_KEY = "Text file"

def __init__(self, startdir: str,
is_url: bool = False,
formats: Tuple[str] = DefaultFormats,
report_progress: Callable = None):
if is_url and not startdir.endswith("/"):
startdir += "/"
elif not is_url:
startdir = os.path.join(startdir, "")
self.startdir = startdir
self.formats = formats
self._report_progress = report_progress
self.cancelled = False
self._is_url = is_url
self._text_data = []
self._meta_data: pd.DataFrame = None

def run(self) -> Tuple[Corpus, List]:
self._text_data, errors_text = self._read_text_data()
self._meta_data, errors_meta = self._read_meta_data()
corpus = self._create_corpus()
corpus = self._add_metadata(corpus)
return corpus, errors_text + errors_meta

def run(self):
def _read_text_data(self):
text_data = []
errors = []
patterns = ["*.{}".format(fmt.lower()) for fmt in self.formats]
paths = self.scan(self.startdir, include_patterns=patterns)
scan = self.scan_url if self._is_url else self.scan
paths = scan(self.startdir, include_patterns=patterns)
n_paths = len(paths)
batch = []

Expand All @@ -183,7 +254,8 @@ def run(self):
batch=batch))
batch = []

reader = Reader.get_reader(path)
reader = Reader.get_reader(path) if not self._is_url \
else UrlReader(path)
text, error = reader.read()
if text is not None:
text_data.append(text)
Expand All @@ -194,10 +266,31 @@ def run(self):
if self.cancelled:
return

self._text_data = text_data
return self._create_corpus(), errors
return text_data, errors

def _create_corpus(self):
def _read_meta_data(self):
scan = self.scan_url if self._is_url else self.scan
patterns = ["*.csv", "*.yaml", "*.yml"]
paths = scan(self.startdir, include_patterns=patterns)
meta_dfs, errors = [], []
for path in paths:
reader = Reader.get_reader(path) if not self._is_url \
else UrlReader(path)
data, error = reader.read()
if data is not None:
content = data.content
if isinstance(content, dict):
content = pd.DataFrame(content, index=[0])
meta_dfs.append(content)
else:
errors.append(error)

if self.cancelled:
return

return pd.concat(meta_dfs) if meta_dfs else None, errors

def _create_corpus(self) -> Corpus:
corpus = None
names = ["name", "path", "content"]
data = []
Expand Down Expand Up @@ -237,6 +330,27 @@ def _create_corpus(self):

return corpus

def _add_metadata(self, corpus: Corpus) -> Corpus:
if "path" not in corpus.domain or self._meta_data is None \
or self.META_DATA_FILE_KEY not in self._meta_data.columns:
return corpus

df = self._meta_data.set_index(
self.startdir + self._meta_data[self.META_DATA_FILE_KEY]
)
path_column = corpus.get_column_view("path")[0]
if len(df.index.drop_duplicates()) != len(df.index):
df = df[~df.index.duplicated(keep='first')]
filtered = df.reindex(path_column)
for column in filtered.columns:
corpus = corpus.add_column(
StringVariable(get_unique_names(corpus.domain, column)),
filtered[column].to_numpy(),
to_metas=True
)

return corpus

@staticmethod
def scan(topdir, include_patterns=("*",), exclude_patterns=(".*",)):
"""
Expand All @@ -258,10 +372,6 @@ def scan(topdir, include_patterns=("*",), exclude_patterns=(".*",)):
if include_patterns is None:
include_patterns = ["*"]

def matches_any(fname, patterns):
return any(fnmatch.fnmatch(fname.lower(), pattern)
for pattern in patterns)

paths = []

for dirpath, dirnames, filenames in os.walk(topdir):
Expand All @@ -275,3 +385,25 @@ def matches_any(fname, patterns):
and not matches_any(fname, exclude_patterns)]
paths = paths + [os.path.join(dirpath, fname) for fname in filenames]
return paths

@staticmethod
def scan_url(topdir: str, include_patterns: Tuple[str] = ("*",),
exclude_patterns: Tuple[str] = (".*",)) -> List[str]:
try:
files = serverfiles.ServerFiles(topdir).listfiles()
except ConnectionError:
return []

include_patterns = include_patterns or ("*",)
paths = []
for filename in files:
path = topdir + "/".join(filename)
if matches_any(path, include_patterns) and \
not matches_any(path, exclude_patterns):
paths.append(path)
return paths


def matches_any(fname: str, patterns: Tuple[str]) -> bool:
return any(fnmatch.fnmatch(fname.lower(), pattern)
for pattern in patterns)
141 changes: 141 additions & 0 deletions orangecontrib/text/tests/test_import_documents.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
import unittest
from unittest.mock import patch

import numpy as np
import pandas as pd

from orangecontrib.text.import_documents import ImportDocuments, UrlReader, \
TxtReader, TextData


class TestUrlReader(unittest.TestCase):
def test_init(self):
path = "http://dummy.server.com/data/foo.txt"
reader = UrlReader(path)
self.assertEqual(reader.filename, path)
self.assertEqual(reader.path, path)

def test_get_reader(self):
path = "http://dummy.server.com/data/foo.txt"
reader = UrlReader.get_reader(path)
self.assertIsInstance(reader, TxtReader)

def test_read(self):
path = "http://file.biolab.si/text-semantics/data/semeval/C-1.txt"
reader = UrlReader(path)
textdata, error = reader.read()
self.assertIsInstance(textdata, TextData)
self.assertEqual(textdata.name, "C-1")
self.assertEqual(textdata.path, path)
self.assertEqual(textdata.ext, [".txt"])
self.assertEqual(textdata.category, "semeval")
self.assertTrue(textdata.content.startswith("On The Complexity of Co"))
self.assertEqual(error, "")

def test_read_file(self):
path = "http://file.biolab.si/text-semantics/data/elektrotehniski-" \
"vestnik-clanki/detektiranje-utrdb-v-šahu-.txt"
reader = UrlReader(path)
reader.read_file()
self.assertIsInstance(reader.content, str)

def test_name_text_data(self):
path = "http://dummy.server.com/data/foo.txt"
reader = UrlReader(path)
reader.content = "text"
text_data = reader.make_text_data()
self.assertIsInstance(text_data, TextData)
self.assertEqual(text_data.name, "foo")
self.assertEqual(text_data.path, path)
self.assertEqual(text_data.ext, [".txt"])
self.assertEqual(text_data.category, "data")
self.assertEqual(text_data.content, "text")


class TestImportDocuments(unittest.TestCase):
def test_scan_url(self):
path = "http://file.biolab.si/text-semantics/data/semeval/"
importer = ImportDocuments(path, True)
paths = importer.scan_url(path)
self.assertGreater(len(paths), 0)

def test_scan_url_txt(self):
path = "http://file.biolab.si/text-semantics/data/semeval/"
importer = ImportDocuments(path, True)
paths = importer.scan_url(path, include_patterns=["*.txt"])
self.assertGreater(len(paths), 0)

def test_scan_url_csv(self):
path = "http://file.biolab.si/text-semantics/data/"
importer = ImportDocuments(path, True)
paths = importer.scan_url(path, include_patterns=["*.csv"])
self.assertGreater(len(paths), 0)

def test_read_meta_data_url(self):
path = "http://file.biolab.si/text-semantics/data/semeval/"
importer = ImportDocuments(path, True)
data1, err = importer._read_meta_data()
self.assertIsInstance(data1, pd.DataFrame)
self.assertEqual(len(err), 0)

# @patch("orangecontrib.text.import_documents.ImportDocuments."
# "META_DATA_FILE_KEY", "File")
def test_merge_metadata_url(self):
path = "http://file.biolab.si/text-semantics/data/semeval/"
importer = ImportDocuments(path, True)
text_data, _ = importer._read_text_data()
meta_data, _ = importer._read_meta_data()

importer._text_data = text_data[:4] # 'C-1', 'C-14', 'C-17', 'C-18'
importer._meta_data = meta_data[:50]
corpus = importer._create_corpus()
corpus = importer._add_metadata(corpus)
self.assertGreater(len(corpus), 0)
columns = ["name", "path", "content", "Content",
"Text file", "Keywords"]
self.assertEqual([v.name for v in corpus.domain.metas], columns)

importer._text_data = text_data[:4] # 'C-1', 'C-14', 'C-17', 'C-18'
importer._meta_data = None
corpus = importer._create_corpus()
corpus = importer._add_metadata(corpus)
self.assertGreater(len(corpus), 0)
columns = ["name", "path", "content"]
self.assertEqual([v.name for v in corpus.domain.metas], columns)

def test_run_url(self):
path = "http://file.biolab.si/text-semantics/data" \
"/predlogi-vladi-sample/"
importer = ImportDocuments(path, True)
corpus1, _ = importer.run()
self.assertGreater(len(corpus1), 0)

mask = np.ones_like(corpus1.metas, dtype=bool)
mask[:, 1] = False

path = "http://file.biolab.si/text-semantics/data" \
"/predlogi-vladi-sample////"
importer = ImportDocuments(path, True)
corpus2, _ = importer.run()
self.assertGreater(len(corpus1), 0)
self.assertEqual(corpus1.metas[mask].tolist(),
corpus2.metas[mask].tolist())

path = "http://file.biolab.si/text-semantics/data" \
"/predlogi-vladi-sample"
importer = ImportDocuments(path, True)
corpus3, _ = importer.run()
self.assertGreater(len(corpus2), 0)
self.assertEqual(corpus1.metas[mask].tolist(),
corpus3.metas[mask].tolist())

def test_run_url_special_characters(self):
path = "http://file.biolab.si/text-semantics/data/" \
"elektrotehniski-vestnik-clanki/"
importer = ImportDocuments(path, True)
corpus, errors = importer.run()
self.assertGreater(len(corpus), 0)


if __name__ == "__main__":
unittest.main()
Loading

0 comments on commit f2d7b44

Please sign in to comment.