Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Import documents: Import from URL #637

Merged
merged 3 commits into from
Apr 28, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
158 changes: 145 additions & 13 deletions orangecontrib/text/import_documents.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,21 @@
import contextlib
import fnmatch
import logging
import os
import pathlib
import re
import yaml
from urllib.parse import quote
from requests.exceptions import ConnectionError

from collections import namedtuple
from tempfile import NamedTemporaryFile
from types import SimpleNamespace as namespace
from typing import List, Tuple, Callable
from unicodedata import normalize

import numpy as np
import pandas as pd

import docx2txt
from odf.opendocument import load
Expand All @@ -20,13 +27,15 @@
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
from bs4 import BeautifulSoup

import serverfiles

from Orange.data import DiscreteVariable, Domain, StringVariable
from Orange.data.io import detect_encoding
from Orange.data.io import detect_encoding, UrlReader as CoreUrlReader
from Orange.data.util import get_unique_names
from Orange.util import Registry

from orangecontrib.text.corpus import Corpus


DefaultFormats = ("docx", "odt", "txt", "pdf", "xml")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should .yaml also be specified here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, it should not. These formats are 'documents' formats. Each file of these format represents a document. Metadata is considered differently, since it appends data to each document. Its formats are hardcoded in _read_meta_data().


TextData = namedtuple(
Expand Down Expand Up @@ -156,19 +165,81 @@ def read_file(self):
self.content = soup.get_text()


class CsvMetaReader(Reader):
ext = [".csv"]

def read_file(self):
self.content = pd.read_csv(self.path)


class YamlMetaReader(Reader):
ext = [".yaml"]

def read_file(self):
with open(self.path, "r") as f:
self.content = yaml.safe_load(f)


class UrlReader(Reader, CoreUrlReader):
ext = [".url"]

def __init__(self, path, *args):
CoreUrlReader.__init__(self, path)
Reader.__init__(self, self.filename, *args)

def read_file(self):
self.filename = quote(self.filename, safe="/:")
self.filename = self._trim(self._resolve_redirects(self.filename))
with contextlib.closing(self.urlopen(self.filename)) as response:
name = self._suggest_filename(
response.headers["content-disposition"])
extension = "".join(pathlib.Path(name).suffixes)
with NamedTemporaryFile(suffix=extension, delete=False) as f:
f.write(response.read())
reader = Reader.get_reader(f.name)
reader.read_file()
self.content = reader.content
os.remove(f.name)

def make_text_data(self):
text_data = super().make_text_data()
ext = pathlib.Path(self.path).suffix
return TextData(text_data.name, text_data.path, [ext],
text_data.category, text_data.content)


class ImportDocuments:
def __init__(self, startdir, formats=DefaultFormats, report_progress=None):
META_DATA_FILE_KEY = "Text file"

def __init__(self, startdir: str,
is_url: bool = False,
formats: Tuple[str] = DefaultFormats,
report_progress: Callable = None):
if is_url and not startdir.endswith("/"):
startdir += "/"
elif not is_url:
startdir = os.path.join(startdir, "")
self.startdir = startdir
self.formats = formats
self._report_progress = report_progress
self.cancelled = False
self._is_url = is_url
self._text_data = []
self._meta_data: pd.DataFrame = None

def run(self) -> Tuple[Corpus, List]:
self._text_data, errors_text = self._read_text_data()
self._meta_data, errors_meta = self._read_meta_data()
corpus = self._create_corpus()
corpus = self._add_metadata(corpus)
return corpus, errors_text + errors_meta

def run(self):
def _read_text_data(self):
text_data = []
errors = []
patterns = ["*.{}".format(fmt.lower()) for fmt in self.formats]
paths = self.scan(self.startdir, include_patterns=patterns)
scan = self.scan_url if self._is_url else self.scan
paths = scan(self.startdir, include_patterns=patterns)
n_paths = len(paths)
batch = []

Expand All @@ -183,7 +254,8 @@ def run(self):
batch=batch))
batch = []

reader = Reader.get_reader(path)
reader = Reader.get_reader(path) if not self._is_url \
else UrlReader(path)
text, error = reader.read()
if text is not None:
text_data.append(text)
Expand All @@ -194,10 +266,31 @@ def run(self):
if self.cancelled:
return

self._text_data = text_data
return self._create_corpus(), errors
return text_data, errors

def _create_corpus(self):
def _read_meta_data(self):
scan = self.scan_url if self._is_url else self.scan
patterns = ["*.csv", "*.yaml", "*.yml"]
paths = scan(self.startdir, include_patterns=patterns)
meta_dfs, errors = [], []
for path in paths:
reader = Reader.get_reader(path) if not self._is_url \
else UrlReader(path)
data, error = reader.read()
if data is not None:
content = data.content
if isinstance(content, dict):
content = pd.DataFrame(content, index=[0])
meta_dfs.append(content)
else:
errors.append(error)

if self.cancelled:
return

return pd.concat(meta_dfs) if meta_dfs else None, errors

def _create_corpus(self) -> Corpus:
corpus = None
names = ["name", "path", "content"]
data = []
Expand Down Expand Up @@ -237,6 +330,27 @@ def _create_corpus(self):

return corpus

def _add_metadata(self, corpus: Corpus) -> Corpus:
if "path" not in corpus.domain or self._meta_data is None \
or self.META_DATA_FILE_KEY not in self._meta_data.columns:
return corpus

df = self._meta_data.set_index(
self.startdir + self._meta_data[self.META_DATA_FILE_KEY]
)
path_column = corpus.get_column_view("path")[0]
if len(df.index.drop_duplicates()) != len(df.index):
df = df[~df.index.duplicated(keep='first')]
filtered = df.reindex(path_column)
for column in filtered.columns:
corpus = corpus.add_column(
StringVariable(get_unique_names(corpus.domain, column)),
filtered[column].to_numpy(),
to_metas=True
)

return corpus

@staticmethod
def scan(topdir, include_patterns=("*",), exclude_patterns=(".*",)):
"""
Expand All @@ -258,10 +372,6 @@ def scan(topdir, include_patterns=("*",), exclude_patterns=(".*",)):
if include_patterns is None:
include_patterns = ["*"]

def matches_any(fname, patterns):
return any(fnmatch.fnmatch(fname.lower(), pattern)
for pattern in patterns)

paths = []

for dirpath, dirnames, filenames in os.walk(topdir):
Expand All @@ -275,3 +385,25 @@ def matches_any(fname, patterns):
and not matches_any(fname, exclude_patterns)]
paths = paths + [os.path.join(dirpath, fname) for fname in filenames]
return paths

@staticmethod
def scan_url(topdir: str, include_patterns: Tuple[str] = ("*",),
exclude_patterns: Tuple[str] = (".*",)) -> List[str]:
try:
files = serverfiles.ServerFiles(topdir).listfiles()
except ConnectionError:
return []

include_patterns = include_patterns or ("*",)
paths = []
for filename in files:
path = topdir + "/".join(filename)
if matches_any(path, include_patterns) and \
not matches_any(path, exclude_patterns):
paths.append(path)
return paths


def matches_any(fname: str, patterns: Tuple[str]) -> bool:
return any(fnmatch.fnmatch(fname.lower(), pattern)
for pattern in patterns)
141 changes: 141 additions & 0 deletions orangecontrib/text/tests/test_import_documents.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
import unittest
from unittest.mock import patch

import numpy as np
import pandas as pd

from orangecontrib.text.import_documents import ImportDocuments, UrlReader, \
TxtReader, TextData


class TestUrlReader(unittest.TestCase):
def test_init(self):
path = "http://dummy.server.com/data/foo.txt"
reader = UrlReader(path)
self.assertEqual(reader.filename, path)
self.assertEqual(reader.path, path)

def test_get_reader(self):
path = "http://dummy.server.com/data/foo.txt"
reader = UrlReader.get_reader(path)
self.assertIsInstance(reader, TxtReader)

def test_read(self):
path = "http://file.biolab.si/text-semantics/data/semeval/C-1.txt"
reader = UrlReader(path)
textdata, error = reader.read()
self.assertIsInstance(textdata, TextData)
self.assertEqual(textdata.name, "C-1")
self.assertEqual(textdata.path, path)
self.assertEqual(textdata.ext, [".txt"])
self.assertEqual(textdata.category, "semeval")
self.assertTrue(textdata.content.startswith("On The Complexity of Co"))
self.assertEqual(error, "")

def test_read_file(self):
path = "http://file.biolab.si/text-semantics/data/elektrotehniski-" \
"vestnik-clanki/detektiranje-utrdb-v-šahu-.txt"
reader = UrlReader(path)
reader.read_file()
self.assertIsInstance(reader.content, str)

def test_name_text_data(self):
path = "http://dummy.server.com/data/foo.txt"
reader = UrlReader(path)
reader.content = "text"
text_data = reader.make_text_data()
self.assertIsInstance(text_data, TextData)
self.assertEqual(text_data.name, "foo")
self.assertEqual(text_data.path, path)
self.assertEqual(text_data.ext, [".txt"])
self.assertEqual(text_data.category, "data")
self.assertEqual(text_data.content, "text")


class TestImportDocuments(unittest.TestCase):
def test_scan_url(self):
path = "http://file.biolab.si/text-semantics/data/semeval/"
importer = ImportDocuments(path, True)
paths = importer.scan_url(path)
self.assertGreater(len(paths), 0)

def test_scan_url_txt(self):
path = "http://file.biolab.si/text-semantics/data/semeval/"
importer = ImportDocuments(path, True)
paths = importer.scan_url(path, include_patterns=["*.txt"])
self.assertGreater(len(paths), 0)

def test_scan_url_csv(self):
path = "http://file.biolab.si/text-semantics/data/"
importer = ImportDocuments(path, True)
paths = importer.scan_url(path, include_patterns=["*.csv"])
self.assertGreater(len(paths), 0)

def test_read_meta_data_url(self):
path = "http://file.biolab.si/text-semantics/data/semeval/"
importer = ImportDocuments(path, True)
data1, err = importer._read_meta_data()
self.assertIsInstance(data1, pd.DataFrame)
self.assertEqual(len(err), 0)

# @patch("orangecontrib.text.import_documents.ImportDocuments."
# "META_DATA_FILE_KEY", "File")
def test_merge_metadata_url(self):
path = "http://file.biolab.si/text-semantics/data/semeval/"
importer = ImportDocuments(path, True)
text_data, _ = importer._read_text_data()
meta_data, _ = importer._read_meta_data()

importer._text_data = text_data[:4] # 'C-1', 'C-14', 'C-17', 'C-18'
importer._meta_data = meta_data[:50]
corpus = importer._create_corpus()
corpus = importer._add_metadata(corpus)
self.assertGreater(len(corpus), 0)
columns = ["name", "path", "content", "Content",
"Text file", "Keywords"]
self.assertEqual([v.name for v in corpus.domain.metas], columns)

importer._text_data = text_data[:4] # 'C-1', 'C-14', 'C-17', 'C-18'
importer._meta_data = None
corpus = importer._create_corpus()
corpus = importer._add_metadata(corpus)
self.assertGreater(len(corpus), 0)
columns = ["name", "path", "content"]
self.assertEqual([v.name for v in corpus.domain.metas], columns)

def test_run_url(self):
path = "http://file.biolab.si/text-semantics/data" \
"/predlogi-vladi-sample/"
importer = ImportDocuments(path, True)
corpus1, _ = importer.run()
self.assertGreater(len(corpus1), 0)

mask = np.ones_like(corpus1.metas, dtype=bool)
mask[:, 1] = False

path = "http://file.biolab.si/text-semantics/data" \
"/predlogi-vladi-sample////"
importer = ImportDocuments(path, True)
corpus2, _ = importer.run()
self.assertGreater(len(corpus1), 0)
self.assertEqual(corpus1.metas[mask].tolist(),
corpus2.metas[mask].tolist())

path = "http://file.biolab.si/text-semantics/data" \
"/predlogi-vladi-sample"
importer = ImportDocuments(path, True)
corpus3, _ = importer.run()
self.assertGreater(len(corpus2), 0)
self.assertEqual(corpus1.metas[mask].tolist(),
corpus3.metas[mask].tolist())

def test_run_url_special_characters(self):
path = "http://file.biolab.si/text-semantics/data/" \
"elektrotehniski-vestnik-clanki/"
importer = ImportDocuments(path, True)
corpus, errors = importer.run()
self.assertGreater(len(corpus), 0)


if __name__ == "__main__":
unittest.main()
Loading