Skip to content

Commit

Permalink
Import Documents: Read metadata
Browse files Browse the repository at this point in the history
  • Loading branch information
VesnaT committed Apr 8, 2021
1 parent 7fa66da commit 8982a5b
Show file tree
Hide file tree
Showing 2 changed files with 120 additions and 16 deletions.
86 changes: 81 additions & 5 deletions orangecontrib/text/import_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
import os
import pathlib
import re
import yaml
from urllib.parse import quote
from requests.exceptions import ConnectionError

from collections import namedtuple
from tempfile import NamedTemporaryFile
Expand All @@ -13,6 +15,7 @@
from unicodedata import normalize

import numpy as np
import pandas as pd

import docx2txt
from odf.opendocument import load
Expand All @@ -28,6 +31,7 @@

from Orange.data import DiscreteVariable, Domain, StringVariable
from Orange.data.io import detect_encoding, UrlReader as CoreUrlReader
from Orange.data.util import get_unique_names
from Orange.util import Registry

from orangecontrib.text.corpus import Corpus
Expand Down Expand Up @@ -161,6 +165,21 @@ def read_file(self):
self.content = soup.get_text()


class CsvMetaReader(Reader):
ext = [".csv"]

def read_file(self):
self.content = pd.read_csv(self.path)


class YamlMetaReader(Reader):
ext = [".yaml"]

def read_file(self):
with open(self.path, "r") as f:
self.content = yaml.safe_load(f)


class UrlReader(Reader, CoreUrlReader):
ext = [".url"]

Expand Down Expand Up @@ -191,6 +210,8 @@ def make_text_data(self):


class ImportDocuments:
META_DATA_FILE_KEY = "Text file"

def __init__(self, startdir: str,
is_url: bool = False,
formats: Tuple[str] = DefaultFormats,
Expand All @@ -199,10 +220,18 @@ def __init__(self, startdir: str,
self.formats = formats
self._report_progress = report_progress
self.cancelled = False
self._text_data = []
self._is_url = is_url
self._text_data = []
self._meta_data: pd.DataFrame = None

def run(self) -> Tuple[Corpus, List]:
self._text_data, errors_text = self._read_text_data()
self._meta_data, errors_meta = self._read_meta_data()
corpus = self._create_corpus()
corpus = self._add_metadata(corpus)
return corpus, errors_text + errors_meta

def _read_text_data(self):
text_data = []
errors = []
patterns = ["*.{}".format(fmt.lower()) for fmt in self.formats]
Expand Down Expand Up @@ -234,8 +263,29 @@ def run(self) -> Tuple[Corpus, List]:
if self.cancelled:
return

self._text_data = text_data
return self._create_corpus(), errors
return text_data, errors

def _read_meta_data(self):
scan = self.scan_url if self._is_url else self.scan
patterns = ["*.csv", "*.yaml", "*.yml"]
paths = scan(self.startdir, include_patterns=patterns)
meta_dfs, errors = [], []
for path in paths:
reader = Reader.get_reader(path) if not self._is_url \
else UrlReader(path)
data, error = reader.read()
if data is not None:
content = data.content
if isinstance(content, dict):
content = pd.DataFrame(content, index=[0])
meta_dfs.append(content)
else:
errors.append(error)

if self.cancelled:
return

return pd.concat(meta_dfs) if meta_dfs else None, errors

def _create_corpus(self) -> Corpus:
corpus = None
Expand Down Expand Up @@ -277,6 +327,27 @@ def _create_corpus(self) -> Corpus:

return corpus

def _add_metadata(self, corpus: Corpus) -> Corpus:
if "path" not in corpus.domain or self._meta_data is None \
or self.META_DATA_FILE_KEY not in self._meta_data.columns:
return corpus

df = self._meta_data.set_index(
self.startdir + self._meta_data[self.META_DATA_FILE_KEY]
)
path_column = corpus.get_column_view("path")[0]
if len(df.index.drop_duplicates()) != len(df.index):
df = df[~df.index.duplicated(keep='first')]
filtered = df.reindex(path_column)
for column in filtered.columns:
corpus = corpus.add_column(
StringVariable(get_unique_names(corpus.domain, column)),
filtered[column].to_numpy(),
to_metas=True
)

return corpus

@staticmethod
def scan(topdir, include_patterns=("*",), exclude_patterns=(".*",)):
"""
Expand Down Expand Up @@ -315,10 +386,15 @@ def scan(topdir, include_patterns=("*",), exclude_patterns=(".*",)):
@staticmethod
def scan_url(topdir: str, include_patterns: Tuple[str] = ("*",),
exclude_patterns: Tuple[str] = (".*",)) -> List[str]:
try:
files = serverfiles.ServerFiles(topdir).listfiles()
except ConnectionError:
return []

include_patterns = include_patterns or ("*",)
paths = []
for filenames in serverfiles.ServerFiles(topdir).listfiles():
path = os.path.join(topdir, os.path.join(*filenames))
for filename in files:
path = os.path.join(topdir, os.path.join(*filename))
if matches_any(path, include_patterns) and \
not matches_any(path, exclude_patterns):
paths.append(path)
Expand Down
50 changes: 39 additions & 11 deletions orangecontrib/text/tests/test_import_documents.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import unittest
from unittest.mock import patch

import pandas as pd

from orangecontrib.text.import_documents import ImportDocuments, UrlReader, \
TxtReader, TextData
Expand Down Expand Up @@ -53,32 +56,57 @@ def test_scan_url(self):
path = "http://file.biolab.si/text-semantics/data/semeval/"
importer = ImportDocuments(path, True)
paths = importer.scan_url(path)
print(paths)
self.assertEqual(len(paths), 101)

def test_scan_url_txt(self):
path = "http://file.biolab.si/text-semantics/data/semeval/"
importer = ImportDocuments(path, True)
paths = importer.scan_url(path, include_patterns=["*.txt"])
print(paths)
self.assertEqual(len(paths), 100)

def test_scan_url_csv(self):
path = "http://file.biolab.si/text-semantics/data/"
importer = ImportDocuments(path, True)
paths = importer.scan_url(path, include_patterns=["*.csv"])
print(paths)
self.assertEqual(len(paths), 6)

def test_run_url(self):
def test_read_meta_data_url(self):
path = "http://file.biolab.si/text-semantics/data/semeval/"
importer = ImportDocuments(path, True)
res, err = importer.run()
print(res)
data1, err = importer._read_meta_data()
self.assertIsInstance(data1, pd.DataFrame)
self.assertEqual(len(err), 0)

def test_run_url_metadata(self):
@patch("orangecontrib.text.import_documents.ImportDocuments."
"META_DATA_FILE_KEY", "File")
def test_merge_metadata_url(self):
path = "http://file.biolab.si/text-semantics/data/semeval/"
importer = ImportDocuments(path, True, formats=["csv"])
res, err = importer.run()
print(res)
print(err)
importer = ImportDocuments(path, True)
text_data, _ = importer._read_text_data()
meta_data, _ = importer._read_meta_data()

importer._text_data = text_data[:4] # 'C-1', 'C-14', 'C-17', 'C-18'
importer._meta_data = meta_data[:50]
corpus = importer._create_corpus()
corpus = importer._add_metadata(corpus)
self.assertEqual(len(corpus), 4)
columns = ["name", "path", "content", "Content", "File", "Keywords"]
self.assertEqual([v.name for v in corpus.domain.metas], columns)

importer._text_data = text_data[:4] # 'C-1', 'C-14', 'C-17', 'C-18'
importer._meta_data = None
corpus = importer._create_corpus()
corpus = importer._add_metadata(corpus)
self.assertEqual(len(corpus), 4)
columns = ["name", "path", "content"]
self.assertEqual([v.name for v in corpus.domain.metas], columns)

def test_run_url(self):
path = "http://file.biolab.si/text-semantics/data/" \
"elektrotehniski-vestnik-clanki/"
importer = ImportDocuments(path, True)
corpus, errors = importer.run()
self.assertEqual(len(corpus), 382)


if __name__ == "__main__":
Expand Down

0 comments on commit 8982a5b

Please sign in to comment.