diff --git a/orangecontrib/text/import_documents.py b/orangecontrib/text/import_documents.py index 58820fae6..89ad7128a 100644 --- a/orangecontrib/text/import_documents.py +++ b/orangecontrib/text/import_documents.py @@ -4,7 +4,9 @@ import os import pathlib import re +import yaml from urllib.parse import quote +from requests.exceptions import ConnectionError from collections import namedtuple from tempfile import NamedTemporaryFile @@ -13,6 +15,7 @@ from unicodedata import normalize import numpy as np +import pandas as pd import docx2txt from odf.opendocument import load @@ -28,6 +31,7 @@ from Orange.data import DiscreteVariable, Domain, StringVariable from Orange.data.io import detect_encoding, UrlReader as CoreUrlReader +from Orange.data.util import get_unique_names from Orange.util import Registry from orangecontrib.text.corpus import Corpus @@ -161,6 +165,21 @@ def read_file(self): self.content = soup.get_text() +class CsvMetaReader(Reader): + ext = [".csv"] + + def read_file(self): + self.content = pd.read_csv(self.path) + + +class YamlMetaReader(Reader): + ext = [".yaml"] + + def read_file(self): + with open(self.path, "r") as f: + self.content = yaml.safe_load(f) + + class UrlReader(Reader, CoreUrlReader): ext = [".url"] @@ -191,6 +210,8 @@ def make_text_data(self): class ImportDocuments: + META_DATA_FILE_KEY = "Text file" + def __init__(self, startdir: str, is_url: bool = False, formats: Tuple[str] = DefaultFormats, @@ -199,10 +220,18 @@ def __init__(self, startdir: str, self.formats = formats self._report_progress = report_progress self.cancelled = False - self._text_data = [] self._is_url = is_url + self._text_data = [] + self._meta_data: pd.DataFrame = None def run(self) -> Tuple[Corpus, List]: + self._text_data, errors_text = self._read_text_data() + self._meta_data, errors_meta = self._read_meta_data() + corpus = self._create_corpus() + corpus = self._add_metadata(corpus) + return corpus, errors_text + errors_meta + + def _read_text_data(self): text_data = [] errors = [] patterns = ["*.{}".format(fmt.lower()) for fmt in self.formats] @@ -234,8 +263,29 @@ def run(self) -> Tuple[Corpus, List]: if self.cancelled: return - self._text_data = text_data - return self._create_corpus(), errors + return text_data, errors + + def _read_meta_data(self): + scan = self.scan_url if self._is_url else self.scan + patterns = ["*.csv", "*.yaml", "*.yml"] + paths = scan(self.startdir, include_patterns=patterns) + meta_dfs, errors = [], [] + for path in paths: + reader = Reader.get_reader(path) if not self._is_url \ + else UrlReader(path) + data, error = reader.read() + if data is not None: + content = data.content + if isinstance(content, dict): + content = pd.DataFrame(content, index=[0]) + meta_dfs.append(content) + else: + errors.append(error) + + if self.cancelled: + return + + return pd.concat(meta_dfs) if meta_dfs else None, errors def _create_corpus(self) -> Corpus: corpus = None @@ -277,6 +327,27 @@ def _create_corpus(self) -> Corpus: return corpus + def _add_metadata(self, corpus: Corpus) -> Corpus: + if "path" not in corpus.domain or self._meta_data is None \ + or self.META_DATA_FILE_KEY not in self._meta_data.columns: + return corpus + + df = self._meta_data.set_index( + self.startdir + self._meta_data[self.META_DATA_FILE_KEY] + ) + path_column = corpus.get_column_view("path")[0] + if len(df.index.drop_duplicates()) != len(df.index): + df = df[~df.index.duplicated(keep='first')] + filtered = df.reindex(path_column) + for column in filtered.columns: + corpus = corpus.add_column( + StringVariable(get_unique_names(corpus.domain, column)), + filtered[column].to_numpy(), + to_metas=True + ) + + return corpus + @staticmethod def scan(topdir, include_patterns=("*",), exclude_patterns=(".*",)): """ @@ -315,10 +386,15 @@ def scan(topdir, include_patterns=("*",), exclude_patterns=(".*",)): @staticmethod def scan_url(topdir: str, include_patterns: Tuple[str] = ("*",), exclude_patterns: Tuple[str] = (".*",)) -> List[str]: + try: + files = serverfiles.ServerFiles(topdir).listfiles() + except ConnectionError: + return [] + include_patterns = include_patterns or ("*",) paths = [] - for filenames in serverfiles.ServerFiles(topdir).listfiles(): - path = os.path.join(topdir, os.path.join(*filenames)) + for filename in files: + path = os.path.join(topdir, os.path.join(*filename)) if matches_any(path, include_patterns) and \ not matches_any(path, exclude_patterns): paths.append(path) diff --git a/orangecontrib/text/tests/test_import_documents.py b/orangecontrib/text/tests/test_import_documents.py index 9131029b5..9f8a553d2 100644 --- a/orangecontrib/text/tests/test_import_documents.py +++ b/orangecontrib/text/tests/test_import_documents.py @@ -1,4 +1,7 @@ import unittest +from unittest.mock import patch + +import pandas as pd from orangecontrib.text.import_documents import ImportDocuments, UrlReader, \ TxtReader, TextData @@ -53,32 +56,57 @@ def test_scan_url(self): path = "http://file.biolab.si/text-semantics/data/semeval/" importer = ImportDocuments(path, True) paths = importer.scan_url(path) - print(paths) + self.assertEqual(len(paths), 101) def test_scan_url_txt(self): path = "http://file.biolab.si/text-semantics/data/semeval/" importer = ImportDocuments(path, True) paths = importer.scan_url(path, include_patterns=["*.txt"]) - print(paths) + self.assertEqual(len(paths), 100) def test_scan_url_csv(self): path = "http://file.biolab.si/text-semantics/data/" importer = ImportDocuments(path, True) paths = importer.scan_url(path, include_patterns=["*.csv"]) - print(paths) + self.assertEqual(len(paths), 6) - def test_run_url(self): + def test_read_meta_data_url(self): path = "http://file.biolab.si/text-semantics/data/semeval/" importer = ImportDocuments(path, True) - res, err = importer.run() - print(res) + data1, err = importer._read_meta_data() + self.assertIsInstance(data1, pd.DataFrame) + self.assertEqual(len(err), 0) - def test_run_url_metadata(self): + @patch("orangecontrib.text.import_documents.ImportDocuments." + "META_DATA_FILE_KEY", "File") + def test_merge_metadata_url(self): path = "http://file.biolab.si/text-semantics/data/semeval/" - importer = ImportDocuments(path, True, formats=["csv"]) - res, err = importer.run() - print(res) - print(err) + importer = ImportDocuments(path, True) + text_data, _ = importer._read_text_data() + meta_data, _ = importer._read_meta_data() + + importer._text_data = text_data[:4] # 'C-1', 'C-14', 'C-17', 'C-18' + importer._meta_data = meta_data[:50] + corpus = importer._create_corpus() + corpus = importer._add_metadata(corpus) + self.assertEqual(len(corpus), 4) + columns = ["name", "path", "content", "Content", "File", "Keywords"] + self.assertEqual([v.name for v in corpus.domain.metas], columns) + + importer._text_data = text_data[:4] # 'C-1', 'C-14', 'C-17', 'C-18' + importer._meta_data = None + corpus = importer._create_corpus() + corpus = importer._add_metadata(corpus) + self.assertEqual(len(corpus), 4) + columns = ["name", "path", "content"] + self.assertEqual([v.name for v in corpus.domain.metas], columns) + + def test_run_url(self): + path = "http://file.biolab.si/text-semantics/data/" \ + "elektrotehniski-vestnik-clanki/" + importer = ImportDocuments(path, True) + corpus, errors = importer.run() + self.assertEqual(len(corpus), 382) if __name__ == "__main__":