diff --git a/orangecontrib/text/import_documents.py b/orangecontrib/text/import_documents.py index 1c8684f65..2bb5fb2b6 100644 --- a/orangecontrib/text/import_documents.py +++ b/orangecontrib/text/import_documents.py @@ -1,4 +1,5 @@ import contextlib +import datetime import fnmatch import logging import os @@ -29,8 +30,10 @@ import serverfiles -from Orange.data import DiscreteVariable, Domain, StringVariable -from Orange.data.io import detect_encoding, UrlReader as CoreUrlReader +from Orange.data import DiscreteVariable, Domain, StringVariable, \ + guess_data_type +from Orange.data.io import detect_encoding, sanitize_variable,\ + UrlReader as CoreUrlReader from Orange.data.util import get_unique_names from Orange.util import Registry @@ -178,6 +181,9 @@ class YamlMetaReader(Reader): def read_file(self): with open(self.path, "r") as f: self.content = yaml.safe_load(f) + for k in self.content: + if self.content[k] is None: + self.content[k] = "" class UrlReader(Reader, CoreUrlReader): @@ -345,13 +351,18 @@ def _add_metadata(self, corpus: Corpus) -> Corpus: if len(df.index.drop_duplicates()) != len(df.index): df = df[~df.index.duplicated(keep='first')] filtered = df.reindex(path_column) - for column in filtered.columns: + for name, column in filtered.iteritems(): + data = column.astype(str).values + val_map, vals, var_type = guess_data_type(data) + values, variable = sanitize_variable(val_map, vals, data, + var_type, {}, + name=get_unique_names( + corpus.domain, name)) corpus = corpus.add_column( - StringVariable(get_unique_names(corpus.domain, column)), - filtered[column].to_numpy(), + variable, + values, to_metas=True ) - return corpus @staticmethod diff --git a/orangecontrib/text/tests/test_import_documents.py b/orangecontrib/text/tests/test_import_documents.py index f1ece308b..4e3676c08 100644 --- a/orangecontrib/text/tests/test_import_documents.py +++ b/orangecontrib/text/tests/test_import_documents.py @@ -130,16 +130,16 @@ def test_run_url(self): importer = ImportDocuments(path, True) corpus2, _ = importer.run() self.assertGreater(len(corpus1), 0) - self.assertEqual(corpus1.metas[mask].tolist(), - corpus2.metas[mask].tolist()) + np.testing.assert_array_equal(corpus1.metas[mask].tolist(), + corpus2.metas[mask].tolist()) path = "http://file.biolab.si/text-semantics/data" \ "/predlogi-vladi-sample" importer = ImportDocuments(path, True) corpus3, _ = importer.run() self.assertGreater(len(corpus2), 0) - self.assertEqual(corpus1.metas[mask].tolist(), - corpus3.metas[mask].tolist()) + np.testing.assert_array_equal(corpus1.metas[mask].tolist(), + corpus3.metas[mask].tolist()) def test_run_url_special_characters(self): path = "http://file.biolab.si/text-semantics/data/" \