From 6024a65f90b0fc50ec4cca702a8119b11e02b172 Mon Sep 17 00:00:00 2001 From: Ajda Pretnar Date: Tue, 6 Jul 2021 15:29:26 +0200 Subject: [PATCH 1/3] Import Documents: Read metas as the right type --- orangecontrib/text/import_documents.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/orangecontrib/text/import_documents.py b/orangecontrib/text/import_documents.py index 1c8684f65..d9ef58eec 100644 --- a/orangecontrib/text/import_documents.py +++ b/orangecontrib/text/import_documents.py @@ -1,4 +1,5 @@ import contextlib +import datetime import fnmatch import logging import os @@ -29,8 +30,10 @@ import serverfiles -from Orange.data import DiscreteVariable, Domain, StringVariable -from Orange.data.io import detect_encoding, UrlReader as CoreUrlReader +from Orange.data import DiscreteVariable, Domain, StringVariable, \ + guess_data_type +from Orange.data.io import detect_encoding, sanitize_variable,\ + UrlReader as CoreUrlReader from Orange.data.util import get_unique_names from Orange.util import Registry @@ -284,6 +287,8 @@ def _read_meta_data(self): content = data.content if isinstance(content, dict): content = pd.DataFrame(content, index=[0]) + # if reader is YamlMetaReader: + # content = content.replace("None", np.nan) meta_dfs.append(content) else: errors.append(error) @@ -345,13 +350,18 @@ def _add_metadata(self, corpus: Corpus) -> Corpus: if len(df.index.drop_duplicates()) != len(df.index): df = df[~df.index.duplicated(keep='first')] filtered = df.reindex(path_column) - for column in filtered.columns: + for name, column in filtered.iteritems(): + data = column.astype(str).values + val_map, vals, var_type = guess_data_type(data) + values, variable = sanitize_variable(val_map, vals, data, + var_type, {}, + name=get_unique_names( + corpus.domain, name)) corpus = corpus.add_column( - StringVariable(get_unique_names(corpus.domain, column)), - filtered[column].to_numpy(), + variable, + values, to_metas=True ) - return corpus @staticmethod From c1d7c8a8bb61362caf5b7339374979823d9533da Mon Sep 17 00:00:00 2001 From: Ajda Pretnar Date: Wed, 21 Jul 2021 09:08:19 +0200 Subject: [PATCH 2/3] Fix tests --- orangecontrib/text/tests/test_import_documents.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/orangecontrib/text/tests/test_import_documents.py b/orangecontrib/text/tests/test_import_documents.py index f1ece308b..4e3676c08 100644 --- a/orangecontrib/text/tests/test_import_documents.py +++ b/orangecontrib/text/tests/test_import_documents.py @@ -130,16 +130,16 @@ def test_run_url(self): importer = ImportDocuments(path, True) corpus2, _ = importer.run() self.assertGreater(len(corpus1), 0) - self.assertEqual(corpus1.metas[mask].tolist(), - corpus2.metas[mask].tolist()) + np.testing.assert_array_equal(corpus1.metas[mask].tolist(), + corpus2.metas[mask].tolist()) path = "http://file.biolab.si/text-semantics/data" \ "/predlogi-vladi-sample" importer = ImportDocuments(path, True) corpus3, _ = importer.run() self.assertGreater(len(corpus2), 0) - self.assertEqual(corpus1.metas[mask].tolist(), - corpus3.metas[mask].tolist()) + np.testing.assert_array_equal(corpus1.metas[mask].tolist(), + corpus3.metas[mask].tolist()) def test_run_url_special_characters(self): path = "http://file.biolab.si/text-semantics/data/" \ From 42405e240424458f05bf106e5410a654ceb28e02 Mon Sep 17 00:00:00 2001 From: Ajda Pretnar Date: Wed, 21 Jul 2021 11:24:49 +0200 Subject: [PATCH 3/3] Sanitize Yaml --- orangecontrib/text/import_documents.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/orangecontrib/text/import_documents.py b/orangecontrib/text/import_documents.py index d9ef58eec..2bb5fb2b6 100644 --- a/orangecontrib/text/import_documents.py +++ b/orangecontrib/text/import_documents.py @@ -181,6 +181,9 @@ class YamlMetaReader(Reader): def read_file(self): with open(self.path, "r") as f: self.content = yaml.safe_load(f) + for k in self.content: + if self.content[k] is None: + self.content[k] = "" class UrlReader(Reader, CoreUrlReader): @@ -287,8 +290,6 @@ def _read_meta_data(self): content = data.content if isinstance(content, dict): content = pd.DataFrame(content, index=[0]) - # if reader is YamlMetaReader: - # content = content.replace("None", np.nan) meta_dfs.append(content) else: errors.append(error)