From 5b2ee48fb394cc1a75a97f96f2c22b43de121820 Mon Sep 17 00:00:00 2001 From: PrimozGodec Date: Tue, 16 Aug 2022 15:35:55 +0200 Subject: [PATCH] Datasets - add languages --- MANIFEST.in | 2 +- .../datasets/20newsgroups-test.tab.metadata | 1 + .../datasets/20newsgroups-train.tab.metadata | 1 + .../text/datasets/andersen.tab.metadata | 1 + .../text/datasets/book-excerpts.tab.metadata | 1 + .../text/datasets/deerwester.tab.metadata | 1 + .../datasets/election-tweets-2016.tab.metadata | 1 + .../datasets/friends-transcripts.tab.metadata | 1 + .../datasets/grimm-tales-selected.tab.metadata | 1 + .../text/datasets/grimm-tales.tab.metadata | 1 + .../datasets/reuters-r52-test.tab.metadata | 1 + .../datasets/reuters-r52-train.tab.metadata | 1 + .../text/datasets/reuters-r8-test.tab.metadata | 1 + .../datasets/reuters-r8-train.tab.metadata | 1 + .../datasets/slo-opinion-corpus.tab.metadata | 1 + orangecontrib/text/tests/test_datasets.py | 18 ++++++++++++++++++ 16 files changed, 33 insertions(+), 1 deletion(-) create mode 100644 orangecontrib/text/datasets/20newsgroups-test.tab.metadata create mode 100644 orangecontrib/text/datasets/20newsgroups-train.tab.metadata create mode 100644 orangecontrib/text/datasets/andersen.tab.metadata create mode 100644 orangecontrib/text/datasets/book-excerpts.tab.metadata create mode 100644 orangecontrib/text/datasets/deerwester.tab.metadata create mode 100644 orangecontrib/text/datasets/election-tweets-2016.tab.metadata create mode 100644 orangecontrib/text/datasets/friends-transcripts.tab.metadata create mode 100644 orangecontrib/text/datasets/grimm-tales-selected.tab.metadata create mode 100644 orangecontrib/text/datasets/grimm-tales.tab.metadata create mode 100644 orangecontrib/text/datasets/reuters-r52-test.tab.metadata create mode 100644 orangecontrib/text/datasets/reuters-r52-train.tab.metadata create mode 100644 orangecontrib/text/datasets/reuters-r8-test.tab.metadata create mode 100644 orangecontrib/text/datasets/reuters-r8-train.tab.metadata create mode 100644 orangecontrib/text/datasets/slo-opinion-corpus.tab.metadata create mode 100644 orangecontrib/text/tests/test_datasets.py diff --git a/MANIFEST.in b/MANIFEST.in index 44485ac82..cb442d94d 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,4 @@ -recursive-include orangecontrib/text/datasets *.tab *.txt +recursive-include orangecontrib/text/datasets *.tab *.txt *.metadata recursive-include orangecontrib/text/models *.ftz recursive-include orangecontrib/text/sentiment *.txt recursive-include orangecontrib/text/tests *.txt *.json *.pkl diff --git a/orangecontrib/text/datasets/20newsgroups-test.tab.metadata b/orangecontrib/text/datasets/20newsgroups-test.tab.metadata new file mode 100644 index 000000000..bfb1a3048 --- /dev/null +++ b/orangecontrib/text/datasets/20newsgroups-test.tab.metadata @@ -0,0 +1 @@ +language: en \ No newline at end of file diff --git a/orangecontrib/text/datasets/20newsgroups-train.tab.metadata b/orangecontrib/text/datasets/20newsgroups-train.tab.metadata new file mode 100644 index 000000000..bfb1a3048 --- /dev/null +++ b/orangecontrib/text/datasets/20newsgroups-train.tab.metadata @@ -0,0 +1 @@ +language: en \ No newline at end of file diff --git a/orangecontrib/text/datasets/andersen.tab.metadata b/orangecontrib/text/datasets/andersen.tab.metadata new file mode 100644 index 000000000..bfb1a3048 --- /dev/null +++ b/orangecontrib/text/datasets/andersen.tab.metadata @@ -0,0 +1 @@ +language: en \ No newline at end of file diff --git a/orangecontrib/text/datasets/book-excerpts.tab.metadata b/orangecontrib/text/datasets/book-excerpts.tab.metadata new file mode 100644 index 000000000..bfb1a3048 --- /dev/null +++ b/orangecontrib/text/datasets/book-excerpts.tab.metadata @@ -0,0 +1 @@ +language: en \ No newline at end of file diff --git a/orangecontrib/text/datasets/deerwester.tab.metadata b/orangecontrib/text/datasets/deerwester.tab.metadata new file mode 100644 index 000000000..bfb1a3048 --- /dev/null +++ b/orangecontrib/text/datasets/deerwester.tab.metadata @@ -0,0 +1 @@ +language: en \ No newline at end of file diff --git a/orangecontrib/text/datasets/election-tweets-2016.tab.metadata b/orangecontrib/text/datasets/election-tweets-2016.tab.metadata new file mode 100644 index 000000000..bfb1a3048 --- /dev/null +++ b/orangecontrib/text/datasets/election-tweets-2016.tab.metadata @@ -0,0 +1 @@ +language: en \ No newline at end of file diff --git a/orangecontrib/text/datasets/friends-transcripts.tab.metadata b/orangecontrib/text/datasets/friends-transcripts.tab.metadata new file mode 100644 index 000000000..bfb1a3048 --- /dev/null +++ b/orangecontrib/text/datasets/friends-transcripts.tab.metadata @@ -0,0 +1 @@ +language: en \ No newline at end of file diff --git a/orangecontrib/text/datasets/grimm-tales-selected.tab.metadata b/orangecontrib/text/datasets/grimm-tales-selected.tab.metadata new file mode 100644 index 000000000..bfb1a3048 --- /dev/null +++ b/orangecontrib/text/datasets/grimm-tales-selected.tab.metadata @@ -0,0 +1 @@ +language: en \ No newline at end of file diff --git a/orangecontrib/text/datasets/grimm-tales.tab.metadata b/orangecontrib/text/datasets/grimm-tales.tab.metadata new file mode 100644 index 000000000..bfb1a3048 --- /dev/null +++ b/orangecontrib/text/datasets/grimm-tales.tab.metadata @@ -0,0 +1 @@ +language: en \ No newline at end of file diff --git a/orangecontrib/text/datasets/reuters-r52-test.tab.metadata b/orangecontrib/text/datasets/reuters-r52-test.tab.metadata new file mode 100644 index 000000000..bfb1a3048 --- /dev/null +++ b/orangecontrib/text/datasets/reuters-r52-test.tab.metadata @@ -0,0 +1 @@ +language: en \ No newline at end of file diff --git a/orangecontrib/text/datasets/reuters-r52-train.tab.metadata b/orangecontrib/text/datasets/reuters-r52-train.tab.metadata new file mode 100644 index 000000000..bfb1a3048 --- /dev/null +++ b/orangecontrib/text/datasets/reuters-r52-train.tab.metadata @@ -0,0 +1 @@ +language: en \ No newline at end of file diff --git a/orangecontrib/text/datasets/reuters-r8-test.tab.metadata b/orangecontrib/text/datasets/reuters-r8-test.tab.metadata new file mode 100644 index 000000000..bfb1a3048 --- /dev/null +++ b/orangecontrib/text/datasets/reuters-r8-test.tab.metadata @@ -0,0 +1 @@ +language: en \ No newline at end of file diff --git a/orangecontrib/text/datasets/reuters-r8-train.tab.metadata b/orangecontrib/text/datasets/reuters-r8-train.tab.metadata new file mode 100644 index 000000000..bfb1a3048 --- /dev/null +++ b/orangecontrib/text/datasets/reuters-r8-train.tab.metadata @@ -0,0 +1 @@ +language: en \ No newline at end of file diff --git a/orangecontrib/text/datasets/slo-opinion-corpus.tab.metadata b/orangecontrib/text/datasets/slo-opinion-corpus.tab.metadata new file mode 100644 index 000000000..2e18e2049 --- /dev/null +++ b/orangecontrib/text/datasets/slo-opinion-corpus.tab.metadata @@ -0,0 +1 @@ +language: sl \ No newline at end of file diff --git a/orangecontrib/text/tests/test_datasets.py b/orangecontrib/text/tests/test_datasets.py new file mode 100644 index 000000000..04836354a --- /dev/null +++ b/orangecontrib/text/tests/test_datasets.py @@ -0,0 +1,18 @@ +import os +import unittest + +from orangecontrib.text import Corpus + + +class TestDatasets(unittest.TestCase): + def test_languages(self): + cur_file = os.path.dirname(os.path.abspath(__file__)) + + for file in os.listdir(os.path.join(cur_file, "..", "datasets")): + if file.endswith((".tab", ".xlsx", ".pkl", ".csv")): + c = Corpus.from_file(file) + self.assertIsNotNone(c.language) + + +if __name__ == "__main__": + unittest.main()