From d8760ac62dc3f547e562e36d4d91bdb76e696866 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Primo=C5=BE=20Godec?=
Date: Fri, 27 Dec 2019 13:09:16 +0100
Subject: [PATCH] corpus: Store titles to be same in subsample.
---
orangecontrib/text/corpus.py | 93 +++++++++++++++++++++----
orangecontrib/text/tests/test_corpus.py | 63 ++++++++++++++++-
orangecontrib/text/widgets/owcorpus.py | 11 +--
3 files changed, 141 insertions(+), 26 deletions(-)
diff --git a/orangecontrib/text/corpus.py b/orangecontrib/text/corpus.py
index 0600170e6..3c329365b 100644
--- a/orangecontrib/text/corpus.py
+++ b/orangecontrib/text/corpus.py
@@ -1,7 +1,9 @@
import os
+from collections import Counter, defaultdict
from copy import copy
from numbers import Integral
from itertools import chain
+from typing import Union, Optional, List
import nltk
import numpy as np
@@ -9,7 +11,7 @@
from gensim import corpora
from Orange.data import ContinuousVariable, DiscreteVariable, \
- Domain, RowInstance, Table
+ Domain, RowInstance, Table, StringVariable
from orangecontrib.text.vectorization import BowVectorizer
@@ -66,6 +68,7 @@ def __init__(self, domain=None, X=None, Y=None, metas=None, W=None,
self.attributes = {}
self.pos_tags = None
self.used_preprocessor = None # required for compute values
+ self._titles: Optional[np.ndarray] = None
if domain is not None and text_features is None:
self._infer_text_features()
@@ -76,6 +79,7 @@ def __init__(self, domain=None, X=None, Y=None, metas=None, W=None,
self.ids = ids
else:
Table._init_ids(self)
+ self._set_unique_titles()
def set_text_features(self, feats):
"""
@@ -95,6 +99,70 @@ def set_text_features(self, feats):
self._infer_text_features()
self._tokens = None # invalidate tokens
+ def set_title_variable(
+ self, title_variable: Union[StringVariable, str, None]
+ ) -> None:
+ """
+ Set the title attribute. Only one column can be a title attribute.
+
+ Parameters
+ ----------
+ title_variable
+ Variable that need to be set as a title variable. If it is None,
+ do not set a variable.
+ """
+ for a in self.domain.variables + self.domain.metas:
+ a.attributes.pop("title", None)
+
+ if title_variable and title_variable in self.domain:
+ self.domain[title_variable].attributes["title"] = True
+
+ self._set_unique_titles()
+
+ def _set_unique_titles(self):
+ """
+ Define self._titles variable as a list of titles (a title for each
+ document). It is used to have an unique title for each document. In
+ case when the document have the same title as the other document we
+ put a number beside.
+ """
+ if self.domain is None:
+ return
+ attrs = [attr for attr in
+ chain(self.domain.variables, self.domain.metas)
+ if attr.attributes.get('title', False)]
+
+ if attrs:
+ self._titles = np.array(self._unique_titles(
+ self.documents_from_features(attrs)))
+ else:
+ self._titles = np.array([
+ 'Document {}'.format(i + 1) for i in range(len(self))])
+
+ @staticmethod
+ def _unique_titles(titles: List[str]) -> List[str]:
+ """
+ Function adds numbers to the non-unique values fo the title.
+
+ Parameters
+ ----------
+ titles
+ List of titles - not necessary unique
+
+ Returns
+ -------
+ List with unique titles.
+ """
+ counts = Counter(titles)
+ cur_appearances = defaultdict(int)
+ new_titles = []
+ for t in titles:
+ if counts[t] > 1:
+ cur_appearances[t] += 1
+ t += f" ({cur_appearances[t]})"
+ new_titles.append(t)
+ return new_titles
+
def _infer_text_features(self):
"""
Infer which text features to use. If nothing was provided
@@ -137,6 +205,7 @@ def extend_corpus(self, metadata, Y):
Table._init_ids(self)
self._tokens = None # invalidate tokens
+ self._set_unique_titles()
def extend_attributes(self, X, feature_names, feature_values=None,
compute_values=None, var_attrs=None, sparse=False):
@@ -195,13 +264,8 @@ def documents(self):
@property
def titles(self):
""" Returns a list of titles. """
- attrs = [attr for attr in chain(self.domain.variables, self.domain.metas)
- if attr.attributes.get('title', False)]
-
- if attrs:
- return self.documents_from_features(attrs)
- else:
- return ['Document {}'.format(i+1) for i in range(len(self))]
+ assert self._titles is not None
+ return self._titles.tolist()
def documents_from_features(self, feats):
"""
@@ -211,8 +275,8 @@ def documents_from_features(self, feats):
Returns: a list of strings constructed by joining feats.
"""
# create a Table where feats are in metas
- data = Table(Domain([], [], [i.name for i in feats],
- source=self.domain), self)
+ data = Table.from_table(Domain([], [], [i.name for i in feats],
+ source=self.domain), self)
# When we use only features coming from sparse X data.metas is sparse.
# Transform it to dense.
@@ -304,6 +368,7 @@ def copy(self):
c.pos_tags = self.pos_tags
c.name = self.name
c.used_preprocessor = self.used_preprocessor
+ c._titles = self._titles
return c
@staticmethod
@@ -386,15 +451,16 @@ def from_file(cls, filename):
filename = abs_path
table = Table.from_file(filename)
- return cls(table.domain, table.X, table.Y, table.metas, table.W)
+ corpus = cls(table.domain, table.X, table.Y, table.metas, table.W)
+ return corpus
@staticmethod
def retain_preprocessing(orig, new, key=...):
""" Set preprocessing of 'new' object to match the 'orig' object. """
if isinstance(orig, Corpus):
+ if isinstance(key, tuple): # get row selection
+ key = key[0]
if orig._tokens is not None: # retain preprocessing
- if isinstance(key, tuple): # get row selection
- key = key[0]
if isinstance(key, Integral):
new._tokens = np.array([orig._tokens[key]])
new.pos_tags = None if orig.pos_tags is None else np.array(
@@ -409,6 +475,7 @@ def retain_preprocessing(orig, new, key=...):
else:
raise TypeError('Indexing by type {} not supported.'.format(type(key)))
new._dictionary = orig._dictionary
+ new._titles = orig._titles[key]
new_domain_metas = set(new.domain.metas)
new.text_features = [tf for tf in orig.text_features if tf in new_domain_metas]
new.ngram_range = orig.ngram_range
diff --git a/orangecontrib/text/tests/test_corpus.py b/orangecontrib/text/tests/test_corpus.py
index edf6d05c3..8ff0709a3 100644
--- a/orangecontrib/text/tests/test_corpus.py
+++ b/orangecontrib/text/tests/test_corpus.py
@@ -164,11 +164,64 @@ def test_titles(self):
self.assertIn('Document ', title)
# title feature set
- c.domain[0].attributes['title'] = True
+ c.set_title_variable(c.domain[0])
titles = c.titles
self.assertEqual(len(titles), len(c))
- for title in titles:
- self.assertIn(title, c.domain.class_var.values)
+
+ # first 50 are children
+ for title, c in zip(titles[:50], range(1, 51)):
+ self.assertEqual(f"children ({c})", title)
+
+ # others are adults
+ for title, a in zip(titles[50:100], range(1, 51)):
+ self.assertEqual(f"adult ({a})", title)
+
+ # first 50 are children
+ for title, c in zip(titles[100:120], range(51, 71)):
+ self.assertEqual(f"children ({c})", title)
+
+ # others are adults
+ for title, a in zip(titles[120:140], range(51, 71)):
+ self.assertEqual(f"adult ({a})", title)
+
+ def test_titles_no_numbers(self):
+ """
+ The case when no number is used since the title appears only once.
+ """
+ c = Corpus.from_file('andersen')
+ c.set_title_variable(c.domain.metas[0])
+
+ # title feature set
+ self.assertEqual("The Little Match-Seller", c.titles[0])
+
+ def test_titles_read_document(self):
+ """
+ When we read the document with a title marked it should have titles
+ set correctly.
+ """
+ c = Corpus.from_file('election-tweets-2016')
+
+ self.assertEqual(len(c), len(c.titles))
+
+ def test_titles_sample(self):
+ c = Corpus.from_file('book-excerpts')
+ c.set_title_variable(c.domain[0])
+
+ c_sample = c[10:20]
+ for title, i in zip(c_sample.titles, range(11, 21)):
+ self.assertEqual(f"children ({i})", title)
+
+ c_sample = c[60:70]
+ for title, i in zip(c_sample.titles, range(11, 21)):
+ self.assertEqual(f"adult ({i})", title)
+
+ c_sample = c[[10, 11, 12]]
+ for title, i in zip(c_sample.titles, range(11, 14)):
+ self.assertEqual(f"children ({i})", title)
+
+ c_sample = c[np.array([10, 11, 12])]
+ for title, i in zip(c_sample.titles, range(11, 14)):
+ self.assertEqual(f"children ({i})", title)
def test_documents_from_features(self):
c = Corpus.from_file('book-excerpts')
@@ -369,3 +422,7 @@ def test_corpus_remove_text_features(self):
self.assertFalse(len(d.text_features))
# Make sure that copying works.
d.copy()
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/orangecontrib/text/widgets/owcorpus.py b/orangecontrib/text/widgets/owcorpus.py
index 774a0227e..e7bab5e40 100644
--- a/orangecontrib/text/widgets/owcorpus.py
+++ b/orangecontrib/text/widgets/owcorpus.py
@@ -262,7 +262,7 @@ def remove_duplicates(l):
if len(self.unused_attrs_model) > 0 and not self.corpus.text_features:
self.Error.no_text_features_used()
- self._set_title_attribute()
+ self.corpus.set_title_variable(self.title_variable)
# prevent sending "empty" corpora
dom = self.corpus.domain
empty = not (dom.variables or dom.metas) \
@@ -270,15 +270,6 @@ def remove_duplicates(l):
or not self.corpus.text_features
self.Outputs.corpus.send(self.corpus if not empty else None)
- def _set_title_attribute(self):
- # remove all title attributes
- for a in self.corpus.domain.variables + self.corpus.domain.metas:
- a.attributes.pop("title", None)
-
- if self.title_variable and self.title_variable in self.corpus.domain:
- self.corpus.domain[
- self.title_variable].attributes["title"] = True
-
def send_report(self):
def describe(features):
if len(features):