corpus: Store titles to be same in subsample.

biolab · Dec 27, 2019 · 8049f7a · 8049f7a
1 parent b6c23ec
commit 8049f7a
Show file tree

Hide file tree

Showing 3 changed files with 140 additions and 26 deletions.
diff --git a/orangecontrib/text/corpus.py b/orangecontrib/text/corpus.py
@@ -1,15 +1,17 @@
 import os
+from collections import Counter, defaultdict
 from copy import copy
 from numbers import Integral
 from itertools import chain
+from typing import Union, Optional, List
 
 import nltk
 import numpy as np
 import scipy.sparse as sp
 from gensim import corpora
 
 from Orange.data import ContinuousVariable, DiscreteVariable, \
-    Domain, RowInstance, Table
+    Domain, RowInstance, Table, StringVariable
 from orangecontrib.text.vectorization import BowVectorizer
 
 
@@ -66,6 +68,7 @@ def __init__(self, domain=None, X=None, Y=None, metas=None, W=None,
         self.attributes = {}
         self.pos_tags = None
         self.used_preprocessor = None   # required for compute values
+        self._titles: Optional[np.ndarray] = None
 
         if domain is not None and text_features is None:
             self._infer_text_features()
@@ -76,6 +79,7 @@ def __init__(self, domain=None, X=None, Y=None, metas=None, W=None,
             self.ids = ids
         else:
             Table._init_ids(self)
+        self._set_unique_titles()
 
     def set_text_features(self, feats):
         """
@@ -95,6 +99,70 @@ def set_text_features(self, feats):
             self._infer_text_features()
         self._tokens = None     # invalidate tokens
 
+    def set_title_variable(
+            self, title_variable: Union[StringVariable, str, None]
+    ) -> None:
+        """
+        Set the title attribute. Only one column can be a title attribute.
+
+        Parameters
+        ----------
+        title_variable
+            Variable that need to be set as a title variable. If it is None,
+            do not set a variable.
+        """
+        for a in self.domain.variables + self.domain.metas:
+            a.attributes.pop("title", None)
+
+        if title_variable and title_variable in self.domain:
+            self.domain[title_variable].attributes["title"] = True
+
+        self._set_unique_titles()
+
+    def _set_unique_titles(self):
+        """
+        Define self._titles variable as a list of titles (a title for each
+        document). It is used to have an unique title for each document. In
+        case when the document have the same title as the other document we
+        put a number beside.
+        """
+        if self.domain is None:
+            return
+        attrs = [attr for attr in
+                 chain(self.domain.variables, self.domain.metas)
+                 if attr.attributes.get('title', False)]
+
+        if attrs:
+            self._titles = np.array(self._unique_titles(
+                self.documents_from_features(attrs)))
+        else:
+            self._titles = np.array([
+                'Document {}'.format(i + 1) for i in range(len(self))])
+
+    @staticmethod
+    def _unique_titles(titles: List[str]) -> List[str]:
+        """
+        Function adds numbers to the non-unique values fo the title.
+
+        Parameters
+        ----------
+        titles
+            List of titles - not necessary unique
+
+        Returns
+        -------
+        List with unique titles.
+        """
+        counts = Counter(titles)
+        cur_appearances = defaultdict(int)
+        new_titles = []
+        for t in titles:
+            if counts[t] > 1:
+                cur_appearances[t] += 1
+                t += f" ({cur_appearances[t]})"
+            new_titles.append(t)
+        return new_titles
+
     def _infer_text_features(self):
         """
         Infer which text features to use. If nothing was provided
@@ -137,6 +205,7 @@ def extend_corpus(self, metadata, Y):
         Table._init_ids(self)
 
         self._tokens = None     # invalidate tokens
+        self._set_unique_titles()
 
     def extend_attributes(self, X, feature_names, feature_values=None,
                           compute_values=None, var_attrs=None, sparse=False):
@@ -195,13 +264,8 @@ def documents(self):
     @property
     def titles(self):
         """ Returns a list of titles. """
-        attrs = [attr for attr in chain(self.domain.variables, self.domain.metas)
-                 if attr.attributes.get('title', False)]
-
-        if attrs:
-            return self.documents_from_features(attrs)
-        else:
-            return ['Document {}'.format(i+1) for i in range(len(self))]
+        assert self._titles is not None
+        return self._titles.tolist()
 
     def documents_from_features(self, feats):
         """
@@ -211,8 +275,8 @@ def documents_from_features(self, feats):
         Returns: a list of strings constructed by joining feats.
         """
         # create a Table where feats are in metas
-        data = Table(Domain([], [], [i.name for i in feats],
-                            source=self.domain), self)
+        data = Table.from_table(Domain([], [], [i.name for i in feats],
+                                       source=self.domain), self)
 
         # When we use only features coming from sparse X data.metas is sparse.
         # Transform it to dense.
@@ -304,6 +368,7 @@ def copy(self):
         c.pos_tags = self.pos_tags
         c.name = self.name
         c.used_preprocessor = self.used_preprocessor
+        c._titles = self._titles
         return c
 
     @staticmethod
@@ -386,15 +451,16 @@ def from_file(cls, filename):
                 filename = abs_path
 
         table = Table.from_file(filename)
-        return cls(table.domain, table.X, table.Y, table.metas, table.W)
+        corpus = cls(table.domain, table.X, table.Y, table.metas, table.W)
+        return corpus
 
     @staticmethod
     def retain_preprocessing(orig, new, key=...):
         """ Set preprocessing of 'new' object to match the 'orig' object. """
         if isinstance(orig, Corpus):
+            if isinstance(key, tuple):  # get row selection
+                key = key[0]
             if orig._tokens is not None:  # retain preprocessing
-                if isinstance(key, tuple):  # get row selection
-                    key = key[0]
                 if isinstance(key, Integral):
                     new._tokens = np.array([orig._tokens[key]])
                     new.pos_tags = None if orig.pos_tags is None else np.array(
@@ -409,6 +475,7 @@ def retain_preprocessing(orig, new, key=...):
                 else:
                     raise TypeError('Indexing by type {} not supported.'.format(type(key)))
                 new._dictionary = orig._dictionary
+            new._titles = orig._titles[key]
             new_domain_metas = set(new.domain.metas)
             new.text_features = [tf for tf in orig.text_features if tf in new_domain_metas]
             new.ngram_range = orig.ngram_range

diff --git a/orangecontrib/text/tests/test_corpus.py b/orangecontrib/text/tests/test_corpus.py
@@ -164,11 +164,67 @@ def test_titles(self):
             self.assertIn('Document ', title)
 
         # title feature set
-        c.domain[0].attributes['title'] = True
+        c.set_title_variable(c.domain[0])
         titles = c.titles
         self.assertEqual(len(titles), len(c))
-        for title in titles:
-            self.assertIn(title, c.domain.class_var.values)
+
+        # first 50 are children
+        for title, c in zip(titles[:50], range(1, 51)):
+            self.assertEqual(f"children ({c})", title)
+
+        # others are adults
+        for title, a in zip(titles[50:100], range(1, 51)):
+            self.assertEqual(f"adult ({a})", title)
+
+        # first 50 are children
+        for title, c in zip(titles[100:120], range(51, 71)):
+            self.assertEqual(f"children ({c})", title)
+
+        # others are adults
+        for title, a in zip(titles[120:140], range(51, 71)):
+            self.assertEqual(f"adult ({a})", title)
+
+    def test_titles_no_numbers(self):
+        """
+        The case when no number is used since the title appears only once.
+        """
+        c = Corpus.from_file('book-excerpts')
+        c.domain[0].values.append("temp")
+        c.Y[0] = 2
+
+        # title feature set
+        c.set_title_variable(c.domain[0])
+        self.assertEqual("temp", c.titles[0])
+        self.assertEqual("children (1)", c.titles[1])
+
+    def test_titles_read_document(self):
+        """
+        When we read the document with a title marked it should have titles
+        set correctly.
+        """
+        c = Corpus.from_file('election-tweets-2016')
+
+        self.assertEqual(len(c), len(c.titles))
+
+    def test_titles_sample(self):
+        c = Corpus.from_file('book-excerpts')
+        c.set_title_variable(c.domain[0])
+
+        c_sample = c[10:20]
+        for title, i in zip(c_sample.titles, range(11, 21)):
+            self.assertEqual(f"children ({i})", title)
+
+        c_sample = c[60:70]
+        for title, i in zip(c_sample.titles, range(11, 21)):
+            self.assertEqual(f"adult ({i})", title)
+
+        c_sample = c[[10, 11, 12]]
+        for title, i in zip(c_sample.titles, range(11, 14)):
+            self.assertEqual(f"children ({i})", title)
+
+        c_sample = c[np.array([10, 11, 12])]
+        for title, i in zip(c_sample.titles, range(11, 14)):
+            self.assertEqual(f"children ({i})", title)
 
     def test_documents_from_features(self):
         c = Corpus.from_file('book-excerpts')

diff --git a/orangecontrib/text/widgets/owcorpus.py b/orangecontrib/text/widgets/owcorpus.py
@@ -262,23 +262,14 @@ def remove_duplicates(l):
             if len(self.unused_attrs_model) > 0 and not self.corpus.text_features:
                 self.Error.no_text_features_used()
 
-            self._set_title_attribute()
+            self.corpus.set_title_variable(self.title_variable)
             # prevent sending "empty" corpora
             dom = self.corpus.domain
             empty = not (dom.variables or dom.metas) \
                 or len(self.corpus) == 0 \
                 or not self.corpus.text_features
             self.Outputs.corpus.send(self.corpus if not empty else None)
 
-    def _set_title_attribute(self):
-        # remove all title attributes
-        for a in self.corpus.domain.variables + self.corpus.domain.metas:
-            a.attributes.pop("title", None)
-
-        if self.title_variable and self.title_variable in self.corpus.domain:
-            self.corpus.domain[
-                self.title_variable].attributes["title"] = True
-
     def send_report(self):
         def describe(features):
             if len(features):