From d8760ac62dc3f547e562e36d4d91bdb76e696866 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Primo=C5=BE=20Godec?= Date: Fri, 27 Dec 2019 13:09:16 +0100 Subject: [PATCH] corpus: Store titles to be same in subsample. --- orangecontrib/text/corpus.py | 93 +++++++++++++++++++++---- orangecontrib/text/tests/test_corpus.py | 63 ++++++++++++++++- orangecontrib/text/widgets/owcorpus.py | 11 +-- 3 files changed, 141 insertions(+), 26 deletions(-) diff --git a/orangecontrib/text/corpus.py b/orangecontrib/text/corpus.py index 0600170e6..3c329365b 100644 --- a/orangecontrib/text/corpus.py +++ b/orangecontrib/text/corpus.py @@ -1,7 +1,9 @@ import os +from collections import Counter, defaultdict from copy import copy from numbers import Integral from itertools import chain +from typing import Union, Optional, List import nltk import numpy as np @@ -9,7 +11,7 @@ from gensim import corpora from Orange.data import ContinuousVariable, DiscreteVariable, \ - Domain, RowInstance, Table + Domain, RowInstance, Table, StringVariable from orangecontrib.text.vectorization import BowVectorizer @@ -66,6 +68,7 @@ def __init__(self, domain=None, X=None, Y=None, metas=None, W=None, self.attributes = {} self.pos_tags = None self.used_preprocessor = None # required for compute values + self._titles: Optional[np.ndarray] = None if domain is not None and text_features is None: self._infer_text_features() @@ -76,6 +79,7 @@ def __init__(self, domain=None, X=None, Y=None, metas=None, W=None, self.ids = ids else: Table._init_ids(self) + self._set_unique_titles() def set_text_features(self, feats): """ @@ -95,6 +99,70 @@ def set_text_features(self, feats): self._infer_text_features() self._tokens = None # invalidate tokens + def set_title_variable( + self, title_variable: Union[StringVariable, str, None] + ) -> None: + """ + Set the title attribute. Only one column can be a title attribute. + + Parameters + ---------- + title_variable + Variable that need to be set as a title variable. If it is None, + do not set a variable. + """ + for a in self.domain.variables + self.domain.metas: + a.attributes.pop("title", None) + + if title_variable and title_variable in self.domain: + self.domain[title_variable].attributes["title"] = True + + self._set_unique_titles() + + def _set_unique_titles(self): + """ + Define self._titles variable as a list of titles (a title for each + document). It is used to have an unique title for each document. In + case when the document have the same title as the other document we + put a number beside. + """ + if self.domain is None: + return + attrs = [attr for attr in + chain(self.domain.variables, self.domain.metas) + if attr.attributes.get('title', False)] + + if attrs: + self._titles = np.array(self._unique_titles( + self.documents_from_features(attrs))) + else: + self._titles = np.array([ + 'Document {}'.format(i + 1) for i in range(len(self))]) + + @staticmethod + def _unique_titles(titles: List[str]) -> List[str]: + """ + Function adds numbers to the non-unique values fo the title. + + Parameters + ---------- + titles + List of titles - not necessary unique + + Returns + ------- + List with unique titles. + """ + counts = Counter(titles) + cur_appearances = defaultdict(int) + new_titles = [] + for t in titles: + if counts[t] > 1: + cur_appearances[t] += 1 + t += f" ({cur_appearances[t]})" + new_titles.append(t) + return new_titles + def _infer_text_features(self): """ Infer which text features to use. If nothing was provided @@ -137,6 +205,7 @@ def extend_corpus(self, metadata, Y): Table._init_ids(self) self._tokens = None # invalidate tokens + self._set_unique_titles() def extend_attributes(self, X, feature_names, feature_values=None, compute_values=None, var_attrs=None, sparse=False): @@ -195,13 +264,8 @@ def documents(self): @property def titles(self): """ Returns a list of titles. """ - attrs = [attr for attr in chain(self.domain.variables, self.domain.metas) - if attr.attributes.get('title', False)] - - if attrs: - return self.documents_from_features(attrs) - else: - return ['Document {}'.format(i+1) for i in range(len(self))] + assert self._titles is not None + return self._titles.tolist() def documents_from_features(self, feats): """ @@ -211,8 +275,8 @@ def documents_from_features(self, feats): Returns: a list of strings constructed by joining feats. """ # create a Table where feats are in metas - data = Table(Domain([], [], [i.name for i in feats], - source=self.domain), self) + data = Table.from_table(Domain([], [], [i.name for i in feats], + source=self.domain), self) # When we use only features coming from sparse X data.metas is sparse. # Transform it to dense. @@ -304,6 +368,7 @@ def copy(self): c.pos_tags = self.pos_tags c.name = self.name c.used_preprocessor = self.used_preprocessor + c._titles = self._titles return c @staticmethod @@ -386,15 +451,16 @@ def from_file(cls, filename): filename = abs_path table = Table.from_file(filename) - return cls(table.domain, table.X, table.Y, table.metas, table.W) + corpus = cls(table.domain, table.X, table.Y, table.metas, table.W) + return corpus @staticmethod def retain_preprocessing(orig, new, key=...): """ Set preprocessing of 'new' object to match the 'orig' object. """ if isinstance(orig, Corpus): + if isinstance(key, tuple): # get row selection + key = key[0] if orig._tokens is not None: # retain preprocessing - if isinstance(key, tuple): # get row selection - key = key[0] if isinstance(key, Integral): new._tokens = np.array([orig._tokens[key]]) new.pos_tags = None if orig.pos_tags is None else np.array( @@ -409,6 +475,7 @@ def retain_preprocessing(orig, new, key=...): else: raise TypeError('Indexing by type {} not supported.'.format(type(key))) new._dictionary = orig._dictionary + new._titles = orig._titles[key] new_domain_metas = set(new.domain.metas) new.text_features = [tf for tf in orig.text_features if tf in new_domain_metas] new.ngram_range = orig.ngram_range diff --git a/orangecontrib/text/tests/test_corpus.py b/orangecontrib/text/tests/test_corpus.py index edf6d05c3..8ff0709a3 100644 --- a/orangecontrib/text/tests/test_corpus.py +++ b/orangecontrib/text/tests/test_corpus.py @@ -164,11 +164,64 @@ def test_titles(self): self.assertIn('Document ', title) # title feature set - c.domain[0].attributes['title'] = True + c.set_title_variable(c.domain[0]) titles = c.titles self.assertEqual(len(titles), len(c)) - for title in titles: - self.assertIn(title, c.domain.class_var.values) + + # first 50 are children + for title, c in zip(titles[:50], range(1, 51)): + self.assertEqual(f"children ({c})", title) + + # others are adults + for title, a in zip(titles[50:100], range(1, 51)): + self.assertEqual(f"adult ({a})", title) + + # first 50 are children + for title, c in zip(titles[100:120], range(51, 71)): + self.assertEqual(f"children ({c})", title) + + # others are adults + for title, a in zip(titles[120:140], range(51, 71)): + self.assertEqual(f"adult ({a})", title) + + def test_titles_no_numbers(self): + """ + The case when no number is used since the title appears only once. + """ + c = Corpus.from_file('andersen') + c.set_title_variable(c.domain.metas[0]) + + # title feature set + self.assertEqual("The Little Match-Seller", c.titles[0]) + + def test_titles_read_document(self): + """ + When we read the document with a title marked it should have titles + set correctly. + """ + c = Corpus.from_file('election-tweets-2016') + + self.assertEqual(len(c), len(c.titles)) + + def test_titles_sample(self): + c = Corpus.from_file('book-excerpts') + c.set_title_variable(c.domain[0]) + + c_sample = c[10:20] + for title, i in zip(c_sample.titles, range(11, 21)): + self.assertEqual(f"children ({i})", title) + + c_sample = c[60:70] + for title, i in zip(c_sample.titles, range(11, 21)): + self.assertEqual(f"adult ({i})", title) + + c_sample = c[[10, 11, 12]] + for title, i in zip(c_sample.titles, range(11, 14)): + self.assertEqual(f"children ({i})", title) + + c_sample = c[np.array([10, 11, 12])] + for title, i in zip(c_sample.titles, range(11, 14)): + self.assertEqual(f"children ({i})", title) def test_documents_from_features(self): c = Corpus.from_file('book-excerpts') @@ -369,3 +422,7 @@ def test_corpus_remove_text_features(self): self.assertFalse(len(d.text_features)) # Make sure that copying works. d.copy() + + +if __name__ == "__main__": + unittest.main() diff --git a/orangecontrib/text/widgets/owcorpus.py b/orangecontrib/text/widgets/owcorpus.py index 774a0227e..e7bab5e40 100644 --- a/orangecontrib/text/widgets/owcorpus.py +++ b/orangecontrib/text/widgets/owcorpus.py @@ -262,7 +262,7 @@ def remove_duplicates(l): if len(self.unused_attrs_model) > 0 and not self.corpus.text_features: self.Error.no_text_features_used() - self._set_title_attribute() + self.corpus.set_title_variable(self.title_variable) # prevent sending "empty" corpora dom = self.corpus.domain empty = not (dom.variables or dom.metas) \ @@ -270,15 +270,6 @@ def remove_duplicates(l): or not self.corpus.text_features self.Outputs.corpus.send(self.corpus if not empty else None) - def _set_title_attribute(self): - # remove all title attributes - for a in self.corpus.domain.variables + self.corpus.domain.metas: - a.attributes.pop("title", None) - - if self.title_variable and self.title_variable in self.corpus.domain: - self.corpus.domain[ - self.title_variable].attributes["title"] = True - def send_report(self): def describe(features): if len(features):