Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Corpus: Store titles in corpus to match between Corpus Viewer even when subsampled #490

Merged
merged 1 commit into from
Jan 13, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 80 additions & 13 deletions orangecontrib/text/corpus.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
import os
from collections import Counter, defaultdict
from copy import copy
from numbers import Integral
from itertools import chain
from typing import Union, Optional, List

import nltk
import numpy as np
import scipy.sparse as sp
from gensim import corpora

from Orange.data import ContinuousVariable, DiscreteVariable, \
Domain, RowInstance, Table
Domain, RowInstance, Table, StringVariable
from orangecontrib.text.vectorization import BowVectorizer


Expand Down Expand Up @@ -66,6 +68,7 @@ def __init__(self, domain=None, X=None, Y=None, metas=None, W=None,
self.attributes = {}
self.pos_tags = None
self.used_preprocessor = None # required for compute values
self._titles: Optional[np.ndarray] = None

if domain is not None and text_features is None:
self._infer_text_features()
Expand All @@ -76,6 +79,7 @@ def __init__(self, domain=None, X=None, Y=None, metas=None, W=None,
self.ids = ids
else:
Table._init_ids(self)
self._set_unique_titles()

def set_text_features(self, feats):
"""
Expand All @@ -95,6 +99,70 @@ def set_text_features(self, feats):
self._infer_text_features()
self._tokens = None # invalidate tokens

def set_title_variable(
self, title_variable: Union[StringVariable, str, None]
) -> None:
"""
Set the title attribute. Only one column can be a title attribute.

Parameters
----------
title_variable
Variable that need to be set as a title variable. If it is None,
do not set a variable.
"""
for a in self.domain.variables + self.domain.metas:
a.attributes.pop("title", None)

if title_variable and title_variable in self.domain:
self.domain[title_variable].attributes["title"] = True

self._set_unique_titles()

def _set_unique_titles(self):
"""
Define self._titles variable as a list of titles (a title for each
document). It is used to have an unique title for each document. In
case when the document have the same title as the other document we
put a number beside.
"""
if self.domain is None:
return
attrs = [attr for attr in
chain(self.domain.variables, self.domain.metas)
if attr.attributes.get('title', False)]

if attrs:
self._titles = np.array(self._unique_titles(
self.documents_from_features(attrs)))
else:
self._titles = np.array([
'Document {}'.format(i + 1) for i in range(len(self))])

@staticmethod
def _unique_titles(titles: List[str]) -> List[str]:
"""
Function adds numbers to the non-unique values fo the title.

Parameters
----------
titles
List of titles - not necessary unique

Returns
-------
List with unique titles.
"""
counts = Counter(titles)
cur_appearances = defaultdict(int)
new_titles = []
for t in titles:
if counts[t] > 1:
cur_appearances[t] += 1
t += f" ({cur_appearances[t]})"
new_titles.append(t)
return new_titles

def _infer_text_features(self):
"""
Infer which text features to use. If nothing was provided
Expand Down Expand Up @@ -137,6 +205,7 @@ def extend_corpus(self, metadata, Y):
Table._init_ids(self)

self._tokens = None # invalidate tokens
self._set_unique_titles()

def extend_attributes(self, X, feature_names, feature_values=None,
compute_values=None, var_attrs=None, sparse=False):
Expand Down Expand Up @@ -195,13 +264,8 @@ def documents(self):
@property
def titles(self):
""" Returns a list of titles. """
attrs = [attr for attr in chain(self.domain.variables, self.domain.metas)
if attr.attributes.get('title', False)]

if attrs:
return self.documents_from_features(attrs)
else:
return ['Document {}'.format(i+1) for i in range(len(self))]
assert self._titles is not None
return self._titles.tolist()

def documents_from_features(self, feats):
"""
Expand All @@ -211,8 +275,8 @@ def documents_from_features(self, feats):
Returns: a list of strings constructed by joining feats.
"""
# create a Table where feats are in metas
data = Table(Domain([], [], [i.name for i in feats],
source=self.domain), self)
data = Table.from_table(Domain([], [], [i.name for i in feats],
source=self.domain), self)

# When we use only features coming from sparse X data.metas is sparse.
# Transform it to dense.
Expand Down Expand Up @@ -304,6 +368,7 @@ def copy(self):
c.pos_tags = self.pos_tags
c.name = self.name
c.used_preprocessor = self.used_preprocessor
c._titles = self._titles
return c

@staticmethod
Expand Down Expand Up @@ -386,15 +451,16 @@ def from_file(cls, filename):
filename = abs_path

table = Table.from_file(filename)
return cls(table.domain, table.X, table.Y, table.metas, table.W)
corpus = cls(table.domain, table.X, table.Y, table.metas, table.W)
return corpus

@staticmethod
def retain_preprocessing(orig, new, key=...):
""" Set preprocessing of 'new' object to match the 'orig' object. """
if isinstance(orig, Corpus):
if isinstance(key, tuple): # get row selection
key = key[0]
if orig._tokens is not None: # retain preprocessing
if isinstance(key, tuple): # get row selection
key = key[0]
if isinstance(key, Integral):
new._tokens = np.array([orig._tokens[key]])
new.pos_tags = None if orig.pos_tags is None else np.array(
Expand All @@ -409,6 +475,7 @@ def retain_preprocessing(orig, new, key=...):
else:
raise TypeError('Indexing by type {} not supported.'.format(type(key)))
new._dictionary = orig._dictionary
new._titles = orig._titles[key]
new_domain_metas = set(new.domain.metas)
new.text_features = [tf for tf in orig.text_features if tf in new_domain_metas]
new.ngram_range = orig.ngram_range
Expand Down
63 changes: 60 additions & 3 deletions orangecontrib/text/tests/test_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,11 +164,64 @@ def test_titles(self):
self.assertIn('Document ', title)

# title feature set
c.domain[0].attributes['title'] = True
c.set_title_variable(c.domain[0])
titles = c.titles
self.assertEqual(len(titles), len(c))
for title in titles:
self.assertIn(title, c.domain.class_var.values)

# first 50 are children
for title, c in zip(titles[:50], range(1, 51)):
self.assertEqual(f"children ({c})", title)

# others are adults
for title, a in zip(titles[50:100], range(1, 51)):
self.assertEqual(f"adult ({a})", title)

# first 50 are children
for title, c in zip(titles[100:120], range(51, 71)):
self.assertEqual(f"children ({c})", title)

# others are adults
for title, a in zip(titles[120:140], range(51, 71)):
self.assertEqual(f"adult ({a})", title)

def test_titles_no_numbers(self):
"""
The case when no number is used since the title appears only once.
"""
c = Corpus.from_file('andersen')
c.set_title_variable(c.domain.metas[0])

# title feature set
self.assertEqual("The Little Match-Seller", c.titles[0])

def test_titles_read_document(self):
"""
When we read the document with a title marked it should have titles
set correctly.
"""
c = Corpus.from_file('election-tweets-2016')

self.assertEqual(len(c), len(c.titles))

def test_titles_sample(self):
c = Corpus.from_file('book-excerpts')
c.set_title_variable(c.domain[0])

c_sample = c[10:20]
for title, i in zip(c_sample.titles, range(11, 21)):
self.assertEqual(f"children ({i})", title)

c_sample = c[60:70]
for title, i in zip(c_sample.titles, range(11, 21)):
self.assertEqual(f"adult ({i})", title)

c_sample = c[[10, 11, 12]]
for title, i in zip(c_sample.titles, range(11, 14)):
self.assertEqual(f"children ({i})", title)

c_sample = c[np.array([10, 11, 12])]
for title, i in zip(c_sample.titles, range(11, 14)):
self.assertEqual(f"children ({i})", title)

def test_documents_from_features(self):
c = Corpus.from_file('book-excerpts')
Expand Down Expand Up @@ -369,3 +422,7 @@ def test_corpus_remove_text_features(self):
self.assertFalse(len(d.text_features))
# Make sure that copying works.
d.copy()


if __name__ == "__main__":
unittest.main()
11 changes: 1 addition & 10 deletions orangecontrib/text/widgets/owcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,23 +262,14 @@ def remove_duplicates(l):
if len(self.unused_attrs_model) > 0 and not self.corpus.text_features:
self.Error.no_text_features_used()

self._set_title_attribute()
self.corpus.set_title_variable(self.title_variable)
# prevent sending "empty" corpora
dom = self.corpus.domain
empty = not (dom.variables or dom.metas) \
or len(self.corpus) == 0 \
or not self.corpus.text_features
self.Outputs.corpus.send(self.corpus if not empty else None)

def _set_title_attribute(self):
# remove all title attributes
for a in self.corpus.domain.variables + self.corpus.domain.metas:
a.attributes.pop("title", None)

if self.title_variable and self.title_variable in self.corpus.domain:
self.corpus.domain[
self.title_variable].attributes["title"] = True

def send_report(self):
def describe(features):
if len(features):
Expand Down