Skip to content

Commit

Permalink
corpus: Store titles to be same in subsample.
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Dec 27, 2019
1 parent b6c23ec commit 8049f7a
Show file tree
Hide file tree
Showing 3 changed files with 140 additions and 26 deletions.
93 changes: 80 additions & 13 deletions orangecontrib/text/corpus.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
import os
from collections import Counter, defaultdict
from copy import copy
from numbers import Integral
from itertools import chain
from typing import Union, Optional, List

import nltk
import numpy as np
import scipy.sparse as sp
from gensim import corpora

from Orange.data import ContinuousVariable, DiscreteVariable, \
Domain, RowInstance, Table
Domain, RowInstance, Table, StringVariable
from orangecontrib.text.vectorization import BowVectorizer


Expand Down Expand Up @@ -66,6 +68,7 @@ def __init__(self, domain=None, X=None, Y=None, metas=None, W=None,
self.attributes = {}
self.pos_tags = None
self.used_preprocessor = None # required for compute values
self._titles: Optional[np.ndarray] = None

if domain is not None and text_features is None:
self._infer_text_features()
Expand All @@ -76,6 +79,7 @@ def __init__(self, domain=None, X=None, Y=None, metas=None, W=None,
self.ids = ids
else:
Table._init_ids(self)
self._set_unique_titles()

def set_text_features(self, feats):
"""
Expand All @@ -95,6 +99,70 @@ def set_text_features(self, feats):
self._infer_text_features()
self._tokens = None # invalidate tokens

def set_title_variable(
self, title_variable: Union[StringVariable, str, None]
) -> None:
"""
Set the title attribute. Only one column can be a title attribute.
Parameters
----------
title_variable
Variable that need to be set as a title variable. If it is None,
do not set a variable.
"""
for a in self.domain.variables + self.domain.metas:
a.attributes.pop("title", None)

if title_variable and title_variable in self.domain:
self.domain[title_variable].attributes["title"] = True

self._set_unique_titles()

def _set_unique_titles(self):
"""
Define self._titles variable as a list of titles (a title for each
document). It is used to have an unique title for each document. In
case when the document have the same title as the other document we
put a number beside.
"""
if self.domain is None:
return
attrs = [attr for attr in
chain(self.domain.variables, self.domain.metas)
if attr.attributes.get('title', False)]

if attrs:
self._titles = np.array(self._unique_titles(
self.documents_from_features(attrs)))
else:
self._titles = np.array([
'Document {}'.format(i + 1) for i in range(len(self))])

@staticmethod
def _unique_titles(titles: List[str]) -> List[str]:
"""
Function adds numbers to the non-unique values fo the title.
Parameters
----------
titles
List of titles - not necessary unique
Returns
-------
List with unique titles.
"""
counts = Counter(titles)
cur_appearances = defaultdict(int)
new_titles = []
for t in titles:
if counts[t] > 1:
cur_appearances[t] += 1
t += f" ({cur_appearances[t]})"
new_titles.append(t)
return new_titles

def _infer_text_features(self):
"""
Infer which text features to use. If nothing was provided
Expand Down Expand Up @@ -137,6 +205,7 @@ def extend_corpus(self, metadata, Y):
Table._init_ids(self)

self._tokens = None # invalidate tokens
self._set_unique_titles()

def extend_attributes(self, X, feature_names, feature_values=None,
compute_values=None, var_attrs=None, sparse=False):
Expand Down Expand Up @@ -195,13 +264,8 @@ def documents(self):
@property
def titles(self):
""" Returns a list of titles. """
attrs = [attr for attr in chain(self.domain.variables, self.domain.metas)
if attr.attributes.get('title', False)]

if attrs:
return self.documents_from_features(attrs)
else:
return ['Document {}'.format(i+1) for i in range(len(self))]
assert self._titles is not None
return self._titles.tolist()

def documents_from_features(self, feats):
"""
Expand All @@ -211,8 +275,8 @@ def documents_from_features(self, feats):
Returns: a list of strings constructed by joining feats.
"""
# create a Table where feats are in metas
data = Table(Domain([], [], [i.name for i in feats],
source=self.domain), self)
data = Table.from_table(Domain([], [], [i.name for i in feats],
source=self.domain), self)

# When we use only features coming from sparse X data.metas is sparse.
# Transform it to dense.
Expand Down Expand Up @@ -304,6 +368,7 @@ def copy(self):
c.pos_tags = self.pos_tags
c.name = self.name
c.used_preprocessor = self.used_preprocessor
c._titles = self._titles
return c

@staticmethod
Expand Down Expand Up @@ -386,15 +451,16 @@ def from_file(cls, filename):
filename = abs_path

table = Table.from_file(filename)
return cls(table.domain, table.X, table.Y, table.metas, table.W)
corpus = cls(table.domain, table.X, table.Y, table.metas, table.W)
return corpus

@staticmethod
def retain_preprocessing(orig, new, key=...):
""" Set preprocessing of 'new' object to match the 'orig' object. """
if isinstance(orig, Corpus):
if isinstance(key, tuple): # get row selection
key = key[0]
if orig._tokens is not None: # retain preprocessing
if isinstance(key, tuple): # get row selection
key = key[0]
if isinstance(key, Integral):
new._tokens = np.array([orig._tokens[key]])
new.pos_tags = None if orig.pos_tags is None else np.array(
Expand All @@ -409,6 +475,7 @@ def retain_preprocessing(orig, new, key=...):
else:
raise TypeError('Indexing by type {} not supported.'.format(type(key)))
new._dictionary = orig._dictionary
new._titles = orig._titles[key]
new_domain_metas = set(new.domain.metas)
new.text_features = [tf for tf in orig.text_features if tf in new_domain_metas]
new.ngram_range = orig.ngram_range
Expand Down
62 changes: 59 additions & 3 deletions orangecontrib/text/tests/test_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,11 +164,67 @@ def test_titles(self):
self.assertIn('Document ', title)

# title feature set
c.domain[0].attributes['title'] = True
c.set_title_variable(c.domain[0])
titles = c.titles
self.assertEqual(len(titles), len(c))
for title in titles:
self.assertIn(title, c.domain.class_var.values)

# first 50 are children
for title, c in zip(titles[:50], range(1, 51)):
self.assertEqual(f"children ({c})", title)

# others are adults
for title, a in zip(titles[50:100], range(1, 51)):
self.assertEqual(f"adult ({a})", title)

# first 50 are children
for title, c in zip(titles[100:120], range(51, 71)):
self.assertEqual(f"children ({c})", title)

# others are adults
for title, a in zip(titles[120:140], range(51, 71)):
self.assertEqual(f"adult ({a})", title)

def test_titles_no_numbers(self):
"""
The case when no number is used since the title appears only once.
"""
c = Corpus.from_file('book-excerpts')
c.domain[0].values.append("temp")
c.Y[0] = 2

# title feature set
c.set_title_variable(c.domain[0])
self.assertEqual("temp", c.titles[0])
self.assertEqual("children (1)", c.titles[1])

def test_titles_read_document(self):
"""
When we read the document with a title marked it should have titles
set correctly.
"""
c = Corpus.from_file('election-tweets-2016')

self.assertEqual(len(c), len(c.titles))

def test_titles_sample(self):
c = Corpus.from_file('book-excerpts')
c.set_title_variable(c.domain[0])

c_sample = c[10:20]
for title, i in zip(c_sample.titles, range(11, 21)):
self.assertEqual(f"children ({i})", title)

c_sample = c[60:70]
for title, i in zip(c_sample.titles, range(11, 21)):
self.assertEqual(f"adult ({i})", title)

c_sample = c[[10, 11, 12]]
for title, i in zip(c_sample.titles, range(11, 14)):
self.assertEqual(f"children ({i})", title)

c_sample = c[np.array([10, 11, 12])]
for title, i in zip(c_sample.titles, range(11, 14)):
self.assertEqual(f"children ({i})", title)

def test_documents_from_features(self):
c = Corpus.from_file('book-excerpts')
Expand Down
11 changes: 1 addition & 10 deletions orangecontrib/text/widgets/owcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,23 +262,14 @@ def remove_duplicates(l):
if len(self.unused_attrs_model) > 0 and not self.corpus.text_features:
self.Error.no_text_features_used()

self._set_title_attribute()
self.corpus.set_title_variable(self.title_variable)
# prevent sending "empty" corpora
dom = self.corpus.domain
empty = not (dom.variables or dom.metas) \
or len(self.corpus) == 0 \
or not self.corpus.text_features
self.Outputs.corpus.send(self.corpus if not empty else None)

def _set_title_attribute(self):
# remove all title attributes
for a in self.corpus.domain.variables + self.corpus.domain.metas:
a.attributes.pop("title", None)

if self.title_variable and self.title_variable in self.corpus.domain:
self.corpus.domain[
self.title_variable].attributes["title"] = True

def send_report(self):
def describe(features):
if len(features):
Expand Down

0 comments on commit 8049f7a

Please sign in to comment.