Skip to content

Commit

Permalink
BoW: use training weights on test data
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Nov 18, 2021
1 parent b99a1c8 commit 134976d
Show file tree
Hide file tree
Showing 2 changed files with 98 additions and 2 deletions.
96 changes: 96 additions & 0 deletions orangecontrib/text/tests/test_bowvectorizer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import unittest

import numpy as np
from Orange.data import Domain, StringVariable

from orangecontrib.text import preprocess
from orangecontrib.text.corpus import Corpus
Expand Down Expand Up @@ -135,6 +136,101 @@ def tests_duplicated_names(self):
# human
self.assertIn("human", [v.name for v in out.domain.attributes[1:]])

def test_compute_values_same_tfidf_regardless_num_documents(self):
"""
When computing TF-IDF from compute values TF-IDF should give same
results regardless of length of new corpus - IDF weighting should consider
only counts from original corpus.
"""
corpus = Corpus.from_file('deerwester')
train_corpus = corpus[:5]
test_corpus = corpus[5:]
vect = BowVectorizer(wglobal=BowVectorizer.IDF)

bow = vect.transform(train_corpus)
computed1 = Corpus.from_table(bow.domain, test_corpus[1:])
computed2 = Corpus.from_table(bow.domain, test_corpus)

self.assertEqual(computed1.domain, computed2.domain)
self.assertEqual(bow.domain, computed2.domain)
self.assertEqual((computed1.X != computed2.X[1:]).nnz, 0)

# fmt: off
domain = Domain([], metas=[StringVariable("text")])
small_corpus_train = Corpus(
domain,
np.empty((4, 0)),
metas=np.array([
["this is a nice day day"],
["the day is nice"],
["i love a beautiful day"],
["this apple is mine"]
])
)
terms = [
"this", "is", "a", "nice", "day", "the", "i", "love", "beautiful",
"apple", "mine"
]
train_counts = np.array([
[1, 1, 1, 1, 2, 0, 0, 0, 0, 0, 0],
[0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0],
[0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0],
[1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1]
])
small_corpus_test = Corpus(
domain,
np.empty((3, 0)),
metas=np.array([
["this is a nice day day"],
["day nice summer mine"],
["apple is cool"],
])
)
test_counts = np.array([
[1, 1, 1, 1, 2, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1],
[0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0]
])
# fmt: on

def assert_bow_same(self, corpus, values, terms):
self.assertSetEqual(set(terms), set(a.name for a in corpus.domain.attributes))
for i, a in enumerate(terms):
self.assertListEqual(
corpus.get_column_view(a)[0].tolist(),
values[:, i].tolist(),
f"BOW differ for term {a}",
)

def test_count_correctness(self):
"""Test if computed counts are correct for train and test dataset"""
bow = BowVectorizer().transform(self.small_corpus_train)
self.assert_bow_same(bow, self.train_counts, self.terms)

# computed from compute_values - result contains only terms from train dataset
bow_test = Corpus.from_table(bow.domain, self.small_corpus_test)
self.assert_bow_same(bow_test, self.test_counts, self.terms)

def test_tfidf_correctness(self):
"""
Test if computed tf-ids are correct for train and test dataset
When computing tf-idf on the training dataset (from compute values)
weights (idf) must be computed based on numbers on training dataset
"""
bow = BowVectorizer(wglobal=BowVectorizer.IDF).transform(
self.small_corpus_train
)

document_appearance = (self.train_counts != 0).sum(0)
n = len(self.train_counts)
idfs_train = self.train_counts * np.log(n / document_appearance)
self.assert_bow_same(bow, idfs_train, self.terms)

bow_test = Corpus.from_table(bow.domain, self.small_corpus_test)
# weights computed based on numbers from training dataset
idfs_test = self.test_counts * np.log(n / document_appearance)
self.assert_bow_same(bow_test, idfs_test, self.terms)


if __name__ == "__main__":
unittest.main()
4 changes: 2 additions & 2 deletions orangecontrib/text/vectorization/bagofwords.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,11 +72,11 @@ def _transform(self, corpus, source_dict=None):
temp_corpus = list(corpus.ngrams_iterator(' ', include_postags=True))
dic = corpora.Dictionary(temp_corpus, prune_at=None) if not source_dict else source_dict
temp_corpus = [dic.doc2bow(doc) for doc in temp_corpus]
model = models.TfidfModel(temp_corpus, normalize=False,
model = models.TfidfModel(dictionary=dic, normalize=False,
wlocal=self.wlocals[self.wlocal],
wglobal=self.wglobals[self.wglobal])

X = matutils.corpus2csc(model[temp_corpus], dtype=np.float, num_terms=len(dic)).T
X = matutils.corpus2csc(model[temp_corpus], dtype=float, num_terms=len(dic)).T
norm = self.norms[self.norm]
if norm:
X = norm(X)
Expand Down

0 comments on commit 134976d

Please sign in to comment.