diff --git a/orangecontrib/text/tests/test_bowvectorizer.py b/orangecontrib/text/tests/test_bowvectorizer.py index 0de51e7fe..0f1efe6ae 100644 --- a/orangecontrib/text/tests/test_bowvectorizer.py +++ b/orangecontrib/text/tests/test_bowvectorizer.py @@ -1,6 +1,7 @@ import unittest import numpy as np +from Orange.data import Domain, StringVariable from orangecontrib.text import preprocess from orangecontrib.text.corpus import Corpus @@ -135,6 +136,101 @@ def tests_duplicated_names(self): # human self.assertIn("human", [v.name for v in out.domain.attributes[1:]]) + def test_compute_values_same_tfidf_regardless_num_documents(self): + """ + When computing TF-IDF from compute values TF-IDF should give same + results regardless of length of new corpus - IDF weighting should consider + only counts from original corpus. + """ + corpus = Corpus.from_file('deerwester') + train_corpus = corpus[:5] + test_corpus = corpus[5:] + vect = BowVectorizer(wglobal=BowVectorizer.IDF) + + bow = vect.transform(train_corpus) + computed1 = Corpus.from_table(bow.domain, test_corpus[1:]) + computed2 = Corpus.from_table(bow.domain, test_corpus) + + self.assertEqual(computed1.domain, computed2.domain) + self.assertEqual(bow.domain, computed2.domain) + self.assertEqual((computed1.X != computed2.X[1:]).nnz, 0) + + # fmt: off + domain = Domain([], metas=[StringVariable("text")]) + small_corpus_train = Corpus( + domain, + np.empty((4, 0)), + metas=np.array([ + ["this is a nice day day"], + ["the day is nice"], + ["i love a beautiful day"], + ["this apple is mine"] + ]) + ) + terms = [ + "this", "is", "a", "nice", "day", "the", "i", "love", "beautiful", + "apple", "mine" + ] + train_counts = np.array([ + [1, 1, 1, 1, 2, 0, 0, 0, 0, 0, 0], + [0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0], + [0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0], + [1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1] + ]) + small_corpus_test = Corpus( + domain, + np.empty((3, 0)), + metas=np.array([ + ["this is a nice day day"], + ["day nice summer mine"], + ["apple is cool"], + ]) + ) + test_counts = np.array([ + [1, 1, 1, 1, 2, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1], + [0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0] + ]) + # fmt: on + + def assert_bow_same(self, corpus, values, terms): + self.assertSetEqual(set(terms), set(a.name for a in corpus.domain.attributes)) + for i, a in enumerate(terms): + self.assertListEqual( + corpus.get_column_view(a)[0].tolist(), + values[:, i].tolist(), + f"BOW differ for term {a}", + ) + + def test_count_correctness(self): + """Test if computed counts are correct for train and test dataset""" + bow = BowVectorizer().transform(self.small_corpus_train) + self.assert_bow_same(bow, self.train_counts, self.terms) + + # computed from compute_values - result contains only terms from train dataset + bow_test = Corpus.from_table(bow.domain, self.small_corpus_test) + self.assert_bow_same(bow_test, self.test_counts, self.terms) + + def test_tfidf_correctness(self): + """ + Test if computed tf-ids are correct for train and test dataset + When computing tf-idf on the training dataset (from compute values) + weights (idf) must be computed based on numbers on training dataset + """ + bow = BowVectorizer(wglobal=BowVectorizer.IDF).transform( + self.small_corpus_train + ) + + document_appearance = (self.train_counts != 0).sum(0) + n = len(self.train_counts) + idfs_train = self.train_counts * np.log(n / document_appearance) + self.assert_bow_same(bow, idfs_train, self.terms) + + bow_test = Corpus.from_table(bow.domain, self.small_corpus_test) + # weights computed based on numbers from training dataset + idfs_test = self.test_counts * np.log(n / document_appearance) + self.assert_bow_same(bow_test, idfs_test, self.terms) + if __name__ == "__main__": unittest.main() diff --git a/orangecontrib/text/vectorization/bagofwords.py b/orangecontrib/text/vectorization/bagofwords.py index a9ae061c2..972f021ad 100644 --- a/orangecontrib/text/vectorization/bagofwords.py +++ b/orangecontrib/text/vectorization/bagofwords.py @@ -72,11 +72,11 @@ def _transform(self, corpus, source_dict=None): temp_corpus = list(corpus.ngrams_iterator(' ', include_postags=True)) dic = corpora.Dictionary(temp_corpus, prune_at=None) if not source_dict else source_dict temp_corpus = [dic.doc2bow(doc) for doc in temp_corpus] - model = models.TfidfModel(temp_corpus, normalize=False, + model = models.TfidfModel(dictionary=dic, normalize=False, wlocal=self.wlocals[self.wlocal], wglobal=self.wglobals[self.wglobal]) - X = matutils.corpus2csc(model[temp_corpus], dtype=np.float, num_terms=len(dic)).T + X = matutils.corpus2csc(model[temp_corpus], dtype=float, num_terms=len(dic)).T norm = self.norms[self.norm] if norm: X = norm(X)