From 4c33e45b4c86cc377a6c194047151b3cee533230 Mon Sep 17 00:00:00 2001 From: djukicn Date: Fri, 24 Jun 2022 15:26:25 +0200 Subject: [PATCH] Corpus: fix ngrams_corpus --- orangecontrib/text/corpus.py | 15 -------- .../text/tests/test_topic_modeling.py | 3 ++ orangecontrib/text/topics/topics.py | 34 ++++++++++++++++--- orangecontrib/text/vectorization/base.py | 2 -- orangecontrib/text/widgets/owtopicmodeling.py | 4 +-- 5 files changed, 35 insertions(+), 23 deletions(-) diff --git a/orangecontrib/text/corpus.py b/orangecontrib/text/corpus.py index 6aedc48bf..982d4676d 100644 --- a/orangecontrib/text/corpus.py +++ b/orangecontrib/text/corpus.py @@ -22,7 +22,6 @@ ) from Orange.preprocess.transformation import Identity from Orange.data.util import get_unique_names -from orangecontrib.text.vectorization import BowVectorizer try: from orangewidget.utils.signals import summarize, PartialSummary @@ -86,7 +85,6 @@ def _setup_corpus(self, text_features: List[Variable] = None) -> None: self.text_features = [] # list of text features for mining self._tokens = None self._dictionary = None - self._ngrams_corpus = None self.ngram_range = (1, 1) self.attributes = {} self._pos_tags = None @@ -459,16 +457,6 @@ def ngrams_iterator(self, join_with=' ', include_postags=False): for n in range(self.ngram_range[0], self.ngram_range[1]+1)))) for doc in data) - @property - def ngrams_corpus(self): - if self._ngrams_corpus is None: - return BowVectorizer().transform(self).ngrams_corpus - return self._ngrams_corpus - - @ngrams_corpus.setter - def ngrams_corpus(self, value): - self._ngrams_corpus = value - @property def ngrams(self): """generator: Ngram representations of documents.""" @@ -487,7 +475,6 @@ def copy(self): c.used_preprocessor = self.used_preprocessor c._titles = self._titles c._pp_documents = self._pp_documents - c._ngrams_corpus = self._ngrams_corpus return c @staticmethod @@ -651,8 +638,6 @@ def retain_preprocessing(orig, new, key=...): new.ngram_range = orig.ngram_range new.attributes = orig.attributes new.used_preprocessor = orig.used_preprocessor - if orig._ngrams_corpus is not None: - new.ngrams_corpus = orig._ngrams_corpus[key] else: # orig is not Corpus new._set_unique_titles() new._infer_text_features() diff --git a/orangecontrib/text/tests/test_topic_modeling.py b/orangecontrib/text/tests/test_topic_modeling.py index 4dca59f41..309697bc0 100644 --- a/orangecontrib/text/tests/test_topic_modeling.py +++ b/orangecontrib/text/tests/test_topic_modeling.py @@ -57,6 +57,9 @@ def test_empty_corpus(self): empty = p(self.corpus) self.assertIsNone(self.model.fit(empty)) + def test_slice(self): + self.model.fit(self.corpus[:len(self.corpus) // 2]) + def test_get_top_words(self): self.model.fit(self.corpus) self.assertRaises(ValueError, self.model.get_topics_table_by_id, 1000) diff --git a/orangecontrib/text/topics/topics.py b/orangecontrib/text/topics/topics.py index 57cfad317..a4e0cd48a 100644 --- a/orangecontrib/text/topics/topics.py +++ b/orangecontrib/text/topics/topics.py @@ -13,6 +13,8 @@ from orangecontrib.text.corpus import Corpus from orangecontrib.text.util import chunkable +from gensim.matutils import Sparse2Corpus +from orangecontrib.text.vectorization import BowVectorizer MAX_WORDS = 1000 @@ -57,6 +59,30 @@ def get_value(self, model, *args, **kwargs): return self.epochs / model.passes +def infer_ngrams_corpus(corpus, return_dict=False): + + bow_features = [ + (i, attribute.name) for i, attribute in enumerate(corpus.domain.attributes) + if 'bow-feature' in attribute.attributes + ] + if len(bow_features) == 0: + corpus = BowVectorizer().transform(corpus) + bow_features = [ + (i, attribute.name) for i, attribute in enumerate(corpus.domain.attributes) + if 'bow-feature' in attribute.attributes + ] + + feature_presence = corpus.X.sum(axis=0) + keep = [(i, a) for i, a in bow_features if feature_presence[0, i] > 0] + # sort features by the order in the dictionary + dictionary = Dictionary(corpus.ngrams_iterator(include_postags=True), prune_at=None) + idx_of_keep = np.argsort([dictionary.token2id[a] for _, a in keep]) + keep = [keep[i][0] for i in idx_of_keep] + result = Sparse2Corpus(corpus.X[:, keep].T) + + return (result, dictionary) if return_dict else result + + class GensimWrapper: name = NotImplemented Model = NotImplemented @@ -96,11 +122,11 @@ def fit(self, corpus, on_progress=dummy_callback, **kwargs): model_kwars, callbacks=[GensimProgressCallback(on_progress)] ) - id2word = Dictionary(corpus.ngrams_iterator(include_postags=True), prune_at=None) + ngrams_corpus, dictionary = infer_ngrams_corpus(corpus, return_dict=True) self.model = self.Model( - corpus=corpus.ngrams_corpus, id2word=id2word, **model_kwars + corpus=ngrams_corpus, id2word=dictionary, **model_kwars ) - self.n_words = len(corpus.dictionary) + self.n_words = ngrams_corpus.sparse.shape[0] self.topic_names = ['Topic {}'.format(i+1) for i in range(self.num_topics)] def dummy_method(self, *args, **kwargs): @@ -129,7 +155,7 @@ def update(self, documents): def transform(self, corpus): """ Create a table with topics representation. """ - topics = self.model[corpus.ngrams_corpus] + topics = self.model[infer_ngrams_corpus(corpus)] self.actual_topics = self.model.get_topics().shape[0] matrix = matutils.corpus2dense( topics, num_docs=len(corpus), num_terms=self.num_topics diff --git a/orangecontrib/text/vectorization/base.py b/orangecontrib/text/vectorization/base.py index 5821356c5..76ba44037 100644 --- a/orangecontrib/text/vectorization/base.py +++ b/orangecontrib/text/vectorization/base.py @@ -2,7 +2,6 @@ from Orange.data.util import SharedComputeValue from Orange.util import dummy_callback -from orangecontrib.text.util import Sparse2CorpusSliceable class BaseVectorizer: @@ -44,7 +43,6 @@ def add_features(corpus, X, dictionary, compute_values=None, var_attrs=None): sparse=True, rename_existing=True ) - corpus.ngrams_corpus = Sparse2CorpusSliceable(X.T) return corpus diff --git a/orangecontrib/text/widgets/owtopicmodeling.py b/orangecontrib/text/widgets/owtopicmodeling.py index 5e27587a2..06e632ef0 100644 --- a/orangecontrib/text/widgets/owtopicmodeling.py +++ b/orangecontrib/text/widgets/owtopicmodeling.py @@ -21,7 +21,7 @@ from orangecontrib.text.corpus import Corpus from orangecontrib.text.topics import Topic, Topics, LdaWrapper, HdpWrapper, \ LsiWrapper, NmfWrapper -from orangecontrib.text.topics.topics import GensimWrapper +from orangecontrib.text.topics.topics import GensimWrapper, infer_ngrams_corpus class TopicWidget(gui.OWComponent, QGroupBox): @@ -262,7 +262,7 @@ def on_done(self, corpus): self.Warning.less_topics_found() if self.model.name == "Latent Dirichlet Allocation": - bound = self.model.model.log_perplexity(corpus.ngrams_corpus) + bound = self.model.model.log_perplexity(infer_ngrams_corpus(corpus)) self.perplexity = "{:.5f}".format(np.exp2(-bound)) cm = CoherenceModel( model=self.model.model, texts=corpus.tokens, corpus=corpus, coherence="c_v"