Skip to content

Commit

Permalink
Corpus: fix ngrams_corpus
Browse files Browse the repository at this point in the history
  • Loading branch information
djukicn committed Jul 8, 2022
1 parent 7f3baed commit 4c33e45
Show file tree
Hide file tree
Showing 5 changed files with 35 additions and 23 deletions.
15 changes: 0 additions & 15 deletions orangecontrib/text/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
)
from Orange.preprocess.transformation import Identity
from Orange.data.util import get_unique_names
from orangecontrib.text.vectorization import BowVectorizer

try:
from orangewidget.utils.signals import summarize, PartialSummary
Expand Down Expand Up @@ -86,7 +85,6 @@ def _setup_corpus(self, text_features: List[Variable] = None) -> None:
self.text_features = [] # list of text features for mining
self._tokens = None
self._dictionary = None
self._ngrams_corpus = None
self.ngram_range = (1, 1)
self.attributes = {}
self._pos_tags = None
Expand Down Expand Up @@ -459,16 +457,6 @@ def ngrams_iterator(self, join_with=' ', include_postags=False):
for n in range(self.ngram_range[0], self.ngram_range[1]+1))))
for doc in data)

@property
def ngrams_corpus(self):
if self._ngrams_corpus is None:
return BowVectorizer().transform(self).ngrams_corpus
return self._ngrams_corpus

@ngrams_corpus.setter
def ngrams_corpus(self, value):
self._ngrams_corpus = value

@property
def ngrams(self):
"""generator: Ngram representations of documents."""
Expand All @@ -487,7 +475,6 @@ def copy(self):
c.used_preprocessor = self.used_preprocessor
c._titles = self._titles
c._pp_documents = self._pp_documents
c._ngrams_corpus = self._ngrams_corpus
return c

@staticmethod
Expand Down Expand Up @@ -651,8 +638,6 @@ def retain_preprocessing(orig, new, key=...):
new.ngram_range = orig.ngram_range
new.attributes = orig.attributes
new.used_preprocessor = orig.used_preprocessor
if orig._ngrams_corpus is not None:
new.ngrams_corpus = orig._ngrams_corpus[key]
else: # orig is not Corpus
new._set_unique_titles()
new._infer_text_features()
Expand Down
3 changes: 3 additions & 0 deletions orangecontrib/text/tests/test_topic_modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ def test_empty_corpus(self):
empty = p(self.corpus)
self.assertIsNone(self.model.fit(empty))

def test_slice(self):
self.model.fit(self.corpus[:len(self.corpus) // 2])

def test_get_top_words(self):
self.model.fit(self.corpus)
self.assertRaises(ValueError, self.model.get_topics_table_by_id, 1000)
Expand Down
34 changes: 30 additions & 4 deletions orangecontrib/text/topics/topics.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@

from orangecontrib.text.corpus import Corpus
from orangecontrib.text.util import chunkable
from gensim.matutils import Sparse2Corpus
from orangecontrib.text.vectorization import BowVectorizer

MAX_WORDS = 1000

Expand Down Expand Up @@ -57,6 +59,30 @@ def get_value(self, model, *args, **kwargs):
return self.epochs / model.passes


def infer_ngrams_corpus(corpus, return_dict=False):

bow_features = [
(i, attribute.name) for i, attribute in enumerate(corpus.domain.attributes)
if 'bow-feature' in attribute.attributes
]
if len(bow_features) == 0:
corpus = BowVectorizer().transform(corpus)
bow_features = [
(i, attribute.name) for i, attribute in enumerate(corpus.domain.attributes)
if 'bow-feature' in attribute.attributes
]

feature_presence = corpus.X.sum(axis=0)
keep = [(i, a) for i, a in bow_features if feature_presence[0, i] > 0]
# sort features by the order in the dictionary
dictionary = Dictionary(corpus.ngrams_iterator(include_postags=True), prune_at=None)
idx_of_keep = np.argsort([dictionary.token2id[a] for _, a in keep])
keep = [keep[i][0] for i in idx_of_keep]
result = Sparse2Corpus(corpus.X[:, keep].T)

return (result, dictionary) if return_dict else result


class GensimWrapper:
name = NotImplemented
Model = NotImplemented
Expand Down Expand Up @@ -96,11 +122,11 @@ def fit(self, corpus, on_progress=dummy_callback, **kwargs):
model_kwars, callbacks=[GensimProgressCallback(on_progress)]
)

id2word = Dictionary(corpus.ngrams_iterator(include_postags=True), prune_at=None)
ngrams_corpus, dictionary = infer_ngrams_corpus(corpus, return_dict=True)
self.model = self.Model(
corpus=corpus.ngrams_corpus, id2word=id2word, **model_kwars
corpus=ngrams_corpus, id2word=dictionary, **model_kwars
)
self.n_words = len(corpus.dictionary)
self.n_words = ngrams_corpus.sparse.shape[0]
self.topic_names = ['Topic {}'.format(i+1) for i in range(self.num_topics)]

def dummy_method(self, *args, **kwargs):
Expand Down Expand Up @@ -129,7 +155,7 @@ def update(self, documents):

def transform(self, corpus):
""" Create a table with topics representation. """
topics = self.model[corpus.ngrams_corpus]
topics = self.model[infer_ngrams_corpus(corpus)]
self.actual_topics = self.model.get_topics().shape[0]
matrix = matutils.corpus2dense(
topics, num_docs=len(corpus), num_terms=self.num_topics
Expand Down
2 changes: 0 additions & 2 deletions orangecontrib/text/vectorization/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

from Orange.data.util import SharedComputeValue
from Orange.util import dummy_callback
from orangecontrib.text.util import Sparse2CorpusSliceable


class BaseVectorizer:
Expand Down Expand Up @@ -44,7 +43,6 @@ def add_features(corpus, X, dictionary, compute_values=None, var_attrs=None):
sparse=True,
rename_existing=True
)
corpus.ngrams_corpus = Sparse2CorpusSliceable(X.T)
return corpus


Expand Down
4 changes: 2 additions & 2 deletions orangecontrib/text/widgets/owtopicmodeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from orangecontrib.text.corpus import Corpus
from orangecontrib.text.topics import Topic, Topics, LdaWrapper, HdpWrapper, \
LsiWrapper, NmfWrapper
from orangecontrib.text.topics.topics import GensimWrapper
from orangecontrib.text.topics.topics import GensimWrapper, infer_ngrams_corpus


class TopicWidget(gui.OWComponent, QGroupBox):
Expand Down Expand Up @@ -262,7 +262,7 @@ def on_done(self, corpus):
self.Warning.less_topics_found()

if self.model.name == "Latent Dirichlet Allocation":
bound = self.model.model.log_perplexity(corpus.ngrams_corpus)
bound = self.model.model.log_perplexity(infer_ngrams_corpus(corpus))
self.perplexity = "{:.5f}".format(np.exp2(-bound))
cm = CoherenceModel(
model=self.model.model, texts=corpus.tokens, corpus=corpus, coherence="c_v"
Expand Down

0 comments on commit 4c33e45

Please sign in to comment.