Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FIX] Corpus - remove dictionary and fix wrong types count on subsampled corpus #990

Merged
merged 6 commits into from
Aug 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 44 additions & 46 deletions orangecontrib/text/corpus.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import os
import warnings
from collections import Counter, defaultdict
from copy import copy, deepcopy
from numbers import Integral
Expand All @@ -9,9 +8,6 @@

import nltk
import numpy as np
import scipy.sparse as sp
from gensim import corpora

from Orange.data import (
Variable,
ContinuousVariable,
Expand All @@ -23,17 +19,12 @@
)
from Orange.preprocess.transformation import Identity
from Orange.data.util import get_unique_names
from gensim import corpora
from orangewidget.utils.signals import summarize, PartialSummary
import scipy.sparse as sp

from orangecontrib.text.language import ISO2LANG

try:
from orangewidget.utils.signals import summarize, PartialSummary
# import to check if Table summary is available - if summarize_by_name does
# not exist Orange (3.28) does not support automated summaries
from Orange.widgets.utils.state_summary import summarize_by_name
except ImportError:
summarize, PartialSummary = None, None


def get_sample_corpora_dir():
path = os.path.dirname(__file__)
Expand Down Expand Up @@ -88,7 +79,6 @@ def _setup_corpus(self, text_features: List[Variable] = None) -> None:
"""
self.text_features = [] # list of text features for mining
self._tokens = None
self._dictionary = None
self.ngram_range = (1, 1)
self._pos_tags = None
from orangecontrib.text.preprocess import PreprocessorList
Expand Down Expand Up @@ -397,8 +387,13 @@ def store_tokens(self, tokens, dictionary=None):
Args:
tokens (list): List of lists containing tokens.
"""
if dictionary is not None:
warn(
"dictionary argument is deprecated and doesn't have effect."
"It will be removed in future orange3-text 1.15.",
FutureWarning,
)
self._tokens = np.array(tokens, dtype=object)
self._dictionary = dictionary or corpora.Dictionary(self.tokens)

@property
def tokens(self):
Expand All @@ -407,7 +402,7 @@ def tokens(self):
present, run default preprocessor and return tokens.
"""
if self._tokens is None:
return self._base_tokens()[0]
return self._base_tokens()
return self._tokens

def has_tokens(self):
Expand All @@ -419,19 +414,17 @@ def _base_tokens(self):
BASE_TOKENIZER, PreprocessorList

# don't use anything that requires NLTK data to assure async download
base_preprocessors = PreprocessorList([BASE_TRANSFORMER,
BASE_TOKENIZER])
base_preprocessors = PreprocessorList([BASE_TRANSFORMER, BASE_TOKENIZER])
corpus = base_preprocessors(self)
return corpus.tokens, corpus.dictionary
return corpus.tokens

@property
def dictionary(self):
"""
corpora.Dictionary: A token to id mapper.
"""
if self._dictionary is None:
return self._base_tokens()[1]
return self._dictionary
warn(
"dictionary is deprecated and will be removed in Orange3-text 1.15",
FutureWarning,
)
return corpora.Dictionary(self.tokens)

@property
def pos_tags(self):
Expand Down Expand Up @@ -468,6 +461,16 @@ def ngrams_iterator(self, join_with=NGRAMS_SEPARATOR, include_postags=False):
for n in range(self.ngram_range[0], self.ngram_range[1]+1))))
for doc in data)

def count_tokens(self) -> int:
"""Count number of all (non-unique) tokens in the corpus"""
return sum(map(len, self.tokens))

def count_unique_tokens(self) -> int:
"""Count number of all (unique) tokens in the corpus"""
# it seems to be fast enough even datasets very large dataset, so I
# would avoid caching to prevetnt potential problems connected to that
return len({tk for lst in self.tokens for tk in lst})

@property
def ngrams(self):
"""generator: Ngram representations of documents."""
Expand All @@ -476,10 +479,9 @@ def ngrams(self):
def copy(self):
"""Return a copy of the table."""
c = super().copy()
# since tokens and dictionary are considered immutable copies are not needed
c._setup_corpus(text_features=copy(self.text_features))
# since tokens are considered immutable copies are not needed
c._tokens = self._tokens
c._dictionary = self._dictionary
c.ngram_range = self.ngram_range
c.pos_tags = self.pos_tags
c.name = self.name
Expand Down Expand Up @@ -640,7 +642,6 @@ def retain_preprocessing(orig, new, key=...):
new.pos_tags = orig.pos_tags
else:
raise TypeError('Indexing by type {} not supported.'.format(type(key)))
new._dictionary = orig._dictionary

if isinstance(new, Corpus):
# _find_identical_feature returns non when feature not found
Expand All @@ -665,23 +666,20 @@ def retain_preprocessing(orig, new, key=...):
new._infer_text_features()


if summarize:
# summarize is not available in older versions of orange-widget-base
# skip if not available
@summarize.register(Corpus)
def summarize_corpus(corpus: Corpus) -> PartialSummary:
"""
Provides automated input and output summaries for Corpus
"""
table_summary = summarize.dispatch(Table)(corpus)
extras = (
(
f"<br/><nobr>Tokens: {sum(map(len, corpus.tokens))}, "
f"Types: {len(corpus.dictionary)}</nobr>"
)
if corpus.has_tokens()
else "<br/><nobr>Corpus is not preprocessed</nobr>"
@summarize.register(Corpus)
def summarize_corpus(corpus: Corpus) -> PartialSummary:
"""
Provides automated input and output summaries for Corpus
"""
table_summary = summarize.dispatch(Table)(corpus)
extras = (
(
f"<br/><nobr>Tokens: {corpus.count_tokens()}, "
f"Types: {corpus.count_unique_tokens()}</nobr>"
)
language = ISO2LANG[corpus.language] if corpus.language else "not set"
extras += f"<br/><nobr>Language: {language}</nobr>"
return PartialSummary(table_summary.summary, table_summary.details + extras)
if corpus.has_tokens()
else "<br/><nobr>Corpus is not preprocessed</nobr>"
)
language = ISO2LANG[corpus.language] if corpus.language else "not set"
extras += f"<br/><nobr>Language: {language}</nobr>"
return PartialSummary(table_summary.summary, table_summary.details + extras)
15 changes: 4 additions & 11 deletions orangecontrib/text/preprocess/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,7 @@ def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
corpus = super().__call__(corpus, wrap_callback(callback, end=0.2))
return self._filter_tokens(corpus, wrap_callback(callback, start=0.2))

def _filter_tokens(self, corpus: Corpus, callback: Callable,
dictionary=None) -> Corpus:
def _filter_tokens(self, corpus: Corpus, callback: Callable) -> Corpus:
callback(0, "Filtering...")
filtered_tokens = []
filtered_tags = []
Expand All @@ -37,10 +36,7 @@ def _filter_tokens(self, corpus: Corpus, callback: Callable,
if corpus.pos_tags is not None:
filtered_tags.append(list(compress(corpus.pos_tags[i],
filter_map)))
if dictionary is None:
corpus.store_tokens(filtered_tokens)
else:
corpus.store_tokens(filtered_tokens, dictionary)
corpus.store_tokens(filtered_tokens)
if filtered_tags:
corpus.pos_tags = np.array(filtered_tags, dtype=object)
return corpus
Expand Down Expand Up @@ -178,11 +174,8 @@ def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
def _fit(self, corpus: Corpus):
raise NotImplemented

def _filter_tokens(self, corpus: Corpus, callback: Callable,
dictionary=None) -> Corpus:
corpus = super()._filter_tokens(corpus, callback,
dictionary=self._dictionary)
return corpus
def _filter_tokens(self, corpus: Corpus, callback: Callable) -> Corpus:
return super()._filter_tokens(corpus, callback)

def _check(self, token):
assert self._lexicon is not None
Expand Down
Loading