diff --git a/orangecontrib/text/corpus.py b/orangecontrib/text/corpus.py index d838cd051..8eace849f 100644 --- a/orangecontrib/text/corpus.py +++ b/orangecontrib/text/corpus.py @@ -9,7 +9,6 @@ import numpy as np import scipy.sparse as sp from gensim import corpora -import fasttext from Orange.data import ( Variable, @@ -83,7 +82,6 @@ def __init__(self, domain=None, X=None, Y=None, metas=None, W=None, from orangecontrib.text.preprocess import PreprocessorList self.__used_preprocessor = PreprocessorList([]) # required for compute values self._titles: Optional[np.ndarray] = None - self.languages = None self._pp_documents = None # preprocessed documents if domain is not None and text_features is None: @@ -226,22 +224,6 @@ def _unique_titles(titles: List[str]) -> List[str]: new_titles.append(t) return new_titles - def detect_languages(self): - """ - Detects language of each document using fastText language - identification model. - [A. Joulin, E. Grave, P. Bojanowski, T. Mikolov, - Bag of Tricks for Efficient Text Classification], - [A. Joulin, E. Grave, P. Bojanowski, M. Douze, H. Jégou, T. Mikolov, - FastText.zip: Compressing text classification models] - """ - path = os.path.join(os.path.dirname(__file__), 'models', 'lid.176.ftz') - model = fasttext.load_model(path) - texts = [' '.join(t.replace('\n', ' ').split(' ')[:2000]) - for t in self.documents] - self.languages = [model.predict(t)[0][0].replace('__label__', '') - for t in texts] - def _infer_text_features(self): """ Infer which text features to use. If nothing was provided diff --git a/orangecontrib/text/models/lid.176.ftz b/orangecontrib/text/models/lid.176.ftz deleted file mode 100644 index 1fb85b357..000000000 Binary files a/orangecontrib/text/models/lid.176.ftz and /dev/null differ diff --git a/orangecontrib/text/tests/test_corpus.py b/orangecontrib/text/tests/test_corpus.py index 9bf43ea99..11e86bb6f 100644 --- a/orangecontrib/text/tests/test_corpus.py +++ b/orangecontrib/text/tests/test_corpus.py @@ -573,14 +573,6 @@ def test_pickle_corpus(self): c = pp(c) pickle.dumps(c) - def test_languages(self): - corpus = Corpus.from_file('deerwester') - - self.assertIsNone(corpus.languages) - corpus.detect_languages() - self.assertEqual(len(corpus.languages), len(corpus)) - self.assertListEqual(corpus.languages, ['en' for _ in range(len(corpus))]) - if __name__ == "__main__": unittest.main() diff --git a/orangecontrib/text/widgets/owcorpus.py b/orangecontrib/text/widgets/owcorpus.py index e38c360ac..cd56ee24f 100644 --- a/orangecontrib/text/widgets/owcorpus.py +++ b/orangecontrib/text/widgets/owcorpus.py @@ -1,9 +1,7 @@ import os import numpy as np -from copy import copy -from Orange.data import Table, StringVariable, Variable, DiscreteVariable, \ - Domain +from Orange.data import Table, StringVariable, Variable from Orange.data.io import FileFormat from Orange.widgets import gui from Orange.widgets.utils.itemmodels import VariableListModel, DomainModel @@ -14,7 +12,6 @@ from Orange.widgets.utils.concurrent import TaskState, ConcurrentWidgetMixin from orangecontrib.text.corpus import Corpus, get_sample_corpora_dir from orangecontrib.text.widgets.utils import widgets, QSize -from orangecontrib.text.vectorization.base import get_unique_names class OWCorpus(OWWidget, ConcurrentWidgetMixin): @@ -51,7 +48,6 @@ class Outputs: ]) used_attrs = ContextSetting([]) title_variable = ContextSetting("") - detect_languages = Setting(False) class Error(OWWidget.Error): read_file = Msg("Can't read file ({})") @@ -104,10 +100,6 @@ def __init__(self): self.unused_attrs_view.setModel(self.unused_attrs_model) ibox.layout().addWidget(self.unused_attrs_view) - gui.checkBox(self.controlArea, self, "detect_languages", - "Detect language automatically", - callback=self.handle_languages) - # Documentation Data Sets & Report box = gui.hBox(self.controlArea) self.browse_documentation = gui.button( @@ -254,7 +246,6 @@ def describe(corpus): if self.corpus is None: self.info.set_output_summary(self.info.NoOutput) else: - self.handle_languages() self.info.set_output_summary( str(len(self.corpus)), describe(self.corpus)) @@ -312,60 +303,6 @@ def describe(features): ('Target', describe(domain.class_vars)), )) - def handle_languages(self): - if self.corpus is not None: - domain = self.corpus.domain - if self.detect_languages: - if self.corpus.languages is None: - self.corpus.detect_languages() - - curr_attributes = list(domain.attributes) - curr_class_var = [domain.class_var] if domain.class_var else [] - curr_metas = list(domain.metas) - curr_variables = curr_attributes + curr_class_var + curr_metas - curr_names = [var.name for var in curr_variables] - new_name = get_unique_names(curr_names, "Language") - - variable_attrs = {'language-feature': True} - new_variable = StringVariable(new_name) - new_variable.attributes.update(variable_attrs) - new_domain = Domain( - attributes=domain.attributes, - class_vars=domain.class_var, - metas=list(domain.metas) + [new_variable] - ) - metas = np.hstack([self.corpus.metas, - np.array(self.corpus.languages).reshape(-1, 1)]) - self.corpus = Corpus(new_domain, - self.corpus.X.copy(), - self.corpus.Y.copy(), - metas, - self.corpus.W.copy(), - copy(self.corpus.text_features)) - else: - lang_feat_idx = None - for i, f in enumerate(domain.metas): - if ('language-feature' in f.attributes and - f.attributes['language-feature']): - lang_feat_idx = i - break - if lang_feat_idx is not None: - new_domain = Domain( - attributes=domain.attributes, - class_vars=domain.class_var, - metas=list(np.delete(list(domain.metas), - lang_feat_idx)) - ) - self.corpus = Corpus( - new_domain, - self.corpus.X.copy(), - self.corpus.Y.copy(), - np.delete(self.corpus.metas, lang_feat_idx, axis=1), - self.corpus.W.copy(), - copy(self.corpus.text_features) - ) - self.Outputs.corpus.send(self.corpus) - if __name__ == '__main__': from AnyQt.QtWidgets import QApplication diff --git a/requirements.txt b/requirements.txt index 6eab952fa..5f26f69be 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,4 +16,3 @@ docx2txt>=0.6 lxml biopython # Enables Pubmed widget. ufal.udpipe >=1.2.0.3 -fasttext