Skip to content

Commit

Permalink
Revert "Corpus: Language Detection"
Browse files Browse the repository at this point in the history
  • Loading branch information
ajdapretnar authored Jan 22, 2021
1 parent ce3fda1 commit 5a48672
Show file tree
Hide file tree
Showing 5 changed files with 1 addition and 91 deletions.
18 changes: 0 additions & 18 deletions orangecontrib/text/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import numpy as np
import scipy.sparse as sp
from gensim import corpora
import fasttext

from Orange.data import (
Variable,
Expand Down Expand Up @@ -83,7 +82,6 @@ def __init__(self, domain=None, X=None, Y=None, metas=None, W=None,
from orangecontrib.text.preprocess import PreprocessorList
self.__used_preprocessor = PreprocessorList([]) # required for compute values
self._titles: Optional[np.ndarray] = None
self.languages = None
self._pp_documents = None # preprocessed documents

if domain is not None and text_features is None:
Expand Down Expand Up @@ -226,22 +224,6 @@ def _unique_titles(titles: List[str]) -> List[str]:
new_titles.append(t)
return new_titles

def detect_languages(self):
"""
Detects language of each document using fastText language
identification model.
[A. Joulin, E. Grave, P. Bojanowski, T. Mikolov,
Bag of Tricks for Efficient Text Classification],
[A. Joulin, E. Grave, P. Bojanowski, M. Douze, H. Jégou, T. Mikolov,
FastText.zip: Compressing text classification models]
"""
path = os.path.join(os.path.dirname(__file__), 'models', 'lid.176.ftz')
model = fasttext.load_model(path)
texts = [' '.join(t.replace('\n', ' ').split(' ')[:2000])
for t in self.documents]
self.languages = [model.predict(t)[0][0].replace('__label__', '')
for t in texts]

def _infer_text_features(self):
"""
Infer which text features to use. If nothing was provided
Expand Down
Binary file removed orangecontrib/text/models/lid.176.ftz
Binary file not shown.
8 changes: 0 additions & 8 deletions orangecontrib/text/tests/test_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -573,14 +573,6 @@ def test_pickle_corpus(self):
c = pp(c)
pickle.dumps(c)

def test_languages(self):
corpus = Corpus.from_file('deerwester')

self.assertIsNone(corpus.languages)
corpus.detect_languages()
self.assertEqual(len(corpus.languages), len(corpus))
self.assertListEqual(corpus.languages, ['en' for _ in range(len(corpus))])


if __name__ == "__main__":
unittest.main()
65 changes: 1 addition & 64 deletions orangecontrib/text/widgets/owcorpus.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
import os
import numpy as np
from copy import copy

from Orange.data import Table, StringVariable, Variable, DiscreteVariable, \
Domain
from Orange.data import Table, StringVariable, Variable
from Orange.data.io import FileFormat
from Orange.widgets import gui
from Orange.widgets.utils.itemmodels import VariableListModel, DomainModel
Expand All @@ -14,7 +12,6 @@
from Orange.widgets.utils.concurrent import TaskState, ConcurrentWidgetMixin
from orangecontrib.text.corpus import Corpus, get_sample_corpora_dir
from orangecontrib.text.widgets.utils import widgets, QSize
from orangecontrib.text.vectorization.base import get_unique_names


class OWCorpus(OWWidget, ConcurrentWidgetMixin):
Expand Down Expand Up @@ -51,7 +48,6 @@ class Outputs:
])
used_attrs = ContextSetting([])
title_variable = ContextSetting("")
detect_languages = Setting(False)

class Error(OWWidget.Error):
read_file = Msg("Can't read file ({})")
Expand Down Expand Up @@ -104,10 +100,6 @@ def __init__(self):
self.unused_attrs_view.setModel(self.unused_attrs_model)
ibox.layout().addWidget(self.unused_attrs_view)

gui.checkBox(self.controlArea, self, "detect_languages",
"Detect language automatically",
callback=self.handle_languages)

# Documentation Data Sets & Report
box = gui.hBox(self.controlArea)
self.browse_documentation = gui.button(
Expand Down Expand Up @@ -254,7 +246,6 @@ def describe(corpus):
if self.corpus is None:
self.info.set_output_summary(self.info.NoOutput)
else:
self.handle_languages()
self.info.set_output_summary(
str(len(self.corpus)), describe(self.corpus))

Expand Down Expand Up @@ -312,60 +303,6 @@ def describe(features):
('Target', describe(domain.class_vars)),
))

def handle_languages(self):
if self.corpus is not None:
domain = self.corpus.domain
if self.detect_languages:
if self.corpus.languages is None:
self.corpus.detect_languages()

curr_attributes = list(domain.attributes)
curr_class_var = [domain.class_var] if domain.class_var else []
curr_metas = list(domain.metas)
curr_variables = curr_attributes + curr_class_var + curr_metas
curr_names = [var.name for var in curr_variables]
new_name = get_unique_names(curr_names, "Language")

variable_attrs = {'language-feature': True}
new_variable = StringVariable(new_name)
new_variable.attributes.update(variable_attrs)
new_domain = Domain(
attributes=domain.attributes,
class_vars=domain.class_var,
metas=list(domain.metas) + [new_variable]
)
metas = np.hstack([self.corpus.metas,
np.array(self.corpus.languages).reshape(-1, 1)])
self.corpus = Corpus(new_domain,
self.corpus.X.copy(),
self.corpus.Y.copy(),
metas,
self.corpus.W.copy(),
copy(self.corpus.text_features))
else:
lang_feat_idx = None
for i, f in enumerate(domain.metas):
if ('language-feature' in f.attributes and
f.attributes['language-feature']):
lang_feat_idx = i
break
if lang_feat_idx is not None:
new_domain = Domain(
attributes=domain.attributes,
class_vars=domain.class_var,
metas=list(np.delete(list(domain.metas),
lang_feat_idx))
)
self.corpus = Corpus(
new_domain,
self.corpus.X.copy(),
self.corpus.Y.copy(),
np.delete(self.corpus.metas, lang_feat_idx, axis=1),
self.corpus.W.copy(),
copy(self.corpus.text_features)
)
self.Outputs.corpus.send(self.corpus)


if __name__ == '__main__':
from AnyQt.QtWidgets import QApplication
Expand Down
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,3 @@ docx2txt>=0.6
lxml
biopython # Enables Pubmed widget.
ufal.udpipe >=1.2.0.3
fasttext

0 comments on commit 5a48672

Please sign in to comment.