Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Add language to corpus #916

Merged
merged 4 commits into from
Nov 7, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
recursive-include orangecontrib/text/datasets *.tab *.txt
recursive-include orangecontrib/text/datasets *.tab *.txt *.metadata
recursive-include orangecontrib/text/models *.ftz
recursive-include orangecontrib/text/sentiment *.txt
recursive-include orangecontrib/text/tests *.txt *.json *.pkl
Expand Down
34 changes: 27 additions & 7 deletions orangecontrib/text/corpus.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
import warnings
from collections import Counter, defaultdict
from copy import copy
from copy import copy, deepcopy
from numbers import Integral
from itertools import chain
from typing import Union, Optional, List, Tuple
Expand All @@ -24,6 +24,8 @@
from Orange.preprocess.transformation import Identity
from Orange.data.util import get_unique_names

from orangecontrib.text.language import ISO2LANG

try:
from orangewidget.utils.signals import summarize, PartialSummary
# import to check if Table summary is available - if summarize_by_name does
Expand Down Expand Up @@ -87,7 +89,6 @@ def _setup_corpus(self, text_features: List[Variable] = None) -> None:
self._tokens = None
self._dictionary = None
self.ngram_range = (1, 1)
self.attributes = {}
self._pos_tags = None
from orangecontrib.text.preprocess import PreprocessorList
self.__used_preprocessor = PreprocessorList([]) # required for compute values
Expand All @@ -100,6 +101,8 @@ def _setup_corpus(self, text_features: List[Variable] = None) -> None:
self.set_text_features(text_features)

self._set_unique_titles()
if "language" not in self.attributes:
self.attributes["language"] = None

@property
def used_preprocessor(self):
Expand Down Expand Up @@ -361,6 +364,10 @@ def titles(self):
assert self._titles is not None
return self._titles

@property
def language(self):
return self.attributes["language"]

def documents_from_features(self, feats):
"""
Args:
Expand Down Expand Up @@ -465,7 +472,7 @@ def copy(self):
"""Return a copy of the table."""
c = super().copy()
# since tokens and dictionary are considered immutable copies are not needed
c._setup_corpus(copy(self.text_features))
c._setup_corpus(text_features=copy(self.text_features))
c._tokens = self._tokens
c._dictionary = self._dictionary
c.ngram_range = self.ngram_range
Expand All @@ -478,7 +485,7 @@ def copy(self):

@staticmethod
def from_documents(documents, name, attributes=None, class_vars=None, metas=None,
title_indices=None):
title_indices=None, language=None):
"""
Create corpus from documents.

Expand All @@ -490,6 +497,7 @@ def from_documents(documents, name, attributes=None, class_vars=None, metas=None
metas (list): List of tuples (Variable, getter) for metas.
title_indices (list): List of indices into domain corresponding to features which will
be used as titles.
language (str): Resulting corpus's language

Returns:
Corpus.
Expand Down Expand Up @@ -527,6 +535,7 @@ def to_val(attr, val):
domain=domain, X=X, Y=Y, metas=metas, text_features=[]
)
corpus.name = name
corpus.attributes["language"] = language
return corpus

def __getitem__(self, key):
Expand All @@ -540,6 +549,8 @@ def from_table(cls, domain, source, row_indices=...):
c = super().from_table(domain, source, row_indices)
c._setup_corpus()
Corpus.retain_preprocessing(source, c, row_indices)
# temp fix: remove when oldest Orange >= 3.34
c.attributes = deepcopy(c.attributes)
return c

@classmethod
Expand All @@ -553,19 +564,24 @@ def from_numpy(
attributes=None,
ids=None,
text_features=None,
language=None
):
t = super().from_numpy(
domain, X, Y=Y, metas=metas, W=W, attributes=attributes, ids=ids
)
# t is corpus but corpus specific attributes were not set yet
t._setup_corpus(text_features=text_features)
# language can be already set in attributes if they provided
if language is not None or "language" not in t.attributes:
t.attributes["language"] = language
return t

@classmethod
def from_list(cls, domain, rows, weights=None):
def from_list(cls, domain, rows, weights=None, language=None):
t = super().from_list(domain, rows, weights)
# t is corpus but corpus specific attributes were not set yet
t._setup_corpus()
t.attributes["language"] = language
return t

@classmethod
Expand All @@ -576,18 +592,20 @@ def from_table_rows(cls, source, row_indices):
if hasattr(source, "_titles"):
# covering case when from_table_rows called by from_table
c._titles = source._titles[row_indices]
# temp fix: remove when oldest Orange >= 3.34
c.attributes = deepcopy(c.attributes)
return c

@classmethod
def from_file(cls, filename):
def from_file(cls, filename, sheet=None):
if not os.path.exists(filename): # check the default location
abs_path = os.path.join(get_sample_corpora_dir(), filename)
if not abs_path.endswith('.tab'):
abs_path += '.tab'
if os.path.exists(abs_path):
filename = abs_path

table = super().from_file(filename)
table = super().from_file(filename, sheet=sheet)
if not isinstance(table, Corpus):
# when loading regular file result of super().from_file is Table - need
# to be transformed to Corpus, when loading pickle it is Corpus already
Expand Down Expand Up @@ -659,4 +677,6 @@ def summarize_(corpus: Corpus) -> PartialSummary:
if corpus.has_tokens()
else "<br/><nobr>Corpus is not preprocessed</nobr>"
)
language = ISO2LANG[corpus.language] if corpus.language else "not set"
extras += f"<br/><nobr>Language: {language}</nobr>"
return PartialSummary(table_summary.summary, table_summary.details + extras)
1 change: 1 addition & 0 deletions orangecontrib/text/datasets/20newsgroups-test.tab.metadata
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
language: en
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
language: en
1 change: 1 addition & 0 deletions orangecontrib/text/datasets/andersen.tab.metadata
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
language: en
1 change: 1 addition & 0 deletions orangecontrib/text/datasets/book-excerpts.tab.metadata
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
language: en
1 change: 1 addition & 0 deletions orangecontrib/text/datasets/deerwester.tab.metadata
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
language: en
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
language: en
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
language: en
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
language: en
1 change: 1 addition & 0 deletions orangecontrib/text/datasets/grimm-tales.tab.metadata
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
language: en
1 change: 1 addition & 0 deletions orangecontrib/text/datasets/reuters-r52-test.tab.metadata
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
language: en
1 change: 1 addition & 0 deletions orangecontrib/text/datasets/reuters-r52-train.tab.metadata
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
language: en
1 change: 1 addition & 0 deletions orangecontrib/text/datasets/reuters-r8-test.tab.metadata
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
language: en
1 change: 1 addition & 0 deletions orangecontrib/text/datasets/reuters-r8-train.tab.metadata
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
language: en
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
language: sl
165 changes: 165 additions & 0 deletions orangecontrib/text/language.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
from collections import Counter
from typing import Optional

from AnyQt.QtCore import Qt
from langdetect import DetectorFactory, detect
from langdetect.lang_detect_exception import LangDetectException
from Orange.data import DiscreteVariable
from orangewidget.utils.itemmodels import PyListModel

# languages supported by at least one method in Orange3-text
# language dependent methods: YAKE!, nltk - stopwords, sentiment methods,
# normalizers, embedding
ISO2LANG = {
"af": "Afrikaans",
"am": "Amharic",
"ar": "Arabic",
"az": "Azerbaijani",
"be": "Belarusian",
"bg": "Bulgarian",
"bn": "Bengali",
"bo": "Tibetan",
"br": "Breton",
"bs": "Bosnian",
"ca": "Catalan",
"ckb": "Central Kurdish",
"cop": "Coptic",
"cs": "Czech",
"cu": "Old Church Slavonic",
"cy": "Welsh",
"da": "Danish",
"de": "German",
"dv": "Divehi",
"el": "Greek",
"en": "English",
"es": "Spanish",
"et": "Estonian",
"eu": "Basque",
"fa": "Persian",
"fi": "Finnish",
"fr": "French",
"ga": "Irish",
"gl": "Galician",
"got": "Gothic",
"grc": "Ancient greek",
"gu": "Gujarati",
"he": "Hebrew",
"hi": "Hindi",
"hi-Latn": "Hindi (latin)",
"hr": "Croatian",
"ht": "Haitian",
"hu": "Hungarian",
"hy": "Armenian",
"id": "Indonesian",
"is": "Icelandic",
"it": "Italian",
"ja": "Japanese",
"ka": "Georgian",
"kk": "Kazakh",
"km": "Central Khmer",
"kn": "Kannada",
"ko": "Korean",
"la": "Latin",
"lo": "Lao",
"lt": "Lithuanian",
"lv": "Latvian",
"mk": "Macedonian",
"ml": "Malayalam",
"mr": "Marathi",
"my": "Burmese",
"nb": "Norwegian Bokmål",
"ne": "Nepali",
"nl": "Dutch",
"nn": "Norwegian Nynorsk",
"no": "Norwegian",
"or": "Oriya",
"pa": "Punjabi",
"pl": "Polish",
"ps": "Pashto",
"pt": "Portuguese",
"ro": "Romanian",
"ru": "Russian",
"sa": "Sanskrit",
"sd": "Sindhi",
"si": "Sinhala",
"sk": "Slovak",
"sl": "Slovenian",
"sr": "Serbian",
"sv": "Swedish",
"ta": "Tamil",
"te": "Telugu",
"tg": "Tajik",
"th": "Thai",
"tl": "Tagalog",
"tr": "Turkish",
"ug": "Uyghur",
"uk": "Ukrainian",
"ur": "Urdu",
"vi": "Vietnamese",
"zh": "Chinese",
None: None,
}
LANG2ISO = {lang: code for code, lang in ISO2LANG.items()}
DEFAULT_LANGUAGE = "English"


class LanguageModel(PyListModel):
"""Model for language selection dropdowns in the widgets"""

def __init__(self):
languages = sorted(filter(None, ISO2LANG.values()))
super().__init__(iterable=[None] + languages)

def data(self, index, role=Qt.DisplayRole):
if index.row() == 0 and role == Qt.DisplayRole:
return "(no language)"
else:
return super().data(index, role)


DetectorFactory.seed = 0
MAX_DOCS = 50 # max number of documents considered for language detection
MAX_WORDS = 2000 # max number of words in document considered for lang detection


def detect_language(corpus: "Corpus") -> Optional[str]:
"""
Detect the language in the corpus

Parameters
----------
corpus
Corpus to detect the language

Returns
-------
Detected language ISO code or None if language not detected
"""
texts = corpus.documents[:MAX_DOCS]
texts = [" ".join(t.replace("\n", " ").split(" ")[:MAX_WORDS]) for t in texts]
languages = list()
for text in texts:
try:
languages.append(detect(text))
except LangDetectException:
languages.append(None)
# filter out languages not in supported by Orange
candidates = [l for l, _ in Counter(languages).most_common() if l in ISO2LANG]
return candidates[0] if candidates else None


def infer_language_from_variable(variable: DiscreteVariable) -> Optional[str]:
"""
Infer language from DiscreteVariable that holds documents' language information.
If documents have different language return None

Parameters
----------
variable
The DiscreteVariable to infer language from it's values

Returns
-------
Language ISO code if all documents have the same language, None otherwise
"""
return variable.values[0] if len(variable.values) == 1 else None
Loading