Skip to content

Commit

Permalink
Merge pull request #916 from PrimozGodec/language-owcorpus
Browse files Browse the repository at this point in the history
[ENH] Add language to corpus
  • Loading branch information
VesnaT authored Nov 7, 2022
2 parents f9d92fa + f6c0f09 commit 5c55009
Show file tree
Hide file tree
Showing 29 changed files with 533 additions and 106 deletions.
2 changes: 1 addition & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
recursive-include orangecontrib/text/datasets *.tab *.txt
recursive-include orangecontrib/text/datasets *.tab *.txt *.metadata
recursive-include orangecontrib/text/models *.ftz
recursive-include orangecontrib/text/sentiment *.txt
recursive-include orangecontrib/text/tests *.txt *.json *.pkl
Expand Down
34 changes: 27 additions & 7 deletions orangecontrib/text/corpus.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
import warnings
from collections import Counter, defaultdict
from copy import copy
from copy import copy, deepcopy
from numbers import Integral
from itertools import chain
from typing import Union, Optional, List, Tuple
Expand All @@ -24,6 +24,8 @@
from Orange.preprocess.transformation import Identity
from Orange.data.util import get_unique_names

from orangecontrib.text.language import ISO2LANG

try:
from orangewidget.utils.signals import summarize, PartialSummary
# import to check if Table summary is available - if summarize_by_name does
Expand Down Expand Up @@ -87,7 +89,6 @@ def _setup_corpus(self, text_features: List[Variable] = None) -> None:
self._tokens = None
self._dictionary = None
self.ngram_range = (1, 1)
self.attributes = {}
self._pos_tags = None
from orangecontrib.text.preprocess import PreprocessorList
self.__used_preprocessor = PreprocessorList([]) # required for compute values
Expand All @@ -100,6 +101,8 @@ def _setup_corpus(self, text_features: List[Variable] = None) -> None:
self.set_text_features(text_features)

self._set_unique_titles()
if "language" not in self.attributes:
self.attributes["language"] = None

@property
def used_preprocessor(self):
Expand Down Expand Up @@ -361,6 +364,10 @@ def titles(self):
assert self._titles is not None
return self._titles

@property
def language(self):
return self.attributes["language"]

def documents_from_features(self, feats):
"""
Args:
Expand Down Expand Up @@ -465,7 +472,7 @@ def copy(self):
"""Return a copy of the table."""
c = super().copy()
# since tokens and dictionary are considered immutable copies are not needed
c._setup_corpus(copy(self.text_features))
c._setup_corpus(text_features=copy(self.text_features))
c._tokens = self._tokens
c._dictionary = self._dictionary
c.ngram_range = self.ngram_range
Expand All @@ -478,7 +485,7 @@ def copy(self):

@staticmethod
def from_documents(documents, name, attributes=None, class_vars=None, metas=None,
title_indices=None):
title_indices=None, language=None):
"""
Create corpus from documents.
Expand All @@ -490,6 +497,7 @@ def from_documents(documents, name, attributes=None, class_vars=None, metas=None
metas (list): List of tuples (Variable, getter) for metas.
title_indices (list): List of indices into domain corresponding to features which will
be used as titles.
language (str): Resulting corpus's language
Returns:
Corpus.
Expand Down Expand Up @@ -527,6 +535,7 @@ def to_val(attr, val):
domain=domain, X=X, Y=Y, metas=metas, text_features=[]
)
corpus.name = name
corpus.attributes["language"] = language
return corpus

def __getitem__(self, key):
Expand All @@ -540,6 +549,8 @@ def from_table(cls, domain, source, row_indices=...):
c = super().from_table(domain, source, row_indices)
c._setup_corpus()
Corpus.retain_preprocessing(source, c, row_indices)
# temp fix: remove when oldest Orange >= 3.34
c.attributes = deepcopy(c.attributes)
return c

@classmethod
Expand All @@ -553,19 +564,24 @@ def from_numpy(
attributes=None,
ids=None,
text_features=None,
language=None
):
t = super().from_numpy(
domain, X, Y=Y, metas=metas, W=W, attributes=attributes, ids=ids
)
# t is corpus but corpus specific attributes were not set yet
t._setup_corpus(text_features=text_features)
# language can be already set in attributes if they provided
if language is not None or "language" not in t.attributes:
t.attributes["language"] = language
return t

@classmethod
def from_list(cls, domain, rows, weights=None):
def from_list(cls, domain, rows, weights=None, language=None):
t = super().from_list(domain, rows, weights)
# t is corpus but corpus specific attributes were not set yet
t._setup_corpus()
t.attributes["language"] = language
return t

@classmethod
Expand All @@ -576,18 +592,20 @@ def from_table_rows(cls, source, row_indices):
if hasattr(source, "_titles"):
# covering case when from_table_rows called by from_table
c._titles = source._titles[row_indices]
# temp fix: remove when oldest Orange >= 3.34
c.attributes = deepcopy(c.attributes)
return c

@classmethod
def from_file(cls, filename):
def from_file(cls, filename, sheet=None):
if not os.path.exists(filename): # check the default location
abs_path = os.path.join(get_sample_corpora_dir(), filename)
if not abs_path.endswith('.tab'):
abs_path += '.tab'
if os.path.exists(abs_path):
filename = abs_path

table = super().from_file(filename)
table = super().from_file(filename, sheet=sheet)
if not isinstance(table, Corpus):
# when loading regular file result of super().from_file is Table - need
# to be transformed to Corpus, when loading pickle it is Corpus already
Expand Down Expand Up @@ -659,4 +677,6 @@ def summarize_(corpus: Corpus) -> PartialSummary:
if corpus.has_tokens()
else "<br/><nobr>Corpus is not preprocessed</nobr>"
)
language = ISO2LANG[corpus.language] if corpus.language else "not set"
extras += f"<br/><nobr>Language: {language}</nobr>"
return PartialSummary(table_summary.summary, table_summary.details + extras)
1 change: 1 addition & 0 deletions orangecontrib/text/datasets/20newsgroups-test.tab.metadata
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
language: en
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
language: en
1 change: 1 addition & 0 deletions orangecontrib/text/datasets/andersen.tab.metadata
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
language: en
1 change: 1 addition & 0 deletions orangecontrib/text/datasets/book-excerpts.tab.metadata
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
language: en
1 change: 1 addition & 0 deletions orangecontrib/text/datasets/deerwester.tab.metadata
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
language: en
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
language: en
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
language: en
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
language: en
1 change: 1 addition & 0 deletions orangecontrib/text/datasets/grimm-tales.tab.metadata
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
language: en
1 change: 1 addition & 0 deletions orangecontrib/text/datasets/reuters-r52-test.tab.metadata
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
language: en
1 change: 1 addition & 0 deletions orangecontrib/text/datasets/reuters-r52-train.tab.metadata
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
language: en
1 change: 1 addition & 0 deletions orangecontrib/text/datasets/reuters-r8-test.tab.metadata
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
language: en
1 change: 1 addition & 0 deletions orangecontrib/text/datasets/reuters-r8-train.tab.metadata
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
language: en
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
language: sl
165 changes: 165 additions & 0 deletions orangecontrib/text/language.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
from collections import Counter
from typing import Optional

from AnyQt.QtCore import Qt
from langdetect import DetectorFactory, detect
from langdetect.lang_detect_exception import LangDetectException
from Orange.data import DiscreteVariable
from orangewidget.utils.itemmodels import PyListModel

# languages supported by at least one method in Orange3-text
# language dependent methods: YAKE!, nltk - stopwords, sentiment methods,
# normalizers, embedding
ISO2LANG = {
"af": "Afrikaans",
"am": "Amharic",
"ar": "Arabic",
"az": "Azerbaijani",
"be": "Belarusian",
"bg": "Bulgarian",
"bn": "Bengali",
"bo": "Tibetan",
"br": "Breton",
"bs": "Bosnian",
"ca": "Catalan",
"ckb": "Central Kurdish",
"cop": "Coptic",
"cs": "Czech",
"cu": "Old Church Slavonic",
"cy": "Welsh",
"da": "Danish",
"de": "German",
"dv": "Divehi",
"el": "Greek",
"en": "English",
"es": "Spanish",
"et": "Estonian",
"eu": "Basque",
"fa": "Persian",
"fi": "Finnish",
"fr": "French",
"ga": "Irish",
"gl": "Galician",
"got": "Gothic",
"grc": "Ancient greek",
"gu": "Gujarati",
"he": "Hebrew",
"hi": "Hindi",
"hi-Latn": "Hindi (latin)",
"hr": "Croatian",
"ht": "Haitian",
"hu": "Hungarian",
"hy": "Armenian",
"id": "Indonesian",
"is": "Icelandic",
"it": "Italian",
"ja": "Japanese",
"ka": "Georgian",
"kk": "Kazakh",
"km": "Central Khmer",
"kn": "Kannada",
"ko": "Korean",
"la": "Latin",
"lo": "Lao",
"lt": "Lithuanian",
"lv": "Latvian",
"mk": "Macedonian",
"ml": "Malayalam",
"mr": "Marathi",
"my": "Burmese",
"nb": "Norwegian Bokmål",
"ne": "Nepali",
"nl": "Dutch",
"nn": "Norwegian Nynorsk",
"no": "Norwegian",
"or": "Oriya",
"pa": "Punjabi",
"pl": "Polish",
"ps": "Pashto",
"pt": "Portuguese",
"ro": "Romanian",
"ru": "Russian",
"sa": "Sanskrit",
"sd": "Sindhi",
"si": "Sinhala",
"sk": "Slovak",
"sl": "Slovenian",
"sr": "Serbian",
"sv": "Swedish",
"ta": "Tamil",
"te": "Telugu",
"tg": "Tajik",
"th": "Thai",
"tl": "Tagalog",
"tr": "Turkish",
"ug": "Uyghur",
"uk": "Ukrainian",
"ur": "Urdu",
"vi": "Vietnamese",
"zh": "Chinese",
None: None,
}
LANG2ISO = {lang: code for code, lang in ISO2LANG.items()}
DEFAULT_LANGUAGE = "English"


class LanguageModel(PyListModel):
"""Model for language selection dropdowns in the widgets"""

def __init__(self):
languages = sorted(filter(None, ISO2LANG.values()))
super().__init__(iterable=[None] + languages)

def data(self, index, role=Qt.DisplayRole):
if index.row() == 0 and role == Qt.DisplayRole:
return "(no language)"
else:
return super().data(index, role)


DetectorFactory.seed = 0
MAX_DOCS = 50 # max number of documents considered for language detection
MAX_WORDS = 2000 # max number of words in document considered for lang detection


def detect_language(corpus: "Corpus") -> Optional[str]:
"""
Detect the language in the corpus
Parameters
----------
corpus
Corpus to detect the language
Returns
-------
Detected language ISO code or None if language not detected
"""
texts = corpus.documents[:MAX_DOCS]
texts = [" ".join(t.replace("\n", " ").split(" ")[:MAX_WORDS]) for t in texts]
languages = list()
for text in texts:
try:
languages.append(detect(text))
except LangDetectException:
languages.append(None)
# filter out languages not in supported by Orange
candidates = [l for l, _ in Counter(languages).most_common() if l in ISO2LANG]
return candidates[0] if candidates else None


def infer_language_from_variable(variable: DiscreteVariable) -> Optional[str]:
"""
Infer language from DiscreteVariable that holds documents' language information.
If documents have different language return None
Parameters
----------
variable
The DiscreteVariable to infer language from it's values
Returns
-------
Language ISO code if all documents have the same language, None otherwise
"""
return variable.values[0] if len(variable.values) == 1 else None
Loading

0 comments on commit 5c55009

Please sign in to comment.