Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add infer_lang function (Issue number #3) #79

Draft
wants to merge 6 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ install_requires =
numpy>=1.17
scikit-learn>=0.22
spacy>=2.2.2
langdetect>=1.0.7
tqdm>=4.3
nltk>=3.3
plotly>=4.2.0
Expand Down
4 changes: 3 additions & 1 deletion tests/test_indexes.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import pandas as pd
from parameterized import parameterized

from texthero import nlp, visualization, preprocessing, representation

from . import PandasTestCase
import unittest
import string
from parameterized import parameterized


# Define valid inputs for different functions.
Expand All @@ -25,6 +26,7 @@
test_cases_nlp = [
["named_entities", nlp.named_entities, (s_text,)],
["noun_chunks", nlp.noun_chunks, (s_text,)],
["infer_lang", nlp.infer_lang, (s_text,)],
]

test_cases_preprocessing = [
Expand Down
124 changes: 124 additions & 0 deletions tests/test_nlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,3 +68,127 @@ def test_count_sentences_wrong_index(self):
t_different_index = pd.Series(["", ""], index=[5, 7])

self.assertFalse(counted_sentences_s.index.equals(t_different_index.index))

def test_infer_lang(self):
# no found words in the following languages it, hr and hi that the function succeeds to detect.
s = pd.Series(
[
"Wêreld",
"مرحبا بالعالم",
"български",
"ওহে বিশ্ব",
"català",
"Ahoj světe",
"Helo Byd",
"dansk",
"Deutsch",
"Γειά σου Κόσμε",
"fox",
"Hola Mundo",
"Tere, Maailm",
"فارسی",
"Hei maailma",
"Bonjour le monde",
"હેલો વર્લ્ડ",
"שלום עולם",
"Helló Világ",
"Bahasa",
"こんにちは世界",
"ಹಲೋ ವರ್ಲ್ಡ್",
"안녕하세요 세계",
"lietuvių kalba",
"Sveika pasaule",
"Здраво свету",
"ഹലോ വേൾഡ്",
"मराठी",
"नेपाली",
"Vlaams",
"Norsk",
"ਸਤਿ ਸ੍ਰੀ ਅਕਾਲ ਦੁਨਿਆ",
"Witaj świecie",
"Olá Mundo",
"Română",
"русский",
"Slovenský",
"Pozdravljen, svet",
"Soomaaliga",
"Përshendetje Botë",
"Hej världen",
"Kiswahili",
"வணக்கம் உலகம்",
"హలో ప్రపంచ",
"สวัสดีชาวโลก",
"Wikang Tagalog",
"Selam Dünya",
"Привіт Світ",
"ہیلو دنیا",
"Chào thế giới",
"中文",
"中華民國國歌",
# "धन्यवाद",
# "Lijepa naša domovino",
# "Italiano",
]
)

s_true = pd.Series(
[
"af",
"ar",
"bg",
"bn",
"ca",
"cs",
"cy",
"da",
"de",
"el",
"en",
"es",
"et",
"fa",
"fi",
"fr",
"gu",
"he",
"hu",
"id",
"ja",
"kn",
"ko",
"lt",
"lv",
"mk",
"ml",
"mr",
"ne",
"nl",
"no",
"pa",
"pl",
"pt",
"ro",
"ru",
"sk",
"sl",
"so",
"sq",
"sv",
"sw",
"ta",
"te",
"th",
"tl",
"tr",
"uk",
"ur",
"vi",
"zh-cn",
"zh-tw",
# 'hi',
# 'hr',
# 'it'
]
)
s_result = nlp.infer_lang(s)
self.assertEqual(s_result, s_true)
71 changes: 71 additions & 0 deletions texthero/nlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@

import spacy
import pandas as pd
from langdetect import detect_langs
from langdetect.lang_detect_exception import LangDetectException
from langdetect.language import Language


def named_entities(s, package="spacy"):
Expand Down Expand Up @@ -129,3 +132,71 @@ def count_sentences(s: pd.Series) -> pd.Series:
number_of_sentences.append(sentences)

return pd.Series(number_of_sentences, index=s.index)


def _Language_to_tuple(lang: Language):
return (str(lang.lang), "%.5f" % float(lang.prob))


def _detect_language_probability(s):
"""
gured out appling detect_langs function on sentence
:param s
"""
try:
detected_language = list(map(_Language_to_tuple, detect_langs(s)))
return detected_language
except LangDetectException:
return ("UNKNOWN", 0.0)


def _detect_language(s):
"""
gured out appling detect_langs function on sentence
:param s
"""
try:
detected_language = str(detect_langs(s)[0].lang)
return detected_language
except LangDetectException:
return "UNKNOWN"


def infer_lang(s, probability=False):
"""
Return languages and their probabilities.

Return a Pandas Series where each row contains a ISO nomenclature of the "average" infer language.

If probability = True then each row contains a list of tuples

Tuple : (language, probability)

Note: infer_lang is nondeterministic function

Parameters
----------
s : Pandas Series
probability (optional) : boolean

supports 55 languages out of the box (ISO 639-1 codes)
------------------------------------------------------
af, ar, bg, bn, ca, cs, cy, da, de, el, en, es, et, fa, fi, fr, gu, he,
hi, hr, hu, id, it, ja, kn, ko, lt, lv, mk, ml, mr, ne, nl, no, pa, pl,
pt, ro, ru, sk, sl, so, sq, sv, sw, ta, te, th, tl, tr, uk, ur, vi, zh-cn, zh-tw

Examples
--------
>>> import texthero as hero
>>> import pandas as pd
>>> s = pd.Series("This is an English text!.")
>>> hero.infer_lang(s)
0 en
dtype: object

"""

if probability:
return s.apply(_detect_language_probability)
else:
return s.apply(_detect_language)