jbesomi · tmankita · Jul 14, 2020 · Jul 14, 2020 · Jul 14, 2020 · Jul 14, 2020
diff --git a/setup.cfg b/setup.cfg
@@ -30,6 +30,7 @@ install_requires =
     numpy>=1.17
     scikit-learn>=0.22
     spacy>=2.2.2
+    langdetect>=1.0.7
     tqdm>=4.3
     nltk>=3.3
     plotly>=4.2.0

diff --git a/tests/test_indexes.py b/tests/test_indexes.py
@@ -1,10 +1,11 @@
 import pandas as pd
+from parameterized import parameterized
+
 from texthero import nlp, visualization, preprocessing, representation
 
 from . import PandasTestCase
 import unittest
 import string
-from parameterized import parameterized
 
 
 # Define valid inputs for different functions.
@@ -25,6 +26,7 @@
 test_cases_nlp = [
     ["named_entities", nlp.named_entities, (s_text,)],
     ["noun_chunks", nlp.noun_chunks, (s_text,)],
+    ["infer_lang", nlp.infer_lang, (s_text,)],
 ]
 
 test_cases_preprocessing = [

diff --git a/tests/test_nlp.py b/tests/test_nlp.py
@@ -68,3 +68,127 @@ def test_count_sentences_wrong_index(self):
         t_different_index = pd.Series(["", ""], index=[5, 7])
 
         self.assertFalse(counted_sentences_s.index.equals(t_different_index.index))
+
+    def test_infer_lang(self):
+        # no found words in the following languages it, hr and hi that the function succeeds to detect.
+        s = pd.Series(
+            [
+                "Wêreld",
+                "مرحبا بالعالم",
+                "български",
+                "ওহে বিশ্ব",
+                "català",
+                "Ahoj světe",
+                "Helo Byd",
+                "dansk",
+                "Deutsch",
+                "Γειά σου Κόσμε",
+                "fox",
+                "Hola Mundo",
+                "Tere, Maailm",
+                "فارسی",
+                "Hei maailma",
+                "Bonjour le monde",
+                "હેલો વર્લ્ડ",
+                "שלום עולם",
+                "Helló Világ",
+                "Bahasa",
+                "こんにちは世界",
+                "ಹಲೋ ವರ್ಲ್ಡ್",
+                "안녕하세요 세계",
+                "lietuvių kalba",
+                "Sveika pasaule",
+                "Здраво свету",
+                "ഹലോ വേൾഡ്",
+                "मराठी",
+                "नेपाली",
+                "Vlaams",
+                "Norsk",
+                "ਸਤਿ ਸ੍ਰੀ ਅਕਾਲ ਦੁਨਿਆ",
+                "Witaj świecie",
+                "Olá Mundo",
+                "Română",
+                "русский",
+                "Slovenský",
+                "Pozdravljen, svet",
+                "Soomaaliga",
+                "Përshendetje Botë",
+                "Hej världen",
+                "Kiswahili",
+                "வணக்கம் உலகம்",
+                "హలో ప్రపంచ",
+                "สวัสดีชาวโลก",
+                "Wikang Tagalog",
+                "Selam Dünya",
+                "Привіт Світ",
+                "ہیلو دنیا",
+                "Chào thế giới",
+                "中文",
+                "中華民國國歌",
+                # "धन्यवाद",
+                # "Lijepa naša domovino",
+                # "Italiano",
+            ]
+        )
+
+        s_true = pd.Series(
+            [
+                "af",
+                "ar",
+                "bg",
+                "bn",
+                "ca",
+                "cs",
+                "cy",
+                "da",
+                "de",
+                "el",
+                "en",
+                "es",
+                "et",
+                "fa",
+                "fi",
+                "fr",
+                "gu",
+                "he",
+                "hu",
+                "id",
+                "ja",
+                "kn",
+                "ko",
+                "lt",
+                "lv",
+                "mk",
+                "ml",
+                "mr",
+                "ne",
+                "nl",
+                "no",
+                "pa",
+                "pl",
+                "pt",
+                "ro",
+                "ru",
+                "sk",
+                "sl",
+                "so",
+                "sq",
+                "sv",
+                "sw",
+                "ta",
+                "te",
+                "th",
+                "tl",
+                "tr",
+                "uk",
+                "ur",
+                "vi",
+                "zh-cn",
+                "zh-tw",
+                # 'hi',
+                # 'hr',
+                # 'it'
+            ]
+        )
+        s_result = nlp.infer_lang(s)
+        self.assertEqual(s_result, s_true)
diff --git a/texthero/nlp.py b/texthero/nlp.py
@@ -4,6 +4,9 @@
 
 import spacy
 import pandas as pd
+from langdetect import detect_langs
+from langdetect.lang_detect_exception import LangDetectException
+from langdetect.language import Language
 
 
 def named_entities(s, package="spacy"):
@@ -129,3 +132,71 @@ def count_sentences(s: pd.Series) -> pd.Series:
         number_of_sentences.append(sentences)
 
     return pd.Series(number_of_sentences, index=s.index)
+
+
+def _Language_to_tuple(lang: Language):
+    return (str(lang.lang), "%.5f" % float(lang.prob))
+
+
+def _detect_language_probability(s):
+    """
+    gured out appling detect_langs function on sentence
+    :param s
+    """
+    try:
+        detected_language = list(map(_Language_to_tuple, detect_langs(s)))
+        return detected_language
+    except LangDetectException:
+        return ("UNKNOWN", 0.0)
+
+
+def _detect_language(s):
+    """
+    gured out appling detect_langs function on sentence
+    :param s
+    """
+    try:
+        detected_language = str(detect_langs(s)[0].lang)
+        return detected_language
+    except LangDetectException:
+        return "UNKNOWN"
+
+
+def infer_lang(s, probability=False):
+    """
+    Return languages and their probabilities.
+
+    Return a Pandas Series where each row contains a ISO nomenclature of the "average" infer language.
+
+    If probability = True then each row contains a list of tuples
+
+    Tuple : (language, probability)
+
+    Note: infer_lang is nondeterministic function
+
+    Parameters
+    ----------
+    s : Pandas Series
+    probability (optional) : boolean
+
+    supports 55 languages out of the box (ISO 639-1 codes)
+    ------------------------------------------------------
+    af, ar, bg, bn, ca, cs, cy, da, de, el, en, es, et, fa, fi, fr, gu, he,
+    hi, hr, hu, id, it, ja, kn, ko, lt, lv, mk, ml, mr, ne, nl, no, pa, pl,
+    pt, ro, ru, sk, sl, so, sq, sv, sw, ta, te, th, tl, tr, uk, ur, vi, zh-cn, zh-tw
+
+    Examples
+    --------
+    >>> import texthero as hero
+    >>> import pandas as pd
+    >>> s = pd.Series("This is an English text!.")
+    >>> hero.infer_lang(s)
+    0    en
+    dtype: object
+
+    """
+
+    if probability:
+        return s.apply(_detect_language_probability)
+    else:
+        return s.apply(_detect_language)