From a18361e8886c4019143386ba853eca7738e4eabd Mon Sep 17 00:00:00 2001 From: Tomer Mankita Date: Tue, 14 Jul 2020 03:31:15 +0300 Subject: [PATCH 1/6] Add infer_lang function --- tests/test_nlp.py | 5 +++ texthero/nlp.py | 87 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 92 insertions(+) diff --git a/tests/test_nlp.py b/tests/test_nlp.py index 2df9db61..3e3f9225 100644 --- a/tests/test_nlp.py +++ b/tests/test_nlp.py @@ -68,3 +68,8 @@ def test_count_sentences_wrong_index(self): t_different_index = pd.Series(["", ""], index=[5, 7]) self.assertFalse(counted_sentences_s.index.equals(t_different_index.index)) + + def test_infer_lang(self): + s = pd.Series("This is an English text!.") + s_true = pd.Series([("en", 0.9999980507990403)]) + self.assertEqual(nlp.infer_lang(s), s_true) diff --git a/texthero/nlp.py b/texthero/nlp.py index 52956d5c..ece427d5 100644 --- a/texthero/nlp.py +++ b/texthero/nlp.py @@ -4,6 +4,11 @@ import spacy import pandas as pd +from spacy_langdetect import LanguageDetector +from langdetect import detect_langs +from langdetect.lang_detect_exception import LangDetectException +import functools +import operator def named_entities(s, package="spacy"): @@ -129,3 +134,85 @@ def count_sentences(s: pd.Series) -> pd.Series: number_of_sentences.append(sentences) return pd.Series(number_of_sentences, index=s.index) + + +def foldl(func, acc, xs): + """ + func(func(func(acc,xs[0]),xs[1])....xs[n]) + + :param func: (T, T) -> T + :param acc: T + :param xs: list of T + """ + return functools.reduce(func, xs, acc) + + +def padding(l, size): + """ + all the tuples in the list will be None padding (size - len(l)) times + :param l: list of tuples + :param size: target size + :return: + """ + curr_size = len(l) + diff = size - curr_size + for t in l: + for i in range(2 * diff): + t += None + + +def detect_language(spacy_object): + """ + gured out appling detect_langs function on spacy_object + :param spacy_object + """ + try: + detected_language = detect_langs(spacy_object.text) + res = {} + for it in detected_language: + res[str(it.lang)] = float(it.prob) + return {"result": res} + except LangDetectException: + return {"UNKNOWN": 0.0} + + +def infer_lang(s): + """ + Return languages and their probabilities. + + Return a Pandas Series where each row contains a tuple that has information regarding to the infer languages. + + Tuple: ( `language_1`, `probability_1`, ...) + + Note: If exist row that has more then one language the return Pandas Series will be pad with None + + Parameters + ---------- + input : Pandas Series + + Examples + -------- + >>> import texthero as hero + >>> import pandas as pd + >>> s = pd.Series("This is an English text!.") + >>> hero.infer_lang(s) + 0 (en, 0.9999980507990403) + dtype: object + """ + + infer_languages = [] + max_list_size = 0 + + nlp = spacy.load("en_core_web_sm") + nlp.add_pipe(LanguageDetector(detect_language), name="language_detector", last=True) + + for doc in nlp.pipe(s.values, batch_size=32): + l = list(doc._.language["result"].items()) + curr_size = len(l) + l = foldl(operator.add, (), l) + if max_list_size < curr_size: + padding(infer_languages, curr_size) + max_list_size = curr_size + infer_languages.append(l) + + return pd.Series(infer_languages, index=s.index) From e00706ed8d3c2af25ca9191bd350753bd64559f5 Mon Sep 17 00:00:00 2001 From: Tomer Mankita Date: Tue, 14 Jul 2020 12:37:31 +0300 Subject: [PATCH 2/6] Add infer_lang to test_cases_nlp in test_indexes Update detect_language function Add padding_tuple function --- tests/test_indexes.py | 4 +++- tests/test_nlp.py | 6 ++++-- texthero/nlp.py | 35 +++++++++++++++++++++++++++-------- 3 files changed, 34 insertions(+), 11 deletions(-) diff --git a/tests/test_indexes.py b/tests/test_indexes.py index adc08008..67900051 100644 --- a/tests/test_indexes.py +++ b/tests/test_indexes.py @@ -1,10 +1,11 @@ import pandas as pd +from parameterized import parameterized + from texthero import nlp, visualization, preprocessing, representation from . import PandasTestCase import unittest import string -from parameterized import parameterized # Define valid inputs for different functions. @@ -25,6 +26,7 @@ test_cases_nlp = [ ["named_entities", nlp.named_entities, (s_text,)], ["noun_chunks", nlp.noun_chunks, (s_text,)], + ["infer_lang", nlp.infer_lang, (s_text,)], ] test_cases_preprocessing = [ diff --git a/tests/test_nlp.py b/tests/test_nlp.py index 3e3f9225..139161fa 100644 --- a/tests/test_nlp.py +++ b/tests/test_nlp.py @@ -71,5 +71,7 @@ def test_count_sentences_wrong_index(self): def test_infer_lang(self): s = pd.Series("This is an English text!.") - s_true = pd.Series([("en", 0.9999980507990403)]) - self.assertEqual(nlp.infer_lang(s), s_true) + s_true = pd.Series([("en", "0.99999")]) + s_result = nlp.infer_lang(s) + self.assertEqual(s_result, s_true) + diff --git a/texthero/nlp.py b/texthero/nlp.py index ece427d5..52e5a03b 100644 --- a/texthero/nlp.py +++ b/texthero/nlp.py @@ -147,7 +147,7 @@ def foldl(func, acc, xs): return functools.reduce(func, xs, acc) -def padding(l, size): +def padding_list(l, size): """ all the tuples in the list will be None padding (size - len(l)) times :param l: list of tuples @@ -157,8 +157,21 @@ def padding(l, size): curr_size = len(l) diff = size - curr_size for t in l: - for i in range(2 * diff): - t += None + padding_tuple(t, 2*diff) + + +def padding_tuple(t, size): + """ + The tuple will be None padding size times + :param t: list of tuples + :param size: target size + :return: + """ + curr_size = len(t) + if curr_size < size: + while curr_size != size: + t+=(None) + curr_size+=1 def detect_language(spacy_object): @@ -170,7 +183,11 @@ def detect_language(spacy_object): detected_language = detect_langs(spacy_object.text) res = {} for it in detected_language: - res[str(it.lang)] = float(it.prob) + prob_str = str(it.prob) + parts = prob_str.split('.') + integer = parts[0] + digits = parts[1][0:5] + res[str(it.lang)] = integer + '.' + digits return {"result": res} except LangDetectException: return {"UNKNOWN": 0.0} @@ -196,7 +213,7 @@ def infer_lang(s): >>> import pandas as pd >>> s = pd.Series("This is an English text!.") >>> hero.infer_lang(s) - 0 (en, 0.9999980507990403) + 0 (en, 0.99999) dtype: object """ @@ -209,10 +226,12 @@ def infer_lang(s): for doc in nlp.pipe(s.values, batch_size=32): l = list(doc._.language["result"].items()) curr_size = len(l) - l = foldl(operator.add, (), l) + t = foldl(operator.add, (), l) if max_list_size < curr_size: - padding(infer_languages, curr_size) + padding_list(infer_languages, curr_size) max_list_size = curr_size - infer_languages.append(l) + elif curr_size < max_list_size: + padding_tuple(t,max_list_size) + infer_languages.append(t) return pd.Series(infer_languages, index=s.index) From fa35af000f3cc6b14ca893563f762965feadd57b Mon Sep 17 00:00:00 2001 From: Tomer Mankita Date: Tue, 14 Jul 2020 12:39:24 +0300 Subject: [PATCH 3/6] reformat by format.sh --- tests/test_nlp.py | 1 - texthero/nlp.py | 12 ++++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/tests/test_nlp.py b/tests/test_nlp.py index 139161fa..606b9cd6 100644 --- a/tests/test_nlp.py +++ b/tests/test_nlp.py @@ -74,4 +74,3 @@ def test_infer_lang(self): s_true = pd.Series([("en", "0.99999")]) s_result = nlp.infer_lang(s) self.assertEqual(s_result, s_true) - diff --git a/texthero/nlp.py b/texthero/nlp.py index 52e5a03b..333e6db5 100644 --- a/texthero/nlp.py +++ b/texthero/nlp.py @@ -157,7 +157,7 @@ def padding_list(l, size): curr_size = len(l) diff = size - curr_size for t in l: - padding_tuple(t, 2*diff) + padding_tuple(t, 2 * diff) def padding_tuple(t, size): @@ -170,8 +170,8 @@ def padding_tuple(t, size): curr_size = len(t) if curr_size < size: while curr_size != size: - t+=(None) - curr_size+=1 + t += None + curr_size += 1 def detect_language(spacy_object): @@ -184,10 +184,10 @@ def detect_language(spacy_object): res = {} for it in detected_language: prob_str = str(it.prob) - parts = prob_str.split('.') + parts = prob_str.split(".") integer = parts[0] digits = parts[1][0:5] - res[str(it.lang)] = integer + '.' + digits + res[str(it.lang)] = integer + "." + digits return {"result": res} except LangDetectException: return {"UNKNOWN": 0.0} @@ -231,7 +231,7 @@ def infer_lang(s): padding_list(infer_languages, curr_size) max_list_size = curr_size elif curr_size < max_list_size: - padding_tuple(t,max_list_size) + padding_tuple(t, max_list_size) infer_languages.append(t) return pd.Series(infer_languages, index=s.index) From c5b28691ec48db697e78f28e500b125712a46b31 Mon Sep 17 00:00:00 2001 From: Tomer Mankita Date: Tue, 14 Jul 2020 18:20:36 +0300 Subject: [PATCH 4/6] -Remove for and while loops -Add more extensively test -Remove unnecessary code -Update infer_lang documentation --- setup.cfg | 2 + tests/test_nlp.py | 19 ++++++- texthero/nlp.py | 123 +++++++++++++++++++++------------------------- 3 files changed, 76 insertions(+), 68 deletions(-) diff --git a/setup.cfg b/setup.cfg index dff1260d..a29ff034 100644 --- a/setup.cfg +++ b/setup.cfg @@ -30,6 +30,8 @@ install_requires = numpy>=1.17 scikit-learn>=0.22 spacy>=2.2.2 + spacy-langdetect>=0.1.2 + langdetect>=1.0.7 tqdm>=4.3 nltk>=3.3 plotly>=4.2.0 diff --git a/tests/test_nlp.py b/tests/test_nlp.py index 606b9cd6..4baa64c7 100644 --- a/tests/test_nlp.py +++ b/tests/test_nlp.py @@ -70,7 +70,22 @@ def test_count_sentences_wrong_index(self): self.assertFalse(counted_sentences_s.index.equals(t_different_index.index)) def test_infer_lang(self): - s = pd.Series("This is an English text!.") - s_true = pd.Series([("en", "0.99999")]) + + s = pd.Series( + [ + "This is English text.", + " Er lebt mit seinen Eltern und seiner Schwester in Berlin.", + " Yo me divierto todos los días en el parque. ", + "Je m'appelle Angélica Summer, j'ai 12 ans et je suis canadienne.", + ] + ) + s_true = pd.Series( + [ + ("en", "%.5f" % float(1)), + ("de", "%.5f" % float(1)), + ("es", "%.5f" % float(1)), + ("fr", "%.5f" % float(1)), + ] + ) s_result = nlp.infer_lang(s) self.assertEqual(s_result, s_true) diff --git a/texthero/nlp.py b/texthero/nlp.py index 333e6db5..dfd15c11 100644 --- a/texthero/nlp.py +++ b/texthero/nlp.py @@ -7,8 +7,7 @@ from spacy_langdetect import LanguageDetector from langdetect import detect_langs from langdetect.lang_detect_exception import LangDetectException -import functools -import operator +from langdetect.language import Language def named_entities(s, package="spacy"): @@ -136,76 +135,78 @@ def count_sentences(s: pd.Series) -> pd.Series: return pd.Series(number_of_sentences, index=s.index) -def foldl(func, acc, xs): - """ - func(func(func(acc,xs[0]),xs[1])....xs[n]) - - :param func: (T, T) -> T - :param acc: T - :param xs: list of T - """ - return functools.reduce(func, xs, acc) - +def _Language_to_dict(lang: Language): + return (str(lang.lang), "%.5f" % float(lang.prob)) -def padding_list(l, size): - """ - all the tuples in the list will be None padding (size - len(l)) times - :param l: list of tuples - :param size: target size - :return: - """ - curr_size = len(l) - diff = size - curr_size - for t in l: - padding_tuple(t, 2 * diff) - -def padding_tuple(t, size): +def _detect_language_list(spaCy_object): """ - The tuple will be None padding size times - :param t: list of tuples - :param size: target size - :return: + gured out appling detect_langs function on spacy_object + :param spacy_object """ - curr_size = len(t) - if curr_size < size: - while curr_size != size: - t += None - curr_size += 1 + try: + detected_language = list( + map(_Language_to_dict, detect_langs(spaCy_object.text)) + ) + return detected_language + except LangDetectException: + return ("UNKNOWN", 0.0) -def detect_language(spacy_object): +def _detect_language(spaCy_object): """ gured out appling detect_langs function on spacy_object :param spacy_object """ try: - detected_language = detect_langs(spacy_object.text) - res = {} - for it in detected_language: - prob_str = str(it.prob) - parts = prob_str.split(".") - integer = parts[0] - digits = parts[1][0:5] - res[str(it.lang)] = integer + "." + digits - return {"result": res} + detected_language = _Language_to_dict(detect_langs(spaCy_object.text)[0]) + return detected_language except LangDetectException: - return {"UNKNOWN": 0.0} + return ("UNKNOWN", 0.0) + + +def _infer_lang_ret_list(s, nlp, infer_languages): + nlp.add_pipe( + LanguageDetector(_detect_language_list), name="language_detector", last=True + ) + for doc in nlp.pipe(s.values, batch_size=32): + infer_languages.append(doc._.language) + return pd.Series(infer_languages, index=s.index) + + +def _infer_lang(s, nlp, infer_languages): + nlp.add_pipe( + LanguageDetector(_detect_language), name="language_detector", last=True + ) + for doc in nlp.pipe(s.values, batch_size=32): + infer_languages.append(doc._.language) -def infer_lang(s): + return pd.Series(infer_languages, index=s.index) + + +def infer_lang(s, ret_list=False): """ Return languages and their probabilities. - Return a Pandas Series where each row contains a tuple that has information regarding to the infer languages. + Return a Pandas Series where each row contains a tuple that has information regarding to the "average" infer language. + + Tuple : (language, probability) - Tuple: ( `language_1`, `probability_1`, ...) + If ret_list = True then each row contains a list of tuples - Note: If exist row that has more then one language the return Pandas Series will be pad with None + Note: infer_lang is nondeterministic function Parameters ---------- - input : Pandas Series + s : Pandas Series + ret_list (optional) : boolean + + supports 55 languages out of the box (ISO 639-1 codes) + ------------------------------------------------------ + af, ar, bg, bn, ca, cs, cy, da, de, el, en, es, et, fa, fi, fr, gu, he, + hi, hr, hu, id, it, ja, kn, ko, lt, lv, mk, ml, mr, ne, nl, no, pa, pl, + pt, ro, ru, sk, sl, so, sq, sv, sw, ta, te, th, tl, tr, uk, ur, vi, zh-cn, zh-tw Examples -------- @@ -213,25 +214,15 @@ def infer_lang(s): >>> import pandas as pd >>> s = pd.Series("This is an English text!.") >>> hero.infer_lang(s) - 0 (en, 0.99999) + 0 (en, 1.00000) dtype: object + """ infer_languages = [] - max_list_size = 0 - nlp = spacy.load("en_core_web_sm") - nlp.add_pipe(LanguageDetector(detect_language), name="language_detector", last=True) - for doc in nlp.pipe(s.values, batch_size=32): - l = list(doc._.language["result"].items()) - curr_size = len(l) - t = foldl(operator.add, (), l) - if max_list_size < curr_size: - padding_list(infer_languages, curr_size) - max_list_size = curr_size - elif curr_size < max_list_size: - padding_tuple(t, max_list_size) - infer_languages.append(t) - - return pd.Series(infer_languages, index=s.index) + if ret_list: + return _infer_lang_ret_list(s, nlp, infer_languages) + else: + return _infer_lang(s, nlp, infer_languages) From c284cda835b3ac071c2bb1ee8781e34dadc79e99 Mon Sep 17 00:00:00 2001 From: Tomer Mankita Date: Fri, 17 Jul 2020 14:51:33 +0300 Subject: [PATCH 5/6] - Change implementation to use pandas apply -Add test for each single ISO code -Change name ret_list to probability - Change name _Language_to_dict to _Language_to_tuple --- setup.cfg | 1 - tests/test_nlp.py | 121 ++++++++++++++++++++++++++++++++++++++++++---- texthero/nlp.py | 63 ++++++++---------------- 3 files changed, 131 insertions(+), 54 deletions(-) diff --git a/setup.cfg b/setup.cfg index a29ff034..835f55b1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -30,7 +30,6 @@ install_requires = numpy>=1.17 scikit-learn>=0.22 spacy>=2.2.2 - spacy-langdetect>=0.1.2 langdetect>=1.0.7 tqdm>=4.3 nltk>=3.3 diff --git a/tests/test_nlp.py b/tests/test_nlp.py index 4baa64c7..7a2c3066 100644 --- a/tests/test_nlp.py +++ b/tests/test_nlp.py @@ -70,21 +70,124 @@ def test_count_sentences_wrong_index(self): self.assertFalse(counted_sentences_s.index.equals(t_different_index.index)) def test_infer_lang(self): - + # no found words in the following languages it, hr and hi that the function succeeds to detect. s = pd.Series( [ - "This is English text.", - " Er lebt mit seinen Eltern und seiner Schwester in Berlin.", - " Yo me divierto todos los días en el parque. ", - "Je m'appelle Angélica Summer, j'ai 12 ans et je suis canadienne.", + "Wêreld", + "مرحبا بالعالم", + "български", + "ওহে বিশ্ব", + "català", + "Ahoj světe", + "Helo Byd", + "dansk", + "Deutsch", + "Γειά σου Κόσμε", + "fox", + "Hola Mundo", + "Tere, Maailm", + "فارسی", + "Hei maailma", + "Bonjour le monde", + "હેલો વર્લ્ડ", + "שלום עולם", + "Helló Világ", + "Bahasa", + "こんにちは世界", + "ಹಲೋ ವರ್ಲ್ಡ್", + "안녕하세요 세계", + "lietuvių kalba", + "Sveika pasaule", + "Здраво свету", + "ഹലോ വേൾഡ്", + "मराठी", + "नेपाली", + "Vlaams", + "Norsk", + "ਸਤਿ ਸ੍ਰੀ ਅਕਾਲ ਦੁਨਿਆ", + "Witaj świecie", + "Olá Mundo", + "Română", + "русский", + "Slovenský", + "Pozdravljen, svet", + "Soomaaliga", + "Përshendetje Botë", + "Hej världen", + "Kiswahili", + "வணக்கம் உலகம்", + "హలో ప్రపంచ", + "สวัสดีชาวโลก", + "Wikang Tagalog", + "Selam Dünya", + "Привіт Світ", + "ہیلو دنیا", + "Chào thế giới", + "中文", + "中華民國國歌", + # "धन्यवाद", + # "Lijepa naša domovino", + # "Italiano", ] ) + s_true = pd.Series( [ - ("en", "%.5f" % float(1)), - ("de", "%.5f" % float(1)), - ("es", "%.5f" % float(1)), - ("fr", "%.5f" % float(1)), + "af", + "ar", + "bg", + "bn", + "ca", + "cs", + "cy", + "da", + "de", + "el", + "en", + "es", + "et", + "fa", + "fi", + "fr", + "gu", + "he", + "hu", + "id", + "ja", + "kn", + "ko", + "lt", + "lv", + "mk", + "ml", + "mr", + "ne", + "nl", + "no", + "pa", + "pl", + "pt", + "ro", + "ru", + "sk", + "sl", + "so", + "sq", + "sv", + "sw", + "ta", + "te", + "th", + "tl", + "tr", + "uk", + "ur", + "vi", + "zh-cn", + "zh-tw", + # 'hi', + # 'hr', + # 'it' ] ) s_result = nlp.infer_lang(s) diff --git a/texthero/nlp.py b/texthero/nlp.py index dfd15c11..3ed7ce01 100644 --- a/texthero/nlp.py +++ b/texthero/nlp.py @@ -135,72 +135,50 @@ def count_sentences(s: pd.Series) -> pd.Series: return pd.Series(number_of_sentences, index=s.index) -def _Language_to_dict(lang: Language): +def _Language_to_tuple(lang: Language): return (str(lang.lang), "%.5f" % float(lang.prob)) -def _detect_language_list(spaCy_object): +def _detect_language_probability(s): """ - gured out appling detect_langs function on spacy_object - :param spacy_object + gured out appling detect_langs function on sentence + :param s """ try: - detected_language = list( - map(_Language_to_dict, detect_langs(spaCy_object.text)) - ) + detected_language = list(map(_Language_to_tuple, detect_langs(s))) return detected_language except LangDetectException: return ("UNKNOWN", 0.0) -def _detect_language(spaCy_object): +def _detect_language(s): """ - gured out appling detect_langs function on spacy_object - :param spacy_object + gured out appling detect_langs function on sentence + :param s """ try: - detected_language = _Language_to_dict(detect_langs(spaCy_object.text)[0]) + detected_language = str(detect_langs(s)[0].lang) return detected_language except LangDetectException: - return ("UNKNOWN", 0.0) + return "UNKNOWN" -def _infer_lang_ret_list(s, nlp, infer_languages): - nlp.add_pipe( - LanguageDetector(_detect_language_list), name="language_detector", last=True - ) - for doc in nlp.pipe(s.values, batch_size=32): - infer_languages.append(doc._.language) - - return pd.Series(infer_languages, index=s.index) - - -def _infer_lang(s, nlp, infer_languages): - nlp.add_pipe( - LanguageDetector(_detect_language), name="language_detector", last=True - ) - for doc in nlp.pipe(s.values, batch_size=32): - infer_languages.append(doc._.language) - - return pd.Series(infer_languages, index=s.index) - - -def infer_lang(s, ret_list=False): +def infer_lang(s, probability=False): """ Return languages and their probabilities. - Return a Pandas Series where each row contains a tuple that has information regarding to the "average" infer language. + Return a Pandas Series where each row contains a ISO nomenclature of the "average" infer language. - Tuple : (language, probability) + If probability = True then each row contains a list of tuples - If ret_list = True then each row contains a list of tuples + Tuple : (language, probability) Note: infer_lang is nondeterministic function Parameters ---------- s : Pandas Series - ret_list (optional) : boolean + probability (optional) : boolean supports 55 languages out of the box (ISO 639-1 codes) ------------------------------------------------------ @@ -214,15 +192,12 @@ def infer_lang(s, ret_list=False): >>> import pandas as pd >>> s = pd.Series("This is an English text!.") >>> hero.infer_lang(s) - 0 (en, 1.00000) + 0 en dtype: object """ - infer_languages = [] - nlp = spacy.load("en_core_web_sm") - - if ret_list: - return _infer_lang_ret_list(s, nlp, infer_languages) + if probability: + return s.apply(_detect_language_probability) else: - return _infer_lang(s, nlp, infer_languages) + return s.apply(_detect_language) From 91c17a086436c3c87dd0725d5a9479b7c601a79f Mon Sep 17 00:00:00 2001 From: Tomer Mankita Date: Fri, 17 Jul 2020 14:59:47 +0300 Subject: [PATCH 6/6] Remove unnecessary imports --- texthero/nlp.py | 1 - 1 file changed, 1 deletion(-) diff --git a/texthero/nlp.py b/texthero/nlp.py index 3ed7ce01..85404b34 100644 --- a/texthero/nlp.py +++ b/texthero/nlp.py @@ -4,7 +4,6 @@ import spacy import pandas as pd -from spacy_langdetect import LanguageDetector from langdetect import detect_langs from langdetect.lang_detect_exception import LangDetectException from langdetect.language import Language