From ca97cd958c1cf311a2e52300683d016062df1851 Mon Sep 17 00:00:00 2001 From: AudayBerro Date: Tue, 31 Aug 2021 19:37:00 +0200 Subject: [PATCH 1/7] Added multi_pivot_paraphrases_generation transformation --- .../README.md | 44 +++ .../__init__.py | 1 + .../constants.py | 16 + .../easy_nmt.py | 22 ++ .../requirements.txt | 5 + .../test.json | 206 ++++++++++++ .../transformation.py | 308 ++++++++++++++++++ .../use_filter.py | 51 +++ 8 files changed, 653 insertions(+) create mode 100644 transformations/multi_pivot_paraphrases_generation/README.md create mode 100644 transformations/multi_pivot_paraphrases_generation/__init__.py create mode 100644 transformations/multi_pivot_paraphrases_generation/constants.py create mode 100644 transformations/multi_pivot_paraphrases_generation/easy_nmt.py create mode 100644 transformations/multi_pivot_paraphrases_generation/requirements.txt create mode 100644 transformations/multi_pivot_paraphrases_generation/test.json create mode 100644 transformations/multi_pivot_paraphrases_generation/transformation.py create mode 100644 transformations/multi_pivot_paraphrases_generation/use_filter.py diff --git a/transformations/multi_pivot_paraphrases_generation/README.md b/transformations/multi_pivot_paraphrases_generation/README.md new file mode 100644 index 000000000..4e521c053 --- /dev/null +++ b/transformations/multi_pivot_paraphrases_generation/README.md @@ -0,0 +1,44 @@ +# From one English Snetnece to a list of paraphrases 🦎 + ⌨️ → 🐍 +This transformation generates a list of paraphrases for an English sentence by leveraging Pivot-Transaltion approach. +Pivot-Transaltion is an approach where a sentence in a source language is translated to a foreign language called the pivot language then translated back to the source language to get a paraprhase candidate, e.g. translate an English sentence to French, then translate back to English. + +The paraphrases generation is divided into two step: +- Step 1: paraphrases Candidate Over-generation by leveraging Pivot-Transaltion. At this step, we generate a Pool of possible parparhases. +- Step 2: apply a candidate selection over the Pool of paraphrases, since the pool can contain semantically unrelated or duplicate paraphrases. + We leverage Embedding Model such as Universal Sentence Encoder~(USE) to disqualify candidate paraphrases from the pool, by computing the Cosine Similarity socres of the + USE Embeddings between the reference sentence and the candidate paraphrase. Let R = USE_Embeding(reference_english_sentence) and P = USE_Embeding(candidate): + - if Cosine(R,P) < alpha => the candidate is semantically unrelated and then removed from the final list of paraphrases + - if Cosine(R,P) > beta => the candidate is a duplication and then removed from the final list of paraphrases + - By default Alpha=0.5 and Beta=0.95, we set the value as suggested by [Parikh et al.](https://arxiv.org/pdf/2004.03484.pdf) works + +Please refer to the test.json for all of the test cases catered. + +This transformation translates an English sentence to a list of predefined languages using Huggingface MariamMT and EasyNMT as Machine Transaltion models. +- The transformation support Two Pivot-Transaltion Level. + - If Pivot-level = 1 => Transalte to only one foreign language. e.g. English -> French -> English || English -> Arabic -> English || English -> japanese -> English + - If Pivot-level = 2 => Transalte to only Two foreign language. e.g. English -> French -> Arabic -> English || English -> Russian -> Chinese -> English + +Author name: Auday Berro (audayberro@gmail.com) + +## What type of a transformation is this? +This transformation is a paraphrase generation for Natural English Sentences by lveraging Pivot-Transaltion techniques. The Pivot-Trnasaltion technique allow to get lexically and syntaxically diverse paraphrases. + +## What tasks does it intend to benefit? +This transformation would benefit all tasks with a sentence as input like question generation, sentence generation, etc. + +## What are the limitations of this transformation? + +1. The transformation does not generate paraphrases for non-English sentences, e.g. Can't generate paraphrases for German or Chinese sentences + +2. This transformation only generate paraphrases for Natural Language English sentences. + +## Previous Work + + +2) This work is partly inspired by the following work on robustness for Machine Translation: +```bibtex +@article{berroextensible, + title={An Extensible and Reusable Pipeline for Automated Utterance Paraphrases}, + author={Berro, Auday and Zade, Mohammad-Ali Yaghub and Baez, Marcos and Benatallah, Boualem and Benabdeslem, Khalid} +} +``` \ No newline at end of file diff --git a/transformations/multi_pivot_paraphrases_generation/__init__.py b/transformations/multi_pivot_paraphrases_generation/__init__.py new file mode 100644 index 000000000..930cdce0b --- /dev/null +++ b/transformations/multi_pivot_paraphrases_generation/__init__.py @@ -0,0 +1 @@ +from .transformation import * diff --git a/transformations/multi_pivot_paraphrases_generation/constants.py b/transformations/multi_pivot_paraphrases_generation/constants.py new file mode 100644 index 000000000..301271550 --- /dev/null +++ b/transformations/multi_pivot_paraphrases_generation/constants.py @@ -0,0 +1,16 @@ +# Huggign Face Marian Machine Translator Model to load. Set of Tuples in the form: tuple=(Source-2-target languages pairs, Huggingface MarianMT Helsinki-NLP model) +HUGGINGFACE_MARIANMT_MODELS_TO_LOAD = { + ('en2romance','Helsinki-NLP/opus-mt-en-ROMANCE'), + ('romance2en','Helsinki-NLP/opus-mt-ROMANCE-en'), + ('de2en','Helsinki-NLP/opus-mt-de-en'), + ('ru2en','Helsinki-NLP/opus-mt-ru-en'), + ('en2ar','Helsinki-NLP/opus-mt-en-ar'), + ('en2zh','Helsinki-NLP/opus-mt-en-zh'), + ('en2jap','Helsinki-NLP/opus-mt-en-jap'), + ('en2ru','Helsinki-NLP/opus-mt-en-ru'), + ('en2de','Helsinki-NLP/opus-mt-en-de'), + ('zh2en','Helsinki-NLP/opus-mt-zh-en') + } + + +EASYNMT_MODEL_NAME = 'm2m_100_418M' \ No newline at end of file diff --git a/transformations/multi_pivot_paraphrases_generation/easy_nmt.py b/transformations/multi_pivot_paraphrases_generation/easy_nmt.py new file mode 100644 index 000000000..97937464d --- /dev/null +++ b/transformations/multi_pivot_paraphrases_generation/easy_nmt.py @@ -0,0 +1,22 @@ +""" EasyNMT - Easy to use, state-of-the-art Neural Machine Translation - https://github.com/UKPLab/EasyNMT """ +from easynmt import EasyNMT + +def load_easynmt_model(model_name='m2m_100_418M'): + """ + EasyNMT model to load + :param model_name: name of the model to load - List of supported model visit: https://github.com/UKPLab/EasyNMT#available-models + :return EasyNMT Machine translation model + """ + + return EasyNMT(model_name) + +def get_easynmt_translation(sentence,model,target_lang,source_lang=None): + """ + Translate a sentence + :param sentence: sentence to translate + :param model: EasyNMT model + :param trg: Target language for the translation + :param source_lang: Source language for the translation. If None, determines the source languages automatically. + :return Translated sentence + """ + return model.translate(sentence, source_lang=source_lang, target_lang=target_lang) \ No newline at end of file diff --git a/transformations/multi_pivot_paraphrases_generation/requirements.txt b/transformations/multi_pivot_paraphrases_generation/requirements.txt new file mode 100644 index 000000000..41ff4a0b5 --- /dev/null +++ b/transformations/multi_pivot_paraphrases_generation/requirements.txt @@ -0,0 +1,5 @@ +EasyNMT==2.0.1 +numpy==1.18.5 +scikit-learn==0.20.4 +tensorflow-hub==0.4.0 +transformers==4.5.1 diff --git a/transformations/multi_pivot_paraphrases_generation/test.json b/transformations/multi_pivot_paraphrases_generation/test.json new file mode 100644 index 000000000..6318d0ba8 --- /dev/null +++ b/transformations/multi_pivot_paraphrases_generation/test.json @@ -0,0 +1,206 @@ +{ + "type": "multi_pivot_paraphrases_generation", + "test_cases": [ + { + "class": "MultiPivotParaphrasesGeneration", + "inputs": { + "Reference sentence": "How does COVID-19 spread?" + }, + "outputs": [ + { + "Paraphrase": "How is COVID-19 disseminated?" + }, + { + "Paraphrase": "How is COVID-19 spread?" + }, + { + "Paraphrase": "How did COVID-19 spread?" + }, + { + "Paraphrase": "How is COVID-19 spreading?" + }, + { + "Paraphrase": "How does COVID-19 spread?" + } + ] + }, + { + "class": "MultiPivotParaphrasesGeneration", + "inputs": { + "Reference sentence": "Book a flight from Lyon to Sydney?" + }, + "outputs": [ + { + "Paraphrase": "To book a flight from Lyon to Sydney?" + }, + { + "Paraphrase": "Have you booked a flight from Lyon to Sydney?" + }, + { + "Paraphrase": "What is the journey from Lyon to Sydney?" + }, + { + "Paraphrase": "Book a flight from Lyon to Sydney?" + }, + { + "Paraphrase": "Are you booking a flight from Lyon to Sydney?" + } + ] + }, + { + "class": "MultiPivotParaphrasesGeneration", + "inputs": { + "Reference sentence": "Reserve an Italian Restaurant near Paris" + }, + "outputs": [ + { + "Paraphrase": "Reserve an Italian restaurant near Paris" + }, + { + "Paraphrase": "Italian restaurants near Paris" + }, + { + "Paraphrase": "Book an Italian restaurant near Paris" + }, + { + "Paraphrase": "It's a reservation at the Italian restaurant near Paris." + }, + { + "Paraphrase": "Save the Italian restaurant near Paris." + } + ] + }, + { + "class": "MultiPivotParaphrasesGeneration", + "inputs": { + "Reference sentence": "how many 10 euros are worth in dollars" + }, + "outputs": [ + { + "Paraphrase": "how many 10 euros are worth in dollars" + }, + { + "Paraphrase": "how much 10 euros are worth in dollars" + }, + { + "Paraphrase": "10 Euros in Dollars." + }, + { + "Paraphrase": "How many Euros are worth in United States dollars?" + }, + { + "Paraphrase": "How much is 10 euros in dollars?" + }, + { + "Paraphrase": "how many 10 euros is worth in dollars" + }, + { + "Paraphrase": "how many 10 euros in dollars are worth" + } + ] + }, + { + "class": "MultiPivotParaphrasesGeneration", + "inputs": { + "Reference sentence": "which company makes the ipod?" + }, + "outputs": [ + { + "Paraphrase": "Which company is making iPods?" + }, + { + "Paraphrase": "What company does the iPod make?" + }, + { + "Paraphrase": "Which company does the ipod?" + }, + { + "Paraphrase": "What kind of company does an iPod?" + }, + { + "Paraphrase": "Which company manufactures ipods?" + }, + { + "Paraphrase": "What company does the iPod do?" + }, + { + "Paraphrase": "Which company makes the iPod?" + }, + { + "Paraphrase": "What company manufactures the ipod?" + } + ] + }, + { + "class": "MultiPivotParaphrasesGeneration", + "inputs": { + "Reference sentence": "what states does the connecticut river flow through?" + }, + "outputs": [ + { + "Paraphrase": "In what states does the connected river flow?" + }, + { + "Paraphrase": "What state is the link to the river?" + }, + { + "Paraphrase": "What states is the connecticut river going through?" + }, + { + "Paraphrase": "Where does the river flow? What is the way the Nile flows?" + }, + { + "Paraphrase": "What are you running through the Connecticut River?" + }, + { + "Paraphrase": "What states does the river connecticut flow through?" + }, + { + "Paraphrase": "In what state does the river connecticut flow?" + }, + { + "Paraphrase": "What states pass through the river Kinkito?" + }, + { + "Paraphrase": "What conditions does the Connecticut River flow through?" + }, + { + "Paraphrase": "What states the river connecticut flows?" + } + ] + }, + { + "class": "MultiPivotParaphrasesGeneration", + "inputs": { + "Reference sentence": "in which tournaments did west indies cricket team win the championship?" + }, + "outputs": [ + { + "Paraphrase": "In which tournaments did Western Indians win the championship?" + }, + { + "Paraphrase": "What tournaments did the West Indies cricket team win the championship?" + }, + { + "Paraphrase": "Which team won the World Cup in West India?" + }, + { + "Paraphrase": "in which tournaments has West India cricket team won the championship?" + }, + { + "Paraphrase": "In which tournaments did the cricket team of the West Indies win the championship?" + }, + { + "Paraphrase": "What game did the Cricket Team of the West Indies win?" + }, + { + "Paraphrase": "In what tournaments did the cricket team of the West Indies win the championship?" + }, + { + "Paraphrase": "What tournament did the West Indies cricket team win?" + } + ] + } + ] + } + \ No newline at end of file diff --git a/transformations/multi_pivot_paraphrases_generation/transformation.py b/transformations/multi_pivot_paraphrases_generation/transformation.py new file mode 100644 index 000000000..d973fc5f5 --- /dev/null +++ b/transformations/multi_pivot_paraphrases_generation/transformation.py @@ -0,0 +1,308 @@ +import random +import string +import concurrent.futures + +from transformers import MarianMTModel,MarianTokenizer + +from .easy_nmt import load_easynmt_model,get_easynmt_translation +from .use_filter import load_use_model,get_use_embedding +from .constants import HUGGINGFACE_MARIANMT_MODELS_TO_LOAD, EASYNMT_MODEL_NAME + +from interfaces.SentenceOperation import SentenceOperation +from tasks.TaskTypes import TaskType + +# methods to print colored text in the terminal +def pr_green(text): + """ Pring text in green color font""" + print("\033[92m{}\033[00m" .format(text)) + +def pr_gray(text): + """ Pring text in gray color font""" + print("\033[7m{}\033[00m" .format(text)) + +class MultiPivotParaphrasesGeneration(SentenceOperation): + """ + This transformation generates a list of paraphrases for an English sentence by leveraging Pivot-Transaltion approach. + Pivot-Transaltion is an approach where a sentence in a source language is translated to a foreign language called the pivot language then + translated back to the source language to get a paraprhase candidate, e.g. translate an English sentence to French, then translate back to English. + + The paraphrases generation is divided into two step: + - Step 1: paraphrases Candidate Over-generation by leveraging Pivot-Transaltion. At this step, we generate a Pool of possible parparhases. + - Step 2: apply a candidate selection over the Pool of paraphrases, since the pool can contain semantically unrelated or duplicate paraphrases. + We leverage Embedding Model such as Universal Sentence Encoder~(USE) to disqualify candidate paraphrases from the pool, by computing the Cosine Similarity socres of the + USE Embeddings between the reference sentence and the candidate paraphrase. Let R = USE_Embeding(reference_english_sentence) and P = USE_Embeding(candidate): + - if Cosine(R,P) < alpha => the candidate is semantically unrelated and then removed from the final list of paraphrases + - if Cosine(R,P) > beta => the candidate is a duplication and then removed from the final list of paraphrases + - By default Alpha=0.5 and Beta=0.95, we set the value as suggested by [Parikh et al.](https://arxiv.org/pdf/2004.03484.pdf) works + + Please refer to the test.json for all of the test cases catered. + + This transformation translates an English sentence to a list of predefined languages using Huggingface MariamMT and EasyNMT as Machine Transaltion models. + - The transformation support Two Pivot-Transaltion Level. + - If Pivot-level = 1 => Transalte to only one foreign language. e.g. English -> French -> English || English -> Arabic -> English || English -> japanese -> English + - If Pivot-level = 2 => Transalte to only Two foreign language. e.g. English -> French -> Arabic -> English || English -> Russian -> Chinese -> English + """ + + tasks = [ + TaskType.QUESTION_GENERATION, + TaskType.TEXT_TO_TEXT_GENERATION + ] + languages = ["en"] + + def __init__(self, seed=0 , pivot_level=1): + """ + Generate parpahrases for an English sentence by Leveraging pivot transaltion + :param pivot_level: integer that indicate the pivot language level, single-pivot or multi-pivot range,1 =single-pivot, 2=double-pivot, 0=apply single and double + """ + + super().__init__(seed) + self.pivot_level = pivot_level + self.models = self.concurrent_model_loader() + self.use_embed_model = None + + def generate(self, sentence:str, candidate_selection = True): + """ + Generate a list of paraphrases for sentence + :param sentence: English sentence to be paraprhased + :param candidate_selection: remove semantiically unrelate paraphrases cadidates using USE_Embedding_Cosine_Similarity scores. False: don't apply candidate selction | True: apply + :return list of paraphrases + """ + + paraphrases = self.multi_translate(sentence,self.models) + + if candidate_selection: + #load_use_model + if not self.use_embed_model: + pr_gray("Load Universal Sentence Encoder Model:") + use_model_name = "https://tfhub.dev/google/universal-sentence-encoder-large/5" + self.use_embed_model = load_use_model(use_model_name) + pr_green("... done") + + #compute cosine similarity scores to remove semantically unrelated candidates + pr_gray("Start paraphrases candidate selection:") + paraphrases = get_use_embedding(paraphrases, self.use_embed_model, sentence) + + pr_green("... done") + + return paraphrases + + def translate(self,utterance,model,tok,trg="NONE"): + """ + Translate a single sentence + :param utterance: sentence to translate + :param model: transformers Marian Machine Transaltion Model(MarianMTModel) + :param tok: transformers Marian Tokenizer module(MarianTokenizer) + :param trg: target language - set value when using en-ROMANCE model - trg=>>fr<<|>>it<<|>>es<<|>>pt<< + :return Translated utterance + """ + if trg != 'NONE': + utterance = '>>'+trg+'<< '+utterance + # translated = model.generate(**tok.prepare_translation_batch([utterance]))#old version transformers==3.0.0 + translated = model.generate(**tok(utterance, return_tensors="pt", padding=True)) + result = [tok.decode(t, skip_special_tokens=True) for t in translated] + + result = result[0] + + # check token indices sequence length is longer than the specified maximum sequence length max_length=512 + if len(result) > 512: + result = result[:512] + return result + + + def multi_translate(self,utterance,model): + """ + Translate sentence + :param utterance: sentence to translate + :param model_list: dictionary containing marianMT model, key: model name - value: list containing respectively Model and tokenizer. e.g. {'en2ROMANCE':[model,tekenizer]} + :return list of utterance translations + """ + response = set() + + if self.pivot_level == 0 or self.pivot_level == 1:#one pivot language + # Translate to Italian + tmp = self.translate(utterance,model['en2romance'][0],model['en2romance'][1],trg="it") + tmp = self.translate(tmp,model['romance2en'][0],model['romance2en'][1])#translate back to English + response.add(tmp) + + # Translate to French + tmp = self.translate(utterance,model['en2romance'][0],model['en2romance'][1],trg="fr") + tmp = self.translate(tmp,model['romance2en'][0],model['romance2en'][1])#translate back to English + response.add(tmp) + + # Translate to Spanish + tmp = self.translate(utterance,model['en2romance'][0],model['en2romance'][1],trg="es") + tmp = self.translate(tmp,model['romance2en'][0],model['romance2en'][1])#translate back to English + response.add(tmp) + + # Translate to Portuguese + tmp = self.translate(utterance,model['en2romance'][0],model['en2romance'][1],trg="pt") + tmp = self.translate(tmp,model['romance2en'][0],model['romance2en'][1])#translate back to English + response.add(tmp) + + # Translate to Romanian + tmp = self.translate(utterance,model['en2romance'][0],model['en2romance'][1],trg="ro") + tmp = self.translate(tmp,model['romance2en'][0],model['romance2en'][1])#translate back to English + response.add(tmp) + + # Translate to German + tmp = self.translate(utterance,model['en2de'][0],model['en2de'][1]) + tmp = self.translate(tmp,model['de2en'][0],model['de2en'][1])#translate back to English + response.add(tmp) + + # Translate to Russian + tmp = self.translate(utterance,model['en2ru'][0],model['en2ru'][1]) + tmp = self.translate(tmp,model['ru2en'][0],model['ru2en'][1])#translate back to English + response.add(tmp) + + # Translate to Arabic + tmp = self.translate(utterance,model['en2ar'][0],model['en2ar'][1]) + tmp = get_easynmt_translation( tmp, model['easy_nmt'], 'en', 'ar') # translate back to English with EasyNMt + response.add(tmp) + + # Translate to Chinese + tmp = self.translate(utterance,model['en2zh'][0],model['en2zh'][1]) + tmp = self.translate(tmp,model['zh2en'][0],model['zh2en'][1])#translate back to English + response.add(tmp) + + # Translate to Japanese + tmp = self.translate(utterance,model['en2jap'][0],model['en2jap'][1]) + tmp = get_easynmt_translation( tmp, model['easy_nmt'], 'en', 'ja') # translate back to English with EasyNMt + response.add(tmp) + + if self.pivot_level == 0 or self.pivot_level == 2:# two pivot language + # Translate Spanish => Russian = > English + tmp = self.translate(utterance,model['en2romance'][0],model['en2romance'][1],trg="es") + tmp = get_easynmt_translation( tmp, model['easy_nmt'], 'ru', 'es') # translate to Russian with EasyNMt + tmp = self.translate(tmp,model['ru2en'][0],model['ru2en'][1])#translate back to English + response.add(tmp) + + # Translate Japanese => Spanish = > English + tmp = self.translate(utterance,model['en2jap'][0],model['en2jap'][1])#translate to Japanese + tmp = get_easynmt_translation( tmp, model['easy_nmt'], 'es', 'ja') # translate to Spanish with EasyNMt + tmp = self.translate(tmp,model['romance2en'][0],model['romance2en'][1])#translate back to English + response.add(tmp) + + # Translate Japanese => Italian = > English + tmp = self.translate(utterance,model['en2jap'][0],model['en2jap'][1])#translate to Japanese + tmp = get_easynmt_translation( tmp, model['easy_nmt'], 'it', 'ja') # translate to Italian with EasyNMt + tmp = self.translate(tmp,model['romance2en'][0],model['romance2en'][1])#translate back to English + response.add(tmp) + + # Translate Arabic => German = > English + tmp = self.translate(utterance,model['en2ar'][0],model['en2ar'][1])#translate to Arabic + tmp = get_easynmt_translation( tmp, model['easy_nmt'], 'de', 'ar') # translate to German with EasyNMt + tmp = self.translate(tmp,model['de2en'][0],model['de2en'][1])#translate back to English + response.add(tmp) + + # Translate Chinese => German = > English + tmp = self.translate(utterance,model['en2zh'][0],model['en2zh'][1])#translate to Chinese + tmp = get_easynmt_translation( tmp, model['easy_nmt'], 'de', 'zh') # translate to German with EasyNMt + tmp = self.translate(tmp,model['de2en'][0],model['de2en'][1])#translate back to English + response.add(tmp) + + # Translate German => Arabic = > English + tmp = self.translate(utterance,model['en2de'][0],model['en2de'][1])#translate to German + tmp = get_easynmt_translation( tmp, model['easy_nmt'], 'ar', 'de') # translate to Arabic with EasyNMt + tmp = get_easynmt_translation( tmp, model['easy_nmt'], 'en', 'ar') # translate to English with EasyNMt + response.add(tmp) + + # Translate German => Chinese = > English + tmp = self.translate(utterance,model['en2de'][0],model['en2de'][1])#translate to German + tmp = get_easynmt_translation( tmp, model['easy_nmt'], 'zh', 'de') # translate to Chinese with EasyNMt + tmp = self.translate(tmp,model['zh2en'][0],model['zh2en'][1])# translate back to English + response.add(tmp) + + # Translate German => Japanese = > English + tmp = self.translate(utterance,model['en2de'][0],model['en2de'][1])#translate to German + tmp = get_easynmt_translation( tmp, model['easy_nmt'], 'ja', 'de') # translate to Chinese with EasyNMt + tmp = get_easynmt_translation( tmp, model['easy_nmt'], 'en', 'ja') # translate to English with EasyNMt + response.add(tmp) + + return list(response) + + def translate_list(self,sentences,model): + """ + Translate a List of sentences + :param sentences: reference sentences to paraprhases in Python List, list of refenrence sentences + :param model_list: dictionary containing marianMT model, key: model name - value: list containing respectively Model and tokenizer. e.g. {'en2ROMANCE':[model,tekenizer]} + :return Python dictionary containing translsation, Key are initial sentence and vaule are a set of translations + """ + + paraphrases = dict() + for sentence in sentences: + tmp = self.multi_translate(sentence,model,self.pivot_level) + paraphrases[sentence]=tmp + + return paraphrases + + def get_model(self,param): + """ + Load Hugginface marian Machine Translator model and tokenizer + :param param: Huggingface MarianMt Helsinki-NLP/{model_name} to load (https://huggingface.co/Helsinki-NLP); param[0]=label - param[1]=model_name + :return a tuple result = (Huggingface MarianMt Model, Marian MT Tokenizer, Marian MT label) + """ + + mt_model = MarianMTModel.from_pretrained(param[1]) #param[0]=label ; param[1]=model_name to load + mt_tokenizer = MarianTokenizer.from_pretrained(param[1]) #load tokenizer + return mt_model,mt_tokenizer,param[0] + + def concurrent_model_loader(self): + """ + Return a List of Huggingface Marian MT model, same as load_model but load concurrently + :return Python dictionary - key: model name - value: list containing respectively MarianModel and MarianTokenizer e.g. {'en2ru':[model,tokenizer]} + """ + response = dict() + + pr_gray("Load Huggingface MarianMT models") + + # load HuggingFace Marian MT model and tokenizer concurrently through thread + with concurrent.futures.ThreadPoolExecutor() as executor: + + # results = [executor.submit(get_model2,model_name) for model_name in models_to_load.values()] + results = executor.map( self.get_model, HUGGINGFACE_MARIANMT_MODELS_TO_LOAD ) + + # unpack and add MarianMT model, MarianMT tokenizer and label + for model,tokenizer,label in results: + response[label] = [model,tokenizer] + + pr_green("... done") + + #load EasyNMT nodel + pr_gray("Load UKPLab Easy-NMT model") + + easy_model = load_easynmt_model( EASYNMT_MODEL_NAME ) + response['easy_nmt'] = easy_model + + pr_green("... done") + + return response + + +if __name__ == '__main__': + import json + from TestRunner import convert_to_snake_case + + tf = MultiPivotParaphrasesGeneration() + + sentences = ['How does COVID-19 spread?', + 'Book a flight from Lyon to Sydney?', + 'Reserve an Italian Restaurant near Paris', + 'how many 10 euros are worth in dollars', + 'which company makes the ipod?', + 'what states does the connecticut river flow through?', + 'in which tournaments did west indies cricket team win the championship?'] + + pr_gray("Start paraphrases Generation:") + + test_cases = [] + for sentence in sentences: + test_cases.append({ + "class": tf.name(), + "inputs": {"Reference sentence": sentence}, "outputs": [{"Paraphrase": o} for o in tf.generate(sentence)]} + ) + + pr_green("... done") + + json_file = {"type": convert_to_snake_case(tf.name()), "test_cases": test_cases} + + print(json.dumps(json_file, indent=2)) \ No newline at end of file diff --git a/transformations/multi_pivot_paraphrases_generation/use_filter.py b/transformations/multi_pivot_paraphrases_generation/use_filter.py new file mode 100644 index 000000000..3de150d14 --- /dev/null +++ b/transformations/multi_pivot_paraphrases_generation/use_filter.py @@ -0,0 +1,51 @@ +import tensorflow_hub as hub +from sklearn.metrics.pairwise import cosine_similarity +import numpy as np + +""" Remove semantically unrelated paraphrases by computing Universal Sentence Encoder embeddings cosine similiraity score """ + +def load_use_model(model_name="https://tfhub.dev/google/universal-sentence-encoder-large/5"): + """ + Load Universal Sentence Encoder model + :param model_name: name of the USE model to load + :return an USE model + """ + + model = hub.load(model_name) + return model + + +def get_use_embedding(paraphrases_list, embed, reference_sentence): + """ + Get Universal Sentence Encoder embeddings + :param paraphrases_list: python list on which to apply embedding, Key initial sentence and value is a set of paraphrases + :param embed: Universal Sentence Encoder model instance + :param reference_sentence: reference sentence with which the paraphrases are compared + :return a python dictionary whre not semantically unrelated paraphrases are removed + """ + + response = set() + key_embedding = embed([reference_sentence]) #initial sentence USE embedding + a=np.reshape(key_embedding,(1,-1)) + + for candidate in paraphrases_list: + candidate_embedding = embed([candidate]) #candidate parpahrase USE embedding + b=np.reshape(candidate_embedding,(1,-1)) + cos_lib = cosine_similarity(a,b) + b = 0 + if cos_lib > 0.5: + response.add(candidate) + + return response + +def test(): + print("Load USE ") + embed = load_model("https://tfhub.dev/google/universal-sentence-encoder-large/5") + print("... done") + + d = {'how does covid-19 spread':["how does it spread","book a flight from lyon to sydney",'i feel cold']} + r = get_embedding(d,embed) + print(r) + +if __name__ == '__main__': + test() \ No newline at end of file From 0fc1aad5b9fc23c7604ea2ee6973381e23e7eaa0 Mon Sep 17 00:00:00 2001 From: AudayBerro Date: Tue, 31 Aug 2021 19:47:29 +0200 Subject: [PATCH 2/7] correction of Snetnece --- transformations/multi_pivot_paraphrases_generation/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transformations/multi_pivot_paraphrases_generation/README.md b/transformations/multi_pivot_paraphrases_generation/README.md index 4e521c053..14115542b 100644 --- a/transformations/multi_pivot_paraphrases_generation/README.md +++ b/transformations/multi_pivot_paraphrases_generation/README.md @@ -1,4 +1,4 @@ -# From one English Snetnece to a list of paraphrases 🦎 + ⌨️ → 🐍 +# From one English Sentence to a list of paraphrases 🦎 + ⌨️ → 🐍 This transformation generates a list of paraphrases for an English sentence by leveraging Pivot-Transaltion approach. Pivot-Transaltion is an approach where a sentence in a source language is translated to a foreign language called the pivot language then translated back to the source language to get a paraprhase candidate, e.g. translate an English sentence to French, then translate back to English. From e3c0a3d738e85b8d05dc421451715c9b29d16bf9 Mon Sep 17 00:00:00 2001 From: AudayBerro Date: Thu, 2 Sep 2021 23:56:40 +0200 Subject: [PATCH 3/7] Remove packages verison to resolve dependencies conflict --- .../requirements.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/transformations/multi_pivot_paraphrases_generation/requirements.txt b/transformations/multi_pivot_paraphrases_generation/requirements.txt index 41ff4a0b5..d67109b6d 100644 --- a/transformations/multi_pivot_paraphrases_generation/requirements.txt +++ b/transformations/multi_pivot_paraphrases_generation/requirements.txt @@ -1,5 +1,5 @@ -EasyNMT==2.0.1 -numpy==1.18.5 -scikit-learn==0.20.4 -tensorflow-hub==0.4.0 -transformers==4.5.1 +EasyNMT +numpy +scikit-learn +tensorflow-hub +transformers From 0f47a8fb7d2183f63251c92aec28a4b9ca1a1854 Mon Sep 17 00:00:00 2001 From: AudayBerro Date: Sat, 4 Sep 2021 20:01:46 +0200 Subject: [PATCH 4/7] add tensorflow --- .../multi_pivot_paraphrases_generation/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/transformations/multi_pivot_paraphrases_generation/requirements.txt b/transformations/multi_pivot_paraphrases_generation/requirements.txt index d67109b6d..a548d12d8 100644 --- a/transformations/multi_pivot_paraphrases_generation/requirements.txt +++ b/transformations/multi_pivot_paraphrases_generation/requirements.txt @@ -1,5 +1,6 @@ EasyNMT numpy scikit-learn +tensorflow tensorflow-hub transformers From 08a5dc50c023a080756b01c1d1e283a65d278eab Mon Sep 17 00:00:00 2001 From: AudayBerro Date: Sun, 31 Oct 2021 12:12:52 +0100 Subject: [PATCH 5/7] remove scikit-learn tensorflow tensorflow-hub transformers librairies --- .../multi_pivot_paraphrases_generation/requirements.txt | 4 ---- 1 file changed, 4 deletions(-) diff --git a/transformations/multi_pivot_paraphrases_generation/requirements.txt b/transformations/multi_pivot_paraphrases_generation/requirements.txt index a548d12d8..bf62fefac 100644 --- a/transformations/multi_pivot_paraphrases_generation/requirements.txt +++ b/transformations/multi_pivot_paraphrases_generation/requirements.txt @@ -1,6 +1,2 @@ EasyNMT numpy -scikit-learn -tensorflow -tensorflow-hub -transformers From e48070e232c946527f33d12aecaf8736cba67a80 Mon Sep 17 00:00:00 2001 From: AudayBerro Date: Sun, 31 Oct 2021 14:04:54 +0100 Subject: [PATCH 6/7] add relevant keywords and heavy=True --- .../multi_pivot_paraphrases_generation/transformation.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/transformations/multi_pivot_paraphrases_generation/transformation.py b/transformations/multi_pivot_paraphrases_generation/transformation.py index d973fc5f5..eef7488d8 100644 --- a/transformations/multi_pivot_paraphrases_generation/transformation.py +++ b/transformations/multi_pivot_paraphrases_generation/transformation.py @@ -48,6 +48,9 @@ class MultiPivotParaphrasesGeneration(SentenceOperation): TaskType.TEXT_TO_TEXT_GENERATION ] languages = ["en"] + keywords = ["lexical", "rule-based", "syntactic","highly-meaning-preserving","transformer-based","tokenizer-required","high-generations"] + heavy = True + def __init__(self, seed=0 , pivot_level=1): """ @@ -305,4 +308,4 @@ def concurrent_model_loader(self): json_file = {"type": convert_to_snake_case(tf.name()), "test_cases": test_cases} - print(json.dumps(json_file, indent=2)) \ No newline at end of file + print(json.dumps(json_file, indent=2)) From fbd10fdd66e9cb160efa30f31f1cfc913df1d0a1 Mon Sep 17 00:00:00 2001 From: AudayBerro Date: Sun, 31 Oct 2021 14:06:11 +0100 Subject: [PATCH 7/7] comment __name__ == __main__ --- .../transformation.py | 48 +++++++++---------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/transformations/multi_pivot_paraphrases_generation/transformation.py b/transformations/multi_pivot_paraphrases_generation/transformation.py index eef7488d8..eb4b4844d 100644 --- a/transformations/multi_pivot_paraphrases_generation/transformation.py +++ b/transformations/multi_pivot_paraphrases_generation/transformation.py @@ -281,31 +281,31 @@ def concurrent_model_loader(self): return response -if __name__ == '__main__': - import json - from TestRunner import convert_to_snake_case - - tf = MultiPivotParaphrasesGeneration() - - sentences = ['How does COVID-19 spread?', - 'Book a flight from Lyon to Sydney?', - 'Reserve an Italian Restaurant near Paris', - 'how many 10 euros are worth in dollars', - 'which company makes the ipod?', - 'what states does the connecticut river flow through?', - 'in which tournaments did west indies cricket team win the championship?'] +# if __name__ == '__main__': +# import json +# from TestRunner import convert_to_snake_case + +# tf = MultiPivotParaphrasesGeneration() + +# sentences = ['How does COVID-19 spread?', +# 'Book a flight from Lyon to Sydney?', +# 'Reserve an Italian Restaurant near Paris', +# 'how many 10 euros are worth in dollars', +# 'which company makes the ipod?', +# 'what states does the connecticut river flow through?', +# 'in which tournaments did west indies cricket team win the championship?'] - pr_gray("Start paraphrases Generation:") - - test_cases = [] - for sentence in sentences: - test_cases.append({ - "class": tf.name(), - "inputs": {"Reference sentence": sentence}, "outputs": [{"Paraphrase": o} for o in tf.generate(sentence)]} - ) +# pr_gray("Start paraphrases Generation:") + +# test_cases = [] +# for sentence in sentences: +# test_cases.append({ +# "class": tf.name(), +# "inputs": {"Reference sentence": sentence}, "outputs": [{"Paraphrase": o} for o in tf.generate(sentence)]} +# ) - pr_green("... done") +# pr_green("... done") - json_file = {"type": convert_to_snake_case(tf.name()), "test_cases": test_cases} +# json_file = {"type": convert_to_snake_case(tf.name()), "test_cases": test_cases} - print(json.dumps(json_file, indent=2)) +# print(json.dumps(json_file, indent=2))