From ca97cd958c1cf311a2e52300683d016062df1851 Mon Sep 17 00:00:00 2001
From: AudayBerro <audayberro@gmail.com>
Date: Tue, 31 Aug 2021 19:37:00 +0200
Subject: [PATCH 1/7] Added multi_pivot_paraphrases_generation transformation

---
 .../README.md                                 |  44 +++
 .../__init__.py                               |   1 +
 .../constants.py                              |  16 +
 .../easy_nmt.py                               |  22 ++
 .../requirements.txt                          |   5 +
 .../test.json                                 | 206 ++++++++++++
 .../transformation.py                         | 308 ++++++++++++++++++
 .../use_filter.py                             |  51 +++
 8 files changed, 653 insertions(+)
 create mode 100644 transformations/multi_pivot_paraphrases_generation/README.md
 create mode 100644 transformations/multi_pivot_paraphrases_generation/__init__.py
 create mode 100644 transformations/multi_pivot_paraphrases_generation/constants.py
 create mode 100644 transformations/multi_pivot_paraphrases_generation/easy_nmt.py
 create mode 100644 transformations/multi_pivot_paraphrases_generation/requirements.txt
 create mode 100644 transformations/multi_pivot_paraphrases_generation/test.json
 create mode 100644 transformations/multi_pivot_paraphrases_generation/transformation.py
 create mode 100644 transformations/multi_pivot_paraphrases_generation/use_filter.py

diff --git a/transformations/multi_pivot_paraphrases_generation/README.md b/transformations/multi_pivot_paraphrases_generation/README.md
new file mode 100644
index 000000000..4e521c053
--- /dev/null
+++ b/transformations/multi_pivot_paraphrases_generation/README.md
@@ -0,0 +1,44 @@
+# From one English Snetnece to a list of paraphrases 🦎  + ⌨️ → 🐍
+This transformation generates a list of paraphrases for an English sentence by leveraging Pivot-Transaltion approach.
+Pivot-Transaltion is an approach where a sentence in a source language is translated to a foreign language called the pivot language then translated back to the source language to get a paraprhase candidate, e.g. translate an English sentence to French, then translate back to English.
+
+The paraphrases generation is divided into two step:
+- Step 1: paraphrases Candidate Over-generation by leveraging Pivot-Transaltion. At this step, we generate a Pool of possible parparhases.
+- Step 2: apply a candidate selection over the Pool of paraphrases, since the pool can contain semantically unrelated or duplicate paraphrases.
+    We leverage Embedding Model such as Universal Sentence Encoder~(USE) to disqualify candidate paraphrases from the pool, by computing the Cosine Similarity socres of the
+    USE Embeddings between the reference sentence and the candidate paraphrase. Let R = USE_Embeding(reference_english_sentence) and P = USE_Embeding(candidate):
+    - if Cosine(R,P) < alpha => the candidate is semantically unrelated and then removed from the final list of paraphrases
+    - if Cosine(R,P) > beta => the candidate is a duplication and then removed from the final list of paraphrases
+    - By default Alpha=0.5 and Beta=0.95, we set the value as suggested by [Parikh et al.](https://arxiv.org/pdf/2004.03484.pdf) works
+
+Please refer to the test.json for all of the test cases catered.
+
+This transformation translates an English sentence to a list of predefined languages using Huggingface MariamMT and EasyNMT as Machine Transaltion models.
+- The transformation support Two Pivot-Transaltion Level.
+    - If Pivot-level = 1 => Transalte to only one foreign language. e.g. English -> French -> English  ||  English -> Arabic -> English  ||  English -> japanese -> English
+    - If Pivot-level = 2 => Transalte to only Two foreign language. e.g. English -> French -> Arabic -> English  ||  English -> Russian -> Chinese -> English
+
+Author name: Auday Berro (audayberro@gmail.com)
+
+## What type of a transformation is this?
+This transformation is a paraphrase generation for Natural English Sentences by lveraging Pivot-Transaltion techniques. The Pivot-Trnasaltion technique allow to get lexically and syntaxically diverse paraphrases.
+
+## What tasks does it intend to benefit?
+This transformation would benefit all tasks with a sentence as input like question generation, sentence generation, etc.
+
+## What are the limitations of this transformation?
+
+1. The transformation does not generate paraphrases for non-English sentences, e.g. Can't generate paraphrases for German or Chinese sentences
+ 
+2. This transformation only generate paraphrases for Natural Language English sentences.
+
+## Previous Work
+
+
+2) This work is partly inspired by the following work on robustness for Machine Translation:
+```bibtex
+@article{berroextensible,
+  title={An Extensible and Reusable Pipeline for Automated Utterance Paraphrases},
+  author={Berro, Auday and Zade, Mohammad-Ali Yaghub and Baez, Marcos and Benatallah, Boualem and Benabdeslem, Khalid}
+}
+```
\ No newline at end of file
diff --git a/transformations/multi_pivot_paraphrases_generation/__init__.py b/transformations/multi_pivot_paraphrases_generation/__init__.py
new file mode 100644
index 000000000..930cdce0b
--- /dev/null
+++ b/transformations/multi_pivot_paraphrases_generation/__init__.py
@@ -0,0 +1 @@
+from .transformation import *
diff --git a/transformations/multi_pivot_paraphrases_generation/constants.py b/transformations/multi_pivot_paraphrases_generation/constants.py
new file mode 100644
index 000000000..301271550
--- /dev/null
+++ b/transformations/multi_pivot_paraphrases_generation/constants.py
@@ -0,0 +1,16 @@
+# Huggign Face Marian Machine Translator Model to load. Set of Tuples in the form: tuple=(Source-2-target languages pairs, Huggingface MarianMT Helsinki-NLP model)
+HUGGINGFACE_MARIANMT_MODELS_TO_LOAD = {
+    ('en2romance','Helsinki-NLP/opus-mt-en-ROMANCE'),
+    ('romance2en','Helsinki-NLP/opus-mt-ROMANCE-en'),
+    ('de2en','Helsinki-NLP/opus-mt-de-en'),
+    ('ru2en','Helsinki-NLP/opus-mt-ru-en'),
+    ('en2ar','Helsinki-NLP/opus-mt-en-ar'),
+    ('en2zh','Helsinki-NLP/opus-mt-en-zh'),
+    ('en2jap','Helsinki-NLP/opus-mt-en-jap'),
+    ('en2ru','Helsinki-NLP/opus-mt-en-ru'),
+    ('en2de','Helsinki-NLP/opus-mt-en-de'),
+    ('zh2en','Helsinki-NLP/opus-mt-zh-en')
+  }
+
+
+EASYNMT_MODEL_NAME = 'm2m_100_418M'
\ No newline at end of file
diff --git a/transformations/multi_pivot_paraphrases_generation/easy_nmt.py b/transformations/multi_pivot_paraphrases_generation/easy_nmt.py
new file mode 100644
index 000000000..97937464d
--- /dev/null
+++ b/transformations/multi_pivot_paraphrases_generation/easy_nmt.py
@@ -0,0 +1,22 @@
+""" EasyNMT - Easy to use, state-of-the-art Neural Machine Translation - https://github.com/UKPLab/EasyNMT """
+from easynmt import EasyNMT
+
+def load_easynmt_model(model_name='m2m_100_418M'):
+    """
+    EasyNMT model to load
+    :param model_name: name of the model to load - List of supported model visit: https://github.com/UKPLab/EasyNMT#available-models 
+    :return EasyNMT Machine translation model
+    """
+    
+    return EasyNMT(model_name)
+
+def get_easynmt_translation(sentence,model,target_lang,source_lang=None):
+    """
+    Translate a sentence
+    :param sentence: sentence to translate
+    :param model: EasyNMT model
+    :param trg: Target language for the translation
+    :param source_lang: Source language for the translation. If None, determines the source languages automatically.
+    :return Translated sentence 
+    """
+    return model.translate(sentence, source_lang=source_lang, target_lang=target_lang)
\ No newline at end of file
diff --git a/transformations/multi_pivot_paraphrases_generation/requirements.txt b/transformations/multi_pivot_paraphrases_generation/requirements.txt
new file mode 100644
index 000000000..41ff4a0b5
--- /dev/null
+++ b/transformations/multi_pivot_paraphrases_generation/requirements.txt
@@ -0,0 +1,5 @@
+EasyNMT==2.0.1
+numpy==1.18.5
+scikit-learn==0.20.4
+tensorflow-hub==0.4.0
+transformers==4.5.1
diff --git a/transformations/multi_pivot_paraphrases_generation/test.json b/transformations/multi_pivot_paraphrases_generation/test.json
new file mode 100644
index 000000000..6318d0ba8
--- /dev/null
+++ b/transformations/multi_pivot_paraphrases_generation/test.json
@@ -0,0 +1,206 @@
+{
+    "type": "multi_pivot_paraphrases_generation",
+    "test_cases": [
+      {
+        "class": "MultiPivotParaphrasesGeneration",
+        "inputs": {
+          "Reference sentence": "How does COVID-19 spread?"
+        },
+        "outputs": [
+          {
+            "Paraphrase": "How is COVID-19 disseminated?"
+          },
+          {
+            "Paraphrase": "How is COVID-19 spread?"
+          },
+          {
+            "Paraphrase": "How did COVID-19 spread?"
+          },
+          {
+            "Paraphrase": "How is COVID-19 spreading?"
+          },
+          {
+            "Paraphrase": "How does COVID-19 spread?"
+          }
+        ]
+      },
+      {
+        "class": "MultiPivotParaphrasesGeneration",
+        "inputs": {
+          "Reference sentence": "Book a flight from Lyon to Sydney?"
+        },
+        "outputs": [
+          {
+            "Paraphrase": "To book a flight from Lyon to Sydney?"
+          },
+          {
+            "Paraphrase": "Have you booked a flight from Lyon to Sydney?"
+          },
+          {
+            "Paraphrase": "What is the journey from Lyon to Sydney?"
+          },
+          {
+            "Paraphrase": "Book a flight from Lyon to Sydney?"
+          },
+          {
+            "Paraphrase": "Are you booking a flight from Lyon to Sydney?"
+          }
+        ]
+      },
+      {
+        "class": "MultiPivotParaphrasesGeneration",
+        "inputs": {
+          "Reference sentence": "Reserve an Italian Restaurant near Paris"
+        },
+        "outputs": [
+          {
+            "Paraphrase": "Reserve an Italian restaurant near Paris"
+          },
+          {
+            "Paraphrase": "Italian restaurants near Paris"
+          },
+          {
+            "Paraphrase": "Book an Italian restaurant near Paris"
+          },
+          {
+            "Paraphrase": "It's a reservation at the Italian restaurant near Paris."
+          },
+          {
+            "Paraphrase": "Save the Italian restaurant near Paris."
+          }
+        ]
+      },
+      {
+        "class": "MultiPivotParaphrasesGeneration",
+        "inputs": {
+          "Reference sentence": "how many 10 euros are worth in dollars"
+        },
+        "outputs": [
+          {
+            "Paraphrase": "how many 10 euros are worth in dollars"
+          },
+          {
+            "Paraphrase": "how much 10 euros are worth in dollars"
+          },
+          {
+            "Paraphrase": "10 Euros in Dollars."
+          },
+          {
+            "Paraphrase": "How many Euros are worth in United States dollars?"
+          },
+          {
+            "Paraphrase": "How much is 10 euros in dollars?"
+          },
+          {
+            "Paraphrase": "how many 10 euros is worth in dollars"
+          },
+          {
+            "Paraphrase": "how many 10 euros in dollars are worth"
+          }
+        ]
+      },
+      {
+        "class": "MultiPivotParaphrasesGeneration",
+        "inputs": {
+          "Reference sentence": "which company makes the ipod?"
+        },
+        "outputs": [
+          {
+            "Paraphrase": "Which company is making iPods?"
+          },
+          {
+            "Paraphrase": "What company does the iPod make?"
+          },
+          {
+            "Paraphrase": "Which company does the ipod?"
+          },
+          {
+            "Paraphrase": "What kind of company does an iPod?"
+          },
+          {
+            "Paraphrase": "Which company manufactures ipods?"
+          },
+          {
+            "Paraphrase": "What company does the iPod do?"
+          },
+          {
+            "Paraphrase": "Which company makes the iPod?"
+          },
+          {
+            "Paraphrase": "What company manufactures the ipod?"
+          }
+        ]
+      },
+      {
+        "class": "MultiPivotParaphrasesGeneration",
+        "inputs": {
+          "Reference sentence": "what states does the connecticut river flow through?"
+        },
+        "outputs": [
+          {
+            "Paraphrase": "In what states does the connected river flow?"
+          },
+          {
+            "Paraphrase": "What state is the link to the river?"
+          },
+          {
+            "Paraphrase": "What states is the connecticut river going through?"
+          },
+          {
+            "Paraphrase": "Where does the river flow? What is the way the Nile flows?"
+          },
+          {
+            "Paraphrase": "What are you running through the Connecticut River?"
+          },
+          {
+            "Paraphrase": "What states does the river connecticut flow through?"
+          },
+          {
+            "Paraphrase": "In what state does the river connecticut flow?"
+          },
+          {
+            "Paraphrase": "What states pass through the river Kinkito?"
+          },
+          {
+            "Paraphrase": "What conditions does the Connecticut River flow through?"
+          },
+          {
+            "Paraphrase": "What states the river connecticut flows?"
+          }
+        ]
+      },
+      {
+        "class": "MultiPivotParaphrasesGeneration",
+        "inputs": {
+          "Reference sentence": "in which tournaments did west indies cricket team win the championship?"
+        },
+        "outputs": [
+          {
+            "Paraphrase": "In which tournaments did Western Indians win the championship?"
+          },
+          {
+            "Paraphrase": "What tournaments did the West Indies cricket team win the championship?"
+          },
+          {
+            "Paraphrase": "Which team won the World Cup in West India?"
+          },
+          {
+            "Paraphrase": "in which tournaments has West India cricket team won the championship?"
+          },
+          {
+            "Paraphrase": "In which tournaments did the cricket team of the West Indies win the championship?"
+          },
+          {
+            "Paraphrase": "What game did the Cricket Team of the West Indies win?"
+          },
+          {
+            "Paraphrase": "In what tournaments did the cricket team of the West Indies win the championship?"
+          },
+          {
+            "Paraphrase": "What tournament did the West Indies cricket team win?"
+          }
+        ]
+      }
+    ]
+  }
+  
\ No newline at end of file
diff --git a/transformations/multi_pivot_paraphrases_generation/transformation.py b/transformations/multi_pivot_paraphrases_generation/transformation.py
new file mode 100644
index 000000000..d973fc5f5
--- /dev/null
+++ b/transformations/multi_pivot_paraphrases_generation/transformation.py
@@ -0,0 +1,308 @@
+import random
+import string
+import concurrent.futures
+
+from transformers import MarianMTModel,MarianTokenizer
+
+from .easy_nmt import load_easynmt_model,get_easynmt_translation
+from .use_filter import load_use_model,get_use_embedding
+from .constants import HUGGINGFACE_MARIANMT_MODELS_TO_LOAD, EASYNMT_MODEL_NAME
+
+from interfaces.SentenceOperation import SentenceOperation
+from tasks.TaskTypes import TaskType
+
+# methods to print colored text in the terminal
+def pr_green(text):
+    """ Pring text in green color font"""
+    print("\033[92m{}\033[00m" .format(text))
+
+def pr_gray(text):
+    """ Pring text in gray color font"""
+    print("\033[7m{}\033[00m" .format(text))
+
+class MultiPivotParaphrasesGeneration(SentenceOperation):
+    """
+    This transformation generates a list of paraphrases for an English sentence by leveraging Pivot-Transaltion approach.
+    Pivot-Transaltion is an approach where a sentence in a source language is translated to a foreign language called the pivot language then
+    translated back to the source language to get a paraprhase candidate, e.g. translate an English sentence to French, then translate back to English.
+
+    The paraphrases generation is divided into two step:
+    - Step 1: paraphrases Candidate Over-generation by leveraging Pivot-Transaltion. At this step, we generate a Pool of possible parparhases.
+    - Step 2: apply a candidate selection over the Pool of paraphrases, since the pool can contain semantically unrelated or duplicate paraphrases.
+      We leverage Embedding Model such as Universal Sentence Encoder~(USE) to disqualify candidate paraphrases from the pool, by computing the Cosine Similarity socres of the
+      USE Embeddings between the reference sentence and the candidate paraphrase. Let R = USE_Embeding(reference_english_sentence) and P = USE_Embeding(candidate):
+        - if Cosine(R,P) < alpha => the candidate is semantically unrelated and then removed from the final list of paraphrases
+        - if Cosine(R,P) > beta => the candidate is a duplication and then removed from the final list of paraphrases
+        - By default Alpha=0.5 and Beta=0.95, we set the value as suggested by [Parikh et al.](https://arxiv.org/pdf/2004.03484.pdf) works
+    
+    Please refer to the test.json for all of the test cases catered.
+    
+    This transformation translates an English sentence to a list of predefined languages using Huggingface MariamMT and EasyNMT as Machine Transaltion models.
+    - The transformation support Two Pivot-Transaltion Level.
+        - If Pivot-level = 1 => Transalte to only one foreign language. e.g. English -> French -> English  ||  English -> Arabic -> English  ||  English -> japanese -> English
+        - If Pivot-level = 2 => Transalte to only Two foreign language. e.g. English -> French -> Arabic -> English  ||  English -> Russian -> Chinese -> English
+    """
+    
+    tasks = [
+        TaskType.QUESTION_GENERATION,
+        TaskType.TEXT_TO_TEXT_GENERATION
+    ]
+    languages = ["en"]
+
+    def __init__(self, seed=0 , pivot_level=1):
+        """
+        Generate parpahrases for an English sentence by Leveraging pivot transaltion
+        :param pivot_level: integer that indicate the pivot language level, single-pivot or multi-pivot range,1 =single-pivot, 2=double-pivot, 0=apply single and double
+        """
+
+        super().__init__(seed)
+        self.pivot_level = pivot_level
+        self.models = self.concurrent_model_loader()
+        self.use_embed_model = None
+
+    def generate(self, sentence:str, candidate_selection = True):
+        """
+        Generate a list of paraphrases for sentence
+        :param sentence: English sentence to be paraprhased
+        :param candidate_selection: remove semantiically unrelate paraphrases cadidates using USE_Embedding_Cosine_Similarity scores. False: don't apply candidate selction | True: apply 
+        :return list of paraphrases
+        """
+
+        paraphrases = self.multi_translate(sentence,self.models)
+
+        if candidate_selection:
+            #load_use_model
+            if not self.use_embed_model:
+                pr_gray("Load Universal Sentence Encoder Model:")
+                use_model_name = "https://tfhub.dev/google/universal-sentence-encoder-large/5"
+                self.use_embed_model = load_use_model(use_model_name)
+                pr_green("... done")
+
+            #compute cosine similarity scores to remove semantically unrelated candidates
+            pr_gray("Start paraphrases candidate selection:")
+            paraphrases = get_use_embedding(paraphrases, self.use_embed_model, sentence)
+
+            pr_green("... done")
+        
+        return paraphrases
+
+    def translate(self,utterance,model,tok,trg="NONE"):
+        """
+        Translate a single sentence
+        :param utterance: sentence to translate
+        :param model: transformers Marian Machine Transaltion Model(MarianMTModel)
+        :param tok: transformers Marian Tokenizer module(MarianTokenizer)
+        :param trg: target language - set value when using en-ROMANCE model - trg=>>fr<<|>>it<<|>>es<<|>>pt<<
+        :return Translated utterance 
+        """
+        if trg != 'NONE':
+            utterance = '>>'+trg+'<<  '+utterance
+        # translated = model.generate(**tok.prepare_translation_batch([utterance]))#old version transformers==3.0.0
+        translated = model.generate(**tok(utterance, return_tensors="pt", padding=True))
+        result = [tok.decode(t, skip_special_tokens=True) for t in translated]
+
+        result = result[0]
+
+        # check token indices sequence length is longer than the specified maximum sequence length max_length=512
+        if len(result) > 512:
+            result = result[:512]
+        return result
+
+
+    def multi_translate(self,utterance,model):
+        """
+        Translate sentence
+        :param utterance: sentence to translate
+        :param model_list: dictionary containing marianMT model, key: model name - value: list containing respectively  Model and tokenizer.  e.g. {'en2ROMANCE':[model,tekenizer]}
+        :return list of utterance translations
+        """
+        response = set()
+
+        if self.pivot_level == 0 or self.pivot_level == 1:#one pivot language
+            # Translate to Italian
+            tmp = self.translate(utterance,model['en2romance'][0],model['en2romance'][1],trg="it")
+            tmp = self.translate(tmp,model['romance2en'][0],model['romance2en'][1])#translate back to English
+            response.add(tmp)
+
+            # Translate to French
+            tmp = self.translate(utterance,model['en2romance'][0],model['en2romance'][1],trg="fr")
+            tmp = self.translate(tmp,model['romance2en'][0],model['romance2en'][1])#translate back to English
+            response.add(tmp)
+
+            # Translate to Spanish
+            tmp = self.translate(utterance,model['en2romance'][0],model['en2romance'][1],trg="es")
+            tmp = self.translate(tmp,model['romance2en'][0],model['romance2en'][1])#translate back to English
+            response.add(tmp)
+
+            # Translate to Portuguese
+            tmp = self.translate(utterance,model['en2romance'][0],model['en2romance'][1],trg="pt")
+            tmp = self.translate(tmp,model['romance2en'][0],model['romance2en'][1])#translate back to English
+            response.add(tmp)
+
+            # Translate to Romanian
+            tmp = self.translate(utterance,model['en2romance'][0],model['en2romance'][1],trg="ro")
+            tmp = self.translate(tmp,model['romance2en'][0],model['romance2en'][1])#translate back to English
+            response.add(tmp)
+
+            # Translate to German
+            tmp = self.translate(utterance,model['en2de'][0],model['en2de'][1])
+            tmp = self.translate(tmp,model['de2en'][0],model['de2en'][1])#translate back to English
+            response.add(tmp)
+
+            # Translate to Russian
+            tmp = self.translate(utterance,model['en2ru'][0],model['en2ru'][1])
+            tmp = self.translate(tmp,model['ru2en'][0],model['ru2en'][1])#translate back to English
+            response.add(tmp)
+
+            # Translate to Arabic
+            tmp = self.translate(utterance,model['en2ar'][0],model['en2ar'][1])
+            tmp = get_easynmt_translation( tmp, model['easy_nmt'], 'en', 'ar') # translate back to English with EasyNMt 
+            response.add(tmp)
+
+            # Translate to Chinese
+            tmp = self.translate(utterance,model['en2zh'][0],model['en2zh'][1])
+            tmp = self.translate(tmp,model['zh2en'][0],model['zh2en'][1])#translate back to English
+            response.add(tmp)
+
+            # Translate to Japanese
+            tmp = self.translate(utterance,model['en2jap'][0],model['en2jap'][1])
+            tmp = get_easynmt_translation( tmp, model['easy_nmt'], 'en', 'ja') # translate back to English with EasyNMt 
+            response.add(tmp)
+            
+        if self.pivot_level == 0 or self.pivot_level == 2:# two pivot language
+            # Translate Spanish => Russian = > English
+            tmp = self.translate(utterance,model['en2romance'][0],model['en2romance'][1],trg="es")
+            tmp = get_easynmt_translation( tmp, model['easy_nmt'], 'ru', 'es') # translate to Russian with EasyNMt
+            tmp = self.translate(tmp,model['ru2en'][0],model['ru2en'][1])#translate back to English
+            response.add(tmp)
+
+            # Translate Japanese => Spanish = > English
+            tmp = self.translate(utterance,model['en2jap'][0],model['en2jap'][1])#translate to Japanese
+            tmp = get_easynmt_translation( tmp, model['easy_nmt'], 'es', 'ja') # translate to Spanish with EasyNMt
+            tmp = self.translate(tmp,model['romance2en'][0],model['romance2en'][1])#translate back to English
+            response.add(tmp)
+
+            # Translate Japanese => Italian = > English
+            tmp = self.translate(utterance,model['en2jap'][0],model['en2jap'][1])#translate to Japanese
+            tmp = get_easynmt_translation( tmp, model['easy_nmt'], 'it', 'ja') # translate to Italian with EasyNMt
+            tmp = self.translate(tmp,model['romance2en'][0],model['romance2en'][1])#translate back to English
+            response.add(tmp)
+
+            # Translate Arabic => German = > English
+            tmp = self.translate(utterance,model['en2ar'][0],model['en2ar'][1])#translate to Arabic
+            tmp = get_easynmt_translation( tmp, model['easy_nmt'], 'de', 'ar') # translate to German with EasyNMt 
+            tmp = self.translate(tmp,model['de2en'][0],model['de2en'][1])#translate back to English
+            response.add(tmp)
+
+            # Translate Chinese => German = > English
+            tmp = self.translate(utterance,model['en2zh'][0],model['en2zh'][1])#translate to Chinese
+            tmp = get_easynmt_translation( tmp, model['easy_nmt'], 'de', 'zh') # translate to German with EasyNMt 
+            tmp = self.translate(tmp,model['de2en'][0],model['de2en'][1])#translate back to English
+            response.add(tmp)
+
+            # Translate German => Arabic = > English
+            tmp = self.translate(utterance,model['en2de'][0],model['en2de'][1])#translate to German
+            tmp = get_easynmt_translation( tmp, model['easy_nmt'], 'ar', 'de') # translate to Arabic with EasyNMt 
+            tmp = get_easynmt_translation( tmp, model['easy_nmt'], 'en', 'ar') # translate to English with EasyNMt
+            response.add(tmp)
+
+            # Translate German => Chinese = > English
+            tmp = self.translate(utterance,model['en2de'][0],model['en2de'][1])#translate to German
+            tmp = get_easynmt_translation( tmp, model['easy_nmt'], 'zh', 'de') # translate to Chinese with EasyNMt 
+            tmp = self.translate(tmp,model['zh2en'][0],model['zh2en'][1])# translate back to English
+            response.add(tmp)
+
+            # Translate German => Japanese = > English
+            tmp = self.translate(utterance,model['en2de'][0],model['en2de'][1])#translate to German
+            tmp = get_easynmt_translation( tmp, model['easy_nmt'], 'ja', 'de') # translate to Chinese with EasyNMt 
+            tmp = get_easynmt_translation( tmp, model['easy_nmt'], 'en', 'ja') # translate to English with EasyNMt 
+            response.add(tmp)
+
+        return list(response)
+
+    def translate_list(self,sentences,model):
+        """
+        Translate a List of sentences
+        :param sentences: reference sentences to paraprhases in Python List, list of refenrence sentences
+        :param model_list: dictionary containing marianMT model, key: model name - value: list containing respectively  Model and tokenizer.  e.g. {'en2ROMANCE':[model,tekenizer]}
+        :return Python dictionary containing translsation, Key are initial sentence and vaule are a set of translations
+        """
+
+        paraphrases = dict()
+        for sentence in sentences:
+            tmp = self.multi_translate(sentence,model,self.pivot_level)
+            paraphrases[sentence]=tmp
+        
+        return paraphrases
+
+    def get_model(self,param):
+        """
+        Load Hugginface marian Machine Translator model and tokenizer
+        :param param: Huggingface MarianMt Helsinki-NLP/{model_name} to load (https://huggingface.co/Helsinki-NLP); param[0]=label - param[1]=model_name
+        :return a tuple result = (Huggingface MarianMt Model, Marian MT Tokenizer, Marian MT label)
+        """
+
+        mt_model = MarianMTModel.from_pretrained(param[1]) #param[0]=label ; param[1]=model_name to load
+        mt_tokenizer = MarianTokenizer.from_pretrained(param[1]) #load tokenizer
+        return mt_model,mt_tokenizer,param[0]
+
+    def concurrent_model_loader(self):
+        """
+        Return a List of Huggingface Marian MT model, same as load_model but load concurrently
+        :return Python dictionary - key: model name - value: list containing respectively MarianModel and MarianTokenizer e.g. {'en2ru':[model,tokenizer]}
+        """
+        response = dict()
+
+        pr_gray("Load Huggingface MarianMT models")
+
+        # load HuggingFace Marian MT model and tokenizer concurrently through thread 
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+
+            # results = [executor.submit(get_model2,model_name) for model_name in models_to_load.values()]
+            results = executor.map( self.get_model, HUGGINGFACE_MARIANMT_MODELS_TO_LOAD )
+
+            # unpack and add MarianMT model, MarianMT tokenizer and label
+            for model,tokenizer,label in results:
+                response[label] = [model,tokenizer]
+            
+            pr_green("... done")
+        
+        #load EasyNMT nodel
+        pr_gray("Load UKPLab Easy-NMT model")
+
+        easy_model = load_easynmt_model( EASYNMT_MODEL_NAME )
+        response['easy_nmt'] = easy_model
+
+        pr_green("... done")
+
+        return response
+
+
+if __name__ == '__main__':
+    import json
+    from TestRunner import convert_to_snake_case
+
+    tf = MultiPivotParaphrasesGeneration()
+
+    sentences = ['How does COVID-19 spread?',
+        'Book a flight from Lyon to Sydney?',
+        'Reserve an Italian Restaurant near Paris',
+        'how many 10 euros are worth in dollars',
+        'which company makes the ipod?',
+        'what states does the connecticut river flow through?',
+        'in which tournaments did west indies cricket team win the championship?']
+    
+    pr_gray("Start paraphrases Generation:")
+
+    test_cases = []
+    for sentence in sentences:
+        test_cases.append({
+            "class": tf.name(),
+            "inputs": {"Reference sentence": sentence}, "outputs": [{"Paraphrase": o} for o in tf.generate(sentence)]}
+        )
+    
+    pr_green("... done")
+
+    json_file = {"type": convert_to_snake_case(tf.name()), "test_cases": test_cases}
+    
+    print(json.dumps(json_file, indent=2))
\ No newline at end of file
diff --git a/transformations/multi_pivot_paraphrases_generation/use_filter.py b/transformations/multi_pivot_paraphrases_generation/use_filter.py
new file mode 100644
index 000000000..3de150d14
--- /dev/null
+++ b/transformations/multi_pivot_paraphrases_generation/use_filter.py
@@ -0,0 +1,51 @@
+import tensorflow_hub as hub
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+
+""" Remove semantically unrelated paraphrases by computing Universal Sentence Encoder embeddings cosine similiraity score """
+
+def load_use_model(model_name="https://tfhub.dev/google/universal-sentence-encoder-large/5"):
+    """
+    Load Universal Sentence Encoder model
+    :param model_name: name of the USE model to load
+    :return an USE model
+    """
+    
+    model = hub.load(model_name)
+    return model
+
+
+def get_use_embedding(paraphrases_list, embed, reference_sentence):
+    """
+    Get Universal Sentence Encoder embeddings
+    :param paraphrases_list: python list on which to apply embedding, Key initial sentence and value is a set of paraphrases
+    :param embed: Universal Sentence Encoder model instance
+    :param reference_sentence: reference sentence with which the paraphrases are compared
+    :return a python dictionary whre not semantically unrelated paraphrases are removed
+    """
+
+    response = set()
+    key_embedding = embed([reference_sentence]) #initial sentence USE embedding
+    a=np.reshape(key_embedding,(1,-1))
+
+    for candidate in paraphrases_list:
+        candidate_embedding = embed([candidate]) #candidate parpahrase USE embedding
+        b=np.reshape(candidate_embedding,(1,-1))
+        cos_lib = cosine_similarity(a,b)
+        b = 0
+        if cos_lib > 0.5:
+            response.add(candidate)
+
+    return response
+
+def test():
+    print("Load USE ")
+    embed = load_model("https://tfhub.dev/google/universal-sentence-encoder-large/5")
+    print("... done")
+
+    d = {'how does covid-19 spread':["how does it spread","book a flight from lyon to sydney",'i feel cold']}
+    r = get_embedding(d,embed)
+    print(r)
+
+if __name__ == '__main__':
+    test()
\ No newline at end of file

From 0fc1aad5b9fc23c7604ea2ee6973381e23e7eaa0 Mon Sep 17 00:00:00 2001
From: AudayBerro <audayberro@gmail.com>
Date: Tue, 31 Aug 2021 19:47:29 +0200
Subject: [PATCH 2/7] correction of Snetnece

---
 transformations/multi_pivot_paraphrases_generation/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/transformations/multi_pivot_paraphrases_generation/README.md b/transformations/multi_pivot_paraphrases_generation/README.md
index 4e521c053..14115542b 100644
--- a/transformations/multi_pivot_paraphrases_generation/README.md
+++ b/transformations/multi_pivot_paraphrases_generation/README.md
@@ -1,4 +1,4 @@
-# From one English Snetnece to a list of paraphrases 🦎  + ⌨️ → 🐍
+# From one English Sentence to a list of paraphrases 🦎  + ⌨️ → 🐍
 This transformation generates a list of paraphrases for an English sentence by leveraging Pivot-Transaltion approach.
 Pivot-Transaltion is an approach where a sentence in a source language is translated to a foreign language called the pivot language then translated back to the source language to get a paraprhase candidate, e.g. translate an English sentence to French, then translate back to English.
 

From e3c0a3d738e85b8d05dc421451715c9b29d16bf9 Mon Sep 17 00:00:00 2001
From: AudayBerro <audayberro@gmail.com>
Date: Thu, 2 Sep 2021 23:56:40 +0200
Subject: [PATCH 3/7] Remove packages verison to resolve dependencies conflict

---
 .../requirements.txt                                   | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/transformations/multi_pivot_paraphrases_generation/requirements.txt b/transformations/multi_pivot_paraphrases_generation/requirements.txt
index 41ff4a0b5..d67109b6d 100644
--- a/transformations/multi_pivot_paraphrases_generation/requirements.txt
+++ b/transformations/multi_pivot_paraphrases_generation/requirements.txt
@@ -1,5 +1,5 @@
-EasyNMT==2.0.1
-numpy==1.18.5
-scikit-learn==0.20.4
-tensorflow-hub==0.4.0
-transformers==4.5.1
+EasyNMT
+numpy
+scikit-learn
+tensorflow-hub
+transformers

From 0f47a8fb7d2183f63251c92aec28a4b9ca1a1854 Mon Sep 17 00:00:00 2001
From: AudayBerro <audayberro@gmail.com>
Date: Sat, 4 Sep 2021 20:01:46 +0200
Subject: [PATCH 4/7] add tensorflow

---
 .../multi_pivot_paraphrases_generation/requirements.txt          | 1 +
 1 file changed, 1 insertion(+)

diff --git a/transformations/multi_pivot_paraphrases_generation/requirements.txt b/transformations/multi_pivot_paraphrases_generation/requirements.txt
index d67109b6d..a548d12d8 100644
--- a/transformations/multi_pivot_paraphrases_generation/requirements.txt
+++ b/transformations/multi_pivot_paraphrases_generation/requirements.txt
@@ -1,5 +1,6 @@
 EasyNMT
 numpy
 scikit-learn
+tensorflow
 tensorflow-hub
 transformers

From 08a5dc50c023a080756b01c1d1e283a65d278eab Mon Sep 17 00:00:00 2001
From: AudayBerro <audayberro@gmail.com>
Date: Sun, 31 Oct 2021 12:12:52 +0100
Subject: [PATCH 5/7] remove scikit-learn tensorflow tensorflow-hub
 transformers librairies

---
 .../multi_pivot_paraphrases_generation/requirements.txt       | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/transformations/multi_pivot_paraphrases_generation/requirements.txt b/transformations/multi_pivot_paraphrases_generation/requirements.txt
index a548d12d8..bf62fefac 100644
--- a/transformations/multi_pivot_paraphrases_generation/requirements.txt
+++ b/transformations/multi_pivot_paraphrases_generation/requirements.txt
@@ -1,6 +1,2 @@
 EasyNMT
 numpy
-scikit-learn
-tensorflow
-tensorflow-hub
-transformers

From e48070e232c946527f33d12aecaf8736cba67a80 Mon Sep 17 00:00:00 2001
From: AudayBerro <audayberro@gmail.com>
Date: Sun, 31 Oct 2021 14:04:54 +0100
Subject: [PATCH 6/7] add relevant keywords and heavy=True

---
 .../multi_pivot_paraphrases_generation/transformation.py     | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/transformations/multi_pivot_paraphrases_generation/transformation.py b/transformations/multi_pivot_paraphrases_generation/transformation.py
index d973fc5f5..eef7488d8 100644
--- a/transformations/multi_pivot_paraphrases_generation/transformation.py
+++ b/transformations/multi_pivot_paraphrases_generation/transformation.py
@@ -48,6 +48,9 @@ class MultiPivotParaphrasesGeneration(SentenceOperation):
         TaskType.TEXT_TO_TEXT_GENERATION
     ]
     languages = ["en"]
+    keywords = ["lexical", "rule-based", "syntactic","highly-meaning-preserving","transformer-based","tokenizer-required","high-generations"]
+    heavy = True
+
 
     def __init__(self, seed=0 , pivot_level=1):
         """
@@ -305,4 +308,4 @@ def concurrent_model_loader(self):
 
     json_file = {"type": convert_to_snake_case(tf.name()), "test_cases": test_cases}
     
-    print(json.dumps(json_file, indent=2))
\ No newline at end of file
+    print(json.dumps(json_file, indent=2))

From fbd10fdd66e9cb160efa30f31f1cfc913df1d0a1 Mon Sep 17 00:00:00 2001
From: AudayBerro <audayberro@gmail.com>
Date: Sun, 31 Oct 2021 14:06:11 +0100
Subject: [PATCH 7/7] comment  __name__ == __main__

---
 .../transformation.py                         | 48 +++++++++----------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/transformations/multi_pivot_paraphrases_generation/transformation.py b/transformations/multi_pivot_paraphrases_generation/transformation.py
index eef7488d8..eb4b4844d 100644
--- a/transformations/multi_pivot_paraphrases_generation/transformation.py
+++ b/transformations/multi_pivot_paraphrases_generation/transformation.py
@@ -281,31 +281,31 @@ def concurrent_model_loader(self):
         return response
 
 
-if __name__ == '__main__':
-    import json
-    from TestRunner import convert_to_snake_case
-
-    tf = MultiPivotParaphrasesGeneration()
-
-    sentences = ['How does COVID-19 spread?',
-        'Book a flight from Lyon to Sydney?',
-        'Reserve an Italian Restaurant near Paris',
-        'how many 10 euros are worth in dollars',
-        'which company makes the ipod?',
-        'what states does the connecticut river flow through?',
-        'in which tournaments did west indies cricket team win the championship?']
+# if __name__ == '__main__':
+#     import json
+#     from TestRunner import convert_to_snake_case
+
+#     tf = MultiPivotParaphrasesGeneration()
+
+#     sentences = ['How does COVID-19 spread?',
+#         'Book a flight from Lyon to Sydney?',
+#         'Reserve an Italian Restaurant near Paris',
+#         'how many 10 euros are worth in dollars',
+#         'which company makes the ipod?',
+#         'what states does the connecticut river flow through?',
+#         'in which tournaments did west indies cricket team win the championship?']
     
-    pr_gray("Start paraphrases Generation:")
-
-    test_cases = []
-    for sentence in sentences:
-        test_cases.append({
-            "class": tf.name(),
-            "inputs": {"Reference sentence": sentence}, "outputs": [{"Paraphrase": o} for o in tf.generate(sentence)]}
-        )
+#     pr_gray("Start paraphrases Generation:")
+
+#     test_cases = []
+#     for sentence in sentences:
+#         test_cases.append({
+#             "class": tf.name(),
+#             "inputs": {"Reference sentence": sentence}, "outputs": [{"Paraphrase": o} for o in tf.generate(sentence)]}
+#         )
     
-    pr_green("... done")
+#     pr_green("... done")
 
-    json_file = {"type": convert_to_snake_case(tf.name()), "test_cases": test_cases}
+#     json_file = {"type": convert_to_snake_case(tf.name()), "test_cases": test_cases}
     
-    print(json.dumps(json_file, indent=2))
+#     print(json.dumps(json_file, indent=2))