GEM-benchmark · marco-digio · Aug 4, 2021 · Sep 9, 2021 · Sep 9, 2021 · Sep 10, 2021
diff --git a/transformations/space_between_characters/README.md b/transformations/space_between_characters/README.md
@@ -0,0 +1,23 @@
+# Space Between Characters
+This perturbation adds noise to all types of text sources (sentence, paragraph, etc.).
+
+Author name: Marco Di Giovanni
+Author email: [email protected]
+Author Affiliation: Politecnico di Milano and University of Bologna
+
+## What type of a transformation is this?
+This transformation acts like a perturbation to test robustness. Few words are picked at random and spaces are added between characters (e.g., "Marco" -> "M a r c o").
+
+The probability of adding a space between characters can also be set (default to 1), allowing transformations like: "house" -> "h ouse" or "h o use".
+
+Generated transformations display high similarity to the source sentences i.e. the code outputs highly precise and readable generations.
+
+## What tasks does it intend to benefit?
+This perturbation would benefit all tasks which have a sentence/paragraph/document as input like text classification, text generation, etc.
+
+It could also benefit tasks involving data from OCR systems.
+
+## What are the limitations of this transformation?
+- The transformation's outputs are very simple.
+- It is not capable of generating linguistically diverse text.
+- This transformation will mainly affect the perfornamce of token/word-level models, while character-level models should be much more robust.
diff --git a/transformations/space_between_characters/__init__.py b/transformations/space_between_characters/__init__.py
@@ -0,0 +1 @@
+from .transformation import *
diff --git a/transformations/space_between_characters/test.json b/transformations/space_between_characters/test.json
@@ -0,0 +1,50 @@
+{
+  "type": "space_between_characters",
+  "test_cases": [
+    {
+      "class": "SpaceBetweenCharacters",
+      "inputs": {
+        "sentence": "Andrew finally returned the French book to Chris that I bought last week"
+      },
+      "outputs": [{
+        "sentence": "Andrew f i n a l l y returned the French book to C h r i s that I bought last w e e k"
+      }]
+    },
+    {
+      "class": "SpaceBetweenCharacters",
+      "inputs": {
+        "sentence": "Sentences with gapping, such as Paul likes coffee and Mary tea, lack an overt predicate to indicate the relation between two or more arguments."
+      },
+      "outputs": [{
+        "sentence": "Sentences w i t h gapping, such as Paul likes c o f f e e and M a r y tea, lack a n overt predicate to indicate the relation b e t w e e n two or more arguments."
+      }]
+    },
+    {
+      "class": "SpaceBetweenCharacters",
+      "inputs": {
+        "sentence": "Alice in Wonderland is a 2010 American live-action/animated dark fantasy adventure film"
+      },
+      "outputs": [{
+        "sentence": "Alice i n Wonderland is a 2010 American l i v e - a c t i o n / a n i m a t e d dark f a n t a s y adventure film"
+      }]
+    },
+    {
+      "class": "SpaceBetweenCharacters",
+      "inputs": {
+        "sentence": "Ujjal Dev Dosanjh served as 33rd Premier of British Columbia from 2000 to 2001"
+      },
+      "outputs": [{
+        "sentence": "Ujjal D e v Dosanjh served as 33rd Premier o f British C o l u m b i a from 2000 t o 2001"
+      }]
+    },
+    {
+      "class": "SpaceBetweenCharacters",
+      "inputs": {
+        "sentence": "Neuroplasticity is a continuous processing allowing short-term, medium-term, and long-term remodeling of the neuronosynaptic organization."
+      },
+      "outputs": [{
+        "sentence": "Neuroplasticity i s a continuous processing allowing short-term, m e d i u m - t e r m , and l o n g - t e r m remodeling of t h e neuronosynaptic organization."
+      }]
+    }
+  ]
+}
diff --git a/transformations/space_between_characters/transformation.py b/transformations/space_between_characters/transformation.py
@@ -0,0 +1,62 @@
+import random
+from typing import List
+
+from interfaces.SentenceOperation import SentenceOperation
+from tasks.TaskTypes import TaskType
+
+
+def add_spaces(text, prob_token=0.1, prob_char=1.0, seed=0, max_outputs=1):
+    random.seed(seed)
+
+    words = text.split(" ")
+    perturbed_texts = []
+    for _ in range(max_outputs):
+        perturbed_text = []
+        for word in words:
+            if random.random() <= prob_token:
+                if prob_char == 1:
+                    new_word = " ".join(word)
+                else:
+                    new_word = [word[0]]
+                    for letter in word[1:]:
+                        if random.random() <= prob_char:
+                            new_word.append(" ")
+                        new_word.append(letter)
+                new_word = "".join(new_word)
+            else:
+                new_word = word
+            perturbed_text.append(new_word)
+        perturbed_texts.append(" ".join(perturbed_text))
+    return perturbed_texts
+
+
+class SpaceBetweenCharacters(SentenceOperation):
+    tasks = [
+        TaskType.TEXT_CLASSIFICATION,
+        TaskType.TEXT_TO_TEXT_GENERATION,
+        TaskType.TEXT_TAGGING,
+    ]
+    languages = ["All"]
+    keywords = [
+        "morphological",
+        "noise",
+        "rule-based",
+        "highly-meaning-preserving",
+        "high-precision",
+        "high-coverage",
+    ]
+
+    def __init__(self, seed=42, max_outputs=1, prob_token=0.1, prob_char=1.0):
+        super().__init__(seed, max_outputs=max_outputs)
+        self.prob_token = prob_token
+        self.prob_char = prob_char
+
+    def generate(self, sentence: str) -> List[str]:
+        perturbed_texts = add_spaces(
+            text=sentence,
+            prob_token=self.prob_token,
+            prob_char=self.prob_char,
+            seed=self.seed,
+            max_outputs=self.max_outputs,
+        )
+        return perturbed_texts