GEM-benchmark · motiwari · Nov 29, 2021
diff --git a/filters/alphanumeric/README.md b/filters/alphanumeric/README.md
@@ -0,0 +1,26 @@
+## Alphanumeric Characters Filter
+
+## What type of filter is this?
+
+This transformation filters text that contains characters which are non-alphanumeric and not common punctuation.
+The alphabetical characters are determined by the 26 letters of the English alphabet.
+
+Author: Mo Tiwari
+Author Email: [email protected]
+Author Affiliation: Stanford University
+
+## Why is measuring performance on this split important?
+This filter can be used to a) select text with only characters from a standard alphabet and 
+b) remove characters that are specifically meant to circumvent filters e.g. text that uses 
+`buy some pi//s` if the string `buy some pills` triggers spam filters 
+
+This is of import in domains such as profanity detection and spam, where bad actors may attempt to work around existing filters by using characters that can be easily mistaken for others.
+
+## Related Work
+
+N/A 
+
+## What are the limitations of this filter?
+- Currently, the filter only permits characters as defined by the English alphabet. 
+The filter could be extended to handle the characters from other alphabets via the `args`
+provided.
diff --git a/filters/alphanumeric/__init__.py b/filters/alphanumeric/__init__.py
@@ -0,0 +1 @@
+from .filter import *
diff --git a/filters/alphanumeric/filter.py b/filters/alphanumeric/filter.py
@@ -0,0 +1,22 @@
+from tasks.TaskTypes import TaskType
+from interfaces.SentenceOperation import SentenceOperation
+
+class AlphanumericFilter(SentenceOperation):
+    """
+    Filters sentence that characters which are a) not alphanumeric and b) not common punctuation.
+
+    Inherits SentenceOperation.
+    """
+    tasks = [TaskType.TEXT_CLASSIFICATION, TaskType.TEXT_TO_TEXT_GENERATION]
+    languages = ["en"]
+    keywords = ["highly-meaning-preserving", "low-generations", "rule-based"]
+
+    def __init__(self):
+        super().__init__()
+        self.punctuation = ['!', '.', '?', "'", '"', '(', ')', '-', ':', ';', ' ']
+
+    def filter(self, sentence: str = None) -> bool:
+        for c in sentence:
+            if not c.isalnum() and c not in self.punctuation:
+                return False
+        return True
diff --git a/filters/alphanumeric/test.json b/filters/alphanumeric/test.json
@@ -0,0 +1,40 @@
+{
+    "type": "keywords",
+    "test_cases": [
+        {
+            "class": "AlphanumericFilter",
+            "inputs": {
+                "sentence": "Andrew played cricket in India."
+            },
+            "outputs": true
+        },
+        {
+            "class": "AlphanumericFilter",
+            "inputs": {
+                "sentence": "∂ is a Greek letter."
+            },
+            "outputs": false
+        },
+        {
+            "class": "AlphanumericFilter",
+            "inputs": {
+                "sentence": "I love tennis!"
+            },
+            "outputs": true
+        },
+        {
+            "class": "AlphanumericFilter",
+            "inputs": {
+                "sentence": "¿Cómo estás?"
+            },
+            "outputs": false
+        },
+        {
+            "class": "AlphanumericFilter",
+            "inputs": {
+                "sentence": "Some non-alphanumeric characters are ^, *, and ž."
+            },
+            "outputs": false
+        }
+    ]
+}
diff --git a/filters/ambiguouscharacters/README.md b/filters/ambiguouscharacters/README.md
@@ -0,0 +1,28 @@
+## Ambiguous Characters Filter
+
+## What type of a filter is this?
+
+This filter filters sentences that contain ambiguous characters.
+(Aside: `Buffalo buffalo buffalo Buffalo buffalo Buffalo buffalo buffalo --> Filter filters filter filter filters filter filters filter`?)
+
+Author: Mo Tiwari
+Author Email: [email protected]
+Author Affiliation: Stanford University
+
+## Why is measuring performance on this split important?
+This filter can be used to either a) select text with ambiguous characters, or b) select text that contains only unambiguous characters.
+This is of import in domains such as profanity detection and spam, where bad actors may attempt to work around existing filters by using characters that can be easily mistaken for others.
+
+For example, "Buy some piIIs here" actually contains two capital `I`s for `l`s.
+
+This feature is also common in password managers, e.g. as a setting to avoid ambiguous characters when generating
+passwords.
+
+## Related Work
+
+N/A 
+
+## What are the limitations of this filter?
+- The usefulness of the filter depends on font in which the initial text was rendered; future work could accept the 
+source font as an argument
+- The filter is also primarily useful for the English language.
diff --git a/filters/ambiguouscharacters/__init__.py b/filters/ambiguouscharacters/__init__.py
@@ -0,0 +1 @@
+from .filter import *
diff --git a/filters/ambiguouscharacters/filter.py b/filters/ambiguouscharacters/filter.py
@@ -0,0 +1,31 @@
+from interfaces.SentenceOperation import SentenceOperation
+from tasks.TaskTypes import TaskType
+
+
+class AmbiguousCharactersFilter(SentenceOperation):
+    """
+    Filters sentence that contain ambiguous characters. The characters that might be ambiguous are defined below.
+
+    Inherits SentenceOperation.
+    """
+    tasks = [TaskType.TEXT_CLASSIFICATION, TaskType.TEXT_TO_TEXT_GENERATION]
+    languages = ["en"]
+    keywords = ["highly-meaning-preserving", "low-generations", "rule-based"]
+
+    def __init__(self):
+        super().__init__()
+        self.ambiguous_chars = [
+                                '0', 'O', 'D', 'o', 'Q',
+                                'l', '1', 'I', 'i', '!', '|',
+                                'B', '8',
+                                'Z', '2',
+                                'S', '5',
+                                'G', '6',
+                                "'", '`',
+                                ]
+
+    def filter(self, sentence: str = None) -> bool:
+        for c in sentence:
+            if c in self.ambiguous_chars:
+                return False
+        return True
diff --git a/filters/ambiguouscharacters/test.json b/filters/ambiguouscharacters/test.json
@@ -0,0 +1,5 @@
+{
+    "type": "ambiguouscharacters",
+    "test_cases": [
+    ]
+}
diff --git a/test/mapper.py b/test/mapper.py
@@ -85,6 +85,8 @@
 # Mapping of light and heavy filters
 map_filter = {
     "light": [
+        "alphanumeric",
+        "ambiguouscharacters",
         "code_mixing",
         "encoding",
         "group_inequity",