From 8420520bbe78b9099bac92edbb380774cfa66512 Mon Sep 17 00:00:00 2001 From: motiwari Date: Sun, 28 Nov 2021 18:22:03 -0800 Subject: [PATCH] Adding ambiguous characters filter Renaming ambiguousfilter folder Adding alphanumeric characters filter Removing unused imports and dead code in alphanumeric filter, renaming class Adding ambiguous characters filter Fixing import error by renaming alphanumeric_filter.py -> filter.py Adding keywords Addressing reviewers comments Adding necessary import Making recommended changes --- filters/alphanumeric/README.md | 26 ++++++++++++++++ filters/alphanumeric/__init__.py | 1 + filters/alphanumeric/filter.py | 22 ++++++++++++++ filters/alphanumeric/test.json | 40 +++++++++++++++++++++++++ filters/ambiguouscharacters/README.md | 28 +++++++++++++++++ filters/ambiguouscharacters/__init__.py | 1 + filters/ambiguouscharacters/filter.py | 31 +++++++++++++++++++ filters/ambiguouscharacters/test.json | 5 ++++ test/mapper.py | 2 ++ 9 files changed, 156 insertions(+) create mode 100644 filters/alphanumeric/README.md create mode 100644 filters/alphanumeric/__init__.py create mode 100644 filters/alphanumeric/filter.py create mode 100644 filters/alphanumeric/test.json create mode 100644 filters/ambiguouscharacters/README.md create mode 100644 filters/ambiguouscharacters/__init__.py create mode 100644 filters/ambiguouscharacters/filter.py create mode 100644 filters/ambiguouscharacters/test.json diff --git a/filters/alphanumeric/README.md b/filters/alphanumeric/README.md new file mode 100644 index 000000000..428794cce --- /dev/null +++ b/filters/alphanumeric/README.md @@ -0,0 +1,26 @@ +## Alphanumeric Characters Filter + +## What type of filter is this? + +This transformation filters text that contains characters which are non-alphanumeric and not common punctuation. +The alphabetical characters are determined by the 26 letters of the English alphabet. + +Author: Mo Tiwari +Author Email: motiwari@stanford.edu +Author Affiliation: Stanford University + +## Why is measuring performance on this split important? +This filter can be used to a) select text with only characters from a standard alphabet and +b) remove characters that are specifically meant to circumvent filters e.g. text that uses +`buy some pi//s` if the string `buy some pills` triggers spam filters + +This is of import in domains such as profanity detection and spam, where bad actors may attempt to work around existing filters by using characters that can be easily mistaken for others. + +## Related Work + +N/A + +## What are the limitations of this filter? +- Currently, the filter only permits characters as defined by the English alphabet. +The filter could be extended to handle the characters from other alphabets via the `args` +provided. \ No newline at end of file diff --git a/filters/alphanumeric/__init__.py b/filters/alphanumeric/__init__.py new file mode 100644 index 000000000..1e78c9bed --- /dev/null +++ b/filters/alphanumeric/__init__.py @@ -0,0 +1 @@ +from .filter import * diff --git a/filters/alphanumeric/filter.py b/filters/alphanumeric/filter.py new file mode 100644 index 000000000..e98b02033 --- /dev/null +++ b/filters/alphanumeric/filter.py @@ -0,0 +1,22 @@ +from tasks.TaskTypes import TaskType +from interfaces.SentenceOperation import SentenceOperation + +class AlphanumericFilter(SentenceOperation): + """ + Filters sentence that characters which are a) not alphanumeric and b) not common punctuation. + + Inherits SentenceOperation. + """ + tasks = [TaskType.TEXT_CLASSIFICATION, TaskType.TEXT_TO_TEXT_GENERATION] + languages = ["en"] + keywords = ["highly-meaning-preserving", "low-generations", "rule-based"] + + def __init__(self): + super().__init__() + self.punctuation = ['!', '.', '?', "'", '"', '(', ')', '-', ':', ';', ' '] + + def filter(self, sentence: str = None) -> bool: + for c in sentence: + if not c.isalnum() and c not in self.punctuation: + return False + return True diff --git a/filters/alphanumeric/test.json b/filters/alphanumeric/test.json new file mode 100644 index 000000000..42e509320 --- /dev/null +++ b/filters/alphanumeric/test.json @@ -0,0 +1,40 @@ +{ + "type": "keywords", + "test_cases": [ + { + "class": "AlphanumericFilter", + "inputs": { + "sentence": "Andrew played cricket in India." + }, + "outputs": true + }, + { + "class": "AlphanumericFilter", + "inputs": { + "sentence": "∂ is a Greek letter." + }, + "outputs": false + }, + { + "class": "AlphanumericFilter", + "inputs": { + "sentence": "I love tennis!" + }, + "outputs": true + }, + { + "class": "AlphanumericFilter", + "inputs": { + "sentence": "¿Cómo estás?" + }, + "outputs": false + }, + { + "class": "AlphanumericFilter", + "inputs": { + "sentence": "Some non-alphanumeric characters are ^, *, and ž." + }, + "outputs": false + } + ] +} \ No newline at end of file diff --git a/filters/ambiguouscharacters/README.md b/filters/ambiguouscharacters/README.md new file mode 100644 index 000000000..b5f2c4461 --- /dev/null +++ b/filters/ambiguouscharacters/README.md @@ -0,0 +1,28 @@ +## Ambiguous Characters Filter + +## What type of a filter is this? + +This filter filters sentences that contain ambiguous characters. +(Aside: `Buffalo buffalo buffalo Buffalo buffalo Buffalo buffalo buffalo --> Filter filters filter filter filters filter filters filter`?) + +Author: Mo Tiwari +Author Email: motiwari@stanford.edu +Author Affiliation: Stanford University + +## Why is measuring performance on this split important? +This filter can be used to either a) select text with ambiguous characters, or b) select text that contains only unambiguous characters. +This is of import in domains such as profanity detection and spam, where bad actors may attempt to work around existing filters by using characters that can be easily mistaken for others. + +For example, "Buy some piIIs here" actually contains two capital `I`s for `l`s. + +This feature is also common in password managers, e.g. as a setting to avoid ambiguous characters when generating +passwords. + +## Related Work + +N/A + +## What are the limitations of this filter? +- The usefulness of the filter depends on font in which the initial text was rendered; future work could accept the +source font as an argument +- The filter is also primarily useful for the English language. \ No newline at end of file diff --git a/filters/ambiguouscharacters/__init__.py b/filters/ambiguouscharacters/__init__.py new file mode 100644 index 000000000..1e78c9bed --- /dev/null +++ b/filters/ambiguouscharacters/__init__.py @@ -0,0 +1 @@ +from .filter import * diff --git a/filters/ambiguouscharacters/filter.py b/filters/ambiguouscharacters/filter.py new file mode 100644 index 000000000..1cf3d23a2 --- /dev/null +++ b/filters/ambiguouscharacters/filter.py @@ -0,0 +1,31 @@ +from interfaces.SentenceOperation import SentenceOperation +from tasks.TaskTypes import TaskType + + +class AmbiguousCharactersFilter(SentenceOperation): + """ + Filters sentence that contain ambiguous characters. The characters that might be ambiguous are defined below. + + Inherits SentenceOperation. + """ + tasks = [TaskType.TEXT_CLASSIFICATION, TaskType.TEXT_TO_TEXT_GENERATION] + languages = ["en"] + keywords = ["highly-meaning-preserving", "low-generations", "rule-based"] + + def __init__(self): + super().__init__() + self.ambiguous_chars = [ + '0', 'O', 'D', 'o', 'Q', + 'l', '1', 'I', 'i', '!', '|', + 'B', '8', + 'Z', '2', + 'S', '5', + 'G', '6', + "'", '`', + ] + + def filter(self, sentence: str = None) -> bool: + for c in sentence: + if c in self.ambiguous_chars: + return False + return True diff --git a/filters/ambiguouscharacters/test.json b/filters/ambiguouscharacters/test.json new file mode 100644 index 000000000..e96b978c1 --- /dev/null +++ b/filters/ambiguouscharacters/test.json @@ -0,0 +1,5 @@ +{ + "type": "ambiguouscharacters", + "test_cases": [ + ] +} \ No newline at end of file diff --git a/test/mapper.py b/test/mapper.py index 1238534cd..1ae73a9a4 100644 --- a/test/mapper.py +++ b/test/mapper.py @@ -85,6 +85,8 @@ # Mapping of light and heavy filters map_filter = { "light": [ + "alphanumeric", + "ambiguouscharacters", "code_mixing", "encoding", "group_inequity",