jfilter · peter-sk · May 9, 2023
diff --git a/cleantext/clean.py b/cleantext/clean.py
@@ -11,7 +11,7 @@
 from ftfy import fix_text
 
 from . import constants
-from .specials import save_replace
+from .specials import save_replace, specials_map
 from .utils import remove_substrings
 
 log = logging.getLogger()
@@ -78,13 +78,13 @@ def to_ascii_unicode(text, lang="en", no_emoji=False):
 
     lang = lang.lower()
     # special handling for German text to preserve umlauts
-    if lang == "de":
+    if lang in specials_map:
         text = save_replace(text, lang=lang)
 
     text = unidecode(text)
 
     # important to remove utility characters
-    if lang == "de":
+    if lang in specials_map:
         text = save_replace(text, lang=lang, back=True)
 
     if not no_emoji:
@@ -253,7 +253,7 @@ def clean(
         replace_with_digit (str): special DIGIT token, default "0",
         replace_with_currency_symbol (str): special CURRENCY token, default "<CUR>",
         replace_with_punct (str): replace punctuations with this token, default "",
-        lang (str): special language-depended preprocessing. Besides the default English ('en'), only German ('de') is supported
+        lang (str): special language-depended preprocessing. Besides the default English ('en'), Danish ('da'), Faroese ('fo'), French ('fr'), German ('de'), Icelandic ('is'), Italian ('it'), Norwegian ('no'), Scandinavian ('sv'), Spanish ('es'), and Swedish ('se') are supported
 
     Returns:
         str: input ``text`` processed according to function args

diff --git a/cleantext/specials.py b/cleantext/specials.py
@@ -5,10 +5,46 @@
 import unicodedata
 
 # add new languages here
-specials = {
+specials_map = {
     "de": {
         "case_insensitive": [["ä", "ae"], ["ü", "ue"], ["ö", "oe"]],
         "case_sensitive": [["ß", "ss"]],
+    },
+    "da": {
+        "case_insensitive": [["é", "e"], ["Æ", "ae"], ["ø", "oe"], ["å", "aa"]],
+        "case_sensitive": [],
+    },
+    "es": {
+        "case_insensitive": [["á", "a"], ["é", "e"], ["í", "i"], ["ó", "o"], ["ú", "u"], ["ñ", "n"]],
+        "case_sensitive": [],
+    },
+    "fo": {
+        "case_insensitive": [["á", "a"], ["ð", "d"], ["í", "i"], ["ó", "o"], ["ú", "u"], ["Æ", "ae"], ["ø", "oe"]],
+        "case_sensitive": [],
+    },
+    "fr": {
+        "case_insensitive": [["é", "e"], ["à", "a"], ["è", "e"], ["ù", "u"], ["â", "a"], ["ê", "e"], ["î", "oe"], ["ô", "o"], ["û", "u"], ["ë", "e"], ["ï", "i"], ["ü", "u"], ["ÿ", "y"], ["ç", "c"]],
+        "case_sensitive": [],
+    },
+    "is": {
+        "case_insensitive": [["á", "a"], ["ð", "d"], ["é","e"], ["í", "i"], ["ó", "o"], ["ú", "u"], ["ý", "y"], ["þ", "th"], ["Æ", "ae"], ["ö", "oe"]],
+        "case_sensitive": [],
+    },
+    "it": {
+        "case_insensitive": [["á", "a"], ["é", "e"], ["í", "i"], ["ó", "o"], ["ú", "u"], ["à", "a"], ["è", "e"], ["ì", "i"], ["ò", "o"], ["ù", "u"]],
+        "case_sensitive": [],
+    },
+    "no": {
+        "case_insensitive": [["é", "e"], ["ó", "o"], ["è", "e"], ["ò", "o"], ["ù", "u"], ["ê", "e"], ["ô", "o"], ["Æ", "ae"], ["ø", "oe"], ["å", "aa"]],
+        "case_sensitive": [],
+    },
+    "sv": {
+        "case_insensitive": [["á", "a"], ["é", "e"], ["í", "i"], ["ó", "o"], ["ú", "u"], ["è", "e"], ["ý", "y"], ["ò", "o"], ["ù", "u"], ["ê", "e"], ["ô", "o"], ["ð", "d"], ["þ", "th"], ["Æ", "ae"], ["ø", "oe"], ["å", "aa"], ["ä", "oe"], ["ö", "oe"]],
+        "case_sensitive": [],
+    },
+    "se": {
+        "case_insensitive": [["å", "aa"], ["ä", "oe"], ["ö", "oe"]],
+        "case_sensitive": [],
     }
 }
 escape_sequence = "xxxxx"
@@ -26,11 +62,11 @@ def save_replace(text, lang, back=False):
     text = norm(text)
 
     possibilities = (
-        specials[lang]["case_sensitive"]
-        + [[norm(x[0]), x[1]] for x in specials[lang]["case_insensitive"]]
+        specials_map[lang]["case_sensitive"]
+        + [[norm(x[0]), x[1]] for x in specials_map[lang]["case_insensitive"]]
         + [
             [norm(x[0].upper()), x[1].upper()]
-            for x in specials[lang]["case_insensitive"]
+            for x in specials_map[lang]["case_insensitive"]
         ]
     )
     for pattern, target in possibilities: