Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added danish, faroese, icelandic, italian, norwegian, scandinavian, s… #36

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions cleantext/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from ftfy import fix_text

from . import constants
from .specials import save_replace
from .specials import save_replace, specials_map
from .utils import remove_substrings

log = logging.getLogger()
Expand Down Expand Up @@ -78,13 +78,13 @@ def to_ascii_unicode(text, lang="en", no_emoji=False):

lang = lang.lower()
# special handling for German text to preserve umlauts
if lang == "de":
if lang in specials_map:
text = save_replace(text, lang=lang)

text = unidecode(text)

# important to remove utility characters
if lang == "de":
if lang in specials_map:
text = save_replace(text, lang=lang, back=True)

if not no_emoji:
Expand Down Expand Up @@ -253,7 +253,7 @@ def clean(
replace_with_digit (str): special DIGIT token, default "0",
replace_with_currency_symbol (str): special CURRENCY token, default "<CUR>",
replace_with_punct (str): replace punctuations with this token, default "",
lang (str): special language-depended preprocessing. Besides the default English ('en'), only German ('de') is supported
lang (str): special language-depended preprocessing. Besides the default English ('en'), Danish ('da'), Faroese ('fo'), French ('fr'), German ('de'), Icelandic ('is'), Italian ('it'), Norwegian ('no'), Scandinavian ('sv'), Spanish ('es'), and Swedish ('se') are supported

Returns:
str: input ``text`` processed according to function args
Expand Down
44 changes: 40 additions & 4 deletions cleantext/specials.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,46 @@
import unicodedata

# add new languages here
specials = {
specials_map = {
"de": {
"case_insensitive": [["ä", "ae"], ["ü", "ue"], ["ö", "oe"]],
"case_sensitive": [["ß", "ss"]],
},
"da": {
"case_insensitive": [["é", "e"], ["Æ", "ae"], ["ø", "oe"], ["å", "aa"]],
"case_sensitive": [],
},
"es": {
"case_insensitive": [["á", "a"], ["é", "e"], ["í", "i"], ["ó", "o"], ["ú", "u"], ["ñ", "n"]],
"case_sensitive": [],
},
"fo": {
"case_insensitive": [["á", "a"], ["ð", "d"], ["í", "i"], ["ó", "o"], ["ú", "u"], ["Æ", "ae"], ["ø", "oe"]],
"case_sensitive": [],
},
"fr": {
"case_insensitive": [["é", "e"], ["à", "a"], ["è", "e"], ["ù", "u"], ["â", "a"], ["ê", "e"], ["î", "oe"], ["ô", "o"], ["û", "u"], ["ë", "e"], ["ï", "i"], ["ü", "u"], ["ÿ", "y"], ["ç", "c"]],
"case_sensitive": [],
},
"is": {
"case_insensitive": [["á", "a"], ["ð", "d"], ["é","e"], ["í", "i"], ["ó", "o"], ["ú", "u"], ["ý", "y"], ["þ", "th"], ["Æ", "ae"], ["ö", "oe"]],
"case_sensitive": [],
},
"it": {
"case_insensitive": [["á", "a"], ["é", "e"], ["í", "i"], ["ó", "o"], ["ú", "u"], ["à", "a"], ["è", "e"], ["ì", "i"], ["ò", "o"], ["ù", "u"]],
"case_sensitive": [],
},
"no": {
"case_insensitive": [["é", "e"], ["ó", "o"], ["è", "e"], ["ò", "o"], ["ù", "u"], ["ê", "e"], ["ô", "o"], ["Æ", "ae"], ["ø", "oe"], ["å", "aa"]],
"case_sensitive": [],
},
"sv": {
"case_insensitive": [["á", "a"], ["é", "e"], ["í", "i"], ["ó", "o"], ["ú", "u"], ["è", "e"], ["ý", "y"], ["ò", "o"], ["ù", "u"], ["ê", "e"], ["ô", "o"], ["ð", "d"], ["þ", "th"], ["Æ", "ae"], ["ø", "oe"], ["å", "aa"], ["ä", "oe"], ["ö", "oe"]],
"case_sensitive": [],
},
"se": {
"case_insensitive": [["å", "aa"], ["ä", "oe"], ["ö", "oe"]],
"case_sensitive": [],
}
}
escape_sequence = "xxxxx"
Expand All @@ -26,11 +62,11 @@ def save_replace(text, lang, back=False):
text = norm(text)

possibilities = (
specials[lang]["case_sensitive"]
+ [[norm(x[0]), x[1]] for x in specials[lang]["case_insensitive"]]
specials_map[lang]["case_sensitive"]
+ [[norm(x[0]), x[1]] for x in specials_map[lang]["case_insensitive"]]
+ [
[norm(x[0].upper()), x[1].upper()]
for x in specials[lang]["case_insensitive"]
for x in specials_map[lang]["case_insensitive"]
]
)
for pattern, target in possibilities:
Expand Down