-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Handle Punctuation in
get_*_text
(#3)
* Handle Punctuation in `get_*_text` * Handle Negative Numbers and Optimize
- Loading branch information
1 parent
ea6a43b
commit 2e411a4
Showing
4 changed files
with
190 additions
and
24 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,163 @@ | ||
import re | ||
|
||
punct_dict = dict( | ||
zip( | ||
'''!"'(),-./:;?[]{}~·‐‑‒–—―‘’“”…⋮⋯⸱⸳⸺⸻、。〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟・︐︑︒︓︔︕︖︗︘︙︱︲︵︶︷︸︹︺︻︼︽︾︿﹀﹁﹂﹃﹄﹇﹈﹐﹑﹒﹔﹕﹖﹗﹘﹙﹚﹛﹜﹝﹞﹣!"'(),-./:;?[]{}~⦅⦆。「」、・''', | ||
'''!"'(),-./:;?[]{}~·------‘’“”………··--,.‘’“”“”‘’[][][][][]~“””·,,.:;!?[]…--(){}[][]“”‘’“”‘’[],,.;:?!-(){}[]-!"'(),-./:;?[]{}~().“”,·''' | ||
) | ||
) | ||
|
||
left_bracket = '([{‘“' | ||
right_bracket = ')]}’”' | ||
left_bracket_to_right = dict(zip(left_bracket, right_bracket)) | ||
left_bracket = {*left_bracket} | ||
right_bracket = {*right_bracket} | ||
left_punct = {*left_bracket} | ||
right_punct = {*'!,.:;?…', *right_bracket} | ||
other_punct = {*'''"'·-~'''} | ||
left_or_other_punct = {' ', *left_punct, *other_punct} | ||
right_or_other_punct = {*right_punct, *other_punct} | ||
|
||
minus_signs = {*'-﹣-'} # U+2212 is unnecessary | ||
decimal_seps = {*'''',.·⸱⸳﹒'.'''} | ||
digits = {*'00𝟎𝟘𝟢𝟬𝟶🯰11𝟏𝟙𝟣𝟭𝟷🯱22𝟐𝟚𝟤𝟮𝟸🯲33𝟑𝟛𝟥𝟯𝟹🯳44𝟒𝟜𝟦𝟰𝟺🯴55𝟓𝟝𝟧𝟱𝟻🯵66𝟔𝟞𝟨𝟲𝟼🯶77𝟕𝟟𝟩𝟳𝟽🯷88𝟖𝟠𝟪𝟴𝟾🯸99𝟗𝟡𝟫𝟵𝟿🯹'} | ||
unknown_or_hyphen = {'', '-'} | ||
|
||
def format_romanization_text(s, conv): | ||
def inner(m): | ||
t = [None] | ||
d = [None] | ||
for k, v in conv(m[0]): | ||
if v: | ||
t += [v] | ||
d += [None] | ||
elif not k.isspace(): | ||
t += [punct_dict.get(k, '')] | ||
d += [k] | ||
t += [None] | ||
d += [None] | ||
l = '' | ||
b = '' | ||
for i, (p, c, n) in enumerate(zip(t, t[1:], t[2:]), 1): | ||
def between(): | ||
nonlocal t, i | ||
j = i - 1 | ||
while j and t[j] in right_bracket: | ||
j -= 1 | ||
f = j and t[j] and len(t[j]) > 1 | ||
j = i + 1 | ||
while j < len(t) - 1 and t[j] in left_bracket: | ||
j += 1 | ||
g = j and t[j] and len(t[j]) > 1 | ||
return f and g | ||
|
||
def lspace(): | ||
nonlocal l | ||
if l and l[-1] not in left_or_other_punct: | ||
l += ' ' | ||
|
||
def rspace(): | ||
nonlocal n, l | ||
if i < len(d) - 2 and d[i + 2] in digits if d[i + 1] in minus_signs else n not in right_or_other_punct: | ||
l += ' ' | ||
|
||
if len(c) > 1: | ||
lspace() | ||
l += c | ||
rspace() | ||
elif not c or d[i] in minus_signs and d[i + 1] in digits and p not in unknown_or_hyphen: | ||
if not l.endswith('[…]'): | ||
l += '[…]' | ||
elif d[i] in decimal_seps and d[i + 1] in digits and d[i - 1] in digits: | ||
continue | ||
elif c in left_punct: | ||
lspace() | ||
l += c | ||
b += left_bracket_to_right[c] | ||
elif c in right_punct: | ||
l += c | ||
rspace() | ||
try: | ||
b = b[:b.rindex(c)] | ||
except ValueError: | ||
pass | ||
elif c == '-': | ||
if p == '-': | ||
continue | ||
if n == '-' or between(): | ||
l += ' – ' | ||
else: | ||
l += c | ||
elif c == '~': | ||
if p == '~' and n != '~' or between(): | ||
l += '~ ' | ||
else: | ||
l += c | ||
elif c == '·': | ||
l += c | ||
else: | ||
j = len(b) - 1 | ||
y = False | ||
while j >= 0 and b[j] not in right_bracket: | ||
if b[j] == c: | ||
y = True | ||
break | ||
j -= 1 | ||
if y: | ||
b = b[:j] | ||
l += c | ||
rspace() | ||
else: | ||
lspace() | ||
l += c | ||
b += c | ||
return ' '.join(l.split()) | ||
|
||
return re.sub(r'[^\0-\x1f\x80-\x9f]+', inner, s) | ||
|
||
major_break = {*'.!?…'} | ||
minor_break = {*',/:;-~()[]{}'} | ||
|
||
def format_ipa_text(s, conv): | ||
def inner(m): | ||
t = [] | ||
d = [] | ||
for k, v in conv(m[0]): | ||
if v: | ||
t += [v] | ||
d += [None] | ||
elif not k.isspace(): | ||
t += [punct_dict.get(k, '')] | ||
d += [k] | ||
d += [None] | ||
l = [] | ||
for i, c in enumerate(t): | ||
if len(c) > 1: | ||
l += [c] | ||
elif not c or d[i] in minus_signs and d[i + 1] in digits and i and t[i - 1] not in unknown_or_hyphen: | ||
if not l or l[-1] != '⸨…⸩': | ||
l += ['⸨…⸩'] | ||
elif l: | ||
if d[i] in decimal_seps and d[i + 1] in digits and i and d[i - 1] in digits: | ||
continue | ||
if c in major_break: | ||
if len(l[-1]) > 1: | ||
l += ['‖'] | ||
else: | ||
l[-1] = '‖' | ||
elif c in minor_break and len(l[-1]) > 1: | ||
l += ['|'] | ||
if len(l[-1]) == 1: | ||
l.pop() | ||
s = '' | ||
for i, c in enumerate(l): | ||
s += c | ||
if i < len(l) - 1: | ||
n = l[i + 1] | ||
if c != '⸨…⸩' and len(c) > 1 and n != '⸨…⸩' and len(n) > 1: | ||
s += '.' | ||
else: | ||
s += ' ' | ||
return s | ||
|
||
return re.sub(r'[^\0-\x1f\x80-\x9f]+', inner, s) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
__version__ = '0.2.1' | ||
__version__ = '0.2.2' |