From fdd7c9641ea6b14af6dfd0bf21456a7071b33332 Mon Sep 17 00:00:00 2001 From: aarneranta Date: Wed, 20 Sep 2023 16:05:46 +0200 Subject: [PATCH] Ara: improving Adj inflection by identifying fcl patterns from concrete forms --- src/arabic/MorphoAra.gf | 6 +++-- src/arabic/ParadigmsAra.gf | 32 +++++++++++++++++++++--- src/arabic/wiktionary/Makefile | 7 ++++++ src/arabic/wiktionary/read_wiktionary.py | 28 +++++++++++++++++++-- 4 files changed, 65 insertions(+), 8 deletions(-) create mode 100644 src/arabic/wiktionary/Makefile diff --git a/src/arabic/MorphoAra.gf b/src/arabic/MorphoAra.gf index 808223b4d..53f7a2608 100644 --- a/src/arabic/MorphoAra.gf +++ b/src/arabic/MorphoAra.gf @@ -153,7 +153,8 @@ oper w + "ف" + x + "ع" + y + "ل" + z => { h = w ; m1 = x; m2 = y; t = z} ; w + "ف" + x + ("ع"|"ل") + y - => { h = w ; m1 = x; m2 = ""; t = y} + => { h = w ; m1 = x; m2 = ""; t = y} ; + _ => Predef.error("cannot get FCL pattern from" ++ pat) } ; --opers to interdigitize (make words out of roots and patterns: @@ -204,7 +205,8 @@ oper => mkAssimilated pat (mkRoot3 rS) ; ? + ? + _ => mkBilit pat (mkRoot2 rS) ; --2=> _=> error rS ---- AR error "expected 3--6" - } + } ; + _ => Predef.error("cannot get FCL pattern from" ++ pS) }; ----------------------------------------------------------------------------- diff --git a/src/arabic/ParadigmsAra.gf b/src/arabic/ParadigmsAra.gf index 1b3cfc85b..3d1623e14 100644 --- a/src/arabic/ParadigmsAra.gf +++ b/src/arabic/ParadigmsAra.gf @@ -898,12 +898,30 @@ oper = \r -> mkA r.root ; mkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str; root : Str} -> A = \r -> mkA r.root ; - mkA : {masc_sg : Str; fem_sg: Str ; masc_pl : Str; fem_pl : Str; root : Str} -> A - = \r -> mkA r.root ; + mkA : {masc_sg, fem_sg, masc_pl, fem_pl, root, sg_patt, pl_patt : Str} -> A + = \r -> mkA r.root r.sg_patt r.pl_patt ; + mkA : {masc_sg, fem_sg, masc_pl, root, sg_patt, pl_patt : Str} -> A + = \r -> mkA r.root r.sg_patt r.pl_patt ; + mkA : {fem_pl : Str; fem_sg : Str; masc_sg : Str; root : Str; sg_patt : Str} -> A + = \r -> mkA r.root r.sg_patt ; + mkA : {fem_pl : Str; fem_sg : Str; masc_sg, masc_pl, root, sg_patt : Str} -> A + = \r -> mkA r.root r.sg_patt ; + mkA : {masc_sg, root, sg_patt : Str} -> A + = \r -> mkA r.root r.sg_patt ; + mkA : {masc_sg, masc_pl, root, sg_patt : Str} -> A + = \r -> mkA r.root r.sg_patt ; + mkA : {masc_sg, fem_sg, masc_pl, fem_pl, root, pl_patt : Str} -> A + = \r -> mkA r.root ; ---- + mkA : {masc_sg, fem_sg, masc_pl, fem_pl, root : Str} -> A + = \r -> mkA r.root ; ---- + mkA : {masc_sg, fem_sg, root : Str} -> A + = \r -> mkA r.root ; ---- + mkA : {masc_sg, fem_sg, masc_pl, fem_pl, pl_patt : Str} -> A + = \r -> mkA r.masc_sg ; ---- mkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str} -> A = \r -> mkA r.masc_sg ; ---- - mkA : {masc_sg : Str; fem_sg : Str; root : Str} -> A - = \r -> mkA r.root ; + mkA : {masc_sg : Str; fem_sg : Str; root : Str ; sg_patt : Str} -> A + = \r -> mkA r.root r.sg_patt ; mkA : {masc_sg : Str; fem_sg : Str} -> A = \r -> mkA r.masc_sg ; ---- mkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str; fem_pl : Str} -> A @@ -914,8 +932,14 @@ oper = \r -> mkA r.masc_sg ; ---- mkA : {masc_sg : Str; masc_pl : Str; root : Str} -> A = \r -> mkA r.root ; + mkA : {masc_sg : Str; masc_pl, pl_patt : Str; root : Str} -> A + = \r -> mkA r.root ; + mkA : {masc_sg : Str; masc_pl, pl_patt, sg_patt : Str; root : Str} -> A + = \r -> mkA r.sg_patt r.pl_patt ; mkA : {masc_sg : Str; masc_pl : Str} -> A = \r -> mkA r.masc_sg ; ---- + mkA : {masc_sg : Str; masc_pl, pl_patt : Str} -> A + = \r -> mkA r.masc_sg ; ---- mkA : {masc_sg : Str; root : Str} -> A = \r -> mkA r.root ; mkA : {masc_sg : Str} -> A diff --git a/src/arabic/wiktionary/Makefile b/src/arabic/wiktionary/Makefile new file mode 100644 index 000000000..80e1da791 --- /dev/null +++ b/src/arabic/wiktionary/Makefile @@ -0,0 +1,7 @@ +all: + python3 read_wiktionary.py gf-abs >MorphoDictAraAbs.gf + python3 read_wiktionary.py gf-cnc >MorphoDictAra.gf + python3 read_wiktionary.py gf-map >source_of_MorphoDictAra.jsonl + gf -make MorphoDictAra.gf + python3 read_wiktionary.py eval-funs >1-eval.txt + python3 to_wordnet.py >wornet-arabic.jsonl diff --git a/src/arabic/wiktionary/read_wiktionary.py b/src/arabic/wiktionary/read_wiktionary.py index 434617231..960a592d3 100644 --- a/src/arabic/wiktionary/read_wiktionary.py +++ b/src/arabic/wiktionary/read_wiktionary.py @@ -122,7 +122,7 @@ def get_gzip_json(file, sample=100000, langs=[]): 0x638: 'Z', # ظ 0x639: 'E', # ع 0x63a: 'g', # غ - 0x641: 'f', # ف + 0x641: 'f', # ف 0x642: 'q', # ق 0x643: 'k', # ك 0x644: 'l', # ل @@ -144,6 +144,7 @@ def get_gzip_json(file, sample=100000, langs=[]): 0x671: '{' # ٱ } + buckwalter_dict_rev = {b: chr(a) for a, b in buckwalter_dict.items()} arabic_vowels = {chr(c) for c in {0x64b, 0x64c, 0x64d, 0x64e, 0x64f, 0x650}} @@ -184,6 +185,24 @@ def get_sound_trigram_root(s): else: return None + +# reverse engineer fcl pattern from a given form, with a sound trigram root +# one more condition: each of the root letters occurs exactly ones +# TODO: better use the given root of the lex entry +def get_sound_fcl_pattern(s): + if root := get_sound_trigram_root(s): + if len([c in s for c in root]) == 3: + p = list(s) + r = s.find(root[0]) + p[r] = chr(0x641) + r += s[r+1:].find(root[1]) + 1 + p[r] = chr(0x639) + r += s[r+1:].find(root[2]) + 1 + p[r] = chr(0x644) + p = ''.join(p) +## print('---PATT', s, root, p) + return p + # Wikt uses vowel+shadda which is a Unicode normalization # GF uses shadda+vowel which is linguistically correct @@ -324,7 +343,12 @@ def forms_for_pos(obj): 'fem_pl': [form for form, descr in forms if all([w in descr for w in ['indefinite', 'feminine', 'plural', 'informal']])][:1], } - } + } + for patt in ['masc_sg', 'masc_pl']: + if patt in gf_entry['args']: + if form := gf_entry['args'][patt]: + if spatt := get_sound_fcl_pattern(form[0]): + gf_entry['args'][patt[5:]+'_patt'] = [spatt] # sg_patt, pl_patt else: gf_entry = {f: d for f, d in forms}