-
Notifications
You must be signed in to change notification settings - Fork 0
/
arabtype.h
227 lines (197 loc) · 7.4 KB
/
arabtype.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
/*
** Arabtype Copyright 2012-2015(c) Wael El Oraiby. All Rights Reserved
**
** This library is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU General Public License for more details.
** Under Section 7 of GPL version 3, you are granted additional
** permissions described in the GCC Runtime Library Exception, version
** 3.1, as published by the Free Software Foundation.
** You should have received a copy of the GNU General Public License and
** a copy of the GCC Runtime Library Exception along with this program;
** see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
** <http://www.gnu.org/licenses/>.
**
** see README.md file for more information
*/
/**
* @brief get_presentation_form_b
* @param in_len input utf8 string length
* @param in_str input utf8 string
* @param out_len output code point buffer size (in bytes)
* @param out_cp output code point buffer
* @return
*/
/*************************************
* ARAB_TEXT_TO_UNICODE_CONVERTER
* AUTHOR OF MODIFICATIONS : alahem monsef
* EMAIL : [email protected]
* VERSION : 0.1
* DATE : 10-05-2019
*
*************************************/
#include <stdbool.h>
#include <stdio.h>
#include "utf8.h"
#define ARABIC_LETTER_START 0x621
#define ARABIC_LETTER_END 0x64A
#define ISOLATED 0
#define ENDING 1
#define INITIAL 2
#define MEDIAL 3
#define UNICODE_LAM 0x644
//
// 0: isolated form
// 1: ending form
// 2: beginning form (if 0, it's a cutting type)
// 3: middle form
//
typedef int char_form_t[4];
typedef char_form_t all_form_t[2];
static all_form_t arabic_forms_b[] = {
{ {0xFE80, 0xFE80, 0, 0}, {-1, -1, 0, 0} }, // hamza (0)
{ {0xFE81, 0xFE82, 0, 0}, {-1, -1, 0xFEF5, 0xFEF6} }, // 2alif madda (1)
{ {0xFE83, 0xFE84, 0, 0}, {-1, -1, 0xFEF7, 0xFEF8} }, // 2alif hamza (2)
{ {0xFE85, 0xFE86, 0, 0}, {-1, -1, 0, 0} }, // waw hamza (3)
{ {0xFE87, 0xFE88, 0, 0}, {-1, -1, 0xFEF9, 0xFEFA} }, // 2alif hamza maksoura (4)
{ {0xFE89, 0xFE8A, 0xFE8B, 0xFE8C}, {-1, -1, 0, 0} }, // 2alif maqsoura hamza (5)
{ {0xFE8D, 0xFE8E, 0, 0}, {-1, -1, 0xFEFB, 0xFEFC} }, // 2alif (6)
{ {0xFE8F, 0xFE90, 0xFE91, 0xFE92}, {-1, -1, 0, 0} }, // ba2 (7)
{ {0xFE93, 0xFE94, 0, 0}, {-1, -1, 0, 0} }, // ta2 marbouta (8)
{ {0xFE95, 0xFE96, 0xFE97, 0xFE98}, {-1, -1, 0, 0} }, // ta2 (9)
{ {0xFE99, 0xFE9A, 0xFE9B, 0xFE9C}, {-1, -1, 0, 0} }, // tha2 (10)
{ {0xFE9D, 0xFE9E, 0xFE9F, 0xFEA0}, {-1, -1, 0, 0} }, // jim (11)
{ {0xFEA1, 0xFEA2, 0xFEA3, 0xFEA4}, {-1, -1, 0, 0} }, // 7a2 (12)
{ {0xFEA5, 0xFEA6, 0xFEA7, 0xFEA8}, {-1, -1, 0, 0} }, // kha2 (13)
{ {0xFEA9, 0xFEAA, 0, 0}, {-1, -1, 0, 0} }, // dal (14)
{ {0xFEAB, 0xFEAC, 0, 0}, {-1, -1, 0, 0} }, // dhal (15)
{ {0xFEAD, 0xFEAE, 0, 0}, {-1, -1, 0, 0} }, // ra2 (16)
{ {0xFEAF, 0xFEB0, 0, 0}, {-1, -1, 0, 0} }, // zayn (17)
{ {0xFEB1, 0xFEB2, 0xFEB3, 0xFEB4}, {-1, -1, 0, 0} }, // syn (18)
{ {0xFEB5, 0xFEB6, 0xFEB7, 0xFEB8}, {-1, -1, 0, 0} }, // shin (19)
{ {0xFEB9, 0xFEBA, 0xFEBB, 0xFEBC}, {-1, -1, 0, 0} }, // sad (20)
{ {0xFEBD, 0xFEBE, 0xFEBF, 0xFEC0}, {-1, -1, 0, 0} }, // dad (21)
{ {0xFEC1, 0xFEC2, 0xFEC3, 0xFEC4}, {-1, -1, 0, 0} }, // tah (22)
{ {0xFEC5, 0xFEC6, 0xFEC7, 0xFEC8}, {-1, -1, 0, 0} }, // thah (23)
{ {0xFEC9, 0xFECA, 0xFECB, 0xFECC}, {-1, -1, 0, 0} }, // 3ayn (24)
{ {0xFECD, 0xFECE, 0xFECF, 0xFED0}, {-1, -1, 0, 0} }, // ghayn (25)
{ { 0, 0, 0, 0}, {-1, -1, 0, 0} }, // (26)
{ { 0, 0, 0, 0}, {-1, -1, 0, 0} }, // (27)
{ { 0, 0, 0, 0}, {-1, -1, 0, 0} }, // (28)
{ { 0, 0, 0, 0}, {-1, -1, 0, 0} }, // (29)
{ { 0, 0, 0, 0}, {-1, -1, 0, 0} }, // (30)
{ { 0x640, 0x640, 0x640, 0x640}, {-1, -1, 0, 0} }, // wasla (31)
{ {0xFED1, 0xFED2, 0xFED3, 0xFED4}, {-1, -1, 0, 0} }, // fa2 (32)
{ {0xFED5, 0xFED6, 0xFED7, 0xFED8}, {-1, -1, 0, 0} }, // qaf (33)
{ {0xFED9, 0xFEDA, 0xFEDB, 0xFEDC}, {-1, -1, 0, 0} }, // kaf (34)
{ {0xFEDD, 0xFEDE, 0xFEDF, 0xFEE0}, {-1, -1, 0, 0} }, // lam (35)
{ {0xFEE1, 0xFEE2, 0xFEE3, 0xFEE4}, {-1, -1, 0, 0} }, // mim (36)
{ {0xFEE5, 0xFEE6, 0xFEE7, 0xFEE8}, {-1, -1, 0, 0} }, // noon (37)
{ {0xFEE9, 0xFEEA, 0xFEEB, 0xFEEC}, {-1, -1, 0, 0} }, // ha2 (38)
{ {0xFEED, 0xFEEE, 0, 0}, {-1, -1, 0, 0} }, // waw (39)
{ {0xFEEF, 0xFEF0, 0, 0}, {-1, -1, 0, 0} }, // 2alif maksoura (40)
{ {0xFEF1, 0xFEF2, 0xFEF3, 0xFEF4}, {-1, -1, 0, 0} }, // ya2 (41)
};
static inline bool is_arabic_letter(uint32_t cp)
{
return ( cp >= ARABIC_LETTER_START && cp <= ARABIC_LETTER_END );
}
static inline bool is_lam_alef(uint32_t cp, uint32_t next)
{
return cp == UNICODE_LAM
&& is_arabic_letter(next)
&& arabic_forms_b[next - ARABIC_LETTER_START][1][INITIAL] != 0;
}
static inline bool is_alef_prev_lam(uint32_t prev, uint32_t cp)
{
return prev == UNICODE_LAM
&& is_arabic_letter(cp)
&& arabic_forms_b[cp - ARABIC_LETTER_START][1][INITIAL] != 0;
}
static inline bool is_linking_type(uint32_t cp)
{
return is_arabic_letter(cp)
&& arabic_forms_b[cp - ARABIC_LETTER_START][0][MEDIAL] != 0;
}
static uint32_t get_presentation_form_b_of_char(uint32_t prev, uint32_t next, uint32_t cp)
{
if( !is_arabic_letter(cp) ) {
return cp; /* not an Arabic letter */
}
bool is_la = is_lam_alef(cp, next);
bool is_apl = is_alef_prev_lam(prev, cp);
bool is_lapl = is_la | is_apl;
#ifdef CLEAR_CODE
if( is_lapl ) {
uint32_t index = (is_linking_type(cp) << 1) | is_linking_type(prev);
return arabic_forms_b[next - ARABIC_LETTER_START][is_lapl][index];
} else {
if( is_apl ) {
return -1; // skip previously processed lam alef
} else {
uint32_t index = ((is_arabic_letter(next) & is_linking_type(cp)) << 1) | is_linking_type(prev);
return arabic_forms_b[cp - ARABIC_LETTER_START][is_lapl][index];
}
}
#else
// optimized code
uint32_t index = (((is_lapl | is_arabic_letter(next)) & is_linking_type(cp)) << 1) | is_linking_type(prev);
uint32_t ref = next * is_la + cp * (!is_la) - ARABIC_LETTER_START;
return arabic_forms_b[ref][is_lapl][index];
#endif
}
void reverso(uint32_t *in, const size_t cp_count)
{
uint32_t out[cp_count];
int i = 0;
int j = cp_count;
for( i = 0; i < cp_count; i++)
{
out[i] = in[i];
}
for( i = 0; i < cp_count; i++)
{
in[j] = out[i];
j--;
}
}
int get_presentation_form_b(arab_line_t *ar_ln_ptr) {
uint32_t codep = 0;
uint32_t state = 0;
uint32_t prev = 0;
size_t cp_count = ar_ln_ptr->utfs / sizeof(uint32_t);
size_t o = 0;
//check encoding
size_t i = 0;
for( i = 0; i < ar_ln_ptr->bytes && o < cp_count; ++i ) {
if( decode(&state, &codep, ar_ln_ptr->in_byte[i]) == UTF8_ACCEPT ) {
ar_ln_ptr->in_utf[o] = codep;
++o;
}
}
if( state != UTF8_ACCEPT ) {
// The string is not well-formed"
return 0;
}
//fill out_str
cp_count = o;
size_t s = 0;
for( o=0 ; o<cp_count ; ++o) {
uint32_t cp = ar_ln_ptr->in_utf[o];
uint32_t next = o < cp_count - 1 ? ar_ln_ptr->in_utf[o + 1] : 0;
uint32_t tcp = get_presentation_form_b_of_char(prev, next, cp);
if (tcp == 0xfeff)
continue;
if (tcp == 0x647)
continue;
//printf("utf = %x\n", tcp);
if( tcp != (uint32_t)-1 ) {
ar_ln_ptr->in_utf[s] = tcp;
++s;
}
prev = cp;
}
ar_ln_ptr->utfs = s;
return o;
}