blob: 8278d7d69c012facf3102655d8a6c1e960b50b54 [file] [log] [blame]
Ebrahim Byagowi8d199072020-02-19 14:56:55 +03301#!/usr/bin/env python3
Behdad Esfahbod3eb936f2010-10-05 18:36:58 -04002
Ebrahim Byagowi08f1d952020-05-28 15:01:15 +04303"""usage: ./gen-arabic-table.py ArabicShaping.txt UnicodeData.txt Blocks.txt
Ebrahim Byagowi6a390df2020-02-10 17:19:23 +03304
David Corbettfd748fa2020-03-15 15:59:31 -04005Input files:
Ebrahim Byagowi6a390df2020-02-10 17:19:23 +03306* https://unicode.org/Public/UCD/latest/ucd/ArabicShaping.txt
7* https://unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
8* https://unicode.org/Public/UCD/latest/ucd/Blocks.txt
Ebrahim Byagowi08f1d952020-05-28 15:01:15 +04309"""
10
Ebrahim Byagowiad871552020-05-29 00:11:19 +043011import os.path, sys
Ebrahim Byagowi08f1d952020-05-28 15:01:15 +043012
13if len (sys.argv) != 4:
Ebrahim Byagowi7554f612020-05-28 22:51:29 +043014 sys.exit (__doc__)
Behdad Esfahbod697a65c2011-06-01 20:52:00 -040015
Ebrahim Byagowiad871552020-05-29 00:11:19 +043016files = [open (x, encoding='utf-8') for x in sys.argv[1:]]
Behdad Esfahbod697a65c2011-06-01 20:52:00 -040017
Behdad Esfahbod3f5327a2014-06-20 16:17:42 -040018headers = [[files[0].readline (), files[0].readline ()], [files[2].readline (), files[2].readline ()]]
Behdad Esfahbodae4a2b92012-04-10 16:25:08 -040019headers.append (["UnicodeData.txt does not have a header."])
20while files[0].readline ().find ('##################') < 0:
Behdad Esfahbod88e7f372010-12-21 14:18:24 -050021 pass
Behdad Esfahbod3eb936f2010-10-05 18:36:58 -040022
Behdad Esfahbod3f5327a2014-06-20 16:17:42 -040023blocks = {}
24def read_blocks(f):
25 global blocks
26 for line in f:
27
28 j = line.find ('#')
29 if j >= 0:
30 line = line[:j]
31
32 fields = [x.strip () for x in line.split (';')]
33 if len (fields) == 1:
34 continue
35
36 uu = fields[0].split ('..')
37 start = int (uu[0], 16)
38 if len (uu) == 1:
39 end = start
40 else:
41 end = int (uu[1], 16)
42
43 t = fields[1]
44
45 for u in range (start, end + 1):
46 blocks[u] = t
Behdad Esfahbod14d78412010-11-17 16:52:58 -050047
Behdad Esfahbodae4a2b92012-04-10 16:25:08 -040048def print_joining_table(f):
49
Behdad Esfahbod3f5327a2014-06-20 16:17:42 -040050 values = {}
Behdad Esfahbodae4a2b92012-04-10 16:25:08 -040051 for line in f:
52
53 if line[0] == '#':
Behdad Esfahbodae4a2b92012-04-10 16:25:08 -040054 continue
55
56 fields = [x.strip () for x in line.split (';')]
57 if len (fields) == 1:
58 continue
59
60 u = int (fields[0], 16)
Behdad Esfahbodae4a2b92012-04-10 16:25:08 -040061
62 if fields[3] in ["ALAPH", "DALATH RISH"]:
63 value = "JOINING_GROUP_" + fields[3].replace(' ', '_')
64 else:
65 value = "JOINING_TYPE_" + fields[2]
Behdad Esfahbodf8867072014-06-20 16:30:10 -040066 values[u] = value
Behdad Esfahbodae4a2b92012-04-10 16:25:08 -040067
Behdad Esfahbod200dfe32014-06-20 16:20:59 -040068 short_value = {}
David Corbettfd748fa2020-03-15 15:59:31 -040069 for value in sorted (set ([v for v in values.values ()] + ['JOINING_TYPE_X'])):
Behdad Esfahbod200dfe32014-06-20 16:20:59 -040070 short = ''.join(x[0] for x in value.split('_')[2:])
71 assert short not in short_value.values()
72 short_value[value] = short
73
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +043074 print ()
Behdad Esfahbod200dfe32014-06-20 16:20:59 -040075 for value,short in short_value.items():
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +043076 print ("#define %s %s" % (short, value))
Behdad Esfahbod200dfe32014-06-20 16:20:59 -040077
Behdad Esfahbodb900fa22014-06-20 17:59:43 -040078 uu = sorted(values.keys())
Behdad Esfahbod3f5327a2014-06-20 16:17:42 -040079 num = len(values)
Behdad Esfahbod2390d9b2014-06-21 14:07:00 -060080 all_blocks = set([blocks[u] for u in uu])
Behdad Esfahbodb900fa22014-06-20 17:59:43 -040081
Behdad Esfahboddcee8382014-06-22 11:29:59 -060082 last = -100000
Behdad Esfahbodb900fa22014-06-20 17:59:43 -040083 ranges = []
84 for u in uu:
Behdad Esfahbod2390d9b2014-06-21 14:07:00 -060085 if u - last <= 1+16*5:
Behdad Esfahbodb900fa22014-06-20 17:59:43 -040086 ranges[-1][-1] = u
87 else:
88 ranges.append([u,u])
89 last = u
90
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +043091 print ()
92 print ("static const uint8_t joining_table[] =")
93 print ("{")
Behdad Esfahbod3f5327a2014-06-20 16:17:42 -040094 last_block = None
Behdad Esfahbodb900fa22014-06-20 17:59:43 -040095 offset = 0
96 for start,end in ranges:
Behdad Esfahbodf8867072014-06-20 16:30:10 -040097
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +043098 print ()
99 print ("#define joining_offset_0x%04xu %d" % (start, offset))
Behdad Esfahbodf8867072014-06-20 16:30:10 -0400100
Behdad Esfahbodb900fa22014-06-20 17:59:43 -0400101 for u in range(start, end+1):
102
103 block = blocks.get(u, last_block)
104 value = values.get(u, "JOINING_TYPE_X")
105
106 if block != last_block or u == start:
107 if u != start:
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430108 print ()
Behdad Esfahbod2390d9b2014-06-21 14:07:00 -0600109 if block in all_blocks:
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430110 print ("\n /* %s */" % block)
Behdad Esfahbod2390d9b2014-06-21 14:07:00 -0600111 else:
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430112 print ("\n /* FILLER */")
Behdad Esfahbodb900fa22014-06-20 17:59:43 -0400113 last_block = block
114 if u % 32 != 0:
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430115 print ()
116 print (" /* %04X */" % (u//32*32), " " * (u % 32), end="")
Behdad Esfahbodb900fa22014-06-20 17:59:43 -0400117
118 if u % 32 == 0:
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430119 print ()
120 print (" /* %04X */ " % u, end="")
121 print ("%s," % short_value[value], end="")
122 print ()
Behdad Esfahbod3f5327a2014-06-20 16:17:42 -0400123
Behdad Esfahbodb900fa22014-06-20 17:59:43 -0400124 offset += end - start + 1
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430125 print ()
Behdad Esfahbodb900fa22014-06-20 17:59:43 -0400126 occupancy = num * 100. / offset
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430127 print ("}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy))
128 print ()
Behdad Esfahbodae4a2b92012-04-10 16:25:08 -0400129
Ebrahim Byagowi08f1d952020-05-28 15:01:15 +0430130 page_bits = 12
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430131 print ()
132 print ("static unsigned int")
133 print ("joining_type (hb_codepoint_t u)")
134 print ("{")
135 print (" switch (u >> %d)" % page_bits)
136 print (" {")
Behdad Esfahbodb900fa22014-06-20 17:59:43 -0400137 pages = set([u>>page_bits for u in [s for s,e in ranges]+[e for s,e in ranges]])
138 for p in sorted(pages):
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430139 print (" case 0x%0Xu:" % p)
Behdad Esfahbodb900fa22014-06-20 17:59:43 -0400140 for (start,end) in ranges:
141 if p not in [start>>page_bits, end>>page_bits]: continue
Behdad Esfahbodc09a6072014-07-11 15:05:36 -0400142 offset = "joining_offset_0x%04xu" % start
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430143 print (" if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return joining_table[u - 0x%04Xu + %s];" % (start, end, start, offset))
144 print (" break;")
145 print ("")
146 print (" default:")
147 print (" break;")
148 print (" }")
149 print (" return X;")
150 print ("}")
151 print ()
Behdad Esfahbod200dfe32014-06-20 16:20:59 -0400152 for value,short in short_value.items():
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430153 print ("#undef %s" % (short))
154 print ()
Behdad Esfahbod200dfe32014-06-20 16:20:59 -0400155
Khaled Hosnyc3f590b2022-06-16 11:04:13 -0600156LIGATURES = (
157 0xF2EE, 0xFC08, 0xFC0E, 0xFC12, 0xFC32, 0xFC3F, 0xFC40, 0xFC41, 0xFC42,
158 0xFC44, 0xFC4E, 0xFC5E, 0xFC60, 0xFC61, 0xFC62, 0xFC6A, 0xFC6D, 0xFC6F,
159 0xFC70, 0xFC73, 0xFC75, 0xFC86, 0xFC8F, 0xFC91, 0xFC94, 0xFC9C, 0xFC9D,
160 0xFC9E, 0xFC9F, 0xFCA1, 0xFCA2, 0xFCA3, 0xFCA4, 0xFCA8, 0xFCAA, 0xFCAC,
161 0xFCB0, 0xFCC9, 0xFCCA, 0xFCCB, 0xFCCC, 0xFCCD, 0xFCCE, 0xFCCF, 0xFCD0,
162 0xFCD1, 0xFCD2, 0xFCD3, 0xFCD5, 0xFCDA, 0xFCDB, 0xFCDC, 0xFCDD, 0xFD30,
163 0xFD88, 0xFEF5, 0xFEF6, 0xFEF7, 0xFEF8, 0xFEF9, 0xFEFA, 0xFEFB, 0xFEFC,
Khaled Hosny69cbd362021-08-14 02:39:46 +0200164 0xF201, 0xF211, 0xF2EE,
Khaled Hosnyc3f590b2022-06-16 11:04:13 -0600165)
166
Behdad Esfahbodae4a2b92012-04-10 16:25:08 -0400167def print_shaping_table(f):
168
169 shapes = {}
170 ligatures = {}
171 names = {}
Khaled Hosny69cbd362021-08-14 02:39:46 +0200172 lines = f.readlines()
173 lines += [
174 "F201;PUA ARABIC LIGATURE LELLAH ISOLATED FORM;Lo;0;AL;<isolated> 0644 0644 0647;;;;N;;;;;",
175 "F211;PUA ARABIC LIGATURE LAM WITH MEEM WITH JEEM INITIAL FORM;Lo;0;AL;<initial> 0644 0645 062C;;;;N;;;;;",
176 "F2EE;PUA ARABIC LIGATURE SHADDA WITH FATHATAN ISOLATED FORM;Lo;0;AL;<isolated> 0020 064B 0651;;;;N;;;;;",
177 ]
178 for line in lines:
Behdad Esfahbodae4a2b92012-04-10 16:25:08 -0400179
180 fields = [x.strip () for x in line.split (';')]
181 if fields[5][0:1] != '<':
182 continue
183
184 items = fields[5].split (' ')
185 shape, items = items[0][1:-1], tuple (int (x, 16) for x in items[1:])
Khaled Hosnyc3f590b2022-06-16 11:04:13 -0600186 c = int (fields[0], 16)
Behdad Esfahbodae4a2b92012-04-10 16:25:08 -0400187
188 if not shape in ['initial', 'medial', 'isolated', 'final']:
189 continue
190
Behdad Esfahbodae4a2b92012-04-10 16:25:08 -0400191 if len (items) != 1:
Khaled Hosnyc3f590b2022-06-16 11:04:13 -0600192 # Mark ligatures start with space and are in visual order, so we
193 # remove the space and reverse the items.
194 if items[0] == 0x0020:
195 items = items[:0:-1]
196 shape = None
197 # We only care about a subset of ligatures
Khaled Hosny7f362192022-06-18 20:28:43 +0200198 if c not in LIGATURES:
Behdad Esfahbodae4a2b92012-04-10 16:25:08 -0400199 continue
200
201 # Save ligature
202 names[c] = fields[1]
203 if items not in ligatures:
204 ligatures[items] = {}
205 ligatures[items][shape] = c
Behdad Esfahbodae4a2b92012-04-10 16:25:08 -0400206 else:
207 # Save shape
208 if items[0] not in names:
209 names[items[0]] = fields[1]
210 else:
211 names[items[0]] = os.path.commonprefix ([names[items[0]], fields[1]]).strip ()
212 if items[0] not in shapes:
213 shapes[items[0]] = {}
214 shapes[items[0]][shape] = c
215
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430216 print ()
217 print ("static const uint16_t shaping_table[][4] =")
218 print ("{")
Behdad Esfahbodae4a2b92012-04-10 16:25:08 -0400219
220 keys = shapes.keys ()
221 min_u, max_u = min (keys), max (keys)
222 for u in range (min_u, max_u + 1):
Behdad Esfahbod07cfbe22012-09-06 01:16:39 -0400223 s = [shapes[u][shape] if u in shapes and shape in shapes[u] else 0
Behdad Esfahbodae4a2b92012-04-10 16:25:08 -0400224 for shape in ['initial', 'medial', 'final', 'isolated']]
Behdad Esfahbodc09a6072014-07-11 15:05:36 -0400225 value = ', '.join ("0x%04Xu" % c for c in s)
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430226 print (" {%s}, /* U+%04X %s */" % (value, u, names[u] if u in names else ""))
Behdad Esfahbodae4a2b92012-04-10 16:25:08 -0400227
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430228 print ("};")
229 print ()
230 print ("#define SHAPING_TABLE_FIRST 0x%04Xu" % min_u)
231 print ("#define SHAPING_TABLE_LAST 0x%04Xu" % max_u)
232 print ()
Behdad Esfahbodae4a2b92012-04-10 16:25:08 -0400233
Khaled Hosny7f362192022-06-18 20:28:43 +0200234 ligas_2 = {}
235 ligas_3 = {}
236 ligas_mark_2 = {}
237 for key in ligatures.keys ():
238 for shape in ligatures[key]:
239 c = ligatures[key][shape]
240 if len(key) == 3:
241 if shape == 'isolated':
242 liga = (shapes[key[0]]['initial'], shapes[key[1]]['medial'], shapes[key[2]]['final'])
243 elif shape == 'final':
244 liga = (shapes[key[0]]['medial'], shapes[key[1]]['medial'], shapes[key[2]]['final'])
245 elif shape == 'initial':
246 liga = (shapes[key[0]]['initial'], shapes[key[1]]['medial'], shapes[key[2]]['medial'])
247 else:
248 raise Exception ("Unexpected shape", shape)
249 if liga[0] not in ligas_3:
250 ligas_3[liga[0]] = []
251 ligas_3[liga[0]].append ((liga[1], liga[2], c))
252 elif len(key) == 2:
253 if shape is None:
254 liga = key
255 if liga[0] not in ligas_mark_2:
256 ligas_mark_2[liga[0]] = []
257 ligas_mark_2[liga[0]].append ((liga[1], c))
258 continue
259 elif shape == 'isolated':
260 liga = (shapes[key[0]]['initial'], shapes[key[1]]['final'])
261 elif shape == 'final':
262 liga = (shapes[key[0]]['medial'], shapes[key[1]]['final'])
263 elif shape == 'initial':
264 liga = (shapes[key[0]]['initial'], shapes[key[1]]['medial'])
265 else:
266 raise Exception ("Unexpected shape", shape)
267 if liga[0] not in ligas_2:
268 ligas_2[liga[0]] = []
269 ligas_2[liga[0]].append ((liga[1], c))
Behdad Esfahbodae4a2b92012-04-10 16:25:08 -0400270 else:
Khaled Hosny7f362192022-06-18 20:28:43 +0200271 raise Exception ("Unexpected number of ligature components", key)
272 max_i = max (len (ligas_2[l]) for l in ligas_2)
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430273 print ()
274 print ("static const struct ligature_set_t {")
275 print (" uint16_t first;")
276 print (" struct ligature_pairs_t {")
Behdad Esfahbodd86effa2022-06-18 13:41:30 -0600277 print (" uint16_t components[1];")
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430278 print (" uint16_t ligature;")
279 print (" } ligatures[%d];" % max_i)
280 print ("} ligature_table[] =")
281 print ("{")
Khaled Hosny7f362192022-06-18 20:28:43 +0200282 for first in sorted (ligas_2.keys ()):
Behdad Esfahbod939c0102012-04-10 17:20:05 -0400283
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430284 print (" { 0x%04Xu, {" % (first))
Khaled Hosny7f362192022-06-18 20:28:43 +0200285 for liga in ligas_2[first]:
Behdad Esfahbodd86effa2022-06-18 13:41:30 -0600286 print (" { {0x%04Xu}, 0x%04Xu }, /* %s */" % (liga[0], liga[1], names[liga[1]]))
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430287 print (" }},")
Behdad Esfahbodae4a2b92012-04-10 16:25:08 -0400288
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430289 print ("};")
290 print ()
Behdad Esfahbodae4a2b92012-04-10 16:25:08 -0400291
Khaled Hosny7f362192022-06-18 20:28:43 +0200292 max_i = max (len (ligas_mark_2[l]) for l in ligas_mark_2)
293 print ()
294 print ("static const struct ligature_mark_set_t {")
295 print (" uint16_t first;")
296 print (" struct ligature_pairs_t {")
Behdad Esfahbodd86effa2022-06-18 13:41:30 -0600297 print (" uint16_t components[1];")
Khaled Hosny7f362192022-06-18 20:28:43 +0200298 print (" uint16_t ligature;")
299 print (" } ligatures[%d];" % max_i)
300 print ("} ligature_mark_table[] =")
301 print ("{")
302 for first in sorted (ligas_mark_2.keys ()):
303
304 print (" { 0x%04Xu, {" % (first))
305 for liga in ligas_mark_2[first]:
Behdad Esfahbodd86effa2022-06-18 13:41:30 -0600306 print (" { {0x%04Xu}, 0x%04Xu }, /* %s */" % (liga[0], liga[1], names[liga[1]]))
Khaled Hosny7f362192022-06-18 20:28:43 +0200307 print (" }},")
308
309 print ("};")
310 print ()
311
312 max_i = max (len (ligas_3[l]) for l in ligas_3)
313 print ()
314 print ("static const struct ligature_3_set_t {")
315 print (" uint16_t first;")
316 print (" struct ligature_triplets_t {")
Behdad Esfahbodd86effa2022-06-18 13:41:30 -0600317 print (" uint16_t components[2];")
Khaled Hosny7f362192022-06-18 20:28:43 +0200318 print (" uint16_t ligature;")
319 print (" } ligatures[%d];" % max_i)
320 print ("} ligature_3_table[] =")
321 print ("{")
322 for first in sorted (ligas_3.keys ()):
323
324 print (" { 0x%04Xu, {" % (first))
325 for liga in ligas_3[first]:
Behdad Esfahbodd86effa2022-06-18 13:41:30 -0600326 print (" { {0x%04Xu, 0x%04Xu}, 0x%04Xu}, /* %s */" % (liga[0], liga[1], liga[2], names[liga[2]]))
Khaled Hosny7f362192022-06-18 20:28:43 +0200327 print (" }},")
328
329 print ("};")
330 print ()
331
Behdad Esfahbodae4a2b92012-04-10 16:25:08 -0400332
333
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430334print ("/* == Start of generated table == */")
335print ("/*")
336print (" * The following table is generated by running:")
337print (" *")
338print (" * ./gen-arabic-table.py ArabicShaping.txt UnicodeData.txt Blocks.txt")
339print (" *")
340print (" * on files with these headers:")
341print (" *")
Behdad Esfahbodae4a2b92012-04-10 16:25:08 -0400342for h in headers:
343 for l in h:
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430344 print (" * %s" % (l.strip()))
345print (" */")
346print ()
Behdad Esfahbod44be1e52022-06-03 02:54:33 -0600347print ("#ifndef HB_OT_SHAPER_ARABIC_TABLE_HH")
348print ("#define HB_OT_SHAPER_ARABIC_TABLE_HH")
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430349print ()
Behdad Esfahbod14d78412010-11-17 16:52:58 -0500350
Behdad Esfahbod3f5327a2014-06-20 16:17:42 -0400351read_blocks (files[2])
Behdad Esfahbodae4a2b92012-04-10 16:25:08 -0400352print_joining_table (files[0])
353print_shaping_table (files[1])
Behdad Esfahbodd606daa2011-09-20 14:34:06 -0400354
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430355print ()
Behdad Esfahbod44be1e52022-06-03 02:54:33 -0600356print ("#endif /* HB_OT_SHAPER_ARABIC_TABLE_HH */")
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430357print ()
358print ("/* == End of generated table == */")