blob: f1d715c8626909c9a3a4d22a6fafd41c7c4456e0 [file] [log] [blame]
Ebrahim Byagowi8d199072020-02-19 14:56:55 +03301#!/usr/bin/env python3
Behdad Esfahbod3eb936f2010-10-05 18:36:58 -04002
Ebrahim Byagowi80395f12018-03-29 22:00:41 +04303import io, os.path, sys
Behdad Esfahbod3eb936f2010-10-05 18:36:58 -04004
Behdad Esfahbod3f5327a2014-06-20 16:17:42 -04005if len (sys.argv) != 4:
Ebrahim Byagowi6a390df2020-02-10 17:19:23 +03306 print ("""usage: ./gen-arabic-table.py ArabicShaping.txt UnicodeData.txt Blocks.txt
7
David Corbettfd748fa2020-03-15 15:59:31 -04008Input files:
Ebrahim Byagowi6a390df2020-02-10 17:19:23 +03309* https://unicode.org/Public/UCD/latest/ucd/ArabicShaping.txt
10* https://unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
11* https://unicode.org/Public/UCD/latest/ucd/Blocks.txt
12""", file=sys.stderr)
Behdad Esfahbod697a65c2011-06-01 20:52:00 -040013 sys.exit (1)
14
Ebrahim Byagowi80395f12018-03-29 22:00:41 +043015files = [io.open (x, encoding='utf-8') for x in sys.argv[1:]]
Behdad Esfahbod697a65c2011-06-01 20:52:00 -040016
Behdad Esfahbod3f5327a2014-06-20 16:17:42 -040017headers = [[files[0].readline (), files[0].readline ()], [files[2].readline (), files[2].readline ()]]
Behdad Esfahbodae4a2b92012-04-10 16:25:08 -040018headers.append (["UnicodeData.txt does not have a header."])
19while files[0].readline ().find ('##################') < 0:
Behdad Esfahbod88e7f372010-12-21 14:18:24 -050020 pass
Behdad Esfahbod3eb936f2010-10-05 18:36:58 -040021
Behdad Esfahbod3f5327a2014-06-20 16:17:42 -040022blocks = {}
23def read_blocks(f):
24 global blocks
25 for line in f:
26
27 j = line.find ('#')
28 if j >= 0:
29 line = line[:j]
30
31 fields = [x.strip () for x in line.split (';')]
32 if len (fields) == 1:
33 continue
34
35 uu = fields[0].split ('..')
36 start = int (uu[0], 16)
37 if len (uu) == 1:
38 end = start
39 else:
40 end = int (uu[1], 16)
41
42 t = fields[1]
43
44 for u in range (start, end + 1):
45 blocks[u] = t
Behdad Esfahbod14d78412010-11-17 16:52:58 -050046
Behdad Esfahbodae4a2b92012-04-10 16:25:08 -040047def print_joining_table(f):
48
Behdad Esfahbod3f5327a2014-06-20 16:17:42 -040049 values = {}
Behdad Esfahbodae4a2b92012-04-10 16:25:08 -040050 for line in f:
51
52 if line[0] == '#':
Behdad Esfahbodae4a2b92012-04-10 16:25:08 -040053 continue
54
55 fields = [x.strip () for x in line.split (';')]
56 if len (fields) == 1:
57 continue
58
59 u = int (fields[0], 16)
Behdad Esfahbodae4a2b92012-04-10 16:25:08 -040060
61 if fields[3] in ["ALAPH", "DALATH RISH"]:
62 value = "JOINING_GROUP_" + fields[3].replace(' ', '_')
63 else:
64 value = "JOINING_TYPE_" + fields[2]
Behdad Esfahbodf8867072014-06-20 16:30:10 -040065 values[u] = value
Behdad Esfahbodae4a2b92012-04-10 16:25:08 -040066
Behdad Esfahbod200dfe32014-06-20 16:20:59 -040067 short_value = {}
David Corbettfd748fa2020-03-15 15:59:31 -040068 for value in sorted (set ([v for v in values.values ()] + ['JOINING_TYPE_X'])):
Behdad Esfahbod200dfe32014-06-20 16:20:59 -040069 short = ''.join(x[0] for x in value.split('_')[2:])
70 assert short not in short_value.values()
71 short_value[value] = short
72
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +043073 print ()
Behdad Esfahbod200dfe32014-06-20 16:20:59 -040074 for value,short in short_value.items():
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +043075 print ("#define %s %s" % (short, value))
Behdad Esfahbod200dfe32014-06-20 16:20:59 -040076
Behdad Esfahbodb900fa22014-06-20 17:59:43 -040077 uu = sorted(values.keys())
Behdad Esfahbod3f5327a2014-06-20 16:17:42 -040078 num = len(values)
Behdad Esfahbod2390d9b2014-06-21 14:07:00 -060079 all_blocks = set([blocks[u] for u in uu])
Behdad Esfahbodb900fa22014-06-20 17:59:43 -040080
Behdad Esfahboddcee8382014-06-22 11:29:59 -060081 last = -100000
Behdad Esfahbodb900fa22014-06-20 17:59:43 -040082 ranges = []
83 for u in uu:
Behdad Esfahbod2390d9b2014-06-21 14:07:00 -060084 if u - last <= 1+16*5:
Behdad Esfahbodb900fa22014-06-20 17:59:43 -040085 ranges[-1][-1] = u
86 else:
87 ranges.append([u,u])
88 last = u
89
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +043090 print ()
91 print ("static const uint8_t joining_table[] =")
92 print ("{")
Behdad Esfahbod3f5327a2014-06-20 16:17:42 -040093 last_block = None
Behdad Esfahbodb900fa22014-06-20 17:59:43 -040094 offset = 0
95 for start,end in ranges:
Behdad Esfahbodf8867072014-06-20 16:30:10 -040096
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +043097 print ()
98 print ("#define joining_offset_0x%04xu %d" % (start, offset))
Behdad Esfahbodf8867072014-06-20 16:30:10 -040099
Behdad Esfahbodb900fa22014-06-20 17:59:43 -0400100 for u in range(start, end+1):
101
102 block = blocks.get(u, last_block)
103 value = values.get(u, "JOINING_TYPE_X")
104
105 if block != last_block or u == start:
106 if u != start:
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430107 print ()
Behdad Esfahbod2390d9b2014-06-21 14:07:00 -0600108 if block in all_blocks:
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430109 print ("\n /* %s */" % block)
Behdad Esfahbod2390d9b2014-06-21 14:07:00 -0600110 else:
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430111 print ("\n /* FILLER */")
Behdad Esfahbodb900fa22014-06-20 17:59:43 -0400112 last_block = block
113 if u % 32 != 0:
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430114 print ()
115 print (" /* %04X */" % (u//32*32), " " * (u % 32), end="")
Behdad Esfahbodb900fa22014-06-20 17:59:43 -0400116
117 if u % 32 == 0:
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430118 print ()
119 print (" /* %04X */ " % u, end="")
120 print ("%s," % short_value[value], end="")
121 print ()
Behdad Esfahbod3f5327a2014-06-20 16:17:42 -0400122
Behdad Esfahbodb900fa22014-06-20 17:59:43 -0400123 offset += end - start + 1
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430124 print ()
Behdad Esfahbodb900fa22014-06-20 17:59:43 -0400125 occupancy = num * 100. / offset
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430126 print ("}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy))
127 print ()
Behdad Esfahbodae4a2b92012-04-10 16:25:08 -0400128
Behdad Esfahbod2390d9b2014-06-21 14:07:00 -0600129 page_bits = 12;
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430130 print ()
131 print ("static unsigned int")
132 print ("joining_type (hb_codepoint_t u)")
133 print ("{")
134 print (" switch (u >> %d)" % page_bits)
135 print (" {")
Behdad Esfahbodb900fa22014-06-20 17:59:43 -0400136 pages = set([u>>page_bits for u in [s for s,e in ranges]+[e for s,e in ranges]])
137 for p in sorted(pages):
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430138 print (" case 0x%0Xu:" % p)
Behdad Esfahbodb900fa22014-06-20 17:59:43 -0400139 for (start,end) in ranges:
140 if p not in [start>>page_bits, end>>page_bits]: continue
Behdad Esfahbodc09a6072014-07-11 15:05:36 -0400141 offset = "joining_offset_0x%04xu" % start
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430142 print (" if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return joining_table[u - 0x%04Xu + %s];" % (start, end, start, offset))
143 print (" break;")
144 print ("")
145 print (" default:")
146 print (" break;")
147 print (" }")
148 print (" return X;")
149 print ("}")
150 print ()
Behdad Esfahbod200dfe32014-06-20 16:20:59 -0400151 for value,short in short_value.items():
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430152 print ("#undef %s" % (short))
153 print ()
Behdad Esfahbod200dfe32014-06-20 16:20:59 -0400154
Behdad Esfahbodae4a2b92012-04-10 16:25:08 -0400155def print_shaping_table(f):
156
157 shapes = {}
158 ligatures = {}
159 names = {}
160 for line in f:
161
162 fields = [x.strip () for x in line.split (';')]
163 if fields[5][0:1] != '<':
164 continue
165
166 items = fields[5].split (' ')
167 shape, items = items[0][1:-1], tuple (int (x, 16) for x in items[1:])
168
169 if not shape in ['initial', 'medial', 'isolated', 'final']:
170 continue
171
172 c = int (fields[0], 16)
173 if len (items) != 1:
174 # We only care about lam-alef ligatures
175 if len (items) != 2 or items[0] != 0x0644 or items[1] not in [0x0622, 0x0623, 0x0625, 0x0627]:
176 continue
177
178 # Save ligature
179 names[c] = fields[1]
180 if items not in ligatures:
181 ligatures[items] = {}
182 ligatures[items][shape] = c
183 pass
184 else:
185 # Save shape
186 if items[0] not in names:
187 names[items[0]] = fields[1]
188 else:
189 names[items[0]] = os.path.commonprefix ([names[items[0]], fields[1]]).strip ()
190 if items[0] not in shapes:
191 shapes[items[0]] = {}
192 shapes[items[0]][shape] = c
193
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430194 print ()
195 print ("static const uint16_t shaping_table[][4] =")
196 print ("{")
Behdad Esfahbodae4a2b92012-04-10 16:25:08 -0400197
198 keys = shapes.keys ()
199 min_u, max_u = min (keys), max (keys)
200 for u in range (min_u, max_u + 1):
Behdad Esfahbod07cfbe22012-09-06 01:16:39 -0400201 s = [shapes[u][shape] if u in shapes and shape in shapes[u] else 0
Behdad Esfahbodae4a2b92012-04-10 16:25:08 -0400202 for shape in ['initial', 'medial', 'final', 'isolated']]
Behdad Esfahbodc09a6072014-07-11 15:05:36 -0400203 value = ', '.join ("0x%04Xu" % c for c in s)
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430204 print (" {%s}, /* U+%04X %s */" % (value, u, names[u] if u in names else ""))
Behdad Esfahbodae4a2b92012-04-10 16:25:08 -0400205
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430206 print ("};")
207 print ()
208 print ("#define SHAPING_TABLE_FIRST 0x%04Xu" % min_u)
209 print ("#define SHAPING_TABLE_LAST 0x%04Xu" % max_u)
210 print ()
Behdad Esfahbodae4a2b92012-04-10 16:25:08 -0400211
Behdad Esfahbod939c0102012-04-10 17:20:05 -0400212 ligas = {}
Behdad Esfahbodae4a2b92012-04-10 16:25:08 -0400213 for pair in ligatures.keys ():
214 for shape in ligatures[pair]:
215 c = ligatures[pair][shape]
216 if shape == 'isolated':
217 liga = (shapes[pair[0]]['initial'], shapes[pair[1]]['final'])
218 elif shape == 'final':
219 liga = (shapes[pair[0]]['medial'], shapes[pair[1]]['final'])
220 else:
221 raise Exception ("Unexpected shape", shape)
Behdad Esfahbod939c0102012-04-10 17:20:05 -0400222 if liga[0] not in ligas:
223 ligas[liga[0]] = []
224 ligas[liga[0]].append ((liga[1], c))
225 max_i = max (len (ligas[l]) for l in ligas)
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430226 print ()
227 print ("static const struct ligature_set_t {")
228 print (" uint16_t first;")
229 print (" struct ligature_pairs_t {")
230 print (" uint16_t second;")
231 print (" uint16_t ligature;")
232 print (" } ligatures[%d];" % max_i)
233 print ("} ligature_table[] =")
234 print ("{")
Ebrahim Byagowi26e0cbd2018-03-29 21:22:47 +0430235 for first in sorted (ligas.keys ()):
Behdad Esfahbod939c0102012-04-10 17:20:05 -0400236
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430237 print (" { 0x%04Xu, {" % (first))
Behdad Esfahbod939c0102012-04-10 17:20:05 -0400238 for liga in ligas[first]:
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430239 print (" { 0x%04Xu, 0x%04Xu }, /* %s */" % (liga[0], liga[1], names[liga[1]]))
240 print (" }},")
Behdad Esfahbodae4a2b92012-04-10 16:25:08 -0400241
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430242 print ("};")
243 print ()
Behdad Esfahbodae4a2b92012-04-10 16:25:08 -0400244
245
246
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430247print ("/* == Start of generated table == */")
248print ("/*")
249print (" * The following table is generated by running:")
250print (" *")
251print (" * ./gen-arabic-table.py ArabicShaping.txt UnicodeData.txt Blocks.txt")
252print (" *")
253print (" * on files with these headers:")
254print (" *")
Behdad Esfahbodae4a2b92012-04-10 16:25:08 -0400255for h in headers:
256 for l in h:
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430257 print (" * %s" % (l.strip()))
258print (" */")
259print ()
260print ("#ifndef HB_OT_SHAPE_COMPLEX_ARABIC_TABLE_HH")
261print ("#define HB_OT_SHAPE_COMPLEX_ARABIC_TABLE_HH")
262print ()
Behdad Esfahbod14d78412010-11-17 16:52:58 -0500263
Behdad Esfahbod3f5327a2014-06-20 16:17:42 -0400264read_blocks (files[2])
Behdad Esfahbodae4a2b92012-04-10 16:25:08 -0400265print_joining_table (files[0])
266print_shaping_table (files[1])
Behdad Esfahbodd606daa2011-09-20 14:34:06 -0400267
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430268print ()
269print ("#endif /* HB_OT_SHAPE_COMPLEX_ARABIC_TABLE_HH */")
270print ()
271print ("/* == End of generated table == */")