blob: e65b9814adf6b7222a508ff3cfa486aa36cb66a4 [file] [log] [blame]
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +04301#!/usr/bin/env python
2
3from __future__ import print_function, division, absolute_import
Behdad Esfahbodb9ddbd52011-06-02 17:43:12 -04004
Ebrahim Byagowi80395f12018-03-29 22:00:41 +04305import io, sys
Behdad Esfahbodb9ddbd52011-06-02 17:43:12 -04006
Behdad Esfahbodae4a2b92012-04-10 16:25:08 -04007if len (sys.argv) != 4:
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +04308 print ("usage: ./gen-indic-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt Blocks.txt", file=sys.stderr)
Behdad Esfahbodb9ddbd52011-06-02 17:43:12 -04009 sys.exit (1)
10
Behdad Esfahbod01a30a62016-05-06 11:50:02 +010011ALLOWED_SINGLES = [0x00A0, 0x25CC]
12ALLOWED_BLOCKS = [
13 'Basic Latin',
14 'Latin-1 Supplement',
15 'Devanagari',
16 'Bengali',
17 'Gurmukhi',
18 'Gujarati',
19 'Oriya',
20 'Tamil',
21 'Telugu',
22 'Kannada',
23 'Malayalam',
24 'Sinhala',
25 'Myanmar',
26 'Khmer',
27 'Vedic Extensions',
28 'General Punctuation',
29 'Superscripts and Subscripts',
30 'Devanagari Extended',
Behdad Esfahbod01a30a62016-05-06 11:50:02 +010031 'Myanmar Extended-B',
32 'Myanmar Extended-A',
33]
Behdad Esfahbod171f9702014-06-20 15:25:30 -040034
Ebrahim Byagowi80395f12018-03-29 22:00:41 +043035files = [io.open (x, encoding='utf-8') for x in sys.argv[1:]]
Behdad Esfahbodb9ddbd52011-06-02 17:43:12 -040036
37headers = [[f.readline () for i in range (2)] for f in files]
38
Behdad Esfahbodb9ddbd52011-06-02 17:43:12 -040039data = [{} for f in files]
40values = [{} for f in files]
41for i, f in enumerate (files):
42 for line in f:
43
44 j = line.find ('#')
45 if j >= 0:
46 line = line[:j]
Behdad Esfahbodd606daa2011-09-20 14:34:06 -040047
Behdad Esfahbodb9ddbd52011-06-02 17:43:12 -040048 fields = [x.strip () for x in line.split (';')]
49 if len (fields) == 1:
50 continue
51
52 uu = fields[0].split ('..')
53 start = int (uu[0], 16)
54 if len (uu) == 1:
55 end = start
56 else:
57 end = int (uu[1], 16)
58
59 t = fields[1]
60
61 for u in range (start, end + 1):
62 data[i][u] = t
Behdad Esfahbod5fa21b32014-06-30 14:30:54 -040063 values[i][t] = values[i].get (t, 0) + end - start + 1
Behdad Esfahbodb9ddbd52011-06-02 17:43:12 -040064
Behdad Esfahbodb9ddbd52011-06-02 17:43:12 -040065# Merge data into one dict:
66defaults = ('Other', 'Not_Applicable', 'No_Block')
67for i,v in enumerate (defaults):
68 values[i][v] = values[i].get (v, 0) + 1
69combined = {}
70for i,d in enumerate (data):
71 for u,v in d.items ():
72 if i == 2 and not u in combined:
73 continue
74 if not u in combined:
75 combined[u] = list (defaults)
76 combined[u][i] = v
Behdad Esfahbod01a30a62016-05-06 11:50:02 +010077combined = {k:v for k,v in combined.items() if k in ALLOWED_SINGLES or v[2] in ALLOWED_BLOCKS}
Behdad Esfahbodb9ddbd52011-06-02 17:43:12 -040078data = combined
79del combined
80num = len (data)
81
Behdad Esfahbodd743ce72014-06-30 15:24:02 -040082for u in [0x17CD, 0x17CE, 0x17CF, 0x17D0, 0x17D3]:
83 if data[u][0] == 'Other':
84 data[u][0] = "Vowel_Dependent"
85
Behdad Esfahbodb9ddbd52011-06-02 17:43:12 -040086# Move the outliers NO-BREAK SPACE and DOTTED CIRCLE out
87singles = {}
Behdad Esfahbod01a30a62016-05-06 11:50:02 +010088for u in ALLOWED_SINGLES:
Behdad Esfahbodb9ddbd52011-06-02 17:43:12 -040089 singles[u] = data[u]
90 del data[u]
91
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +043092print ("/* == Start of generated table == */")
93print ("/*")
94print (" * The following table is generated by running:")
95print (" *")
96print (" * ./gen-indic-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt Blocks.txt")
97print (" *")
98print (" * on files with these headers:")
99print (" *")
Behdad Esfahbodb9ddbd52011-06-02 17:43:12 -0400100for h in headers:
101 for l in h:
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430102 print (" * %s" % (l.strip()))
103print (" */")
104print ()
Behdad Esfahbodc77ae402018-08-25 22:36:36 -0700105print ('#include "hb-ot-shape-complex-indic.hh"')
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430106print ()
Behdad Esfahbodb9ddbd52011-06-02 17:43:12 -0400107
108# Shorten values
Behdad Esfahbodb9ddbd52011-06-02 17:43:12 -0400109short = [{
110 "Bindu": 'Bi',
Behdad Esfahbod89e49462014-06-22 11:32:13 -0600111 "Cantillation_Mark": 'Ca',
112 "Joiner": 'ZWJ',
113 "Non_Joiner": 'ZWNJ',
114 "Number": 'Nd',
Behdad Esfahbodb9ddbd52011-06-02 17:43:12 -0400115 "Visarga": 'Vs',
116 "Vowel": 'Vo',
Behdad Esfahbod81426802011-06-13 16:02:18 -0400117 "Vowel_Dependent": 'M',
Behdad Esfahbod2813e302015-12-18 11:05:11 +0000118 "Consonant_Prefixed": 'CPrf',
Behdad Esfahbod81426802011-06-13 16:02:18 -0400119 "Other": 'x',
Behdad Esfahbodb9ddbd52011-06-02 17:43:12 -0400120},{
Behdad Esfahbod81426802011-06-13 16:02:18 -0400121 "Not_Applicable": 'x',
Behdad Esfahbodb9ddbd52011-06-02 17:43:12 -0400122}]
Behdad Esfahbodf2ad86e2014-06-21 15:31:10 -0600123all_shorts = [{},{}]
Behdad Esfahbodb9ddbd52011-06-02 17:43:12 -0400124
125# Add some of the values, to make them more readable, and to avoid duplicates
126
127
128for i in range (2):
129 for v,s in short[i].items ():
Behdad Esfahbodf2ad86e2014-06-21 15:31:10 -0600130 all_shorts[i][s] = v
Behdad Esfahbodb9ddbd52011-06-02 17:43:12 -0400131
132what = ["INDIC_SYLLABIC_CATEGORY", "INDIC_MATRA_CATEGORY"]
133what_short = ["ISC", "IMC"]
134for i in range (2):
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430135 print ()
Ebrahim Byagowi26e0cbd2018-03-29 21:22:47 +0430136 vv = sorted (values[i].keys ())
Behdad Esfahbodb9ddbd52011-06-02 17:43:12 -0400137 for v in vv:
138 v_no_and = v.replace ('_And_', '_')
139 if v in short[i]:
140 s = short[i][v]
141 else:
142 s = ''.join ([c for c in v_no_and if ord ('A') <= ord (c) <= ord ('Z')])
143 if s in all_shorts[i]:
Behdad Esfahbodf2ad86e2014-06-21 15:31:10 -0600144 raise Exception ("Duplicate short value alias", v, all_shorts[i][s])
145 all_shorts[i][s] = v
Behdad Esfahbodb9ddbd52011-06-02 17:43:12 -0400146 short[i][v] = s
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430147 print ("#define %s_%s %s_%s %s/* %3d chars; %s */" %
148 (what_short[i], s, what[i], v.upper (),
149 ' '* ((48-1 - len (what[i]) - 1 - len (v)) // 8),
150 values[i][v], v))
151print ()
152print ("#define _(S,M) INDIC_COMBINE_CATEGORIES (ISC_##S, IMC_##M)")
153print ()
154print ()
Behdad Esfahbodb9ddbd52011-06-02 17:43:12 -0400155
Behdad Esfahbodcdc8b492012-03-07 12:08:33 -0500156total = 0
157used = 0
Behdad Esfahbod0436e1d2014-06-20 14:56:22 -0400158last_block = None
Behdad Esfahbodb9ddbd52011-06-02 17:43:12 -0400159def print_block (block, start, end, data):
Behdad Esfahbod0436e1d2014-06-20 14:56:22 -0400160 global total, used, last_block
161 if block and block != last_block:
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430162 print ()
163 print ()
164 print (" /* %s */" % block)
Behdad Esfahbodb9ddbd52011-06-02 17:43:12 -0400165 num = 0
Behdad Esfahbod190a2512014-06-20 14:41:39 -0400166 assert start % 8 == 0
167 assert (end+1) % 8 == 0
Behdad Esfahbodb9ddbd52011-06-02 17:43:12 -0400168 for u in range (start, end+1):
169 if u % 8 == 0:
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430170 print ()
171 print (" /* %04X */" % u, end="")
Behdad Esfahbodb9ddbd52011-06-02 17:43:12 -0400172 if u in data:
173 num += 1
174 d = data.get (u, defaults)
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430175 print ("%9s" % ("_(%s,%s)," % (short[0][d[0]], short[1][d[1]])), end="")
Behdad Esfahbodb9ddbd52011-06-02 17:43:12 -0400176
Behdad Esfahbodcdc8b492012-03-07 12:08:33 -0500177 total += end - start + 1
178 used += num
Behdad Esfahbod0436e1d2014-06-20 14:56:22 -0400179 if block:
180 last_block = block
Behdad Esfahbodb9ddbd52011-06-02 17:43:12 -0400181
Ebrahim Byagowi26e0cbd2018-03-29 21:22:47 +0430182uu = sorted (data.keys ())
Behdad Esfahbodb9ddbd52011-06-02 17:43:12 -0400183
Behdad Esfahboddcee8382014-06-22 11:29:59 -0600184last = -100000
Behdad Esfahbodb9ddbd52011-06-02 17:43:12 -0400185num = 0
Behdad Esfahbodc4a59de2011-06-28 14:03:29 -0400186offset = 0
187starts = []
188ends = []
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430189print ("static const INDIC_TABLE_ELEMENT_TYPE indic_table[] = {")
Behdad Esfahbodb9ddbd52011-06-02 17:43:12 -0400190for u in uu:
191 if u <= last:
192 continue
193 block = data[u][2]
Behdad Esfahbod0436e1d2014-06-20 14:56:22 -0400194
195 start = u//8*8
196 end = start+1
Behdad Esfahbod55abfbd2014-06-20 16:47:43 -0400197 while end in uu and block == data[end][2]:
Behdad Esfahbod0436e1d2014-06-20 14:56:22 -0400198 end += 1
199 end = (end-1)//8*8 + 7
Behdad Esfahbodb9ddbd52011-06-02 17:43:12 -0400200
201 if start != last + 1:
Behdad Esfahbod0436e1d2014-06-20 14:56:22 -0400202 if start - last <= 1+16*3:
203 print_block (None, last+1, start-1, data)
Behdad Esfahbodb9ddbd52011-06-02 17:43:12 -0400204 last = start-1
205 else:
206 if last >= 0:
Behdad Esfahbodc4a59de2011-06-28 14:03:29 -0400207 ends.append (last + 1)
208 offset += ends[-1] - starts[-1]
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430209 print ()
210 print ()
211 print ("#define indic_offset_0x%04xu %d" % (start, offset))
Behdad Esfahbodc4a59de2011-06-28 14:03:29 -0400212 starts.append (start)
Behdad Esfahbodb9ddbd52011-06-02 17:43:12 -0400213
214 print_block (block, start, end, data)
215 last = end
Behdad Esfahbodc4a59de2011-06-28 14:03:29 -0400216ends.append (last + 1)
217offset += ends[-1] - starts[-1]
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430218print ()
219print ()
Behdad Esfahbodcdc8b492012-03-07 12:08:33 -0500220occupancy = used * 100. / total
Behdad Esfahboda133e602014-06-20 18:01:34 -0400221page_bits = 12
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430222print ("}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy))
223print ()
224print ("INDIC_TABLE_ELEMENT_TYPE")
225print ("hb_indic_get_categories (hb_codepoint_t u)")
226print ("{")
227print (" switch (u >> %d)" % page_bits)
228print (" {")
Ebrahim Byagowi26e0cbd2018-03-29 21:22:47 +0430229pages = set ([u>>page_bits for u in starts+ends+list (singles.keys ())])
Behdad Esfahbodc2e11342014-06-20 17:57:03 -0400230for p in sorted(pages):
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430231 print (" case 0x%0Xu:" % p)
Behdad Esfahbod308f4192018-01-03 14:22:07 +0000232 for u,d in singles.items ():
233 if p != u>>page_bits: continue
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430234 print (" if (unlikely (u == 0x%04Xu)) return _(%s,%s);" % (u, short[0][d[0]], short[1][d[1]]))
Behdad Esfahbod65ac2da2014-06-20 15:12:49 -0400235 for (start,end) in zip (starts, ends):
Behdad Esfahboda133e602014-06-20 18:01:34 -0400236 if p not in [start>>page_bits, end>>page_bits]: continue
Behdad Esfahbodc09a6072014-07-11 15:05:36 -0400237 offset = "indic_offset_0x%04xu" % start
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430238 print (" if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return indic_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset))
239 print (" break;")
240 print ("")
241print (" default:")
242print (" break;")
243print (" }")
244print (" return _(x,x);")
245print ("}")
246print ()
247print ("#undef _")
Behdad Esfahbodb9ddbd52011-06-02 17:43:12 -0400248for i in range (2):
249 print
Ebrahim Byagowi26e0cbd2018-03-29 21:22:47 +0430250 vv = sorted (values[i].keys ())
Behdad Esfahbodb9ddbd52011-06-02 17:43:12 -0400251 for v in vv:
Ebrahim Byagowicab2c2c2018-03-29 12:48:47 +0430252 print ("#undef %s_%s" %
253 (what_short[i], short[i][v]))
254print ()
255print ("/* == End of generated table == */")
Behdad Esfahbodcdc8b492012-03-07 12:08:33 -0500256
257# Maintain at least 30% occupancy in the table */
258if occupancy < 30:
259 raise Exception ("Table too sparse, please investigate: ", occupancy)