Ebrahim Byagowi | cab2c2c | 2018-03-29 12:48:47 +0430 | [diff] [blame] | 1 | #!/usr/bin/env python |
| 2 | |
| 3 | from __future__ import print_function, division, absolute_import |
Behdad Esfahbod | b9ddbd5 | 2011-06-02 17:43:12 -0400 | [diff] [blame] | 4 | |
Ebrahim Byagowi | 80395f1 | 2018-03-29 22:00:41 +0430 | [diff] [blame] | 5 | import io, sys |
Behdad Esfahbod | b9ddbd5 | 2011-06-02 17:43:12 -0400 | [diff] [blame] | 6 | |
Behdad Esfahbod | ae4a2b9 | 2012-04-10 16:25:08 -0400 | [diff] [blame] | 7 | if len (sys.argv) != 4: |
Ebrahim Byagowi | cab2c2c | 2018-03-29 12:48:47 +0430 | [diff] [blame] | 8 | print ("usage: ./gen-indic-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt Blocks.txt", file=sys.stderr) |
Behdad Esfahbod | b9ddbd5 | 2011-06-02 17:43:12 -0400 | [diff] [blame] | 9 | sys.exit (1) |
| 10 | |
Behdad Esfahbod | 01a30a6 | 2016-05-06 11:50:02 +0100 | [diff] [blame] | 11 | ALLOWED_SINGLES = [0x00A0, 0x25CC] |
| 12 | ALLOWED_BLOCKS = [ |
| 13 | 'Basic Latin', |
| 14 | 'Latin-1 Supplement', |
| 15 | 'Devanagari', |
| 16 | 'Bengali', |
| 17 | 'Gurmukhi', |
| 18 | 'Gujarati', |
| 19 | 'Oriya', |
| 20 | 'Tamil', |
| 21 | 'Telugu', |
| 22 | 'Kannada', |
| 23 | 'Malayalam', |
| 24 | 'Sinhala', |
| 25 | 'Myanmar', |
| 26 | 'Khmer', |
| 27 | 'Vedic Extensions', |
| 28 | 'General Punctuation', |
| 29 | 'Superscripts and Subscripts', |
| 30 | 'Devanagari Extended', |
Behdad Esfahbod | 01a30a6 | 2016-05-06 11:50:02 +0100 | [diff] [blame] | 31 | 'Myanmar Extended-B', |
| 32 | 'Myanmar Extended-A', |
| 33 | ] |
Behdad Esfahbod | 171f970 | 2014-06-20 15:25:30 -0400 | [diff] [blame] | 34 | |
Ebrahim Byagowi | 80395f1 | 2018-03-29 22:00:41 +0430 | [diff] [blame] | 35 | files = [io.open (x, encoding='utf-8') for x in sys.argv[1:]] |
Behdad Esfahbod | b9ddbd5 | 2011-06-02 17:43:12 -0400 | [diff] [blame] | 36 | |
| 37 | headers = [[f.readline () for i in range (2)] for f in files] |
| 38 | |
Behdad Esfahbod | b9ddbd5 | 2011-06-02 17:43:12 -0400 | [diff] [blame] | 39 | data = [{} for f in files] |
| 40 | values = [{} for f in files] |
| 41 | for i, f in enumerate (files): |
| 42 | for line in f: |
| 43 | |
| 44 | j = line.find ('#') |
| 45 | if j >= 0: |
| 46 | line = line[:j] |
Behdad Esfahbod | d606daa | 2011-09-20 14:34:06 -0400 | [diff] [blame] | 47 | |
Behdad Esfahbod | b9ddbd5 | 2011-06-02 17:43:12 -0400 | [diff] [blame] | 48 | fields = [x.strip () for x in line.split (';')] |
| 49 | if len (fields) == 1: |
| 50 | continue |
| 51 | |
| 52 | uu = fields[0].split ('..') |
| 53 | start = int (uu[0], 16) |
| 54 | if len (uu) == 1: |
| 55 | end = start |
| 56 | else: |
| 57 | end = int (uu[1], 16) |
| 58 | |
| 59 | t = fields[1] |
| 60 | |
| 61 | for u in range (start, end + 1): |
| 62 | data[i][u] = t |
Behdad Esfahbod | 5fa21b3 | 2014-06-30 14:30:54 -0400 | [diff] [blame] | 63 | values[i][t] = values[i].get (t, 0) + end - start + 1 |
Behdad Esfahbod | b9ddbd5 | 2011-06-02 17:43:12 -0400 | [diff] [blame] | 64 | |
Behdad Esfahbod | b9ddbd5 | 2011-06-02 17:43:12 -0400 | [diff] [blame] | 65 | # Merge data into one dict: |
| 66 | defaults = ('Other', 'Not_Applicable', 'No_Block') |
| 67 | for i,v in enumerate (defaults): |
| 68 | values[i][v] = values[i].get (v, 0) + 1 |
| 69 | combined = {} |
| 70 | for i,d in enumerate (data): |
| 71 | for u,v in d.items (): |
| 72 | if i == 2 and not u in combined: |
| 73 | continue |
| 74 | if not u in combined: |
| 75 | combined[u] = list (defaults) |
| 76 | combined[u][i] = v |
Behdad Esfahbod | 01a30a6 | 2016-05-06 11:50:02 +0100 | [diff] [blame] | 77 | combined = {k:v for k,v in combined.items() if k in ALLOWED_SINGLES or v[2] in ALLOWED_BLOCKS} |
Behdad Esfahbod | b9ddbd5 | 2011-06-02 17:43:12 -0400 | [diff] [blame] | 78 | data = combined |
| 79 | del combined |
| 80 | num = len (data) |
| 81 | |
Behdad Esfahbod | d743ce7 | 2014-06-30 15:24:02 -0400 | [diff] [blame] | 82 | for u in [0x17CD, 0x17CE, 0x17CF, 0x17D0, 0x17D3]: |
| 83 | if data[u][0] == 'Other': |
| 84 | data[u][0] = "Vowel_Dependent" |
| 85 | |
Behdad Esfahbod | b9ddbd5 | 2011-06-02 17:43:12 -0400 | [diff] [blame] | 86 | # Move the outliers NO-BREAK SPACE and DOTTED CIRCLE out |
| 87 | singles = {} |
Behdad Esfahbod | 01a30a6 | 2016-05-06 11:50:02 +0100 | [diff] [blame] | 88 | for u in ALLOWED_SINGLES: |
Behdad Esfahbod | b9ddbd5 | 2011-06-02 17:43:12 -0400 | [diff] [blame] | 89 | singles[u] = data[u] |
| 90 | del data[u] |
| 91 | |
Ebrahim Byagowi | cab2c2c | 2018-03-29 12:48:47 +0430 | [diff] [blame] | 92 | print ("/* == Start of generated table == */") |
| 93 | print ("/*") |
| 94 | print (" * The following table is generated by running:") |
| 95 | print (" *") |
| 96 | print (" * ./gen-indic-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt Blocks.txt") |
| 97 | print (" *") |
| 98 | print (" * on files with these headers:") |
| 99 | print (" *") |
Behdad Esfahbod | b9ddbd5 | 2011-06-02 17:43:12 -0400 | [diff] [blame] | 100 | for h in headers: |
| 101 | for l in h: |
Ebrahim Byagowi | cab2c2c | 2018-03-29 12:48:47 +0430 | [diff] [blame] | 102 | print (" * %s" % (l.strip())) |
| 103 | print (" */") |
| 104 | print () |
Behdad Esfahbod | c77ae40 | 2018-08-25 22:36:36 -0700 | [diff] [blame] | 105 | print ('#include "hb-ot-shape-complex-indic.hh"') |
Ebrahim Byagowi | cab2c2c | 2018-03-29 12:48:47 +0430 | [diff] [blame] | 106 | print () |
Behdad Esfahbod | b9ddbd5 | 2011-06-02 17:43:12 -0400 | [diff] [blame] | 107 | |
| 108 | # Shorten values |
Behdad Esfahbod | b9ddbd5 | 2011-06-02 17:43:12 -0400 | [diff] [blame] | 109 | short = [{ |
| 110 | "Bindu": 'Bi', |
Behdad Esfahbod | 89e4946 | 2014-06-22 11:32:13 -0600 | [diff] [blame] | 111 | "Cantillation_Mark": 'Ca', |
| 112 | "Joiner": 'ZWJ', |
| 113 | "Non_Joiner": 'ZWNJ', |
| 114 | "Number": 'Nd', |
Behdad Esfahbod | b9ddbd5 | 2011-06-02 17:43:12 -0400 | [diff] [blame] | 115 | "Visarga": 'Vs', |
| 116 | "Vowel": 'Vo', |
Behdad Esfahbod | 8142680 | 2011-06-13 16:02:18 -0400 | [diff] [blame] | 117 | "Vowel_Dependent": 'M', |
Behdad Esfahbod | 2813e30 | 2015-12-18 11:05:11 +0000 | [diff] [blame] | 118 | "Consonant_Prefixed": 'CPrf', |
Behdad Esfahbod | 8142680 | 2011-06-13 16:02:18 -0400 | [diff] [blame] | 119 | "Other": 'x', |
Behdad Esfahbod | b9ddbd5 | 2011-06-02 17:43:12 -0400 | [diff] [blame] | 120 | },{ |
Behdad Esfahbod | 8142680 | 2011-06-13 16:02:18 -0400 | [diff] [blame] | 121 | "Not_Applicable": 'x', |
Behdad Esfahbod | b9ddbd5 | 2011-06-02 17:43:12 -0400 | [diff] [blame] | 122 | }] |
Behdad Esfahbod | f2ad86e | 2014-06-21 15:31:10 -0600 | [diff] [blame] | 123 | all_shorts = [{},{}] |
Behdad Esfahbod | b9ddbd5 | 2011-06-02 17:43:12 -0400 | [diff] [blame] | 124 | |
| 125 | # Add some of the values, to make them more readable, and to avoid duplicates |
| 126 | |
| 127 | |
| 128 | for i in range (2): |
| 129 | for v,s in short[i].items (): |
Behdad Esfahbod | f2ad86e | 2014-06-21 15:31:10 -0600 | [diff] [blame] | 130 | all_shorts[i][s] = v |
Behdad Esfahbod | b9ddbd5 | 2011-06-02 17:43:12 -0400 | [diff] [blame] | 131 | |
| 132 | what = ["INDIC_SYLLABIC_CATEGORY", "INDIC_MATRA_CATEGORY"] |
| 133 | what_short = ["ISC", "IMC"] |
| 134 | for i in range (2): |
Ebrahim Byagowi | cab2c2c | 2018-03-29 12:48:47 +0430 | [diff] [blame] | 135 | print () |
Ebrahim Byagowi | 26e0cbd | 2018-03-29 21:22:47 +0430 | [diff] [blame] | 136 | vv = sorted (values[i].keys ()) |
Behdad Esfahbod | b9ddbd5 | 2011-06-02 17:43:12 -0400 | [diff] [blame] | 137 | for v in vv: |
| 138 | v_no_and = v.replace ('_And_', '_') |
| 139 | if v in short[i]: |
| 140 | s = short[i][v] |
| 141 | else: |
| 142 | s = ''.join ([c for c in v_no_and if ord ('A') <= ord (c) <= ord ('Z')]) |
| 143 | if s in all_shorts[i]: |
Behdad Esfahbod | f2ad86e | 2014-06-21 15:31:10 -0600 | [diff] [blame] | 144 | raise Exception ("Duplicate short value alias", v, all_shorts[i][s]) |
| 145 | all_shorts[i][s] = v |
Behdad Esfahbod | b9ddbd5 | 2011-06-02 17:43:12 -0400 | [diff] [blame] | 146 | short[i][v] = s |
Ebrahim Byagowi | cab2c2c | 2018-03-29 12:48:47 +0430 | [diff] [blame] | 147 | print ("#define %s_%s %s_%s %s/* %3d chars; %s */" % |
| 148 | (what_short[i], s, what[i], v.upper (), |
| 149 | ' '* ((48-1 - len (what[i]) - 1 - len (v)) // 8), |
| 150 | values[i][v], v)) |
| 151 | print () |
| 152 | print ("#define _(S,M) INDIC_COMBINE_CATEGORIES (ISC_##S, IMC_##M)") |
| 153 | print () |
| 154 | print () |
Behdad Esfahbod | b9ddbd5 | 2011-06-02 17:43:12 -0400 | [diff] [blame] | 155 | |
Behdad Esfahbod | cdc8b49 | 2012-03-07 12:08:33 -0500 | [diff] [blame] | 156 | total = 0 |
| 157 | used = 0 |
Behdad Esfahbod | 0436e1d | 2014-06-20 14:56:22 -0400 | [diff] [blame] | 158 | last_block = None |
Behdad Esfahbod | b9ddbd5 | 2011-06-02 17:43:12 -0400 | [diff] [blame] | 159 | def print_block (block, start, end, data): |
Behdad Esfahbod | 0436e1d | 2014-06-20 14:56:22 -0400 | [diff] [blame] | 160 | global total, used, last_block |
| 161 | if block and block != last_block: |
Ebrahim Byagowi | cab2c2c | 2018-03-29 12:48:47 +0430 | [diff] [blame] | 162 | print () |
| 163 | print () |
| 164 | print (" /* %s */" % block) |
Behdad Esfahbod | b9ddbd5 | 2011-06-02 17:43:12 -0400 | [diff] [blame] | 165 | num = 0 |
Behdad Esfahbod | 190a251 | 2014-06-20 14:41:39 -0400 | [diff] [blame] | 166 | assert start % 8 == 0 |
| 167 | assert (end+1) % 8 == 0 |
Behdad Esfahbod | b9ddbd5 | 2011-06-02 17:43:12 -0400 | [diff] [blame] | 168 | for u in range (start, end+1): |
| 169 | if u % 8 == 0: |
Ebrahim Byagowi | cab2c2c | 2018-03-29 12:48:47 +0430 | [diff] [blame] | 170 | print () |
| 171 | print (" /* %04X */" % u, end="") |
Behdad Esfahbod | b9ddbd5 | 2011-06-02 17:43:12 -0400 | [diff] [blame] | 172 | if u in data: |
| 173 | num += 1 |
| 174 | d = data.get (u, defaults) |
Ebrahim Byagowi | cab2c2c | 2018-03-29 12:48:47 +0430 | [diff] [blame] | 175 | print ("%9s" % ("_(%s,%s)," % (short[0][d[0]], short[1][d[1]])), end="") |
Behdad Esfahbod | b9ddbd5 | 2011-06-02 17:43:12 -0400 | [diff] [blame] | 176 | |
Behdad Esfahbod | cdc8b49 | 2012-03-07 12:08:33 -0500 | [diff] [blame] | 177 | total += end - start + 1 |
| 178 | used += num |
Behdad Esfahbod | 0436e1d | 2014-06-20 14:56:22 -0400 | [diff] [blame] | 179 | if block: |
| 180 | last_block = block |
Behdad Esfahbod | b9ddbd5 | 2011-06-02 17:43:12 -0400 | [diff] [blame] | 181 | |
Ebrahim Byagowi | 26e0cbd | 2018-03-29 21:22:47 +0430 | [diff] [blame] | 182 | uu = sorted (data.keys ()) |
Behdad Esfahbod | b9ddbd5 | 2011-06-02 17:43:12 -0400 | [diff] [blame] | 183 | |
Behdad Esfahbod | dcee838 | 2014-06-22 11:29:59 -0600 | [diff] [blame] | 184 | last = -100000 |
Behdad Esfahbod | b9ddbd5 | 2011-06-02 17:43:12 -0400 | [diff] [blame] | 185 | num = 0 |
Behdad Esfahbod | c4a59de | 2011-06-28 14:03:29 -0400 | [diff] [blame] | 186 | offset = 0 |
| 187 | starts = [] |
| 188 | ends = [] |
Ebrahim Byagowi | cab2c2c | 2018-03-29 12:48:47 +0430 | [diff] [blame] | 189 | print ("static const INDIC_TABLE_ELEMENT_TYPE indic_table[] = {") |
Behdad Esfahbod | b9ddbd5 | 2011-06-02 17:43:12 -0400 | [diff] [blame] | 190 | for u in uu: |
| 191 | if u <= last: |
| 192 | continue |
| 193 | block = data[u][2] |
Behdad Esfahbod | 0436e1d | 2014-06-20 14:56:22 -0400 | [diff] [blame] | 194 | |
| 195 | start = u//8*8 |
| 196 | end = start+1 |
Behdad Esfahbod | 55abfbd | 2014-06-20 16:47:43 -0400 | [diff] [blame] | 197 | while end in uu and block == data[end][2]: |
Behdad Esfahbod | 0436e1d | 2014-06-20 14:56:22 -0400 | [diff] [blame] | 198 | end += 1 |
| 199 | end = (end-1)//8*8 + 7 |
Behdad Esfahbod | b9ddbd5 | 2011-06-02 17:43:12 -0400 | [diff] [blame] | 200 | |
| 201 | if start != last + 1: |
Behdad Esfahbod | 0436e1d | 2014-06-20 14:56:22 -0400 | [diff] [blame] | 202 | if start - last <= 1+16*3: |
| 203 | print_block (None, last+1, start-1, data) |
Behdad Esfahbod | b9ddbd5 | 2011-06-02 17:43:12 -0400 | [diff] [blame] | 204 | last = start-1 |
| 205 | else: |
| 206 | if last >= 0: |
Behdad Esfahbod | c4a59de | 2011-06-28 14:03:29 -0400 | [diff] [blame] | 207 | ends.append (last + 1) |
| 208 | offset += ends[-1] - starts[-1] |
Ebrahim Byagowi | cab2c2c | 2018-03-29 12:48:47 +0430 | [diff] [blame] | 209 | print () |
| 210 | print () |
| 211 | print ("#define indic_offset_0x%04xu %d" % (start, offset)) |
Behdad Esfahbod | c4a59de | 2011-06-28 14:03:29 -0400 | [diff] [blame] | 212 | starts.append (start) |
Behdad Esfahbod | b9ddbd5 | 2011-06-02 17:43:12 -0400 | [diff] [blame] | 213 | |
| 214 | print_block (block, start, end, data) |
| 215 | last = end |
Behdad Esfahbod | c4a59de | 2011-06-28 14:03:29 -0400 | [diff] [blame] | 216 | ends.append (last + 1) |
| 217 | offset += ends[-1] - starts[-1] |
Ebrahim Byagowi | cab2c2c | 2018-03-29 12:48:47 +0430 | [diff] [blame] | 218 | print () |
| 219 | print () |
Behdad Esfahbod | cdc8b49 | 2012-03-07 12:08:33 -0500 | [diff] [blame] | 220 | occupancy = used * 100. / total |
Behdad Esfahbod | a133e60 | 2014-06-20 18:01:34 -0400 | [diff] [blame] | 221 | page_bits = 12 |
Ebrahim Byagowi | cab2c2c | 2018-03-29 12:48:47 +0430 | [diff] [blame] | 222 | print ("}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy)) |
| 223 | print () |
| 224 | print ("INDIC_TABLE_ELEMENT_TYPE") |
| 225 | print ("hb_indic_get_categories (hb_codepoint_t u)") |
| 226 | print ("{") |
| 227 | print (" switch (u >> %d)" % page_bits) |
| 228 | print (" {") |
Ebrahim Byagowi | 26e0cbd | 2018-03-29 21:22:47 +0430 | [diff] [blame] | 229 | pages = set ([u>>page_bits for u in starts+ends+list (singles.keys ())]) |
Behdad Esfahbod | c2e1134 | 2014-06-20 17:57:03 -0400 | [diff] [blame] | 230 | for p in sorted(pages): |
Ebrahim Byagowi | cab2c2c | 2018-03-29 12:48:47 +0430 | [diff] [blame] | 231 | print (" case 0x%0Xu:" % p) |
Behdad Esfahbod | 308f419 | 2018-01-03 14:22:07 +0000 | [diff] [blame] | 232 | for u,d in singles.items (): |
| 233 | if p != u>>page_bits: continue |
Ebrahim Byagowi | cab2c2c | 2018-03-29 12:48:47 +0430 | [diff] [blame] | 234 | print (" if (unlikely (u == 0x%04Xu)) return _(%s,%s);" % (u, short[0][d[0]], short[1][d[1]])) |
Behdad Esfahbod | 65ac2da | 2014-06-20 15:12:49 -0400 | [diff] [blame] | 235 | for (start,end) in zip (starts, ends): |
Behdad Esfahbod | a133e60 | 2014-06-20 18:01:34 -0400 | [diff] [blame] | 236 | if p not in [start>>page_bits, end>>page_bits]: continue |
Behdad Esfahbod | c09a607 | 2014-07-11 15:05:36 -0400 | [diff] [blame] | 237 | offset = "indic_offset_0x%04xu" % start |
Ebrahim Byagowi | cab2c2c | 2018-03-29 12:48:47 +0430 | [diff] [blame] | 238 | print (" if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return indic_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset)) |
| 239 | print (" break;") |
| 240 | print ("") |
| 241 | print (" default:") |
| 242 | print (" break;") |
| 243 | print (" }") |
| 244 | print (" return _(x,x);") |
| 245 | print ("}") |
| 246 | print () |
| 247 | print ("#undef _") |
Behdad Esfahbod | b9ddbd5 | 2011-06-02 17:43:12 -0400 | [diff] [blame] | 248 | for i in range (2): |
| 249 | print |
Ebrahim Byagowi | 26e0cbd | 2018-03-29 21:22:47 +0430 | [diff] [blame] | 250 | vv = sorted (values[i].keys ()) |
Behdad Esfahbod | b9ddbd5 | 2011-06-02 17:43:12 -0400 | [diff] [blame] | 251 | for v in vv: |
Ebrahim Byagowi | cab2c2c | 2018-03-29 12:48:47 +0430 | [diff] [blame] | 252 | print ("#undef %s_%s" % |
| 253 | (what_short[i], short[i][v])) |
| 254 | print () |
| 255 | print ("/* == End of generated table == */") |
Behdad Esfahbod | cdc8b49 | 2012-03-07 12:08:33 -0500 | [diff] [blame] | 256 | |
| 257 | # Maintain at least 30% occupancy in the table */ |
| 258 | if occupancy < 30: |
| 259 | raise Exception ("Table too sparse, please investigate: ", occupancy) |