Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 1 | #!/usr/bin/python |
| 2 | |
| 3 | import sys |
| 4 | |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 5 | if len (sys.argv) != 5: |
| 6 | print >>sys.stderr, "usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt" |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 7 | sys.exit (1) |
| 8 | |
| 9 | BLACKLISTED_BLOCKS = ["Thai", "Lao", "Tibetan"] |
| 10 | |
| 11 | files = [file (x) for x in sys.argv[1:]] |
| 12 | |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 13 | headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 2] |
| 14 | headers.append (["UnicodeData.txt does not have a header."]) |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 15 | |
| 16 | data = [{} for f in files] |
| 17 | values = [{} for f in files] |
| 18 | for i, f in enumerate (files): |
| 19 | for line in f: |
| 20 | |
| 21 | j = line.find ('#') |
| 22 | if j >= 0: |
| 23 | line = line[:j] |
| 24 | |
| 25 | fields = [x.strip () for x in line.split (';')] |
| 26 | if len (fields) == 1: |
| 27 | continue |
| 28 | |
| 29 | uu = fields[0].split ('..') |
| 30 | start = int (uu[0], 16) |
| 31 | if len (uu) == 1: |
| 32 | end = start |
| 33 | else: |
| 34 | end = int (uu[1], 16) |
| 35 | |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 36 | t = fields[1 if i != 2 else 2] |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 37 | |
| 38 | for u in range (start, end + 1): |
| 39 | data[i][u] = t |
| 40 | values[i][t] = values[i].get (t, 0) + end - start + 1 |
| 41 | |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 42 | defaults = ('Other', 'Not_Applicable', 'Cn', 'No_Block') |
Behdad Esfahbod | ba72801 | 2015-07-21 11:57:23 +0100 | [diff] [blame] | 43 | |
Behdad Esfahbod | 2d4b62e | 2015-07-21 16:46:37 +0100 | [diff] [blame] | 44 | # TODO Characters that are not in Unicode Indic files, but used in USE |
Behdad Esfahbod | ba72801 | 2015-07-21 11:57:23 +0100 | [diff] [blame] | 45 | data[0][0x034F] = defaults[0] |
Behdad Esfahbod | 2d4b62e | 2015-07-21 16:46:37 +0100 | [diff] [blame] | 46 | data[0][0x2060] = defaults[0] |
| 47 | for u in range (0xFE00, 0xFE0F + 1): |
| 48 | data[0][u] = defaults[0] |
Behdad Esfahbod | ba72801 | 2015-07-21 11:57:23 +0100 | [diff] [blame] | 49 | |
| 50 | # Merge data into one dict: |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 51 | for i,v in enumerate (defaults): |
| 52 | values[i][v] = values[i].get (v, 0) + 1 |
| 53 | combined = {} |
| 54 | for i,d in enumerate (data): |
| 55 | for u,v in d.items (): |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 56 | if i >= 2 and not u in combined: |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 57 | continue |
| 58 | if not u in combined: |
| 59 | combined[u] = list (defaults) |
| 60 | combined[u][i] = v |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 61 | combined = {k:v for k,v in combined.items() if v[3] not in BLACKLISTED_BLOCKS} |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 62 | data = combined |
| 63 | del combined |
| 64 | num = len (data) |
| 65 | |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 66 | |
| 67 | property_names = [ |
| 68 | # General_Category |
| 69 | 'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc', |
| 70 | 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', |
| 71 | 'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs', |
| 72 | # Indic_Syllabic_Category |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 73 | 'Other', |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 74 | 'Bindu', |
| 75 | 'Visarga', |
| 76 | 'Avagraha', |
| 77 | 'Nukta', |
| 78 | 'Virama', |
| 79 | 'Pure_Killer', |
| 80 | 'Invisible_Stacker', |
| 81 | 'Vowel_Independent', |
| 82 | 'Vowel_Dependent', |
| 83 | 'Vowel', |
| 84 | 'Consonant_Placeholder', |
| 85 | 'Consonant', |
| 86 | 'Consonant_Dead', |
| 87 | 'Consonant_With_Stacker', |
| 88 | 'Consonant_Prefixed', |
| 89 | 'Consonant_Preceding_Repha', |
| 90 | 'Consonant_Succeeding_Repha', |
| 91 | 'Consonant_Subjoined', |
| 92 | 'Consonant_Medial', |
| 93 | 'Consonant_Final', |
| 94 | 'Consonant_Head_Letter', |
| 95 | 'Modifying_Letter', |
| 96 | 'Tone_Letter', |
| 97 | 'Tone_Mark', |
| 98 | 'Gemination_Mark', |
| 99 | 'Cantillation_Mark', |
| 100 | 'Register_Shifter', |
| 101 | 'Syllable_Modifier', |
| 102 | 'Consonant_Killer', |
| 103 | 'Non_Joiner', |
| 104 | 'Joiner', |
| 105 | 'Number_Joiner', |
| 106 | 'Number', |
| 107 | 'Brahmi_Joining_Number', |
| 108 | # Indic_Positional_Category |
Behdad Esfahbod | 44910ce | 2015-07-20 18:01:10 +0100 | [diff] [blame] | 109 | 'Not_Applicable', |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 110 | 'Right', |
| 111 | 'Left', |
| 112 | 'Visual_Order_Left', |
| 113 | 'Left_And_Right', |
| 114 | 'Top', |
| 115 | 'Bottom', |
| 116 | 'Top_And_Bottom', |
| 117 | 'Top_And_Right', |
| 118 | 'Top_And_Left', |
| 119 | 'Top_And_Left_And_Right', |
| 120 | 'Bottom_And_Right', |
| 121 | 'Top_And_Bottom_And_Right', |
| 122 | 'Overstruck', |
| 123 | ] |
| 124 | |
| 125 | class PropertyValue(object): |
| 126 | def __init__(self, name_): |
| 127 | self.name = name_ |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 128 | def __str__(self): |
| 129 | return self.name |
| 130 | def __eq__(self, other): |
Behdad Esfahbod | 44910ce | 2015-07-20 18:01:10 +0100 | [diff] [blame] | 131 | return self.name == (other if isinstance(other, basestring) else other.name) |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 132 | def __ne__(self, other): |
| 133 | return not (self == other) |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 134 | |
| 135 | property_values = {} |
| 136 | |
| 137 | for name in property_names: |
| 138 | value = PropertyValue(name) |
| 139 | assert value not in property_values |
| 140 | assert value not in globals() |
| 141 | property_values[name] = value |
| 142 | globals().update(property_values) |
| 143 | |
| 144 | |
| 145 | def is_BASE(U, UISC, UGC): |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 146 | return (UISC in [Number, Consonant, Consonant_Head_Letter, |
Behdad Esfahbod | 9b6312f | 2016-05-06 17:41:49 +0100 | [diff] [blame] | 147 | #SPEC-DRAFT Consonant_Placeholder, |
| 148 | Tone_Letter, |
| 149 | Vowel_Independent #SPEC-DRAFT |
| 150 | ] or |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 151 | (UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial, |
| 152 | Consonant_Subjoined, Vowel, Vowel_Dependent])) |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 153 | def is_BASE_IND(U, UISC, UGC): |
Behdad Esfahbod | 9b6312f | 2016-05-06 17:41:49 +0100 | [diff] [blame] | 154 | #SPEC-DRAFT return (UISC in [Consonant_Dead, Modifying_Letter] or UGC == Po) |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 155 | return (UISC in [Consonant_Dead, Modifying_Letter] or |
Behdad Esfahbod | 9b6312f | 2016-05-06 17:41:49 +0100 | [diff] [blame] | 156 | (UGC == Po and not U in [0x104E, 0x2022]) or |
| 157 | False # SPEC-DRAFT-OUTDATED! U == 0x002D |
| 158 | ) |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 159 | def is_BASE_NUM(U, UISC, UGC): |
| 160 | return UISC == Brahmi_Joining_Number |
| 161 | def is_BASE_OTHER(U, UISC, UGC): |
Behdad Esfahbod | 9b6312f | 2016-05-06 17:41:49 +0100 | [diff] [blame] | 162 | if UISC == Consonant_Placeholder: return True #SPEC-DRAFT |
| 163 | #SPEC-DRAFT return U in [0x00A0, 0x00D7, 0x2015, 0x2022, 0x25CC, 0x25FB, 0x25FC, 0x25FD, 0x25FE] |
| 164 | return U in [0x2015, 0x2022, 0x25FB, 0x25FC, 0x25FD, 0x25FE] |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 165 | def is_CGJ(U, UISC, UGC): |
| 166 | return U == 0x034F |
| 167 | def is_CONS_FINAL(U, UISC, UGC): |
| 168 | return ((UISC == Consonant_Final and UGC != Lo) or |
| 169 | UISC == Consonant_Succeeding_Repha) |
| 170 | def is_CONS_FINAL_MOD(U, UISC, UGC): |
Behdad Esfahbod | 9b6312f | 2016-05-06 17:41:49 +0100 | [diff] [blame] | 171 | #SPEC-DRAFT return UISC in [Consonant_Final_Modifier, Syllable_Modifier] |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 172 | return UISC == Syllable_Modifier |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 173 | def is_CONS_MED(U, UISC, UGC): |
| 174 | return UISC == Consonant_Medial and UGC != Lo |
| 175 | def is_CONS_MOD(U, UISC, UGC): |
| 176 | return UISC in [Nukta, Gemination_Mark, Consonant_Killer] |
| 177 | def is_CONS_SUB(U, UISC, UGC): |
Behdad Esfahbod | 9b6312f | 2016-05-06 17:41:49 +0100 | [diff] [blame] | 178 | #SPEC-DRAFT return UISC == Consonant_Subjoined |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 179 | return UISC == Consonant_Subjoined and UGC != Lo |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 180 | def is_HALANT(U, UISC, UGC): |
| 181 | return UISC in [Virama, Invisible_Stacker] |
| 182 | def is_HALANT_NUM(U, UISC, UGC): |
| 183 | return UISC == Number_Joiner |
| 184 | def is_ZWNJ(U, UISC, UGC): |
| 185 | return UISC == Non_Joiner |
| 186 | def is_ZWJ(U, UISC, UGC): |
| 187 | return UISC == Joiner |
| 188 | def is_Word_Joiner(U, UISC, UGC): |
| 189 | return U == 0x2060 |
| 190 | def is_OTHER(U, UISC, UGC): |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 191 | #SPEC-OUTDATED return UGC == Zs # or any other SCRIPT_COMMON characters |
Behdad Esfahbod | ba72801 | 2015-07-21 11:57:23 +0100 | [diff] [blame] | 192 | return (UISC == Other |
| 193 | and not is_SYM_MOD(U, UISC, UGC) |
| 194 | and not is_CGJ(U, UISC, UGC) |
Behdad Esfahbod | 2d4b62e | 2015-07-21 16:46:37 +0100 | [diff] [blame] | 195 | and not is_Word_Joiner(U, UISC, UGC) |
| 196 | and not is_VARIATION_SELECTOR(U, UISC, UGC) |
Behdad Esfahbod | ba72801 | 2015-07-21 11:57:23 +0100 | [diff] [blame] | 197 | ) |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 198 | def is_Reserved(U, UISC, UGC): |
| 199 | return UGC == 'Cn' |
| 200 | def is_REPHA(U, UISC, UGC): |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 201 | #return UISC == Consonant_Preceding_Repha |
| 202 | #SPEC-OUTDATED hack to categorize Consonant_With_Stacker and Consonant_Prefixed |
| 203 | return UISC in [Consonant_Preceding_Repha, Consonant_With_Stacker, Consonant_Prefixed] |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 204 | def is_SYM(U, UISC, UGC): |
Behdad Esfahbod | 9b6312f | 2016-05-06 17:41:49 +0100 | [diff] [blame] | 205 | if U == 0x25CC: return False #SPEC-DRAFT |
| 206 | #SPEC-DRAFT return UGC in [So, Sc] or UISC == Symbol_Letter |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 207 | return UGC in [So, Sc] |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 208 | def is_SYM_MOD(U, UISC, UGC): |
| 209 | return U in [0x1B6B, 0x1B6C, 0x1B6D, 0x1B6E, 0x1B6F, 0x1B70, 0x1B71, 0x1B72, 0x1B73] |
| 210 | def is_VARIATION_SELECTOR(U, UISC, UGC): |
| 211 | return 0xFE00 <= U <= 0xFE0F |
| 212 | def is_VOWEL(U, UISC, UGC): |
| 213 | return (UISC == Pure_Killer or |
| 214 | (UGC != Lo and UISC in [Vowel, Vowel_Dependent])) |
| 215 | def is_VOWEL_MOD(U, UISC, UGC): |
| 216 | return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or |
| 217 | (UGC != Lo and UISC == Bindu)) |
| 218 | |
| 219 | use_mapping = { |
| 220 | 'B': is_BASE, |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 221 | 'IND': is_BASE_IND, |
| 222 | 'N': is_BASE_NUM, |
| 223 | 'GB': is_BASE_OTHER, |
| 224 | 'CGJ': is_CGJ, |
| 225 | 'F': is_CONS_FINAL, |
| 226 | 'FM': is_CONS_FINAL_MOD, |
| 227 | 'M': is_CONS_MED, |
| 228 | 'CM': is_CONS_MOD, |
| 229 | 'SUB': is_CONS_SUB, |
| 230 | 'H': is_HALANT, |
| 231 | 'HN': is_HALANT_NUM, |
| 232 | 'ZWNJ': is_ZWNJ, |
| 233 | 'ZWJ': is_ZWJ, |
| 234 | 'WJ': is_Word_Joiner, |
| 235 | 'O': is_OTHER, |
| 236 | 'Rsv': is_Reserved, |
| 237 | 'R': is_REPHA, |
| 238 | 'S': is_SYM, |
| 239 | 'SM': is_SYM_MOD, |
| 240 | 'VS': is_VARIATION_SELECTOR, |
| 241 | 'V': is_VOWEL, |
| 242 | 'VM': is_VOWEL_MOD, |
| 243 | } |
| 244 | |
Behdad Esfahbod | 44910ce | 2015-07-20 18:01:10 +0100 | [diff] [blame] | 245 | use_positions = { |
| 246 | 'F': { |
| 247 | 'Abv': [Top], |
| 248 | 'Blw': [Bottom], |
| 249 | 'Pst': [Right], |
| 250 | }, |
| 251 | 'M': { |
| 252 | 'Abv': [Top], |
| 253 | 'Blw': [Bottom], |
| 254 | 'Pst': [Right], |
| 255 | 'Pre': [Left], |
| 256 | }, |
| 257 | 'CM': { |
| 258 | 'Abv': [Top], |
| 259 | 'Blw': [Bottom], |
| 260 | }, |
| 261 | 'V': { |
| 262 | 'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right], |
| 263 | 'Blw': [Bottom, Overstruck, Bottom_And_Right], |
| 264 | 'Pst': [Right], |
| 265 | 'Pre': [Left, Top_And_Left, Top_And_Left_And_Right, Left_And_Right], |
| 266 | }, |
| 267 | 'VM': { |
| 268 | 'Abv': [Top], |
| 269 | 'Blw': [Bottom, Overstruck], |
| 270 | 'Pst': [Right], |
| 271 | 'Pre': [Left], |
| 272 | }, |
| 273 | 'SM': { |
| 274 | 'Abv': [Top], |
| 275 | 'Blw': [Bottom], |
| 276 | }, |
| 277 | 'H': None, |
| 278 | 'B': None, |
| 279 | 'FM': None, |
| 280 | 'SUB': None, |
| 281 | } |
| 282 | |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 283 | def map_to_use(data): |
| 284 | out = {} |
| 285 | items = use_mapping.items() |
| 286 | for U,(UISC,UIPC,UGC,UBlock) in data.items(): |
Behdad Esfahbod | 44910ce | 2015-07-20 18:01:10 +0100 | [diff] [blame] | 287 | |
| 288 | # Resolve Indic_Syllabic_Category |
| 289 | |
| 290 | # TODO: These don't have UISC assigned in Unicode 8.0, but |
| 291 | # have UIPC |
| 292 | if U == 0x17DD: UISC = Vowel_Dependent |
| 293 | if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark |
| 294 | |
| 295 | # TODO: U+1CED should only be allowed after some of |
| 296 | # the nasalization marks, maybe only for U+1CE9..U+1CF1. |
| 297 | if U == 0x1CED: UISC = Tone_Mark |
| 298 | |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 299 | evals = [(k, v(U,UISC,UGC)) for k,v in items] |
| 300 | values = [k for k,v in evals if v] |
| 301 | assert len(values) == 1, "%s %s %s %s" % (hex(U), UISC, UGC, values) |
Behdad Esfahbod | 44910ce | 2015-07-20 18:01:10 +0100 | [diff] [blame] | 302 | USE = values[0] |
| 303 | |
| 304 | # Resolve Indic_Positional_Category |
| 305 | |
| 306 | # TODO: Not in Unicode 8.0 yet, but in spec. |
| 307 | if U == 0x1B6C: UIPC = Bottom |
| 308 | |
| 309 | # TODO: These should die, but have UIPC in Unicode 8.0 |
| 310 | if U in [0x953, 0x954]: UIPC = Not_Applicable |
| 311 | |
| 312 | # TODO: In USE's override list but not in Unicode 8.0 |
| 313 | if U == 0x103C: UIPC = Left |
| 314 | |
| 315 | # TODO: These are not in USE's override list that we have, nor are they in Unicode 8.0 |
| 316 | if 0xA926 <= U <= 0xA92A: UIPC = Top |
| 317 | if U == 0x111CA: UIPC = Bottom |
| 318 | if U == 0x11300: UIPC = Top |
| 319 | if U == 0x1133C: UIPC = Bottom |
| 320 | if U == 0x1171E: UIPC = Left # Correct?! |
| 321 | if 0x1CF2 <= U <= 0x1CF3: UIPC = Right |
| 322 | if 0x1CF8 <= U <= 0x1CF9: UIPC = Top |
| 323 | |
| 324 | assert (UIPC in [Not_Applicable, Visual_Order_Left] or |
| 325 | USE in use_positions), "%s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC) |
| 326 | |
| 327 | pos_mapping = use_positions.get(USE, None) |
| 328 | if pos_mapping: |
| 329 | values = [k for k,v in pos_mapping.items() if v and UIPC in v] |
| 330 | assert len(values) == 1, "%s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC, values) |
| 331 | USE = USE + values[0] |
| 332 | |
| 333 | out[U] = (USE, UBlock) |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 334 | return out |
| 335 | |
| 336 | defaults = ('O', 'No_Block') |
| 337 | data = map_to_use(data) |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 338 | |
Behdad Esfahbod | c48ff28 | 2015-07-20 11:46:17 +0100 | [diff] [blame] | 339 | # Remove the outliers |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 340 | singles = {} |
Behdad Esfahbod | ba72801 | 2015-07-21 11:57:23 +0100 | [diff] [blame] | 341 | for u in [0x034F, 0x25CC, 0x1107F]: |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 342 | singles[u] = data[u] |
| 343 | del data[u] |
| 344 | |
| 345 | print "/* == Start of generated table == */" |
| 346 | print "/*" |
| 347 | print " * The following table is generated by running:" |
| 348 | print " *" |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 349 | print " * ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt" |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 350 | print " *" |
| 351 | print " * on files with these headers:" |
| 352 | print " *" |
| 353 | for h in headers: |
| 354 | for l in h: |
| 355 | print " * %s" % (l.strip()) |
| 356 | print " */" |
| 357 | print |
Behdad Esfahbod | c48ff28 | 2015-07-20 11:46:17 +0100 | [diff] [blame] | 358 | print '#include "hb-ot-shape-complex-use-private.hh"' |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 359 | print |
| 360 | |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 361 | total = 0 |
| 362 | used = 0 |
| 363 | last_block = None |
| 364 | def print_block (block, start, end, data): |
| 365 | global total, used, last_block |
| 366 | if block and block != last_block: |
| 367 | print |
| 368 | print |
| 369 | print " /* %s */" % block |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 370 | if start % 16: |
Behdad Esfahbod | 44910ce | 2015-07-20 18:01:10 +0100 | [diff] [blame] | 371 | print ' ' * (20 + (start % 16 * 6)), |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 372 | num = 0 |
| 373 | assert start % 8 == 0 |
| 374 | assert (end+1) % 8 == 0 |
| 375 | for u in range (start, end+1): |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 376 | if u % 16 == 0: |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 377 | print |
| 378 | print " /* %04X */" % u, |
| 379 | if u in data: |
| 380 | num += 1 |
| 381 | d = data.get (u, defaults) |
Behdad Esfahbod | 44910ce | 2015-07-20 18:01:10 +0100 | [diff] [blame] | 382 | sys.stdout.write ("%6s," % d[0]) |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 383 | |
| 384 | total += end - start + 1 |
| 385 | used += num |
| 386 | if block: |
| 387 | last_block = block |
| 388 | |
| 389 | uu = data.keys () |
| 390 | uu.sort () |
| 391 | |
| 392 | last = -100000 |
| 393 | num = 0 |
| 394 | offset = 0 |
| 395 | starts = [] |
| 396 | ends = [] |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 397 | for k,v in sorted(use_mapping.items()): |
Behdad Esfahbod | 44910ce | 2015-07-20 18:01:10 +0100 | [diff] [blame] | 398 | if k in use_positions and use_positions[k]: continue |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 399 | print "#define %s USE_%s /* %s */" % (k, k, v.__name__[3:]) |
Behdad Esfahbod | 44910ce | 2015-07-20 18:01:10 +0100 | [diff] [blame] | 400 | for k,v in sorted(use_positions.items()): |
| 401 | if not v: continue |
| 402 | for suf in v.keys(): |
| 403 | tag = k + suf |
| 404 | print "#define %s USE_%s" % (tag, tag) |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 405 | print "" |
Behdad Esfahbod | c48ff28 | 2015-07-20 11:46:17 +0100 | [diff] [blame] | 406 | print "static const USE_TABLE_ELEMENT_TYPE use_table[] = {" |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 407 | for u in uu: |
| 408 | if u <= last: |
| 409 | continue |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 410 | block = data[u][1] |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 411 | |
| 412 | start = u//8*8 |
| 413 | end = start+1 |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 414 | while end in uu and block == data[end][1]: |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 415 | end += 1 |
| 416 | end = (end-1)//8*8 + 7 |
| 417 | |
| 418 | if start != last + 1: |
| 419 | if start - last <= 1+16*3: |
| 420 | print_block (None, last+1, start-1, data) |
| 421 | last = start-1 |
| 422 | else: |
| 423 | if last >= 0: |
| 424 | ends.append (last + 1) |
| 425 | offset += ends[-1] - starts[-1] |
| 426 | print |
| 427 | print |
Behdad Esfahbod | c48ff28 | 2015-07-20 11:46:17 +0100 | [diff] [blame] | 428 | print "#define use_offset_0x%04xu %d" % (start, offset) |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 429 | starts.append (start) |
| 430 | |
| 431 | print_block (block, start, end, data) |
| 432 | last = end |
| 433 | ends.append (last + 1) |
| 434 | offset += ends[-1] - starts[-1] |
| 435 | print |
| 436 | print |
| 437 | occupancy = used * 100. / total |
| 438 | page_bits = 12 |
| 439 | print "}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy) |
| 440 | print |
Behdad Esfahbod | c48ff28 | 2015-07-20 11:46:17 +0100 | [diff] [blame] | 441 | print "USE_TABLE_ELEMENT_TYPE" |
| 442 | print "hb_use_get_categories (hb_codepoint_t u)" |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 443 | print "{" |
| 444 | print " switch (u >> %d)" % page_bits |
| 445 | print " {" |
| 446 | pages = set([u>>page_bits for u in starts+ends+singles.keys()]) |
| 447 | for p in sorted(pages): |
| 448 | print " case 0x%0Xu:" % p |
| 449 | for (start,end) in zip (starts, ends): |
| 450 | if p not in [start>>page_bits, end>>page_bits]: continue |
Behdad Esfahbod | c48ff28 | 2015-07-20 11:46:17 +0100 | [diff] [blame] | 451 | offset = "use_offset_0x%04xu" % start |
| 452 | print " if (hb_in_range (u, 0x%04Xu, 0x%04Xu)) return use_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset) |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 453 | for u,d in singles.items (): |
| 454 | if p != u>>page_bits: continue |
Behdad Esfahbod | ad71782 | 2015-07-21 16:43:27 +0100 | [diff] [blame] | 455 | print " if (unlikely (u == 0x%04Xu)) return %s;" % (u, d[0]) |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 456 | print " break;" |
| 457 | print "" |
| 458 | print " default:" |
| 459 | print " break;" |
| 460 | print " }" |
Behdad Esfahbod | 44910ce | 2015-07-20 18:01:10 +0100 | [diff] [blame] | 461 | print " return USE_O;" |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 462 | print "}" |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 463 | print |
Behdad Esfahbod | ad71782 | 2015-07-21 16:43:27 +0100 | [diff] [blame] | 464 | for k in sorted(use_mapping.keys()): |
| 465 | if k in use_positions and use_positions[k]: continue |
| 466 | print "#undef %s" % k |
| 467 | for k,v in sorted(use_positions.items()): |
| 468 | if not v: continue |
| 469 | for suf in v.keys(): |
| 470 | tag = k + suf |
| 471 | print "#undef %s" % tag |
| 472 | print |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 473 | print "/* == End of generated table == */" |
| 474 | |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 475 | # Maintain at least 50% occupancy in the table */ |
| 476 | if occupancy < 50: |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 477 | raise Exception ("Table too sparse, please investigate: ", occupancy) |