Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 1 | #!/usr/bin/python |
| 2 | |
| 3 | import sys |
| 4 | |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 5 | if len (sys.argv) != 5: |
| 6 | print >>sys.stderr, "usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt" |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 7 | sys.exit (1) |
| 8 | |
| 9 | BLACKLISTED_BLOCKS = ["Thai", "Lao", "Tibetan"] |
| 10 | |
| 11 | files = [file (x) for x in sys.argv[1:]] |
| 12 | |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 13 | headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 2] |
| 14 | headers.append (["UnicodeData.txt does not have a header."]) |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 15 | |
| 16 | data = [{} for f in files] |
| 17 | values = [{} for f in files] |
| 18 | for i, f in enumerate (files): |
| 19 | for line in f: |
| 20 | |
| 21 | j = line.find ('#') |
| 22 | if j >= 0: |
| 23 | line = line[:j] |
| 24 | |
| 25 | fields = [x.strip () for x in line.split (';')] |
| 26 | if len (fields) == 1: |
| 27 | continue |
| 28 | |
| 29 | uu = fields[0].split ('..') |
| 30 | start = int (uu[0], 16) |
| 31 | if len (uu) == 1: |
| 32 | end = start |
| 33 | else: |
| 34 | end = int (uu[1], 16) |
| 35 | |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 36 | t = fields[1 if i != 2 else 2] |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 37 | |
| 38 | for u in range (start, end + 1): |
| 39 | data[i][u] = t |
| 40 | values[i][t] = values[i].get (t, 0) + end - start + 1 |
| 41 | |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 42 | defaults = ('Other', 'Not_Applicable', 'Cn', 'No_Block') |
Behdad Esfahbod | ba72801 | 2015-07-21 11:57:23 +0100 | [diff] [blame] | 43 | |
Behdad Esfahbod | 2d4b62e | 2015-07-21 16:46:37 +0100 | [diff] [blame] | 44 | # TODO Characters that are not in Unicode Indic files, but used in USE |
Behdad Esfahbod | ba72801 | 2015-07-21 11:57:23 +0100 | [diff] [blame] | 45 | data[0][0x034F] = defaults[0] |
Behdad Esfahbod | 2d4b62e | 2015-07-21 16:46:37 +0100 | [diff] [blame] | 46 | data[0][0x2060] = defaults[0] |
David Corbett | 87f0ad1 | 2017-11-02 10:59:25 -0400 | [diff] [blame] | 47 | data[0][0x20F0] = defaults[0] |
Behdad Esfahbod | 2d4b62e | 2015-07-21 16:46:37 +0100 | [diff] [blame] | 48 | for u in range (0xFE00, 0xFE0F + 1): |
| 49 | data[0][u] = defaults[0] |
Behdad Esfahbod | ba72801 | 2015-07-21 11:57:23 +0100 | [diff] [blame] | 50 | |
| 51 | # Merge data into one dict: |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 52 | for i,v in enumerate (defaults): |
| 53 | values[i][v] = values[i].get (v, 0) + 1 |
| 54 | combined = {} |
| 55 | for i,d in enumerate (data): |
| 56 | for u,v in d.items (): |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 57 | if i >= 2 and not u in combined: |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 58 | continue |
| 59 | if not u in combined: |
| 60 | combined[u] = list (defaults) |
| 61 | combined[u][i] = v |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 62 | combined = {k:v for k,v in combined.items() if v[3] not in BLACKLISTED_BLOCKS} |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 63 | data = combined |
| 64 | del combined |
| 65 | num = len (data) |
| 66 | |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 67 | |
| 68 | property_names = [ |
| 69 | # General_Category |
| 70 | 'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc', |
| 71 | 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', |
| 72 | 'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs', |
| 73 | # Indic_Syllabic_Category |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 74 | 'Other', |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 75 | 'Bindu', |
| 76 | 'Visarga', |
| 77 | 'Avagraha', |
| 78 | 'Nukta', |
| 79 | 'Virama', |
| 80 | 'Pure_Killer', |
| 81 | 'Invisible_Stacker', |
| 82 | 'Vowel_Independent', |
| 83 | 'Vowel_Dependent', |
| 84 | 'Vowel', |
| 85 | 'Consonant_Placeholder', |
| 86 | 'Consonant', |
| 87 | 'Consonant_Dead', |
| 88 | 'Consonant_With_Stacker', |
| 89 | 'Consonant_Prefixed', |
| 90 | 'Consonant_Preceding_Repha', |
| 91 | 'Consonant_Succeeding_Repha', |
| 92 | 'Consonant_Subjoined', |
| 93 | 'Consonant_Medial', |
| 94 | 'Consonant_Final', |
| 95 | 'Consonant_Head_Letter', |
| 96 | 'Modifying_Letter', |
| 97 | 'Tone_Letter', |
| 98 | 'Tone_Mark', |
| 99 | 'Gemination_Mark', |
| 100 | 'Cantillation_Mark', |
| 101 | 'Register_Shifter', |
| 102 | 'Syllable_Modifier', |
| 103 | 'Consonant_Killer', |
| 104 | 'Non_Joiner', |
| 105 | 'Joiner', |
| 106 | 'Number_Joiner', |
| 107 | 'Number', |
| 108 | 'Brahmi_Joining_Number', |
| 109 | # Indic_Positional_Category |
Behdad Esfahbod | 44910ce | 2015-07-20 18:01:10 +0100 | [diff] [blame] | 110 | 'Not_Applicable', |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 111 | 'Right', |
| 112 | 'Left', |
| 113 | 'Visual_Order_Left', |
| 114 | 'Left_And_Right', |
| 115 | 'Top', |
| 116 | 'Bottom', |
| 117 | 'Top_And_Bottom', |
| 118 | 'Top_And_Right', |
| 119 | 'Top_And_Left', |
| 120 | 'Top_And_Left_And_Right', |
Behdad Esfahbod | ea535a1 | 2017-10-02 17:02:39 +0200 | [diff] [blame] | 121 | 'Bottom_And_Left', |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 122 | 'Bottom_And_Right', |
| 123 | 'Top_And_Bottom_And_Right', |
| 124 | 'Overstruck', |
| 125 | ] |
| 126 | |
| 127 | class PropertyValue(object): |
| 128 | def __init__(self, name_): |
| 129 | self.name = name_ |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 130 | def __str__(self): |
| 131 | return self.name |
| 132 | def __eq__(self, other): |
Behdad Esfahbod | 44910ce | 2015-07-20 18:01:10 +0100 | [diff] [blame] | 133 | return self.name == (other if isinstance(other, basestring) else other.name) |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 134 | def __ne__(self, other): |
| 135 | return not (self == other) |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 136 | |
| 137 | property_values = {} |
| 138 | |
| 139 | for name in property_names: |
| 140 | value = PropertyValue(name) |
| 141 | assert value not in property_values |
| 142 | assert value not in globals() |
| 143 | property_values[name] = value |
| 144 | globals().update(property_values) |
| 145 | |
| 146 | |
| 147 | def is_BASE(U, UISC, UGC): |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 148 | return (UISC in [Number, Consonant, Consonant_Head_Letter, |
Behdad Esfahbod | 9b6312f | 2016-05-06 17:41:49 +0100 | [diff] [blame] | 149 | #SPEC-DRAFT Consonant_Placeholder, |
| 150 | Tone_Letter, |
| 151 | Vowel_Independent #SPEC-DRAFT |
| 152 | ] or |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 153 | (UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial, |
| 154 | Consonant_Subjoined, Vowel, Vowel_Dependent])) |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 155 | def is_BASE_IND(U, UISC, UGC): |
Behdad Esfahbod | 9b6312f | 2016-05-06 17:41:49 +0100 | [diff] [blame] | 156 | #SPEC-DRAFT return (UISC in [Consonant_Dead, Modifying_Letter] or UGC == Po) |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 157 | return (UISC in [Consonant_Dead, Modifying_Letter] or |
Behdad Esfahbod | ea535a1 | 2017-10-02 17:02:39 +0200 | [diff] [blame] | 158 | (UGC == Po and not U in [0x104E, 0x2022, 0x11A3F, 0x11A45]) or |
Behdad Esfahbod | 9b6312f | 2016-05-06 17:41:49 +0100 | [diff] [blame] | 159 | False # SPEC-DRAFT-OUTDATED! U == 0x002D |
| 160 | ) |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 161 | def is_BASE_NUM(U, UISC, UGC): |
| 162 | return UISC == Brahmi_Joining_Number |
| 163 | def is_BASE_OTHER(U, UISC, UGC): |
Behdad Esfahbod | 9b6312f | 2016-05-06 17:41:49 +0100 | [diff] [blame] | 164 | if UISC == Consonant_Placeholder: return True #SPEC-DRAFT |
| 165 | #SPEC-DRAFT return U in [0x00A0, 0x00D7, 0x2015, 0x2022, 0x25CC, 0x25FB, 0x25FC, 0x25FD, 0x25FE] |
| 166 | return U in [0x2015, 0x2022, 0x25FB, 0x25FC, 0x25FD, 0x25FE] |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 167 | def is_CGJ(U, UISC, UGC): |
| 168 | return U == 0x034F |
| 169 | def is_CONS_FINAL(U, UISC, UGC): |
| 170 | return ((UISC == Consonant_Final and UGC != Lo) or |
| 171 | UISC == Consonant_Succeeding_Repha) |
| 172 | def is_CONS_FINAL_MOD(U, UISC, UGC): |
Behdad Esfahbod | 9b6312f | 2016-05-06 17:41:49 +0100 | [diff] [blame] | 173 | #SPEC-DRAFT return UISC in [Consonant_Final_Modifier, Syllable_Modifier] |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 174 | return UISC == Syllable_Modifier |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 175 | def is_CONS_MED(U, UISC, UGC): |
| 176 | return UISC == Consonant_Medial and UGC != Lo |
| 177 | def is_CONS_MOD(U, UISC, UGC): |
| 178 | return UISC in [Nukta, Gemination_Mark, Consonant_Killer] |
| 179 | def is_CONS_SUB(U, UISC, UGC): |
Behdad Esfahbod | 9b6312f | 2016-05-06 17:41:49 +0100 | [diff] [blame] | 180 | #SPEC-DRAFT return UISC == Consonant_Subjoined |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 181 | return UISC == Consonant_Subjoined and UGC != Lo |
Behdad Esfahbod | e07669f | 2017-10-03 14:57:14 +0200 | [diff] [blame] | 182 | def is_CONS_WITH_STACKER(U, UISC, UGC): |
| 183 | return UISC == Consonant_With_Stacker |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 184 | def is_HALANT(U, UISC, UGC): |
| 185 | return UISC in [Virama, Invisible_Stacker] |
| 186 | def is_HALANT_NUM(U, UISC, UGC): |
| 187 | return UISC == Number_Joiner |
| 188 | def is_ZWNJ(U, UISC, UGC): |
| 189 | return UISC == Non_Joiner |
| 190 | def is_ZWJ(U, UISC, UGC): |
| 191 | return UISC == Joiner |
| 192 | def is_Word_Joiner(U, UISC, UGC): |
| 193 | return U == 0x2060 |
| 194 | def is_OTHER(U, UISC, UGC): |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 195 | #SPEC-OUTDATED return UGC == Zs # or any other SCRIPT_COMMON characters |
Behdad Esfahbod | ba72801 | 2015-07-21 11:57:23 +0100 | [diff] [blame] | 196 | return (UISC == Other |
| 197 | and not is_SYM_MOD(U, UISC, UGC) |
| 198 | and not is_CGJ(U, UISC, UGC) |
Behdad Esfahbod | 2d4b62e | 2015-07-21 16:46:37 +0100 | [diff] [blame] | 199 | and not is_Word_Joiner(U, UISC, UGC) |
| 200 | and not is_VARIATION_SELECTOR(U, UISC, UGC) |
Behdad Esfahbod | ba72801 | 2015-07-21 11:57:23 +0100 | [diff] [blame] | 201 | ) |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 202 | def is_Reserved(U, UISC, UGC): |
| 203 | return UGC == 'Cn' |
| 204 | def is_REPHA(U, UISC, UGC): |
Behdad Esfahbod | e07669f | 2017-10-03 14:57:14 +0200 | [diff] [blame] | 205 | return UISC in [Consonant_Preceding_Repha, Consonant_Prefixed] |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 206 | def is_SYM(U, UISC, UGC): |
Behdad Esfahbod | 9b6312f | 2016-05-06 17:41:49 +0100 | [diff] [blame] | 207 | if U == 0x25CC: return False #SPEC-DRAFT |
| 208 | #SPEC-DRAFT return UGC in [So, Sc] or UISC == Symbol_Letter |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 209 | return UGC in [So, Sc] |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 210 | def is_SYM_MOD(U, UISC, UGC): |
| 211 | return U in [0x1B6B, 0x1B6C, 0x1B6D, 0x1B6E, 0x1B6F, 0x1B70, 0x1B71, 0x1B72, 0x1B73] |
| 212 | def is_VARIATION_SELECTOR(U, UISC, UGC): |
| 213 | return 0xFE00 <= U <= 0xFE0F |
| 214 | def is_VOWEL(U, UISC, UGC): |
Behdad Esfahbod | 216b003 | 2017-07-14 16:38:51 +0100 | [diff] [blame] | 215 | # https://github.com/roozbehp/unicode-data/issues/6 |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 216 | return (UISC == Pure_Killer or |
Behdad Esfahbod | 216b003 | 2017-07-14 16:38:51 +0100 | [diff] [blame] | 217 | (UGC != Lo and UISC in [Vowel, Vowel_Dependent] and U not in [0xAA29])) |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 218 | def is_VOWEL_MOD(U, UISC, UGC): |
Behdad Esfahbod | 216b003 | 2017-07-14 16:38:51 +0100 | [diff] [blame] | 219 | # https://github.com/roozbehp/unicode-data/issues/6 |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 220 | return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or |
Behdad Esfahbod | 216b003 | 2017-07-14 16:38:51 +0100 | [diff] [blame] | 221 | (UGC != Lo and (UISC == Bindu or U in [0xAA29]))) |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 222 | |
| 223 | use_mapping = { |
| 224 | 'B': is_BASE, |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 225 | 'IND': is_BASE_IND, |
| 226 | 'N': is_BASE_NUM, |
| 227 | 'GB': is_BASE_OTHER, |
| 228 | 'CGJ': is_CGJ, |
| 229 | 'F': is_CONS_FINAL, |
| 230 | 'FM': is_CONS_FINAL_MOD, |
| 231 | 'M': is_CONS_MED, |
| 232 | 'CM': is_CONS_MOD, |
| 233 | 'SUB': is_CONS_SUB, |
Behdad Esfahbod | e07669f | 2017-10-03 14:57:14 +0200 | [diff] [blame] | 234 | 'CS': is_CONS_WITH_STACKER, |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 235 | 'H': is_HALANT, |
| 236 | 'HN': is_HALANT_NUM, |
| 237 | 'ZWNJ': is_ZWNJ, |
| 238 | 'ZWJ': is_ZWJ, |
| 239 | 'WJ': is_Word_Joiner, |
| 240 | 'O': is_OTHER, |
| 241 | 'Rsv': is_Reserved, |
| 242 | 'R': is_REPHA, |
| 243 | 'S': is_SYM, |
| 244 | 'SM': is_SYM_MOD, |
| 245 | 'VS': is_VARIATION_SELECTOR, |
| 246 | 'V': is_VOWEL, |
| 247 | 'VM': is_VOWEL_MOD, |
| 248 | } |
| 249 | |
Behdad Esfahbod | 44910ce | 2015-07-20 18:01:10 +0100 | [diff] [blame] | 250 | use_positions = { |
| 251 | 'F': { |
| 252 | 'Abv': [Top], |
| 253 | 'Blw': [Bottom], |
| 254 | 'Pst': [Right], |
| 255 | }, |
| 256 | 'M': { |
| 257 | 'Abv': [Top], |
Behdad Esfahbod | ea535a1 | 2017-10-02 17:02:39 +0200 | [diff] [blame] | 258 | 'Blw': [Bottom, Bottom_And_Left], |
Behdad Esfahbod | 44910ce | 2015-07-20 18:01:10 +0100 | [diff] [blame] | 259 | 'Pst': [Right], |
| 260 | 'Pre': [Left], |
| 261 | }, |
| 262 | 'CM': { |
| 263 | 'Abv': [Top], |
| 264 | 'Blw': [Bottom], |
| 265 | }, |
| 266 | 'V': { |
| 267 | 'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right], |
| 268 | 'Blw': [Bottom, Overstruck, Bottom_And_Right], |
| 269 | 'Pst': [Right], |
| 270 | 'Pre': [Left, Top_And_Left, Top_And_Left_And_Right, Left_And_Right], |
| 271 | }, |
| 272 | 'VM': { |
| 273 | 'Abv': [Top], |
| 274 | 'Blw': [Bottom, Overstruck], |
| 275 | 'Pst': [Right], |
| 276 | 'Pre': [Left], |
| 277 | }, |
| 278 | 'SM': { |
| 279 | 'Abv': [Top], |
| 280 | 'Blw': [Bottom], |
| 281 | }, |
| 282 | 'H': None, |
| 283 | 'B': None, |
| 284 | 'FM': None, |
| 285 | 'SUB': None, |
| 286 | } |
| 287 | |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 288 | def map_to_use(data): |
| 289 | out = {} |
| 290 | items = use_mapping.items() |
| 291 | for U,(UISC,UIPC,UGC,UBlock) in data.items(): |
Behdad Esfahbod | 44910ce | 2015-07-20 18:01:10 +0100 | [diff] [blame] | 292 | |
| 293 | # Resolve Indic_Syllabic_Category |
| 294 | |
| 295 | # TODO: These don't have UISC assigned in Unicode 8.0, but |
| 296 | # have UIPC |
| 297 | if U == 0x17DD: UISC = Vowel_Dependent |
| 298 | if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark |
| 299 | |
David Corbett | 7bfdf1a | 2017-11-22 16:32:52 -0500 | [diff] [blame] | 300 | # TODO: https://github.com/harfbuzz/harfbuzz/pull/627 |
| 301 | if 0x1BF2 <= U <= 0x1BF3: UISC = Nukta; UIPC = Bottom |
| 302 | |
Behdad Esfahbod | 44910ce | 2015-07-20 18:01:10 +0100 | [diff] [blame] | 303 | # TODO: U+1CED should only be allowed after some of |
| 304 | # the nasalization marks, maybe only for U+1CE9..U+1CF1. |
| 305 | if U == 0x1CED: UISC = Tone_Mark |
| 306 | |
ebraminio | 7c6937e | 2017-11-20 14:49:22 -0500 | [diff] [blame] | 307 | # TODO: https://github.com/harfbuzz/harfbuzz/issues/525 |
Behdad Esfahbod | 5680ef8 | 2017-10-02 18:20:51 +0200 | [diff] [blame] | 308 | if U == 0x1A7F: UISC = Consonant_Final; UIPC = Bottom |
| 309 | |
ebraminio | 7c6937e | 2017-11-20 14:49:22 -0500 | [diff] [blame] | 310 | # TODO: https://github.com/harfbuzz/harfbuzz/pull/609 |
David Corbett | 87f0ad1 | 2017-11-02 10:59:25 -0400 | [diff] [blame] | 311 | if U == 0x20F0: UISC = Cantillation_Mark; UIPC = Top |
| 312 | |
David Corbett | 9f25976 | 2017-11-21 14:16:10 -0500 | [diff] [blame] | 313 | # TODO: https://github.com/harfbuzz/harfbuzz/pull/626 |
| 314 | if U == 0xA8B4: UISC = Consonant_Medial |
| 315 | |
Behdad Esfahbod | 29c244a | 2017-10-02 16:36:21 +0200 | [diff] [blame] | 316 | values = [k for k,v in items if v(U,UISC,UGC)] |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 317 | assert len(values) == 1, "%s %s %s %s" % (hex(U), UISC, UGC, values) |
Behdad Esfahbod | 44910ce | 2015-07-20 18:01:10 +0100 | [diff] [blame] | 318 | USE = values[0] |
| 319 | |
| 320 | # Resolve Indic_Positional_Category |
| 321 | |
| 322 | # TODO: Not in Unicode 8.0 yet, but in spec. |
| 323 | if U == 0x1B6C: UIPC = Bottom |
| 324 | |
| 325 | # TODO: These should die, but have UIPC in Unicode 8.0 |
| 326 | if U in [0x953, 0x954]: UIPC = Not_Applicable |
| 327 | |
| 328 | # TODO: In USE's override list but not in Unicode 8.0 |
| 329 | if U == 0x103C: UIPC = Left |
| 330 | |
| 331 | # TODO: These are not in USE's override list that we have, nor are they in Unicode 8.0 |
| 332 | if 0xA926 <= U <= 0xA92A: UIPC = Top |
| 333 | if U == 0x111CA: UIPC = Bottom |
| 334 | if U == 0x11300: UIPC = Top |
| 335 | if U == 0x1133C: UIPC = Bottom |
| 336 | if U == 0x1171E: UIPC = Left # Correct?! |
| 337 | if 0x1CF2 <= U <= 0x1CF3: UIPC = Right |
| 338 | if 0x1CF8 <= U <= 0x1CF9: UIPC = Top |
| 339 | |
| 340 | assert (UIPC in [Not_Applicable, Visual_Order_Left] or |
| 341 | USE in use_positions), "%s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC) |
| 342 | |
| 343 | pos_mapping = use_positions.get(USE, None) |
| 344 | if pos_mapping: |
| 345 | values = [k for k,v in pos_mapping.items() if v and UIPC in v] |
| 346 | assert len(values) == 1, "%s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC, values) |
| 347 | USE = USE + values[0] |
| 348 | |
| 349 | out[U] = (USE, UBlock) |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 350 | return out |
| 351 | |
| 352 | defaults = ('O', 'No_Block') |
| 353 | data = map_to_use(data) |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 354 | |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 355 | print "/* == Start of generated table == */" |
| 356 | print "/*" |
| 357 | print " * The following table is generated by running:" |
| 358 | print " *" |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 359 | print " * ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt" |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 360 | print " *" |
| 361 | print " * on files with these headers:" |
| 362 | print " *" |
| 363 | for h in headers: |
| 364 | for l in h: |
| 365 | print " * %s" % (l.strip()) |
| 366 | print " */" |
| 367 | print |
Behdad Esfahbod | c48ff28 | 2015-07-20 11:46:17 +0100 | [diff] [blame] | 368 | print '#include "hb-ot-shape-complex-use-private.hh"' |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 369 | print |
| 370 | |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 371 | total = 0 |
| 372 | used = 0 |
| 373 | last_block = None |
| 374 | def print_block (block, start, end, data): |
| 375 | global total, used, last_block |
| 376 | if block and block != last_block: |
| 377 | print |
| 378 | print |
| 379 | print " /* %s */" % block |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 380 | if start % 16: |
Behdad Esfahbod | 44910ce | 2015-07-20 18:01:10 +0100 | [diff] [blame] | 381 | print ' ' * (20 + (start % 16 * 6)), |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 382 | num = 0 |
| 383 | assert start % 8 == 0 |
| 384 | assert (end+1) % 8 == 0 |
| 385 | for u in range (start, end+1): |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 386 | if u % 16 == 0: |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 387 | print |
| 388 | print " /* %04X */" % u, |
| 389 | if u in data: |
| 390 | num += 1 |
| 391 | d = data.get (u, defaults) |
Behdad Esfahbod | 44910ce | 2015-07-20 18:01:10 +0100 | [diff] [blame] | 392 | sys.stdout.write ("%6s," % d[0]) |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 393 | |
| 394 | total += end - start + 1 |
| 395 | used += num |
| 396 | if block: |
| 397 | last_block = block |
| 398 | |
| 399 | uu = data.keys () |
| 400 | uu.sort () |
| 401 | |
| 402 | last = -100000 |
| 403 | num = 0 |
| 404 | offset = 0 |
| 405 | starts = [] |
| 406 | ends = [] |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 407 | for k,v in sorted(use_mapping.items()): |
Behdad Esfahbod | 44910ce | 2015-07-20 18:01:10 +0100 | [diff] [blame] | 408 | if k in use_positions and use_positions[k]: continue |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 409 | print "#define %s USE_%s /* %s */" % (k, k, v.__name__[3:]) |
Behdad Esfahbod | 44910ce | 2015-07-20 18:01:10 +0100 | [diff] [blame] | 410 | for k,v in sorted(use_positions.items()): |
| 411 | if not v: continue |
| 412 | for suf in v.keys(): |
| 413 | tag = k + suf |
| 414 | print "#define %s USE_%s" % (tag, tag) |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 415 | print "" |
Behdad Esfahbod | c48ff28 | 2015-07-20 11:46:17 +0100 | [diff] [blame] | 416 | print "static const USE_TABLE_ELEMENT_TYPE use_table[] = {" |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 417 | for u in uu: |
| 418 | if u <= last: |
| 419 | continue |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 420 | block = data[u][1] |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 421 | |
| 422 | start = u//8*8 |
| 423 | end = start+1 |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 424 | while end in uu and block == data[end][1]: |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 425 | end += 1 |
| 426 | end = (end-1)//8*8 + 7 |
| 427 | |
| 428 | if start != last + 1: |
| 429 | if start - last <= 1+16*3: |
| 430 | print_block (None, last+1, start-1, data) |
| 431 | last = start-1 |
| 432 | else: |
| 433 | if last >= 0: |
| 434 | ends.append (last + 1) |
| 435 | offset += ends[-1] - starts[-1] |
| 436 | print |
| 437 | print |
Behdad Esfahbod | c48ff28 | 2015-07-20 11:46:17 +0100 | [diff] [blame] | 438 | print "#define use_offset_0x%04xu %d" % (start, offset) |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 439 | starts.append (start) |
| 440 | |
| 441 | print_block (block, start, end, data) |
| 442 | last = end |
| 443 | ends.append (last + 1) |
| 444 | offset += ends[-1] - starts[-1] |
| 445 | print |
| 446 | print |
| 447 | occupancy = used * 100. / total |
| 448 | page_bits = 12 |
| 449 | print "}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy) |
| 450 | print |
Behdad Esfahbod | c48ff28 | 2015-07-20 11:46:17 +0100 | [diff] [blame] | 451 | print "USE_TABLE_ELEMENT_TYPE" |
Behdad Esfahbod | 5078044 | 2018-02-13 21:46:28 -0800 | [diff] [blame] | 452 | print "hb_use_get_category (hb_codepoint_t u)" |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 453 | print "{" |
| 454 | print " switch (u >> %d)" % page_bits |
| 455 | print " {" |
Behdad Esfahbod | f8daeef | 2018-01-03 14:27:34 +0000 | [diff] [blame] | 456 | pages = set([u>>page_bits for u in starts+ends]) |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 457 | for p in sorted(pages): |
| 458 | print " case 0x%0Xu:" % p |
| 459 | for (start,end) in zip (starts, ends): |
| 460 | if p not in [start>>page_bits, end>>page_bits]: continue |
Behdad Esfahbod | c48ff28 | 2015-07-20 11:46:17 +0100 | [diff] [blame] | 461 | offset = "use_offset_0x%04xu" % start |
Behdad Esfahbod | 216b003 | 2017-07-14 16:38:51 +0100 | [diff] [blame] | 462 | print " if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return use_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset) |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 463 | print " break;" |
| 464 | print "" |
| 465 | print " default:" |
| 466 | print " break;" |
| 467 | print " }" |
Behdad Esfahbod | 44910ce | 2015-07-20 18:01:10 +0100 | [diff] [blame] | 468 | print " return USE_O;" |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 469 | print "}" |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 470 | print |
Behdad Esfahbod | ad71782 | 2015-07-21 16:43:27 +0100 | [diff] [blame] | 471 | for k in sorted(use_mapping.keys()): |
| 472 | if k in use_positions and use_positions[k]: continue |
| 473 | print "#undef %s" % k |
| 474 | for k,v in sorted(use_positions.items()): |
| 475 | if not v: continue |
| 476 | for suf in v.keys(): |
| 477 | tag = k + suf |
| 478 | print "#undef %s" % tag |
| 479 | print |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 480 | print "/* == End of generated table == */" |
| 481 | |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 482 | # Maintain at least 50% occupancy in the table */ |
| 483 | if occupancy < 50: |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 484 | raise Exception ("Table too sparse, please investigate: ", occupancy) |