Ebrahim Byagowi | a48dd6e | 2018-03-28 19:08:19 +0430 | [diff] [blame] | 1 | #!/usr/bin/env python |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 2 | |
Ebrahim Byagowi | cab2c2c | 2018-03-29 12:48:47 +0430 | [diff] [blame] | 3 | from __future__ import print_function, division, absolute_import |
| 4 | |
Ebrahim Byagowi | 80395f1 | 2018-03-29 22:00:41 +0430 | [diff] [blame] | 5 | import io, sys |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 6 | |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 7 | if len (sys.argv) != 5: |
Ebrahim Byagowi | a48dd6e | 2018-03-28 19:08:19 +0430 | [diff] [blame] | 8 | print ("usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt", file=sys.stderr) |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 9 | sys.exit (1) |
| 10 | |
| 11 | BLACKLISTED_BLOCKS = ["Thai", "Lao", "Tibetan"] |
| 12 | |
Ebrahim Byagowi | 80395f1 | 2018-03-29 22:00:41 +0430 | [diff] [blame] | 13 | files = [io.open (x, encoding='utf-8') for x in sys.argv[1:]] |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 14 | |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 15 | headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 2] |
| 16 | headers.append (["UnicodeData.txt does not have a header."]) |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 17 | |
| 18 | data = [{} for f in files] |
| 19 | values = [{} for f in files] |
| 20 | for i, f in enumerate (files): |
| 21 | for line in f: |
| 22 | |
| 23 | j = line.find ('#') |
| 24 | if j >= 0: |
| 25 | line = line[:j] |
| 26 | |
| 27 | fields = [x.strip () for x in line.split (';')] |
| 28 | if len (fields) == 1: |
| 29 | continue |
| 30 | |
| 31 | uu = fields[0].split ('..') |
| 32 | start = int (uu[0], 16) |
| 33 | if len (uu) == 1: |
| 34 | end = start |
| 35 | else: |
| 36 | end = int (uu[1], 16) |
| 37 | |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 38 | t = fields[1 if i != 2 else 2] |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 39 | |
| 40 | for u in range (start, end + 1): |
| 41 | data[i][u] = t |
| 42 | values[i][t] = values[i].get (t, 0) + end - start + 1 |
| 43 | |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 44 | defaults = ('Other', 'Not_Applicable', 'Cn', 'No_Block') |
Behdad Esfahbod | ba72801 | 2015-07-21 11:57:23 +0100 | [diff] [blame] | 45 | |
Behdad Esfahbod | 2d4b62e | 2015-07-21 16:46:37 +0100 | [diff] [blame] | 46 | # TODO Characters that are not in Unicode Indic files, but used in USE |
Behdad Esfahbod | ba72801 | 2015-07-21 11:57:23 +0100 | [diff] [blame] | 47 | data[0][0x034F] = defaults[0] |
Behdad Esfahbod | 2d4b62e | 2015-07-21 16:46:37 +0100 | [diff] [blame] | 48 | data[0][0x2060] = defaults[0] |
David Corbett | 87f0ad1 | 2017-11-02 10:59:25 -0400 | [diff] [blame] | 49 | data[0][0x20F0] = defaults[0] |
Behdad Esfahbod | 2d4b62e | 2015-07-21 16:46:37 +0100 | [diff] [blame] | 50 | for u in range (0xFE00, 0xFE0F + 1): |
| 51 | data[0][u] = defaults[0] |
Behdad Esfahbod | ba72801 | 2015-07-21 11:57:23 +0100 | [diff] [blame] | 52 | |
| 53 | # Merge data into one dict: |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 54 | for i,v in enumerate (defaults): |
| 55 | values[i][v] = values[i].get (v, 0) + 1 |
| 56 | combined = {} |
| 57 | for i,d in enumerate (data): |
| 58 | for u,v in d.items (): |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 59 | if i >= 2 and not u in combined: |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 60 | continue |
| 61 | if not u in combined: |
| 62 | combined[u] = list (defaults) |
| 63 | combined[u][i] = v |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 64 | combined = {k:v for k,v in combined.items() if v[3] not in BLACKLISTED_BLOCKS} |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 65 | data = combined |
| 66 | del combined |
| 67 | num = len (data) |
| 68 | |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 69 | |
| 70 | property_names = [ |
| 71 | # General_Category |
| 72 | 'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc', |
| 73 | 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', |
| 74 | 'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs', |
| 75 | # Indic_Syllabic_Category |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 76 | 'Other', |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 77 | 'Bindu', |
| 78 | 'Visarga', |
| 79 | 'Avagraha', |
| 80 | 'Nukta', |
| 81 | 'Virama', |
| 82 | 'Pure_Killer', |
| 83 | 'Invisible_Stacker', |
| 84 | 'Vowel_Independent', |
| 85 | 'Vowel_Dependent', |
| 86 | 'Vowel', |
| 87 | 'Consonant_Placeholder', |
| 88 | 'Consonant', |
| 89 | 'Consonant_Dead', |
| 90 | 'Consonant_With_Stacker', |
| 91 | 'Consonant_Prefixed', |
| 92 | 'Consonant_Preceding_Repha', |
| 93 | 'Consonant_Succeeding_Repha', |
| 94 | 'Consonant_Subjoined', |
| 95 | 'Consonant_Medial', |
| 96 | 'Consonant_Final', |
| 97 | 'Consonant_Head_Letter', |
Behdad Esfahbod | 060e6b4 | 2018-06-05 17:31:46 -0700 | [diff] [blame] | 98 | 'Consonant_Initial_Postfixed', |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 99 | 'Modifying_Letter', |
| 100 | 'Tone_Letter', |
| 101 | 'Tone_Mark', |
| 102 | 'Gemination_Mark', |
| 103 | 'Cantillation_Mark', |
| 104 | 'Register_Shifter', |
| 105 | 'Syllable_Modifier', |
| 106 | 'Consonant_Killer', |
| 107 | 'Non_Joiner', |
| 108 | 'Joiner', |
| 109 | 'Number_Joiner', |
| 110 | 'Number', |
| 111 | 'Brahmi_Joining_Number', |
| 112 | # Indic_Positional_Category |
Behdad Esfahbod | 44910ce | 2015-07-20 18:01:10 +0100 | [diff] [blame] | 113 | 'Not_Applicable', |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 114 | 'Right', |
| 115 | 'Left', |
| 116 | 'Visual_Order_Left', |
| 117 | 'Left_And_Right', |
| 118 | 'Top', |
| 119 | 'Bottom', |
| 120 | 'Top_And_Bottom', |
| 121 | 'Top_And_Right', |
| 122 | 'Top_And_Left', |
| 123 | 'Top_And_Left_And_Right', |
Behdad Esfahbod | ea535a1 | 2017-10-02 17:02:39 +0200 | [diff] [blame] | 124 | 'Bottom_And_Left', |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 125 | 'Bottom_And_Right', |
| 126 | 'Top_And_Bottom_And_Right', |
| 127 | 'Overstruck', |
| 128 | ] |
| 129 | |
Ebrahim Byagowi | 26e0cbd | 2018-03-29 21:22:47 +0430 | [diff] [blame] | 130 | try: |
| 131 | basestring |
| 132 | except NameError: |
| 133 | basestring = str |
| 134 | |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 135 | class PropertyValue(object): |
| 136 | def __init__(self, name_): |
| 137 | self.name = name_ |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 138 | def __str__(self): |
| 139 | return self.name |
| 140 | def __eq__(self, other): |
Behdad Esfahbod | 44910ce | 2015-07-20 18:01:10 +0100 | [diff] [blame] | 141 | return self.name == (other if isinstance(other, basestring) else other.name) |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 142 | def __ne__(self, other): |
| 143 | return not (self == other) |
Ebrahim Byagowi | 26e0cbd | 2018-03-29 21:22:47 +0430 | [diff] [blame] | 144 | def __hash__(self): |
| 145 | return hash(str(self)) |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 146 | |
| 147 | property_values = {} |
| 148 | |
| 149 | for name in property_names: |
| 150 | value = PropertyValue(name) |
| 151 | assert value not in property_values |
| 152 | assert value not in globals() |
| 153 | property_values[name] = value |
| 154 | globals().update(property_values) |
| 155 | |
| 156 | |
| 157 | def is_BASE(U, UISC, UGC): |
David Corbett | a2a1484 | 2018-06-06 12:57:28 -0400 | [diff] [blame] | 158 | return (UISC in [Number, Consonant, Consonant_Head_Letter, |
Behdad Esfahbod | 9b6312f | 2016-05-06 17:41:49 +0100 | [diff] [blame] | 159 | #SPEC-DRAFT Consonant_Placeholder, |
| 160 | Tone_Letter, |
| 161 | Vowel_Independent #SPEC-DRAFT |
| 162 | ] or |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 163 | (UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial, |
| 164 | Consonant_Subjoined, Vowel, Vowel_Dependent])) |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 165 | def is_BASE_IND(U, UISC, UGC): |
Behdad Esfahbod | 9b6312f | 2016-05-06 17:41:49 +0100 | [diff] [blame] | 166 | #SPEC-DRAFT return (UISC in [Consonant_Dead, Modifying_Letter] or UGC == Po) |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 167 | return (UISC in [Consonant_Dead, Modifying_Letter] or |
Behdad Esfahbod | 060e6b4 | 2018-06-05 17:31:46 -0700 | [diff] [blame] | 168 | (UGC == Po and not U in [0x104B, 0x104E, 0x2022, 0x11A3F, 0x11A45]) or |
Behdad Esfahbod | 9b6312f | 2016-05-06 17:41:49 +0100 | [diff] [blame] | 169 | False # SPEC-DRAFT-OUTDATED! U == 0x002D |
| 170 | ) |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 171 | def is_BASE_NUM(U, UISC, UGC): |
| 172 | return UISC == Brahmi_Joining_Number |
| 173 | def is_BASE_OTHER(U, UISC, UGC): |
Behdad Esfahbod | 9b6312f | 2016-05-06 17:41:49 +0100 | [diff] [blame] | 174 | if UISC == Consonant_Placeholder: return True #SPEC-DRAFT |
| 175 | #SPEC-DRAFT return U in [0x00A0, 0x00D7, 0x2015, 0x2022, 0x25CC, 0x25FB, 0x25FC, 0x25FD, 0x25FE] |
| 176 | return U in [0x2015, 0x2022, 0x25FB, 0x25FC, 0x25FD, 0x25FE] |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 177 | def is_CGJ(U, UISC, UGC): |
| 178 | return U == 0x034F |
| 179 | def is_CONS_FINAL(U, UISC, UGC): |
David Corbett | a2a1484 | 2018-06-06 12:57:28 -0400 | [diff] [blame] | 180 | # Consonant_Initial_Postfixed is new in Unicode 11; not in the spec. |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 181 | return ((UISC == Consonant_Final and UGC != Lo) or |
David Corbett | a2a1484 | 2018-06-06 12:57:28 -0400 | [diff] [blame] | 182 | UISC == Consonant_Initial_Postfixed or |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 183 | UISC == Consonant_Succeeding_Repha) |
| 184 | def is_CONS_FINAL_MOD(U, UISC, UGC): |
Behdad Esfahbod | 9b6312f | 2016-05-06 17:41:49 +0100 | [diff] [blame] | 185 | #SPEC-DRAFT return UISC in [Consonant_Final_Modifier, Syllable_Modifier] |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 186 | return UISC == Syllable_Modifier |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 187 | def is_CONS_MED(U, UISC, UGC): |
| 188 | return UISC == Consonant_Medial and UGC != Lo |
| 189 | def is_CONS_MOD(U, UISC, UGC): |
| 190 | return UISC in [Nukta, Gemination_Mark, Consonant_Killer] |
| 191 | def is_CONS_SUB(U, UISC, UGC): |
Behdad Esfahbod | 9b6312f | 2016-05-06 17:41:49 +0100 | [diff] [blame] | 192 | #SPEC-DRAFT return UISC == Consonant_Subjoined |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 193 | return UISC == Consonant_Subjoined and UGC != Lo |
Behdad Esfahbod | e07669f | 2017-10-03 14:57:14 +0200 | [diff] [blame] | 194 | def is_CONS_WITH_STACKER(U, UISC, UGC): |
| 195 | return UISC == Consonant_With_Stacker |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 196 | def is_HALANT(U, UISC, UGC): |
| 197 | return UISC in [Virama, Invisible_Stacker] |
| 198 | def is_HALANT_NUM(U, UISC, UGC): |
| 199 | return UISC == Number_Joiner |
| 200 | def is_ZWNJ(U, UISC, UGC): |
| 201 | return UISC == Non_Joiner |
| 202 | def is_ZWJ(U, UISC, UGC): |
| 203 | return UISC == Joiner |
| 204 | def is_Word_Joiner(U, UISC, UGC): |
| 205 | return U == 0x2060 |
| 206 | def is_OTHER(U, UISC, UGC): |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 207 | #SPEC-OUTDATED return UGC == Zs # or any other SCRIPT_COMMON characters |
Behdad Esfahbod | ba72801 | 2015-07-21 11:57:23 +0100 | [diff] [blame] | 208 | return (UISC == Other |
| 209 | and not is_SYM_MOD(U, UISC, UGC) |
| 210 | and not is_CGJ(U, UISC, UGC) |
Behdad Esfahbod | 2d4b62e | 2015-07-21 16:46:37 +0100 | [diff] [blame] | 211 | and not is_Word_Joiner(U, UISC, UGC) |
| 212 | and not is_VARIATION_SELECTOR(U, UISC, UGC) |
Behdad Esfahbod | ba72801 | 2015-07-21 11:57:23 +0100 | [diff] [blame] | 213 | ) |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 214 | def is_Reserved(U, UISC, UGC): |
| 215 | return UGC == 'Cn' |
| 216 | def is_REPHA(U, UISC, UGC): |
Behdad Esfahbod | e07669f | 2017-10-03 14:57:14 +0200 | [diff] [blame] | 217 | return UISC in [Consonant_Preceding_Repha, Consonant_Prefixed] |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 218 | def is_SYM(U, UISC, UGC): |
Behdad Esfahbod | 9b6312f | 2016-05-06 17:41:49 +0100 | [diff] [blame] | 219 | if U == 0x25CC: return False #SPEC-DRAFT |
| 220 | #SPEC-DRAFT return UGC in [So, Sc] or UISC == Symbol_Letter |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 221 | return UGC in [So, Sc] |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 222 | def is_SYM_MOD(U, UISC, UGC): |
| 223 | return U in [0x1B6B, 0x1B6C, 0x1B6D, 0x1B6E, 0x1B6F, 0x1B70, 0x1B71, 0x1B72, 0x1B73] |
| 224 | def is_VARIATION_SELECTOR(U, UISC, UGC): |
| 225 | return 0xFE00 <= U <= 0xFE0F |
| 226 | def is_VOWEL(U, UISC, UGC): |
Behdad Esfahbod | 216b003 | 2017-07-14 16:38:51 +0100 | [diff] [blame] | 227 | # https://github.com/roozbehp/unicode-data/issues/6 |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 228 | return (UISC == Pure_Killer or |
Behdad Esfahbod | 216b003 | 2017-07-14 16:38:51 +0100 | [diff] [blame] | 229 | (UGC != Lo and UISC in [Vowel, Vowel_Dependent] and U not in [0xAA29])) |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 230 | def is_VOWEL_MOD(U, UISC, UGC): |
Behdad Esfahbod | 216b003 | 2017-07-14 16:38:51 +0100 | [diff] [blame] | 231 | # https://github.com/roozbehp/unicode-data/issues/6 |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 232 | return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or |
Behdad Esfahbod | 216b003 | 2017-07-14 16:38:51 +0100 | [diff] [blame] | 233 | (UGC != Lo and (UISC == Bindu or U in [0xAA29]))) |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 234 | |
| 235 | use_mapping = { |
| 236 | 'B': is_BASE, |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 237 | 'IND': is_BASE_IND, |
| 238 | 'N': is_BASE_NUM, |
| 239 | 'GB': is_BASE_OTHER, |
| 240 | 'CGJ': is_CGJ, |
| 241 | 'F': is_CONS_FINAL, |
| 242 | 'FM': is_CONS_FINAL_MOD, |
| 243 | 'M': is_CONS_MED, |
| 244 | 'CM': is_CONS_MOD, |
| 245 | 'SUB': is_CONS_SUB, |
Behdad Esfahbod | e07669f | 2017-10-03 14:57:14 +0200 | [diff] [blame] | 246 | 'CS': is_CONS_WITH_STACKER, |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 247 | 'H': is_HALANT, |
| 248 | 'HN': is_HALANT_NUM, |
| 249 | 'ZWNJ': is_ZWNJ, |
| 250 | 'ZWJ': is_ZWJ, |
| 251 | 'WJ': is_Word_Joiner, |
| 252 | 'O': is_OTHER, |
| 253 | 'Rsv': is_Reserved, |
| 254 | 'R': is_REPHA, |
| 255 | 'S': is_SYM, |
| 256 | 'SM': is_SYM_MOD, |
| 257 | 'VS': is_VARIATION_SELECTOR, |
| 258 | 'V': is_VOWEL, |
| 259 | 'VM': is_VOWEL_MOD, |
| 260 | } |
| 261 | |
Behdad Esfahbod | 44910ce | 2015-07-20 18:01:10 +0100 | [diff] [blame] | 262 | use_positions = { |
| 263 | 'F': { |
| 264 | 'Abv': [Top], |
| 265 | 'Blw': [Bottom], |
| 266 | 'Pst': [Right], |
| 267 | }, |
| 268 | 'M': { |
| 269 | 'Abv': [Top], |
Behdad Esfahbod | ea535a1 | 2017-10-02 17:02:39 +0200 | [diff] [blame] | 270 | 'Blw': [Bottom, Bottom_And_Left], |
Behdad Esfahbod | 44910ce | 2015-07-20 18:01:10 +0100 | [diff] [blame] | 271 | 'Pst': [Right], |
| 272 | 'Pre': [Left], |
| 273 | }, |
| 274 | 'CM': { |
| 275 | 'Abv': [Top], |
| 276 | 'Blw': [Bottom], |
| 277 | }, |
| 278 | 'V': { |
| 279 | 'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right], |
| 280 | 'Blw': [Bottom, Overstruck, Bottom_And_Right], |
| 281 | 'Pst': [Right], |
| 282 | 'Pre': [Left, Top_And_Left, Top_And_Left_And_Right, Left_And_Right], |
| 283 | }, |
| 284 | 'VM': { |
| 285 | 'Abv': [Top], |
| 286 | 'Blw': [Bottom, Overstruck], |
| 287 | 'Pst': [Right], |
| 288 | 'Pre': [Left], |
| 289 | }, |
| 290 | 'SM': { |
| 291 | 'Abv': [Top], |
| 292 | 'Blw': [Bottom], |
| 293 | }, |
| 294 | 'H': None, |
| 295 | 'B': None, |
| 296 | 'FM': None, |
| 297 | 'SUB': None, |
| 298 | } |
| 299 | |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 300 | def map_to_use(data): |
| 301 | out = {} |
| 302 | items = use_mapping.items() |
| 303 | for U,(UISC,UIPC,UGC,UBlock) in data.items(): |
Behdad Esfahbod | 44910ce | 2015-07-20 18:01:10 +0100 | [diff] [blame] | 304 | |
| 305 | # Resolve Indic_Syllabic_Category |
| 306 | |
| 307 | # TODO: These don't have UISC assigned in Unicode 8.0, but |
| 308 | # have UIPC |
| 309 | if U == 0x17DD: UISC = Vowel_Dependent |
| 310 | if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark |
| 311 | |
David Corbett | 7bfdf1a | 2017-11-22 16:32:52 -0500 | [diff] [blame] | 312 | # TODO: https://github.com/harfbuzz/harfbuzz/pull/627 |
| 313 | if 0x1BF2 <= U <= 0x1BF3: UISC = Nukta; UIPC = Bottom |
| 314 | |
Behdad Esfahbod | 44910ce | 2015-07-20 18:01:10 +0100 | [diff] [blame] | 315 | # TODO: U+1CED should only be allowed after some of |
| 316 | # the nasalization marks, maybe only for U+1CE9..U+1CF1. |
| 317 | if U == 0x1CED: UISC = Tone_Mark |
| 318 | |
ebraminio | 7c6937e | 2017-11-20 14:49:22 -0500 | [diff] [blame] | 319 | # TODO: https://github.com/harfbuzz/harfbuzz/issues/525 |
Behdad Esfahbod | 5680ef8 | 2017-10-02 18:20:51 +0200 | [diff] [blame] | 320 | if U == 0x1A7F: UISC = Consonant_Final; UIPC = Bottom |
| 321 | |
ebraminio | 7c6937e | 2017-11-20 14:49:22 -0500 | [diff] [blame] | 322 | # TODO: https://github.com/harfbuzz/harfbuzz/pull/609 |
David Corbett | 87f0ad1 | 2017-11-02 10:59:25 -0400 | [diff] [blame] | 323 | if U == 0x20F0: UISC = Cantillation_Mark; UIPC = Top |
| 324 | |
David Corbett | 9f25976 | 2017-11-21 14:16:10 -0500 | [diff] [blame] | 325 | # TODO: https://github.com/harfbuzz/harfbuzz/pull/626 |
| 326 | if U == 0xA8B4: UISC = Consonant_Medial |
| 327 | |
Behdad Esfahbod | 29c244a | 2017-10-02 16:36:21 +0200 | [diff] [blame] | 328 | values = [k for k,v in items if v(U,UISC,UGC)] |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 329 | assert len(values) == 1, "%s %s %s %s" % (hex(U), UISC, UGC, values) |
Behdad Esfahbod | 44910ce | 2015-07-20 18:01:10 +0100 | [diff] [blame] | 330 | USE = values[0] |
| 331 | |
| 332 | # Resolve Indic_Positional_Category |
| 333 | |
| 334 | # TODO: Not in Unicode 8.0 yet, but in spec. |
| 335 | if U == 0x1B6C: UIPC = Bottom |
| 336 | |
| 337 | # TODO: These should die, but have UIPC in Unicode 8.0 |
| 338 | if U in [0x953, 0x954]: UIPC = Not_Applicable |
| 339 | |
Behdad Esfahbod | 060e6b4 | 2018-06-05 17:31:46 -0700 | [diff] [blame] | 340 | # TODO: In USE's override list but not in Unicode 11.0 |
Behdad Esfahbod | 44910ce | 2015-07-20 18:01:10 +0100 | [diff] [blame] | 341 | if U == 0x103C: UIPC = Left |
| 342 | |
Behdad Esfahbod | 060e6b4 | 2018-06-05 17:31:46 -0700 | [diff] [blame] | 343 | # TODO: These are not in USE's override list that we have, nor are they in Unicode 11.0 |
Behdad Esfahbod | 44910ce | 2015-07-20 18:01:10 +0100 | [diff] [blame] | 344 | if 0xA926 <= U <= 0xA92A: UIPC = Top |
| 345 | if U == 0x111CA: UIPC = Bottom |
| 346 | if U == 0x11300: UIPC = Top |
Behdad Esfahbod | 44910ce | 2015-07-20 18:01:10 +0100 | [diff] [blame] | 347 | if U == 0x1171E: UIPC = Left # Correct?! |
| 348 | if 0x1CF2 <= U <= 0x1CF3: UIPC = Right |
| 349 | if 0x1CF8 <= U <= 0x1CF9: UIPC = Top |
Behdad Esfahbod | 060e6b4 | 2018-06-05 17:31:46 -0700 | [diff] [blame] | 350 | # https://github.com/roozbehp/unicode-data/issues/8 |
| 351 | if U == 0x0A51: UIPC = Bottom |
Behdad Esfahbod | 44910ce | 2015-07-20 18:01:10 +0100 | [diff] [blame] | 352 | |
| 353 | assert (UIPC in [Not_Applicable, Visual_Order_Left] or |
| 354 | USE in use_positions), "%s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC) |
| 355 | |
| 356 | pos_mapping = use_positions.get(USE, None) |
| 357 | if pos_mapping: |
| 358 | values = [k for k,v in pos_mapping.items() if v and UIPC in v] |
| 359 | assert len(values) == 1, "%s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC, values) |
| 360 | USE = USE + values[0] |
| 361 | |
| 362 | out[U] = (USE, UBlock) |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 363 | return out |
| 364 | |
| 365 | defaults = ('O', 'No_Block') |
| 366 | data = map_to_use(data) |
Behdad Esfahbod | 20e246e | 2015-07-20 15:56:19 +0100 | [diff] [blame] | 367 | |
Ebrahim Byagowi | a48dd6e | 2018-03-28 19:08:19 +0430 | [diff] [blame] | 368 | print ("/* == Start of generated table == */") |
| 369 | print ("/*") |
| 370 | print (" * The following table is generated by running:") |
| 371 | print (" *") |
| 372 | print (" * ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt") |
| 373 | print (" *") |
| 374 | print (" * on files with these headers:") |
| 375 | print (" *") |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 376 | for h in headers: |
| 377 | for l in h: |
Ebrahim Byagowi | a48dd6e | 2018-03-28 19:08:19 +0430 | [diff] [blame] | 378 | print (" * %s" % (l.strip())) |
| 379 | print (" */") |
| 380 | print () |
| 381 | print ('#include "hb-ot-shape-complex-use-private.hh"') |
| 382 | print () |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 383 | |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 384 | total = 0 |
| 385 | used = 0 |
| 386 | last_block = None |
| 387 | def print_block (block, start, end, data): |
| 388 | global total, used, last_block |
| 389 | if block and block != last_block: |
Ebrahim Byagowi | a48dd6e | 2018-03-28 19:08:19 +0430 | [diff] [blame] | 390 | print () |
| 391 | print () |
| 392 | print (" /* %s */" % block) |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 393 | if start % 16: |
Ebrahim Byagowi | a48dd6e | 2018-03-28 19:08:19 +0430 | [diff] [blame] | 394 | print (' ' * (20 + (start % 16 * 6)), end='') |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 395 | num = 0 |
| 396 | assert start % 8 == 0 |
| 397 | assert (end+1) % 8 == 0 |
| 398 | for u in range (start, end+1): |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 399 | if u % 16 == 0: |
Ebrahim Byagowi | a48dd6e | 2018-03-28 19:08:19 +0430 | [diff] [blame] | 400 | print () |
| 401 | print (" /* %04X */" % u, end='') |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 402 | if u in data: |
| 403 | num += 1 |
| 404 | d = data.get (u, defaults) |
Ebrahim Byagowi | a48dd6e | 2018-03-28 19:08:19 +0430 | [diff] [blame] | 405 | print ("%6s," % d[0], end='') |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 406 | |
| 407 | total += end - start + 1 |
| 408 | used += num |
| 409 | if block: |
| 410 | last_block = block |
| 411 | |
Ebrahim Byagowi | 26e0cbd | 2018-03-29 21:22:47 +0430 | [diff] [blame] | 412 | uu = sorted (data.keys ()) |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 413 | |
| 414 | last = -100000 |
| 415 | num = 0 |
| 416 | offset = 0 |
| 417 | starts = [] |
| 418 | ends = [] |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 419 | for k,v in sorted(use_mapping.items()): |
Behdad Esfahbod | 44910ce | 2015-07-20 18:01:10 +0100 | [diff] [blame] | 420 | if k in use_positions and use_positions[k]: continue |
Ebrahim Byagowi | a48dd6e | 2018-03-28 19:08:19 +0430 | [diff] [blame] | 421 | print ("#define %s USE_%s /* %s */" % (k, k, v.__name__[3:])) |
Behdad Esfahbod | 44910ce | 2015-07-20 18:01:10 +0100 | [diff] [blame] | 422 | for k,v in sorted(use_positions.items()): |
| 423 | if not v: continue |
| 424 | for suf in v.keys(): |
| 425 | tag = k + suf |
Ebrahim Byagowi | a48dd6e | 2018-03-28 19:08:19 +0430 | [diff] [blame] | 426 | print ("#define %s USE_%s" % (tag, tag)) |
| 427 | print ("") |
| 428 | print ("static const USE_TABLE_ELEMENT_TYPE use_table[] = {") |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 429 | for u in uu: |
| 430 | if u <= last: |
| 431 | continue |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 432 | block = data[u][1] |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 433 | |
| 434 | start = u//8*8 |
| 435 | end = start+1 |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 436 | while end in uu and block == data[end][1]: |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 437 | end += 1 |
| 438 | end = (end-1)//8*8 + 7 |
| 439 | |
| 440 | if start != last + 1: |
| 441 | if start - last <= 1+16*3: |
| 442 | print_block (None, last+1, start-1, data) |
| 443 | last = start-1 |
| 444 | else: |
| 445 | if last >= 0: |
| 446 | ends.append (last + 1) |
| 447 | offset += ends[-1] - starts[-1] |
Ebrahim Byagowi | a48dd6e | 2018-03-28 19:08:19 +0430 | [diff] [blame] | 448 | print () |
| 449 | print () |
| 450 | print ("#define use_offset_0x%04xu %d" % (start, offset)) |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 451 | starts.append (start) |
| 452 | |
| 453 | print_block (block, start, end, data) |
| 454 | last = end |
| 455 | ends.append (last + 1) |
| 456 | offset += ends[-1] - starts[-1] |
Ebrahim Byagowi | a48dd6e | 2018-03-28 19:08:19 +0430 | [diff] [blame] | 457 | print () |
| 458 | print () |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 459 | occupancy = used * 100. / total |
| 460 | page_bits = 12 |
Ebrahim Byagowi | a48dd6e | 2018-03-28 19:08:19 +0430 | [diff] [blame] | 461 | print ("}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy)) |
| 462 | print () |
| 463 | print ("USE_TABLE_ELEMENT_TYPE") |
| 464 | print ("hb_use_get_category (hb_codepoint_t u)") |
| 465 | print ("{") |
| 466 | print (" switch (u >> %d)" % page_bits) |
| 467 | print (" {") |
Behdad Esfahbod | f8daeef | 2018-01-03 14:27:34 +0000 | [diff] [blame] | 468 | pages = set([u>>page_bits for u in starts+ends]) |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 469 | for p in sorted(pages): |
Ebrahim Byagowi | a48dd6e | 2018-03-28 19:08:19 +0430 | [diff] [blame] | 470 | print (" case 0x%0Xu:" % p) |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 471 | for (start,end) in zip (starts, ends): |
| 472 | if p not in [start>>page_bits, end>>page_bits]: continue |
Behdad Esfahbod | c48ff28 | 2015-07-20 11:46:17 +0100 | [diff] [blame] | 473 | offset = "use_offset_0x%04xu" % start |
Ebrahim Byagowi | a48dd6e | 2018-03-28 19:08:19 +0430 | [diff] [blame] | 474 | print (" if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return use_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset)) |
| 475 | print (" break;") |
| 476 | print ("") |
| 477 | print (" default:") |
| 478 | print (" break;") |
| 479 | print (" }") |
| 480 | print (" return USE_O;") |
| 481 | print ("}") |
| 482 | print () |
Behdad Esfahbod | ad71782 | 2015-07-21 16:43:27 +0100 | [diff] [blame] | 483 | for k in sorted(use_mapping.keys()): |
| 484 | if k in use_positions and use_positions[k]: continue |
Ebrahim Byagowi | a48dd6e | 2018-03-28 19:08:19 +0430 | [diff] [blame] | 485 | print ("#undef %s" % k) |
Behdad Esfahbod | ad71782 | 2015-07-21 16:43:27 +0100 | [diff] [blame] | 486 | for k,v in sorted(use_positions.items()): |
| 487 | if not v: continue |
| 488 | for suf in v.keys(): |
| 489 | tag = k + suf |
Ebrahim Byagowi | a48dd6e | 2018-03-28 19:08:19 +0430 | [diff] [blame] | 490 | print ("#undef %s" % tag) |
| 491 | print () |
| 492 | print ("/* == End of generated table == */") |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 493 | |
Behdad Esfahbod | ad72555 | 2015-07-20 17:00:06 +0100 | [diff] [blame] | 494 | # Maintain at least 50% occupancy in the table */ |
| 495 | if occupancy < 50: |
Behdad Esfahbod | e2c9511 | 2015-07-20 11:32:48 +0100 | [diff] [blame] | 496 | raise Exception ("Table too sparse, please investigate: ", occupancy) |