blob: e8b76dfb5fbdd26585ccf1414c7145f18c2dab6b [file] [log] [blame]
Ebrahim Byagowi8d199072020-02-19 14:56:55 +03301#!/usr/bin/env python3
Ebrahim Byagowi08f1d952020-05-28 15:01:15 +04302# flake8: noqa: F821
Behdad Esfahbode2c95112015-07-20 11:32:48 +01003
Behdad Esfahbod83c3a912022-11-19 13:34:58 -07004import logging
5logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO)
6
David Corbett3e635cf2021-10-08 17:13:22 -04007"""usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt ArabicShaping.txt DerivedCoreProperties.txt UnicodeData.txt Blocks.txt Scripts.txt IndicSyllabicCategory-Additional.txt IndicPositionalCategory-Additional.txt
Ebrahim Byagowi6a390df2020-02-10 17:19:23 +03308
David Corbett06f49fc2020-08-13 13:37:45 -04009Input files:
Ebrahim Byagowi6a390df2020-02-10 17:19:23 +033010* https://unicode.org/Public/UCD/latest/ucd/IndicSyllabicCategory.txt
11* https://unicode.org/Public/UCD/latest/ucd/IndicPositionalCategory.txt
David Corbett3e635cf2021-10-08 17:13:22 -040012* https://unicode.org/Public/UCD/latest/ucd/ArabicShaping.txt
David Corbett8eaee2f2021-10-07 20:10:31 -040013* https://unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
Ebrahim Byagowi6a390df2020-02-10 17:19:23 +033014* https://unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
Ebrahim Byagowi08f1d952020-05-28 15:01:15 +043015* https://unicode.org/Public/UCD/latest/ucd/Blocks.txt
David Corbett3e635cf2021-10-08 17:13:22 -040016* https://unicode.org/Public/UCD/latest/ucd/Scripts.txt
David Corbett06f49fc2020-08-13 13:37:45 -040017* ms-use/IndicSyllabicCategory-Additional.txt
David Corbettc39ab822020-10-06 16:51:40 -040018* ms-use/IndicPositionalCategory-Additional.txt
Ebrahim Byagowi08f1d952020-05-28 15:01:15 +043019"""
20
Ebrahim Byagowi08f1d952020-05-28 15:01:15 +043021import sys
22
David Corbett3e635cf2021-10-08 17:13:22 -040023if len (sys.argv) != 10:
Ebrahim Byagowi7554f612020-05-28 22:51:29 +043024 sys.exit (__doc__)
Behdad Esfahbode2c95112015-07-20 11:32:48 +010025
David Corbett3e635cf2021-10-08 17:13:22 -040026DISABLED_SCRIPTS = {
27 'Arabic',
David Corbett06f49fc2020-08-13 13:37:45 -040028 'Lao',
David Corbett3e635cf2021-10-08 17:13:22 -040029 'Samaritan',
30 'Syriac',
31 'Thai',
32}
Behdad Esfahbode2c95112015-07-20 11:32:48 +010033
Ebrahim Byagowiad871552020-05-29 00:11:19 +043034files = [open (x, encoding='utf-8') for x in sys.argv[1:]]
Behdad Esfahbode2c95112015-07-20 11:32:48 +010035
David Corbett3e635cf2021-10-08 17:13:22 -040036headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 4]
37for j in range(7, 9):
David Corbett06f49fc2020-08-13 13:37:45 -040038 for line in files[j]:
39 line = line.rstrip()
40 if not line:
41 break
42 headers[j - 1].append(line)
Behdad Esfahbod20e246e2015-07-20 15:56:19 +010043headers.append (["UnicodeData.txt does not have a header."])
Behdad Esfahbode2c95112015-07-20 11:32:48 +010044
David Corbettc33468d2022-03-06 12:26:37 -050045unicode_data = [{} for _ in files]
Ebrahim Byagowi69370922020-07-13 21:32:15 +043046values = [{} for _ in files]
Behdad Esfahbode2c95112015-07-20 11:32:48 +010047for i, f in enumerate (files):
48 for line in f:
49
50 j = line.find ('#')
51 if j >= 0:
52 line = line[:j]
53
54 fields = [x.strip () for x in line.split (';')]
55 if len (fields) == 1:
56 continue
57
58 uu = fields[0].split ('..')
59 start = int (uu[0], 16)
60 if len (uu) == 1:
61 end = start
62 else:
63 end = int (uu[1], 16)
64
David Corbett3e635cf2021-10-08 17:13:22 -040065 t = fields[1 if i not in [2, 4] else 2]
Behdad Esfahbode2c95112015-07-20 11:32:48 +010066
David Corbett3e635cf2021-10-08 17:13:22 -040067 if i == 2:
David Corbett06f49fc2020-08-13 13:37:45 -040068 t = 'jt_' + t
David Corbett3e635cf2021-10-08 17:13:22 -040069 elif i == 3 and t != 'Default_Ignorable_Code_Point':
70 continue
71 elif i == 7 and t == 'Consonant_Final_Modifier':
David Corbett06f49fc2020-08-13 13:37:45 -040072 # TODO: https://github.com/MicrosoftDocs/typography-issues/issues/336
73 t = 'Syllable_Modifier'
David Corbett3e635cf2021-10-08 17:13:22 -040074 elif i == 8 and t == 'NA':
David Corbett06f49fc2020-08-13 13:37:45 -040075 t = 'Not_Applicable'
76
David Corbett3e635cf2021-10-08 17:13:22 -040077 i0 = i if i < 7 else i - 7
Behdad Esfahbode2c95112015-07-20 11:32:48 +010078 for u in range (start, end + 1):
David Corbettc33468d2022-03-06 12:26:37 -050079 unicode_data[i0][u] = t
David Corbett06f49fc2020-08-13 13:37:45 -040080 values[i0][t] = values[i0].get (t, 0) + end - start + 1
Behdad Esfahbode2c95112015-07-20 11:32:48 +010081
David Corbett3e635cf2021-10-08 17:13:22 -040082defaults = ('Other', 'Not_Applicable', 'jt_X', '', 'Cn', 'No_Block', 'Unknown')
Behdad Esfahbodba728012015-07-21 11:57:23 +010083
Behdad Esfahbodba728012015-07-21 11:57:23 +010084# Merge data into one dict:
Behdad Esfahbode2c95112015-07-20 11:32:48 +010085for i,v in enumerate (defaults):
86 values[i][v] = values[i].get (v, 0) + 1
87combined = {}
David Corbettc33468d2022-03-06 12:26:37 -050088for i,d in enumerate (unicode_data):
Behdad Esfahbode2c95112015-07-20 11:32:48 +010089 for u,v in d.items ():
Behdad Esfahbode2c95112015-07-20 11:32:48 +010090 if not u in combined:
David Corbett3e635cf2021-10-08 17:13:22 -040091 if i >= 4:
92 continue
Behdad Esfahbode2c95112015-07-20 11:32:48 +010093 combined[u] = list (defaults)
94 combined[u][i] = v
David Corbett3e635cf2021-10-08 17:13:22 -040095combined = {k: v for k, v in combined.items() if v[6] not in DISABLED_SCRIPTS}
Behdad Esfahbode2c95112015-07-20 11:32:48 +010096
Behdad Esfahbod20e246e2015-07-20 15:56:19 +010097
98property_names = [
99 # General_Category
100 'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc',
101 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po',
102 'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs',
103 # Indic_Syllabic_Category
Behdad Esfahbodad725552015-07-20 17:00:06 +0100104 'Other',
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100105 'Bindu',
106 'Visarga',
107 'Avagraha',
108 'Nukta',
109 'Virama',
110 'Pure_Killer',
111 'Invisible_Stacker',
112 'Vowel_Independent',
113 'Vowel_Dependent',
114 'Vowel',
115 'Consonant_Placeholder',
116 'Consonant',
117 'Consonant_Dead',
118 'Consonant_With_Stacker',
119 'Consonant_Prefixed',
120 'Consonant_Preceding_Repha',
121 'Consonant_Succeeding_Repha',
122 'Consonant_Subjoined',
123 'Consonant_Medial',
124 'Consonant_Final',
125 'Consonant_Head_Letter',
Behdad Esfahbod060e6b42018-06-05 17:31:46 -0700126 'Consonant_Initial_Postfixed',
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100127 'Modifying_Letter',
128 'Tone_Letter',
129 'Tone_Mark',
130 'Gemination_Mark',
131 'Cantillation_Mark',
132 'Register_Shifter',
133 'Syllable_Modifier',
134 'Consonant_Killer',
135 'Non_Joiner',
136 'Joiner',
137 'Number_Joiner',
138 'Number',
139 'Brahmi_Joining_Number',
David Corbett3ca5fbd2022-09-21 18:13:17 -0400140 'Symbol_Modifier',
David Corbettfaf09f52020-08-14 18:30:20 -0400141 'Hieroglyph',
142 'Hieroglyph_Joiner',
143 'Hieroglyph_Segment_Begin',
144 'Hieroglyph_Segment_End',
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100145 # Indic_Positional_Category
Behdad Esfahbod44910ce2015-07-20 18:01:10 +0100146 'Not_Applicable',
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100147 'Right',
148 'Left',
149 'Visual_Order_Left',
150 'Left_And_Right',
151 'Top',
152 'Bottom',
153 'Top_And_Bottom',
David Corbettfd748fa2020-03-15 15:59:31 -0400154 'Top_And_Bottom_And_Left',
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100155 'Top_And_Right',
156 'Top_And_Left',
157 'Top_And_Left_And_Right',
Behdad Esfahbodea535a12017-10-02 17:02:39 +0200158 'Bottom_And_Left',
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100159 'Bottom_And_Right',
160 'Top_And_Bottom_And_Right',
161 'Overstruck',
David Corbett06f49fc2020-08-13 13:37:45 -0400162 # Joining_Type
163 'jt_C',
164 'jt_D',
165 'jt_L',
166 'jt_R',
167 'jt_T',
168 'jt_U',
169 'jt_X',
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100170]
171
172class PropertyValue(object):
173 def __init__(self, name_):
174 self.name = name_
Behdad Esfahbodad725552015-07-20 17:00:06 +0100175 def __str__(self):
176 return self.name
177 def __eq__(self, other):
Ebrahim Byagowia0c58be2020-03-18 23:40:59 +0330178 return self.name == (other if isinstance(other, str) else other.name)
Behdad Esfahbodad725552015-07-20 17:00:06 +0100179 def __ne__(self, other):
180 return not (self == other)
Ebrahim Byagowi26e0cbd2018-03-29 21:22:47 +0430181 def __hash__(self):
182 return hash(str(self))
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100183
184property_values = {}
185
186for name in property_names:
187 value = PropertyValue(name)
188 assert value not in property_values
189 assert value not in globals()
190 property_values[name] = value
191globals().update(property_values)
192
193
David Corbett8eaee2f2021-10-07 20:10:31 -0400194def is_BASE(U, UISC, UDI, UGC, AJT):
David Corbetta2a14842018-06-06 12:57:28 -0400195 return (UISC in [Number, Consonant, Consonant_Head_Letter,
Behdad Esfahbod9b6312f2016-05-06 17:41:49 +0100196 Tone_Letter,
David Corbett06f49fc2020-08-13 13:37:45 -0400197 Vowel_Independent,
David Corbettf83496a2020-11-20 08:16:36 -0500198 ] or
David Corbett06f49fc2020-08-13 13:37:45 -0400199 # TODO: https://github.com/MicrosoftDocs/typography-issues/issues/484
David Corbett07926902020-10-16 22:41:12 -0400200 AJT in [jt_C, jt_D, jt_L, jt_R] and UISC != Joiner or
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100201 (UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial,
202 Consonant_Subjoined, Vowel, Vowel_Dependent]))
David Corbett8eaee2f2021-10-07 20:10:31 -0400203def is_BASE_NUM(U, UISC, UDI, UGC, AJT):
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100204 return UISC == Brahmi_Joining_Number
David Corbett8eaee2f2021-10-07 20:10:31 -0400205def is_BASE_OTHER(U, UISC, UDI, UGC, AJT):
David Corbett06f49fc2020-08-13 13:37:45 -0400206 if UISC == Consonant_Placeholder: return True
Behdad Esfahbod9b6312f2016-05-06 17:41:49 +0100207 return U in [0x2015, 0x2022, 0x25FB, 0x25FC, 0x25FD, 0x25FE]
David Corbett8eaee2f2021-10-07 20:10:31 -0400208def is_CGJ(U, UISC, UDI, UGC, AJT):
David Corbett5b0a5982022-03-04 20:45:30 -0500209 # Also includes VARIATION_SELECTOR and ZWJ
210 return UISC == Joiner or UDI and UGC in [Mc, Me, Mn]
David Corbett8eaee2f2021-10-07 20:10:31 -0400211def is_CONS_FINAL(U, UISC, UDI, UGC, AJT):
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100212 return ((UISC == Consonant_Final and UGC != Lo) or
213 UISC == Consonant_Succeeding_Repha)
David Corbett8eaee2f2021-10-07 20:10:31 -0400214def is_CONS_FINAL_MOD(U, UISC, UDI, UGC, AJT):
David Corbett06f49fc2020-08-13 13:37:45 -0400215 return UISC == Syllable_Modifier
David Corbett8eaee2f2021-10-07 20:10:31 -0400216def is_CONS_MED(U, UISC, UDI, UGC, AJT):
David Corbett125c45e2019-05-15 17:02:32 -0400217 # Consonant_Initial_Postfixed is new in Unicode 11; not in the spec.
218 return (UISC == Consonant_Medial and UGC != Lo or
219 UISC == Consonant_Initial_Postfixed)
David Corbett8eaee2f2021-10-07 20:10:31 -0400220def is_CONS_MOD(U, UISC, UDI, UGC, AJT):
David Corbett3ca5fbd2022-09-21 18:13:17 -0400221 return UISC in [Nukta, Gemination_Mark, Consonant_Killer]
David Corbett8eaee2f2021-10-07 20:10:31 -0400222def is_CONS_SUB(U, UISC, UDI, UGC, AJT):
Behdad Esfahbodad725552015-07-20 17:00:06 +0100223 return UISC == Consonant_Subjoined and UGC != Lo
David Corbett8eaee2f2021-10-07 20:10:31 -0400224def is_CONS_WITH_STACKER(U, UISC, UDI, UGC, AJT):
Behdad Esfahbode07669f2017-10-03 14:57:14 +0200225 return UISC == Consonant_With_Stacker
David Corbett8eaee2f2021-10-07 20:10:31 -0400226def is_HALANT(U, UISC, UDI, UGC, AJT):
David Corbett60598282022-06-25 11:33:44 -0400227 return UISC == Virama and not is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UDI, UGC, AJT)
228def is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UDI, UGC, AJT):
229 # Split off of HALANT
230 return U == 0x0DCA
David Corbett8eaee2f2021-10-07 20:10:31 -0400231def is_HALANT_NUM(U, UISC, UDI, UGC, AJT):
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100232 return UISC == Number_Joiner
David Corbett8eaee2f2021-10-07 20:10:31 -0400233def is_HIEROGLYPH(U, UISC, UDI, UGC, AJT):
David Corbettfaf09f52020-08-14 18:30:20 -0400234 return UISC == Hieroglyph
David Corbett8eaee2f2021-10-07 20:10:31 -0400235def is_HIEROGLYPH_JOINER(U, UISC, UDI, UGC, AJT):
David Corbettfaf09f52020-08-14 18:30:20 -0400236 return UISC == Hieroglyph_Joiner
David Corbett8eaee2f2021-10-07 20:10:31 -0400237def is_HIEROGLYPH_SEGMENT_BEGIN(U, UISC, UDI, UGC, AJT):
David Corbettfaf09f52020-08-14 18:30:20 -0400238 return UISC == Hieroglyph_Segment_Begin
David Corbett8eaee2f2021-10-07 20:10:31 -0400239def is_HIEROGLYPH_SEGMENT_END(U, UISC, UDI, UGC, AJT):
David Corbettfaf09f52020-08-14 18:30:20 -0400240 return UISC == Hieroglyph_Segment_End
David Corbett49fb8f92022-03-13 15:01:11 -0400241def is_INVISIBLE_STACKER(U, UISC, UDI, UGC, AJT):
242 # Split off of HALANT
243 return (UISC == Invisible_Stacker
244 and not is_SAKOT(U, UISC, UDI, UGC, AJT)
245 )
David Corbett8eaee2f2021-10-07 20:10:31 -0400246def is_ZWNJ(U, UISC, UDI, UGC, AJT):
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100247 return UISC == Non_Joiner
David Corbett8eaee2f2021-10-07 20:10:31 -0400248def is_OTHER(U, UISC, UDI, UGC, AJT):
David Corbettc33468d2022-03-06 12:26:37 -0500249 # Also includes BASE_IND and SYM
250 return ((UGC == Po or UISC in [Consonant_Dead, Joiner, Modifying_Letter, Other])
David Corbett8eaee2f2021-10-07 20:10:31 -0400251 and not is_BASE(U, UISC, UDI, UGC, AJT)
252 and not is_BASE_OTHER(U, UISC, UDI, UGC, AJT)
253 and not is_CGJ(U, UISC, UDI, UGC, AJT)
David Corbett8eaee2f2021-10-07 20:10:31 -0400254 and not is_SYM_MOD(U, UISC, UDI, UGC, AJT)
David Corbett5b0a5982022-03-04 20:45:30 -0500255 and not is_Word_Joiner(U, UISC, UDI, UGC, AJT)
Behdad Esfahbodba728012015-07-21 11:57:23 +0100256 )
David Corbett8eaee2f2021-10-07 20:10:31 -0400257def is_REPHA(U, UISC, UDI, UGC, AJT):
Behdad Esfahbode07669f2017-10-03 14:57:14 +0200258 return UISC in [Consonant_Preceding_Repha, Consonant_Prefixed]
David Corbett8eaee2f2021-10-07 20:10:31 -0400259def is_SAKOT(U, UISC, UDI, UGC, AJT):
David Corbettbb50aae2021-10-08 13:25:46 -0400260 # Split off of HALANT
David Corbettd64fb9d2019-05-26 11:05:54 -0400261 return U == 0x1A60
David Corbett8eaee2f2021-10-07 20:10:31 -0400262def is_SYM_MOD(U, UISC, UDI, UGC, AJT):
David Corbett3ca5fbd2022-09-21 18:13:17 -0400263 return UISC == Symbol_Modifier
David Corbett8eaee2f2021-10-07 20:10:31 -0400264def is_VOWEL(U, UISC, UDI, UGC, AJT):
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100265 return (UISC == Pure_Killer or
David Corbett13bb46c2022-03-06 15:35:31 -0500266 UGC != Lo and UISC in [Vowel, Vowel_Dependent])
David Corbett8eaee2f2021-10-07 20:10:31 -0400267def is_VOWEL_MOD(U, UISC, UDI, UGC, AJT):
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100268 return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or
David Corbett13bb46c2022-03-06 15:35:31 -0500269 UGC != Lo and UISC == Bindu)
David Corbett5b0a5982022-03-04 20:45:30 -0500270def is_Word_Joiner(U, UISC, UDI, UGC, AJT):
David Corbettc33468d2022-03-06 12:26:37 -0500271 # Also includes Rsv
David Corbett5b0a5982022-03-04 20:45:30 -0500272 return (UDI and U not in [0x115F, 0x1160, 0x3164, 0xFFA0, 0x1BCA0, 0x1BCA1, 0x1BCA2, 0x1BCA3]
273 and UISC == Other
274 and not is_CGJ(U, UISC, UDI, UGC, AJT)
David Corbettc33468d2022-03-06 12:26:37 -0500275 ) or UGC == Cn
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100276
277use_mapping = {
278 'B': is_BASE,
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100279 'N': is_BASE_NUM,
280 'GB': is_BASE_OTHER,
David Corbett8eaee2f2021-10-07 20:10:31 -0400281 'CGJ': is_CGJ,
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100282 'F': is_CONS_FINAL,
283 'FM': is_CONS_FINAL_MOD,
284 'M': is_CONS_MED,
285 'CM': is_CONS_MOD,
286 'SUB': is_CONS_SUB,
Behdad Esfahbode07669f2017-10-03 14:57:14 +0200287 'CS': is_CONS_WITH_STACKER,
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100288 'H': is_HALANT,
David Corbett60598282022-06-25 11:33:44 -0400289 'HVM': is_HALANT_OR_VOWEL_MODIFIER,
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100290 'HN': is_HALANT_NUM,
David Corbett49fb8f92022-03-13 15:01:11 -0400291 'IS': is_INVISIBLE_STACKER,
David Corbettfaf09f52020-08-14 18:30:20 -0400292 'G': is_HIEROGLYPH,
293 'J': is_HIEROGLYPH_JOINER,
294 'SB': is_HIEROGLYPH_SEGMENT_BEGIN,
295 'SE': is_HIEROGLYPH_SEGMENT_END,
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100296 'ZWNJ': is_ZWNJ,
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100297 'O': is_OTHER,
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100298 'R': is_REPHA,
David Corbettd64fb9d2019-05-26 11:05:54 -0400299 'Sk': is_SAKOT,
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100300 'SM': is_SYM_MOD,
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100301 'V': is_VOWEL,
302 'VM': is_VOWEL_MOD,
David Corbett5b0a5982022-03-04 20:45:30 -0500303 'WJ': is_Word_Joiner,
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100304}
305
Behdad Esfahbod44910ce2015-07-20 18:01:10 +0100306use_positions = {
307 'F': {
308 'Abv': [Top],
309 'Blw': [Bottom],
310 'Pst': [Right],
311 },
312 'M': {
313 'Abv': [Top],
David Corbettfd748fa2020-03-15 15:59:31 -0400314 'Blw': [Bottom, Bottom_And_Left, Bottom_And_Right],
Behdad Esfahbod44910ce2015-07-20 18:01:10 +0100315 'Pst': [Right],
David Corbettfd748fa2020-03-15 15:59:31 -0400316 'Pre': [Left, Top_And_Bottom_And_Left],
Behdad Esfahbod44910ce2015-07-20 18:01:10 +0100317 },
318 'CM': {
319 'Abv': [Top],
David Corbett06f49fc2020-08-13 13:37:45 -0400320 'Blw': [Bottom, Overstruck],
Behdad Esfahbod44910ce2015-07-20 18:01:10 +0100321 },
322 'V': {
323 'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right],
324 'Blw': [Bottom, Overstruck, Bottom_And_Right],
David Corbett06f49fc2020-08-13 13:37:45 -0400325 'Pst': [Right],
326 'Pre': [Left, Top_And_Left, Top_And_Left_And_Right, Left_And_Right],
Behdad Esfahbod44910ce2015-07-20 18:01:10 +0100327 },
328 'VM': {
329 'Abv': [Top],
330 'Blw': [Bottom, Overstruck],
331 'Pst': [Right],
332 'Pre': [Left],
333 },
334 'SM': {
335 'Abv': [Top],
336 'Blw': [Bottom],
337 },
338 'H': None,
David Corbett60598282022-06-25 11:33:44 -0400339 'HVM': None,
David Corbett49fb8f92022-03-13 15:01:11 -0400340 'IS': None,
Behdad Esfahbod44910ce2015-07-20 18:01:10 +0100341 'B': None,
David Corbettd100cca2019-05-19 10:01:20 -0400342 'FM': {
343 'Abv': [Top],
344 'Blw': [Bottom],
345 'Pst': [Not_Applicable],
346 },
David Corbett06f49fc2020-08-13 13:37:45 -0400347 'R': None,
Behdad Esfahbod44910ce2015-07-20 18:01:10 +0100348 'SUB': None,
349}
350
Behdad Esfahbodad725552015-07-20 17:00:06 +0100351def map_to_use(data):
352 out = {}
353 items = use_mapping.items()
David Corbett3e635cf2021-10-08 17:13:22 -0400354 for U, (UISC, UIPC, AJT, UDI, UGC, UBlock, _) in data.items():
David Corbett06f49fc2020-08-13 13:37:45 -0400355
Behdad Esfahbod44910ce2015-07-20 18:01:10 +0100356 # Resolve Indic_Syllabic_Category
357
David Corbettfd748fa2020-03-15 15:59:31 -0400358 # TODO: These don't have UISC assigned in Unicode 13.0.0, but have UIPC
Behdad Esfahbod44910ce2015-07-20 18:01:10 +0100359 if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark
360
Behdad Esfahbod32a43812018-10-02 18:43:29 +0200361 # Tibetan:
David Corbettfd748fa2020-03-15 15:59:31 -0400362 # TODO: These don't have UISC assigned in Unicode 13.0.0, but have UIPC
Behdad Esfahbod32a43812018-10-02 18:43:29 +0200363 if 0x0F18 <= U <= 0x0F19 or 0x0F3E <= U <= 0x0F3F: UISC = Vowel_Dependent
Behdad Esfahbod32a43812018-10-02 18:43:29 +0200364
Behdad Esfahbod44910ce2015-07-20 18:01:10 +0100365 # TODO: U+1CED should only be allowed after some of
366 # the nasalization marks, maybe only for U+1CE9..U+1CF1.
367 if U == 0x1CED: UISC = Tone_Mark
368
David Corbett8eaee2f2021-10-07 20:10:31 -0400369 values = [k for k,v in items if v(U, UISC, UDI, UGC, AJT)]
370 assert len(values) == 1, "%s %s %s %s %s %s" % (hex(U), UISC, UDI, UGC, AJT, values)
Behdad Esfahbod44910ce2015-07-20 18:01:10 +0100371 USE = values[0]
372
373 # Resolve Indic_Positional_Category
374
punchcutter9541c9d2018-06-24 22:54:57 -0700375 # TODO: https://github.com/harfbuzz/harfbuzz/pull/1037
punchcuttera7eed7e2019-03-27 23:12:58 -0700376 # and https://github.com/harfbuzz/harfbuzz/issues/1631
377 if U in [0x11302, 0x11303, 0x114C1]: UIPC = Top
David Corbett06f49fc2020-08-13 13:37:45 -0400378
379 assert (UIPC in [Not_Applicable, Visual_Order_Left] or U == 0x0F7F or
David Corbett8eaee2f2021-10-07 20:10:31 -0400380 USE in use_positions), "%s %s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UDI, UGC, AJT)
Behdad Esfahbod44910ce2015-07-20 18:01:10 +0100381
382 pos_mapping = use_positions.get(USE, None)
383 if pos_mapping:
384 values = [k for k,v in pos_mapping.items() if v and UIPC in v]
David Corbett8eaee2f2021-10-07 20:10:31 -0400385 assert len(values) == 1, "%s %s %s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UDI, UGC, AJT, values)
Behdad Esfahbod44910ce2015-07-20 18:01:10 +0100386 USE = USE + values[0]
387
388 out[U] = (USE, UBlock)
Behdad Esfahbodad725552015-07-20 17:00:06 +0100389 return out
390
David Corbettc33468d2022-03-06 12:26:37 -0500391use_data = map_to_use(combined)
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100392
Ebrahim Byagowia48dd6e2018-03-28 19:08:19 +0430393print ("/* == Start of generated table == */")
394print ("/*")
395print (" * The following table is generated by running:")
396print (" *")
David Corbett3e635cf2021-10-08 17:13:22 -0400397print (" * {} IndicSyllabicCategory.txt IndicPositionalCategory.txt ArabicShaping.txt DerivedCoreProperties.txt UnicodeData.txt Blocks.txt Scripts.txt IndicSyllabicCategory-Additional.txt IndicPositionalCategory-Additional.txt".format (sys.argv[0]))
Ebrahim Byagowia48dd6e2018-03-28 19:08:19 +0430398print (" *")
399print (" * on files with these headers:")
400print (" *")
Behdad Esfahbode2c95112015-07-20 11:32:48 +0100401for h in headers:
402 for l in h:
Ebrahim Byagowia48dd6e2018-03-28 19:08:19 +0430403 print (" * %s" % (l.strip()))
404print (" */")
405print ()
Behdad Esfahbod44be1e52022-06-03 02:54:33 -0600406print ("#ifndef HB_OT_SHAPER_USE_TABLE_HH")
407print ("#define HB_OT_SHAPER_USE_TABLE_HH")
Behdad Esfahbod7aad5362019-06-26 13:21:03 -0700408print ()
Behdad Esfahbod59721c22021-01-29 11:34:59 -0700409print ('#include "hb.hh"')
Behdad Esfahbod7aad5362019-06-26 13:21:03 -0700410print ()
Behdad Esfahbod5bfb0b72022-06-03 02:56:41 -0600411print ('#include "hb-ot-shaper-use-machine.hh"')
Ebrahim Byagowia48dd6e2018-03-28 19:08:19 +0430412print ()
Behdad Esfahbode2c95112015-07-20 11:32:48 +0100413
Behdad Esfahbode2c95112015-07-20 11:32:48 +0100414total = 0
415used = 0
416last_block = None
David Corbettc33468d2022-03-06 12:26:37 -0500417def print_block (block, start, end, use_data):
Behdad Esfahbode2c95112015-07-20 11:32:48 +0100418 global total, used, last_block
419 if block and block != last_block:
Ebrahim Byagowia48dd6e2018-03-28 19:08:19 +0430420 print ()
421 print ()
422 print (" /* %s */" % block)
Behdad Esfahbodad725552015-07-20 17:00:06 +0100423 if start % 16:
Ebrahim Byagowia48dd6e2018-03-28 19:08:19 +0430424 print (' ' * (20 + (start % 16 * 6)), end='')
Behdad Esfahbode2c95112015-07-20 11:32:48 +0100425 num = 0
426 assert start % 8 == 0
427 assert (end+1) % 8 == 0
428 for u in range (start, end+1):
Behdad Esfahbodad725552015-07-20 17:00:06 +0100429 if u % 16 == 0:
Ebrahim Byagowia48dd6e2018-03-28 19:08:19 +0430430 print ()
431 print (" /* %04X */" % u, end='')
David Corbettc33468d2022-03-06 12:26:37 -0500432 if u in use_data:
Behdad Esfahbode2c95112015-07-20 11:32:48 +0100433 num += 1
David Corbettc33468d2022-03-06 12:26:37 -0500434 d = use_data.get (u)
435 if d is not None:
436 d = d[0]
437 elif u in unicode_data[4]:
438 d = 'O'
439 else:
440 d = 'WJ'
441 print ("%6s," % d, end='')
Behdad Esfahbode2c95112015-07-20 11:32:48 +0100442
443 total += end - start + 1
444 used += num
445 if block:
446 last_block = block
447
David Corbettc33468d2022-03-06 12:26:37 -0500448uu = sorted (use_data.keys ())
Behdad Esfahbode2c95112015-07-20 11:32:48 +0100449
450last = -100000
451num = 0
452offset = 0
453starts = []
454ends = []
Behdad Esfahbod8874eef2019-01-17 15:04:44 -0500455print ('#pragma GCC diagnostic push')
456print ('#pragma GCC diagnostic ignored "-Wunused-macros"')
Behdad Esfahbodad725552015-07-20 17:00:06 +0100457for k,v in sorted(use_mapping.items()):
Behdad Esfahbod44910ce2015-07-20 18:01:10 +0100458 if k in use_positions and use_positions[k]: continue
Behdad Esfahbod3bb26532021-01-28 20:36:51 -0700459 print ("#define %s USE(%s) /* %s */" % (k, k, v.__name__[3:]))
Behdad Esfahbod44910ce2015-07-20 18:01:10 +0100460for k,v in sorted(use_positions.items()):
461 if not v: continue
462 for suf in v.keys():
463 tag = k + suf
Behdad Esfahbod3bb26532021-01-28 20:36:51 -0700464 print ("#define %s USE(%s)" % (tag, tag))
Behdad Esfahbod8874eef2019-01-17 15:04:44 -0500465print ('#pragma GCC diagnostic pop')
Ebrahim Byagowia48dd6e2018-03-28 19:08:19 +0430466print ("")
Behdad Esfahbode2c95112015-07-20 11:32:48 +0100467
Behdad Esfahbode2c95112015-07-20 11:32:48 +0100468
Behdad Esfahbod015aecf2022-07-13 12:15:01 -0600469import packTab
470data = {u:v[0] for u,v in use_data.items()}
Behdad Esfahbod72c4e432022-11-19 13:40:33 -0700471
472DEFAULT = 5
473COMPACT = 9
474for compression in (DEFAULT, COMPACT):
475
476 logging.info(' Compression=%d:' % compression)
477 print()
478 if compression == DEFAULT:
479 print('#ifndef HB_OPTIMIZE_SIZE')
480 elif compression == COMPACT:
481 print('#else')
482 else:
483 assert False
484 print()
485
486 code = packTab.Code('hb_use')
487 sol = packTab.pack_table(data, compression=compression, default='O')
488 logging.info(' FullCost=%d' % (sol.fullCost))
489 sol.genCode(code, f'get_category')
490 code.print_c(linkage='static inline')
491 print ()
492
493print('#endif')
Behdad Esfahbode2c95112015-07-20 11:32:48 +0100494
Ebrahim Byagowia48dd6e2018-03-28 19:08:19 +0430495print ()
Behdad Esfahbodad717822015-07-21 16:43:27 +0100496for k in sorted(use_mapping.keys()):
497 if k in use_positions and use_positions[k]: continue
Ebrahim Byagowia48dd6e2018-03-28 19:08:19 +0430498 print ("#undef %s" % k)
Behdad Esfahbodad717822015-07-21 16:43:27 +0100499for k,v in sorted(use_positions.items()):
500 if not v: continue
501 for suf in v.keys():
502 tag = k + suf
Ebrahim Byagowia48dd6e2018-03-28 19:08:19 +0430503 print ("#undef %s" % tag)
504print ()
Behdad Esfahbod7aad5362019-06-26 13:21:03 -0700505print ()
Behdad Esfahbod44be1e52022-06-03 02:54:33 -0600506print ("#endif /* HB_OT_SHAPER_USE_TABLE_HH */")
Ebrahim Byagowia48dd6e2018-03-28 19:08:19 +0430507print ("/* == End of generated table == */")