blob: e0ae2a65d0e3b75472c724a4383cf10e15930337 [file] [log] [blame]
David Corbett205737a2018-10-12 16:54:54 -04001#!/usr/bin/python
2
3"""Generator of the function to prohibit certain vowel sequences.
4
Behdad Esfahbod6d40eb82018-10-23 02:51:42 -07005It creates ``_hb_preprocess_text_vowel_constraints``, which inserts dotted
David Corbett205737a2018-10-12 16:54:54 -04006circles into sequences prohibited by the USE script development spec.
7This function should be used as the ``preprocess_text`` of an
8``hb_ot_complex_shaper_t``.
David Corbett205737a2018-10-12 16:54:54 -04009"""
10
11from __future__ import absolute_import, division, print_function, unicode_literals
12
13import collections
14try:
15 from HTMLParser import HTMLParser
16 def write (s):
17 print (s.encode ('utf-8'), end='')
18except ImportError:
19 from html.parser import HTMLParser
20 def write (s):
21 sys.stdout.flush ()
22 sys.stdout.buffer.write (s.encode ('utf-8'))
23import itertools
24import io
25import sys
26
27if len (sys.argv) != 3:
David Corbette6351d92019-11-11 17:39:55 -050028 print ('usage: ./gen-vowel-constraints.py ms-use/IndicShapingInvalidCluster.txt Scripts.txt', file=sys.stderr)
David Corbett205737a2018-10-12 16:54:54 -040029 sys.exit (1)
30
David Corbett205737a2018-10-12 16:54:54 -040031with io.open (sys.argv[2], encoding='utf-8') as f:
32 scripts_header = [f.readline () for i in range (2)]
33 scripts = {}
34 script_order = {}
35 for line in f:
36 j = line.find ('#')
37 if j >= 0:
38 line = line[:j]
39 fields = [x.strip () for x in line.split (';')]
40 if len (fields) == 1:
41 continue
42 uu = fields[0].split ('..')
43 start = int (uu[0], 16)
44 if len (uu) == 1:
45 end = start
46 else:
47 end = int (uu[1], 16)
48 script = fields[1]
49 for u in range (start, end + 1):
50 scripts[u] = script
51 if script not in script_order:
52 script_order[script] = start
53
54class ConstraintSet (object):
55 """A set of prohibited code point sequences.
56
57 Args:
58 constraint (List[int]): A prohibited code point sequence.
59
60 """
61 def __init__ (self, constraint):
62 # Either a list or a dictionary. As a list of code points, it
63 # represents a prohibited code point sequence. As a dictionary,
64 # it represents a set of prohibited sequences, where each item
65 # represents the set of prohibited sequences starting with the
66 # key (a code point) concatenated with any of the values
67 # (ConstraintSets).
68 self._c = constraint
69
70 def add (self, constraint):
71 """Add a constraint to this set."""
72 if not constraint:
73 return
74 first = constraint[0]
75 rest = constraint[1:]
76 if isinstance (self._c, list):
77 if constraint == self._c[:len (constraint)]:
78 self._c = constraint
79 elif self._c != constraint[:len (self._c)]:
80 self._c = {self._c[0]: ConstraintSet (self._c[1:])}
81 if isinstance (self._c, dict):
82 if first in self._c:
83 self._c[first].add (rest)
84 else:
85 self._c[first] = ConstraintSet (rest)
86
David Corbettb372c3e2019-11-08 20:59:48 -050087 @staticmethod
88 def _indent (depth):
David Corbett205737a2018-10-12 16:54:54 -040089 return (' ' * depth).replace (' ', '\t')
90
91 def __str__ (self, index=0, depth=4):
92 s = []
93 indent = self._indent (depth)
94 if isinstance (self._c, list):
95 if len (self._c) == 0:
David Corbettb372c3e2019-11-08 20:59:48 -050096 assert index == 2, 'Cannot use `matched` for this constraint; the general case has not been implemented'
David Corbett205737a2018-10-12 16:54:54 -040097 s.append ('{}matched = true;\n'.format (indent))
98 elif len (self._c) == 1:
David Corbettb372c3e2019-11-08 20:59:48 -050099 assert index == 1, 'Cannot use `matched` for this constraint; the general case has not been implemented'
David Corbett205737a2018-10-12 16:54:54 -0400100 s.append ('{}matched = 0x{:04X}u == buffer->cur ({}).codepoint;\n'.format (indent, next (iter (self._c)), index or ''))
101 else:
David Corbettb372c3e2019-11-08 20:59:48 -0500102 s.append ('{}if (0x{:04X}u == buffer->cur ({}).codepoint &&\n'.format (indent, self._c[0], index or ''))
103 if index:
104 s.append ('{}buffer->idx + {} < count &&\n'.format (self._indent (depth + 2), index + 1))
David Corbett205737a2018-10-12 16:54:54 -0400105 for i, cp in enumerate (self._c[1:], start=1):
106 s.append ('{}0x{:04X}u == buffer->cur ({}).codepoint{}\n'.format (
107 self._indent (depth + 2), cp, index + i, ')' if i == len (self._c) - 1 else ' &&'))
108 s.append ('{}{{\n'.format (indent))
David Corbettb372c3e2019-11-08 20:59:48 -0500109 for i in range (index + 1):
David Corbett205737a2018-10-12 16:54:54 -0400110 s.append ('{}buffer->next_glyph ();\n'.format (self._indent (depth + 1)))
Behdad Esfahbodae9e8f22018-10-24 16:46:07 -0700111 s.append ('{}_output_dotted_circle (buffer);\n'.format (self._indent (depth + 1)))
David Corbett205737a2018-10-12 16:54:54 -0400112 s.append ('{}}}\n'.format (indent))
113 else:
114 s.append ('{}switch (buffer->cur ({}).codepoint)\n'.format(indent, index or ''))
115 s.append ('{}{{\n'.format (indent))
116 cases = collections.defaultdict (set)
117 for first, rest in sorted (self._c.items ()):
118 cases[rest.__str__ (index + 1, depth + 2)].add (first)
119 for body, labels in sorted (cases.items (), key=lambda b_ls: sorted (b_ls[1])[0]):
120 for i, cp in enumerate (sorted (labels)):
121 if i % 4 == 0:
122 s.append (self._indent (depth + 1))
123 else:
124 s.append (' ')
125 s.append ('case 0x{:04X}u:{}'.format (cp, '\n' if i % 4 == 3 else ''))
126 if len (labels) % 4 != 0:
127 s.append ('\n')
128 s.append (body)
129 s.append ('{}break;\n'.format (self._indent (depth + 2)))
130 s.append ('{}}}\n'.format (indent))
131 return ''.join (s)
132
Behdad Esfahbod6d40eb82018-10-23 02:51:42 -0700133constraints = {}
134with io.open (sys.argv[1], encoding='utf-8') as f:
David Corbettb372c3e2019-11-08 20:59:48 -0500135 constraints_header = []
136 while True:
137 line = f.readline ().strip ()
138 if line == '#':
139 break
140 constraints_header.append(line)
Behdad Esfahbod6d40eb82018-10-23 02:51:42 -0700141 for line in f:
142 j = line.find ('#')
143 if j >= 0:
144 line = line[:j]
145 constraint = [int (cp, 16) for cp in line.split (';')[0].split ()]
146 if not constraint: continue
147 assert 2 <= len (constraint), 'Prohibited sequence is too short: {}'.format (constraint)
148 script = scripts[constraint[0]]
149 if script in constraints:
150 constraints[script].add (constraint)
151 else:
152 constraints[script] = ConstraintSet (constraint)
153 assert constraints, 'No constraints found'
David Corbett205737a2018-10-12 16:54:54 -0400154
155print ('/* == Start of generated functions == */')
156print ('/*')
157print (' * The following functions are generated by running:')
158print (' *')
David Corbette6351d92019-11-11 17:39:55 -0500159print (' * %s ms-use/IndicShapingInvalidCluster.txt Scripts.txt' % sys.argv[0])
David Corbett205737a2018-10-12 16:54:54 -0400160print (' *')
161print (' * on files with these headers:')
162print (' *')
Behdad Esfahbod6d40eb82018-10-23 02:51:42 -0700163for line in constraints_header:
164 print (' * %s' % line.strip ())
165print (' *')
David Corbett205737a2018-10-12 16:54:54 -0400166for line in scripts_header:
167 print (' * %s' % line.strip ())
168print (' */')
Behdad Esfahbod7aad5362019-06-26 13:21:03 -0700169
170print ()
171print ('#include "hb.hh"')
172print ()
173print ('#ifndef HB_NO_OT_SHAPE')
David Corbett205737a2018-10-12 16:54:54 -0400174print ()
Behdad Esfahbod6d40eb82018-10-23 02:51:42 -0700175print ('#include "hb-ot-shape-complex-vowel-constraints.hh"')
David Corbett205737a2018-10-12 16:54:54 -0400176print ()
David Corbett205737a2018-10-12 16:54:54 -0400177print ('static void')
Behdad Esfahbodae9e8f22018-10-24 16:46:07 -0700178print ('_output_dotted_circle (hb_buffer_t *buffer)')
David Corbett205737a2018-10-12 16:54:54 -0400179print ('{')
180print (' hb_glyph_info_t &dottedcircle = buffer->output_glyph (0x25CCu);')
181print (' _hb_glyph_info_reset_continuation (&dottedcircle);')
Behdad Esfahbodae9e8f22018-10-24 16:46:07 -0700182print ('}')
David Corbett205737a2018-10-12 16:54:54 -0400183print ()
Behdad Esfahbodae9e8f22018-10-24 16:46:07 -0700184print ('static void')
185print ('_output_with_dotted_circle (hb_buffer_t *buffer)')
186print ('{')
187print (' _output_dotted_circle (buffer);')
David Corbett205737a2018-10-12 16:54:54 -0400188print (' buffer->next_glyph ();')
189print ('}')
190print ()
191
Behdad Esfahbod6d40eb82018-10-23 02:51:42 -0700192print ('void')
Behdad Esfahbod39bd07a2018-10-26 21:01:11 -0700193print ('_hb_preprocess_text_vowel_constraints (const hb_ot_shape_plan_t *plan HB_UNUSED,')
Behdad Esfahbod6d40eb82018-10-23 02:51:42 -0700194print ('\t\t\t\t hb_buffer_t *buffer,')
Behdad Esfahbod39bd07a2018-10-26 21:01:11 -0700195print ('\t\t\t\t hb_font_t *font HB_UNUSED)')
David Corbett205737a2018-10-12 16:54:54 -0400196print ('{')
David Corbettb372c3e2019-11-08 20:59:48 -0500197print ('#ifdef HB_NO_OT_SHAPE_COMPLEX_VOWEL_CONSTRAINTS')
David Corbett14e1fab2019-05-01 21:29:06 -0400198print (' return;')
199print ('#endif')
Eric Mullerb38bab82019-02-12 11:41:16 -0800200print (' if (buffer->flags & HB_BUFFER_FLAG_DO_NOT_INSERT_DOTTED_CIRCLE)')
201print (' return;')
202print ()
David Corbett205737a2018-10-12 16:54:54 -0400203print (' /* UGLY UGLY UGLY business of adding dotted-circle in the middle of')
204print (' * vowel-sequences that look like another vowel. Data for each script')
205print (' * collected from the USE script development spec.')
206print (' *')
207print (' * https://github.com/harfbuzz/harfbuzz/issues/1019')
208print (' */')
209print (' bool processed = false;')
210print (' buffer->clear_output ();')
211print (' unsigned int count = buffer->len;')
212print (' switch ((unsigned) buffer->props.script)')
213print (' {')
214
Behdad Esfahbod6d40eb82018-10-23 02:51:42 -0700215for script, constraints in sorted (constraints.items (), key=lambda s_c: script_order[s_c[0]]):
David Corbett205737a2018-10-12 16:54:54 -0400216 print (' case HB_SCRIPT_{}:'.format (script.upper ()))
217 print (' for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;)')
218 print (' {')
219 print ('\tbool matched = false;')
220 write (str (constraints))
221 print ('\tbuffer->next_glyph ();')
222 print ('\tif (matched) _output_with_dotted_circle (buffer);')
223 print (' }')
224 print (' processed = true;')
225 print (' break;')
226 print ()
227
228print (' default:')
229print (' break;')
230print (' }')
231print (' if (processed)')
232print (' {')
233print (' if (buffer->idx < count)')
Behdad Esfahbod17335a82018-11-04 02:25:07 -0500234print (' buffer->next_glyph ();')
David Corbette723c042019-03-06 12:37:25 -0500235print (' buffer->swap_buffers ();')
David Corbett205737a2018-10-12 16:54:54 -0400236print (' }')
237print ('}')
238
239print ()
Behdad Esfahbod7aad5362019-06-26 13:21:03 -0700240print ()
241print ('#endif')
David Corbett205737a2018-10-12 16:54:54 -0400242print ('/* == End of generated functions == */')