David Corbett | 205737a | 2018-10-12 16:54:54 -0400 | [diff] [blame] | 1 | #!/usr/bin/python |
| 2 | |
| 3 | """Generator of the function to prohibit certain vowel sequences. |
| 4 | |
Behdad Esfahbod | 6d40eb8 | 2018-10-23 02:51:42 -0700 | [diff] [blame] | 5 | It creates ``_hb_preprocess_text_vowel_constraints``, which inserts dotted |
David Corbett | 205737a | 2018-10-12 16:54:54 -0400 | [diff] [blame] | 6 | circles into sequences prohibited by the USE script development spec. |
| 7 | This function should be used as the ``preprocess_text`` of an |
| 8 | ``hb_ot_complex_shaper_t``. |
David Corbett | 205737a | 2018-10-12 16:54:54 -0400 | [diff] [blame] | 9 | """ |
| 10 | |
| 11 | from __future__ import absolute_import, division, print_function, unicode_literals |
| 12 | |
| 13 | import collections |
| 14 | try: |
| 15 | from HTMLParser import HTMLParser |
| 16 | def write (s): |
| 17 | print (s.encode ('utf-8'), end='') |
| 18 | except ImportError: |
| 19 | from html.parser import HTMLParser |
| 20 | def write (s): |
| 21 | sys.stdout.flush () |
| 22 | sys.stdout.buffer.write (s.encode ('utf-8')) |
| 23 | import itertools |
| 24 | import io |
| 25 | import sys |
| 26 | |
| 27 | if len (sys.argv) != 3: |
David Corbett | e6351d9 | 2019-11-11 17:39:55 -0500 | [diff] [blame] | 28 | print ('usage: ./gen-vowel-constraints.py ms-use/IndicShapingInvalidCluster.txt Scripts.txt', file=sys.stderr) |
David Corbett | 205737a | 2018-10-12 16:54:54 -0400 | [diff] [blame] | 29 | sys.exit (1) |
| 30 | |
David Corbett | 205737a | 2018-10-12 16:54:54 -0400 | [diff] [blame] | 31 | with io.open (sys.argv[2], encoding='utf-8') as f: |
| 32 | scripts_header = [f.readline () for i in range (2)] |
| 33 | scripts = {} |
| 34 | script_order = {} |
| 35 | for line in f: |
| 36 | j = line.find ('#') |
| 37 | if j >= 0: |
| 38 | line = line[:j] |
| 39 | fields = [x.strip () for x in line.split (';')] |
| 40 | if len (fields) == 1: |
| 41 | continue |
| 42 | uu = fields[0].split ('..') |
| 43 | start = int (uu[0], 16) |
| 44 | if len (uu) == 1: |
| 45 | end = start |
| 46 | else: |
| 47 | end = int (uu[1], 16) |
| 48 | script = fields[1] |
| 49 | for u in range (start, end + 1): |
| 50 | scripts[u] = script |
| 51 | if script not in script_order: |
| 52 | script_order[script] = start |
| 53 | |
| 54 | class ConstraintSet (object): |
| 55 | """A set of prohibited code point sequences. |
| 56 | |
| 57 | Args: |
| 58 | constraint (List[int]): A prohibited code point sequence. |
| 59 | |
| 60 | """ |
| 61 | def __init__ (self, constraint): |
| 62 | # Either a list or a dictionary. As a list of code points, it |
| 63 | # represents a prohibited code point sequence. As a dictionary, |
| 64 | # it represents a set of prohibited sequences, where each item |
| 65 | # represents the set of prohibited sequences starting with the |
| 66 | # key (a code point) concatenated with any of the values |
| 67 | # (ConstraintSets). |
| 68 | self._c = constraint |
| 69 | |
| 70 | def add (self, constraint): |
| 71 | """Add a constraint to this set.""" |
| 72 | if not constraint: |
| 73 | return |
| 74 | first = constraint[0] |
| 75 | rest = constraint[1:] |
| 76 | if isinstance (self._c, list): |
| 77 | if constraint == self._c[:len (constraint)]: |
| 78 | self._c = constraint |
| 79 | elif self._c != constraint[:len (self._c)]: |
| 80 | self._c = {self._c[0]: ConstraintSet (self._c[1:])} |
| 81 | if isinstance (self._c, dict): |
| 82 | if first in self._c: |
| 83 | self._c[first].add (rest) |
| 84 | else: |
| 85 | self._c[first] = ConstraintSet (rest) |
| 86 | |
David Corbett | b372c3e | 2019-11-08 20:59:48 -0500 | [diff] [blame] | 87 | @staticmethod |
| 88 | def _indent (depth): |
David Corbett | 205737a | 2018-10-12 16:54:54 -0400 | [diff] [blame] | 89 | return (' ' * depth).replace (' ', '\t') |
| 90 | |
| 91 | def __str__ (self, index=0, depth=4): |
| 92 | s = [] |
| 93 | indent = self._indent (depth) |
| 94 | if isinstance (self._c, list): |
| 95 | if len (self._c) == 0: |
David Corbett | b372c3e | 2019-11-08 20:59:48 -0500 | [diff] [blame] | 96 | assert index == 2, 'Cannot use `matched` for this constraint; the general case has not been implemented' |
David Corbett | 205737a | 2018-10-12 16:54:54 -0400 | [diff] [blame] | 97 | s.append ('{}matched = true;\n'.format (indent)) |
| 98 | elif len (self._c) == 1: |
David Corbett | b372c3e | 2019-11-08 20:59:48 -0500 | [diff] [blame] | 99 | assert index == 1, 'Cannot use `matched` for this constraint; the general case has not been implemented' |
David Corbett | 205737a | 2018-10-12 16:54:54 -0400 | [diff] [blame] | 100 | s.append ('{}matched = 0x{:04X}u == buffer->cur ({}).codepoint;\n'.format (indent, next (iter (self._c)), index or '')) |
| 101 | else: |
David Corbett | b372c3e | 2019-11-08 20:59:48 -0500 | [diff] [blame] | 102 | s.append ('{}if (0x{:04X}u == buffer->cur ({}).codepoint &&\n'.format (indent, self._c[0], index or '')) |
| 103 | if index: |
| 104 | s.append ('{}buffer->idx + {} < count &&\n'.format (self._indent (depth + 2), index + 1)) |
David Corbett | 205737a | 2018-10-12 16:54:54 -0400 | [diff] [blame] | 105 | for i, cp in enumerate (self._c[1:], start=1): |
| 106 | s.append ('{}0x{:04X}u == buffer->cur ({}).codepoint{}\n'.format ( |
| 107 | self._indent (depth + 2), cp, index + i, ')' if i == len (self._c) - 1 else ' &&')) |
| 108 | s.append ('{}{{\n'.format (indent)) |
David Corbett | b372c3e | 2019-11-08 20:59:48 -0500 | [diff] [blame] | 109 | for i in range (index + 1): |
David Corbett | 205737a | 2018-10-12 16:54:54 -0400 | [diff] [blame] | 110 | s.append ('{}buffer->next_glyph ();\n'.format (self._indent (depth + 1))) |
Behdad Esfahbod | ae9e8f2 | 2018-10-24 16:46:07 -0700 | [diff] [blame] | 111 | s.append ('{}_output_dotted_circle (buffer);\n'.format (self._indent (depth + 1))) |
David Corbett | 205737a | 2018-10-12 16:54:54 -0400 | [diff] [blame] | 112 | s.append ('{}}}\n'.format (indent)) |
| 113 | else: |
| 114 | s.append ('{}switch (buffer->cur ({}).codepoint)\n'.format(indent, index or '')) |
| 115 | s.append ('{}{{\n'.format (indent)) |
| 116 | cases = collections.defaultdict (set) |
| 117 | for first, rest in sorted (self._c.items ()): |
| 118 | cases[rest.__str__ (index + 1, depth + 2)].add (first) |
| 119 | for body, labels in sorted (cases.items (), key=lambda b_ls: sorted (b_ls[1])[0]): |
| 120 | for i, cp in enumerate (sorted (labels)): |
| 121 | if i % 4 == 0: |
| 122 | s.append (self._indent (depth + 1)) |
| 123 | else: |
| 124 | s.append (' ') |
| 125 | s.append ('case 0x{:04X}u:{}'.format (cp, '\n' if i % 4 == 3 else '')) |
| 126 | if len (labels) % 4 != 0: |
| 127 | s.append ('\n') |
| 128 | s.append (body) |
| 129 | s.append ('{}break;\n'.format (self._indent (depth + 2))) |
| 130 | s.append ('{}}}\n'.format (indent)) |
| 131 | return ''.join (s) |
| 132 | |
Behdad Esfahbod | 6d40eb8 | 2018-10-23 02:51:42 -0700 | [diff] [blame] | 133 | constraints = {} |
| 134 | with io.open (sys.argv[1], encoding='utf-8') as f: |
David Corbett | b372c3e | 2019-11-08 20:59:48 -0500 | [diff] [blame] | 135 | constraints_header = [] |
| 136 | while True: |
| 137 | line = f.readline ().strip () |
| 138 | if line == '#': |
| 139 | break |
| 140 | constraints_header.append(line) |
Behdad Esfahbod | 6d40eb8 | 2018-10-23 02:51:42 -0700 | [diff] [blame] | 141 | for line in f: |
| 142 | j = line.find ('#') |
| 143 | if j >= 0: |
| 144 | line = line[:j] |
| 145 | constraint = [int (cp, 16) for cp in line.split (';')[0].split ()] |
| 146 | if not constraint: continue |
| 147 | assert 2 <= len (constraint), 'Prohibited sequence is too short: {}'.format (constraint) |
| 148 | script = scripts[constraint[0]] |
| 149 | if script in constraints: |
| 150 | constraints[script].add (constraint) |
| 151 | else: |
| 152 | constraints[script] = ConstraintSet (constraint) |
| 153 | assert constraints, 'No constraints found' |
David Corbett | 205737a | 2018-10-12 16:54:54 -0400 | [diff] [blame] | 154 | |
| 155 | print ('/* == Start of generated functions == */') |
| 156 | print ('/*') |
| 157 | print (' * The following functions are generated by running:') |
| 158 | print (' *') |
David Corbett | e6351d9 | 2019-11-11 17:39:55 -0500 | [diff] [blame] | 159 | print (' * %s ms-use/IndicShapingInvalidCluster.txt Scripts.txt' % sys.argv[0]) |
David Corbett | 205737a | 2018-10-12 16:54:54 -0400 | [diff] [blame] | 160 | print (' *') |
| 161 | print (' * on files with these headers:') |
| 162 | print (' *') |
Behdad Esfahbod | 6d40eb8 | 2018-10-23 02:51:42 -0700 | [diff] [blame] | 163 | for line in constraints_header: |
| 164 | print (' * %s' % line.strip ()) |
| 165 | print (' *') |
David Corbett | 205737a | 2018-10-12 16:54:54 -0400 | [diff] [blame] | 166 | for line in scripts_header: |
| 167 | print (' * %s' % line.strip ()) |
| 168 | print (' */') |
Behdad Esfahbod | 7aad536 | 2019-06-26 13:21:03 -0700 | [diff] [blame] | 169 | |
| 170 | print () |
| 171 | print ('#include "hb.hh"') |
| 172 | print () |
| 173 | print ('#ifndef HB_NO_OT_SHAPE') |
David Corbett | 205737a | 2018-10-12 16:54:54 -0400 | [diff] [blame] | 174 | print () |
Behdad Esfahbod | 6d40eb8 | 2018-10-23 02:51:42 -0700 | [diff] [blame] | 175 | print ('#include "hb-ot-shape-complex-vowel-constraints.hh"') |
David Corbett | 205737a | 2018-10-12 16:54:54 -0400 | [diff] [blame] | 176 | print () |
David Corbett | 205737a | 2018-10-12 16:54:54 -0400 | [diff] [blame] | 177 | print ('static void') |
Behdad Esfahbod | ae9e8f2 | 2018-10-24 16:46:07 -0700 | [diff] [blame] | 178 | print ('_output_dotted_circle (hb_buffer_t *buffer)') |
David Corbett | 205737a | 2018-10-12 16:54:54 -0400 | [diff] [blame] | 179 | print ('{') |
| 180 | print (' hb_glyph_info_t &dottedcircle = buffer->output_glyph (0x25CCu);') |
| 181 | print (' _hb_glyph_info_reset_continuation (&dottedcircle);') |
Behdad Esfahbod | ae9e8f2 | 2018-10-24 16:46:07 -0700 | [diff] [blame] | 182 | print ('}') |
David Corbett | 205737a | 2018-10-12 16:54:54 -0400 | [diff] [blame] | 183 | print () |
Behdad Esfahbod | ae9e8f2 | 2018-10-24 16:46:07 -0700 | [diff] [blame] | 184 | print ('static void') |
| 185 | print ('_output_with_dotted_circle (hb_buffer_t *buffer)') |
| 186 | print ('{') |
| 187 | print (' _output_dotted_circle (buffer);') |
David Corbett | 205737a | 2018-10-12 16:54:54 -0400 | [diff] [blame] | 188 | print (' buffer->next_glyph ();') |
| 189 | print ('}') |
| 190 | print () |
| 191 | |
Behdad Esfahbod | 6d40eb8 | 2018-10-23 02:51:42 -0700 | [diff] [blame] | 192 | print ('void') |
Behdad Esfahbod | 39bd07a | 2018-10-26 21:01:11 -0700 | [diff] [blame] | 193 | print ('_hb_preprocess_text_vowel_constraints (const hb_ot_shape_plan_t *plan HB_UNUSED,') |
Behdad Esfahbod | 6d40eb8 | 2018-10-23 02:51:42 -0700 | [diff] [blame] | 194 | print ('\t\t\t\t hb_buffer_t *buffer,') |
Behdad Esfahbod | 39bd07a | 2018-10-26 21:01:11 -0700 | [diff] [blame] | 195 | print ('\t\t\t\t hb_font_t *font HB_UNUSED)') |
David Corbett | 205737a | 2018-10-12 16:54:54 -0400 | [diff] [blame] | 196 | print ('{') |
David Corbett | b372c3e | 2019-11-08 20:59:48 -0500 | [diff] [blame] | 197 | print ('#ifdef HB_NO_OT_SHAPE_COMPLEX_VOWEL_CONSTRAINTS') |
David Corbett | 14e1fab | 2019-05-01 21:29:06 -0400 | [diff] [blame] | 198 | print (' return;') |
| 199 | print ('#endif') |
Eric Muller | b38bab8 | 2019-02-12 11:41:16 -0800 | [diff] [blame] | 200 | print (' if (buffer->flags & HB_BUFFER_FLAG_DO_NOT_INSERT_DOTTED_CIRCLE)') |
| 201 | print (' return;') |
| 202 | print () |
David Corbett | 205737a | 2018-10-12 16:54:54 -0400 | [diff] [blame] | 203 | print (' /* UGLY UGLY UGLY business of adding dotted-circle in the middle of') |
| 204 | print (' * vowel-sequences that look like another vowel. Data for each script') |
| 205 | print (' * collected from the USE script development spec.') |
| 206 | print (' *') |
| 207 | print (' * https://github.com/harfbuzz/harfbuzz/issues/1019') |
| 208 | print (' */') |
| 209 | print (' bool processed = false;') |
| 210 | print (' buffer->clear_output ();') |
| 211 | print (' unsigned int count = buffer->len;') |
| 212 | print (' switch ((unsigned) buffer->props.script)') |
| 213 | print (' {') |
| 214 | |
Behdad Esfahbod | 6d40eb8 | 2018-10-23 02:51:42 -0700 | [diff] [blame] | 215 | for script, constraints in sorted (constraints.items (), key=lambda s_c: script_order[s_c[0]]): |
David Corbett | 205737a | 2018-10-12 16:54:54 -0400 | [diff] [blame] | 216 | print (' case HB_SCRIPT_{}:'.format (script.upper ())) |
| 217 | print (' for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;)') |
| 218 | print (' {') |
| 219 | print ('\tbool matched = false;') |
| 220 | write (str (constraints)) |
| 221 | print ('\tbuffer->next_glyph ();') |
| 222 | print ('\tif (matched) _output_with_dotted_circle (buffer);') |
| 223 | print (' }') |
| 224 | print (' processed = true;') |
| 225 | print (' break;') |
| 226 | print () |
| 227 | |
| 228 | print (' default:') |
| 229 | print (' break;') |
| 230 | print (' }') |
| 231 | print (' if (processed)') |
| 232 | print (' {') |
| 233 | print (' if (buffer->idx < count)') |
Behdad Esfahbod | 17335a8 | 2018-11-04 02:25:07 -0500 | [diff] [blame] | 234 | print (' buffer->next_glyph ();') |
David Corbett | e723c04 | 2019-03-06 12:37:25 -0500 | [diff] [blame] | 235 | print (' buffer->swap_buffers ();') |
David Corbett | 205737a | 2018-10-12 16:54:54 -0400 | [diff] [blame] | 236 | print (' }') |
| 237 | print ('}') |
| 238 | |
| 239 | print () |
Behdad Esfahbod | 7aad536 | 2019-06-26 13:21:03 -0700 | [diff] [blame] | 240 | print () |
| 241 | print ('#endif') |
David Corbett | 205737a | 2018-10-12 16:54:54 -0400 | [diff] [blame] | 242 | print ('/* == End of generated functions == */') |