David Corbett | 205737a | 2018-10-12 16:54:54 -0400 | [diff] [blame] | 1 | #!/usr/bin/python |
| 2 | |
| 3 | """Generator of the function to prohibit certain vowel sequences. |
| 4 | |
Behdad Esfahbod | 6d40eb8 | 2018-10-23 02:51:42 -0700 | [diff] [blame] | 5 | It creates ``_hb_preprocess_text_vowel_constraints``, which inserts dotted |
David Corbett | 205737a | 2018-10-12 16:54:54 -0400 | [diff] [blame] | 6 | circles into sequences prohibited by the USE script development spec. |
| 7 | This function should be used as the ``preprocess_text`` of an |
| 8 | ``hb_ot_complex_shaper_t``. |
David Corbett | 205737a | 2018-10-12 16:54:54 -0400 | [diff] [blame] | 9 | """ |
| 10 | |
| 11 | from __future__ import absolute_import, division, print_function, unicode_literals |
| 12 | |
| 13 | import collections |
| 14 | try: |
| 15 | from HTMLParser import HTMLParser |
| 16 | def write (s): |
| 17 | print (s.encode ('utf-8'), end='') |
| 18 | except ImportError: |
| 19 | from html.parser import HTMLParser |
| 20 | def write (s): |
| 21 | sys.stdout.flush () |
| 22 | sys.stdout.buffer.write (s.encode ('utf-8')) |
| 23 | import itertools |
| 24 | import io |
| 25 | import sys |
| 26 | |
| 27 | if len (sys.argv) != 3: |
Behdad Esfahbod | 6d40eb8 | 2018-10-23 02:51:42 -0700 | [diff] [blame] | 28 | print ('usage: ./gen-vowel-constraints.py HBIndicVowelConstraints.txt Scripts.txt', file=sys.stderr) |
David Corbett | 205737a | 2018-10-12 16:54:54 -0400 | [diff] [blame] | 29 | sys.exit (1) |
| 30 | |
David Corbett | 205737a | 2018-10-12 16:54:54 -0400 | [diff] [blame] | 31 | with io.open (sys.argv[2], encoding='utf-8') as f: |
| 32 | scripts_header = [f.readline () for i in range (2)] |
| 33 | scripts = {} |
| 34 | script_order = {} |
| 35 | for line in f: |
| 36 | j = line.find ('#') |
| 37 | if j >= 0: |
| 38 | line = line[:j] |
| 39 | fields = [x.strip () for x in line.split (';')] |
| 40 | if len (fields) == 1: |
| 41 | continue |
| 42 | uu = fields[0].split ('..') |
| 43 | start = int (uu[0], 16) |
| 44 | if len (uu) == 1: |
| 45 | end = start |
| 46 | else: |
| 47 | end = int (uu[1], 16) |
| 48 | script = fields[1] |
| 49 | for u in range (start, end + 1): |
| 50 | scripts[u] = script |
| 51 | if script not in script_order: |
| 52 | script_order[script] = start |
| 53 | |
| 54 | class ConstraintSet (object): |
| 55 | """A set of prohibited code point sequences. |
| 56 | |
| 57 | Args: |
| 58 | constraint (List[int]): A prohibited code point sequence. |
| 59 | |
| 60 | """ |
| 61 | def __init__ (self, constraint): |
| 62 | # Either a list or a dictionary. As a list of code points, it |
| 63 | # represents a prohibited code point sequence. As a dictionary, |
| 64 | # it represents a set of prohibited sequences, where each item |
| 65 | # represents the set of prohibited sequences starting with the |
| 66 | # key (a code point) concatenated with any of the values |
| 67 | # (ConstraintSets). |
| 68 | self._c = constraint |
| 69 | |
| 70 | def add (self, constraint): |
| 71 | """Add a constraint to this set.""" |
| 72 | if not constraint: |
| 73 | return |
| 74 | first = constraint[0] |
| 75 | rest = constraint[1:] |
| 76 | if isinstance (self._c, list): |
| 77 | if constraint == self._c[:len (constraint)]: |
| 78 | self._c = constraint |
| 79 | elif self._c != constraint[:len (self._c)]: |
| 80 | self._c = {self._c[0]: ConstraintSet (self._c[1:])} |
| 81 | if isinstance (self._c, dict): |
| 82 | if first in self._c: |
| 83 | self._c[first].add (rest) |
| 84 | else: |
| 85 | self._c[first] = ConstraintSet (rest) |
| 86 | |
| 87 | def _indent (self, depth): |
| 88 | return (' ' * depth).replace (' ', '\t') |
| 89 | |
| 90 | def __str__ (self, index=0, depth=4): |
| 91 | s = [] |
| 92 | indent = self._indent (depth) |
| 93 | if isinstance (self._c, list): |
| 94 | if len (self._c) == 0: |
| 95 | s.append ('{}matched = true;\n'.format (indent)) |
| 96 | elif len (self._c) == 1: |
| 97 | s.append ('{}matched = 0x{:04X}u == buffer->cur ({}).codepoint;\n'.format (indent, next (iter (self._c)), index or '')) |
| 98 | else: |
| 99 | s.append ('{}if (0x{:04X}u == buffer->cur ({}).codepoint &&\n'.format (indent, self._c[0], index)) |
| 100 | s.append ('{}buffer->idx + {} < count &&\n'.format (self._indent (depth + 2), len (self._c))) |
| 101 | for i, cp in enumerate (self._c[1:], start=1): |
| 102 | s.append ('{}0x{:04X}u == buffer->cur ({}).codepoint{}\n'.format ( |
| 103 | self._indent (depth + 2), cp, index + i, ')' if i == len (self._c) - 1 else ' &&')) |
| 104 | s.append ('{}{{\n'.format (indent)) |
| 105 | for i in range (len (self._c)): |
| 106 | s.append ('{}buffer->next_glyph ();\n'.format (self._indent (depth + 1))) |
| 107 | s.append ('{}buffer->output_glyph (0x25CCu);\n'.format (self._indent (depth + 1))) |
| 108 | s.append ('{}}}\n'.format (indent)) |
| 109 | else: |
| 110 | s.append ('{}switch (buffer->cur ({}).codepoint)\n'.format(indent, index or '')) |
| 111 | s.append ('{}{{\n'.format (indent)) |
| 112 | cases = collections.defaultdict (set) |
| 113 | for first, rest in sorted (self._c.items ()): |
| 114 | cases[rest.__str__ (index + 1, depth + 2)].add (first) |
| 115 | for body, labels in sorted (cases.items (), key=lambda b_ls: sorted (b_ls[1])[0]): |
| 116 | for i, cp in enumerate (sorted (labels)): |
| 117 | if i % 4 == 0: |
| 118 | s.append (self._indent (depth + 1)) |
| 119 | else: |
| 120 | s.append (' ') |
| 121 | s.append ('case 0x{:04X}u:{}'.format (cp, '\n' if i % 4 == 3 else '')) |
| 122 | if len (labels) % 4 != 0: |
| 123 | s.append ('\n') |
| 124 | s.append (body) |
| 125 | s.append ('{}break;\n'.format (self._indent (depth + 2))) |
| 126 | s.append ('{}}}\n'.format (indent)) |
| 127 | return ''.join (s) |
| 128 | |
Behdad Esfahbod | 6d40eb8 | 2018-10-23 02:51:42 -0700 | [diff] [blame] | 129 | constraints = {} |
| 130 | with io.open (sys.argv[1], encoding='utf-8') as f: |
| 131 | constraints_header = [f.readline ().strip () for i in range (2)] |
| 132 | for line in f: |
| 133 | j = line.find ('#') |
| 134 | if j >= 0: |
| 135 | line = line[:j] |
| 136 | constraint = [int (cp, 16) for cp in line.split (';')[0].split ()] |
| 137 | if not constraint: continue |
| 138 | assert 2 <= len (constraint), 'Prohibited sequence is too short: {}'.format (constraint) |
| 139 | script = scripts[constraint[0]] |
| 140 | if script in constraints: |
| 141 | constraints[script].add (constraint) |
| 142 | else: |
| 143 | constraints[script] = ConstraintSet (constraint) |
| 144 | assert constraints, 'No constraints found' |
David Corbett | 205737a | 2018-10-12 16:54:54 -0400 | [diff] [blame] | 145 | |
| 146 | print ('/* == Start of generated functions == */') |
| 147 | print ('/*') |
| 148 | print (' * The following functions are generated by running:') |
| 149 | print (' *') |
| 150 | print (' * %s use Scripts.txt' % sys.argv[0]) |
| 151 | print (' *') |
| 152 | print (' * on files with these headers:') |
| 153 | print (' *') |
Behdad Esfahbod | 6d40eb8 | 2018-10-23 02:51:42 -0700 | [diff] [blame] | 154 | for line in constraints_header: |
| 155 | print (' * %s' % line.strip ()) |
| 156 | print (' *') |
David Corbett | 205737a | 2018-10-12 16:54:54 -0400 | [diff] [blame] | 157 | for line in scripts_header: |
| 158 | print (' * %s' % line.strip ()) |
| 159 | print (' */') |
| 160 | print () |
Behdad Esfahbod | 6d40eb8 | 2018-10-23 02:51:42 -0700 | [diff] [blame] | 161 | print ('#include "hb-ot-shape-complex-vowel-constraints.hh"') |
David Corbett | 205737a | 2018-10-12 16:54:54 -0400 | [diff] [blame] | 162 | print () |
David Corbett | 205737a | 2018-10-12 16:54:54 -0400 | [diff] [blame] | 163 | print ('static void') |
| 164 | print ('_output_with_dotted_circle (hb_buffer_t *buffer)') |
| 165 | print ('{') |
| 166 | print (' hb_glyph_info_t &dottedcircle = buffer->output_glyph (0x25CCu);') |
| 167 | print (' _hb_glyph_info_reset_continuation (&dottedcircle);') |
| 168 | print () |
| 169 | print (' buffer->next_glyph ();') |
| 170 | print ('}') |
| 171 | print () |
| 172 | |
Behdad Esfahbod | 6d40eb8 | 2018-10-23 02:51:42 -0700 | [diff] [blame] | 173 | print ('void') |
| 174 | print ('_hb_preprocess_text_vowel_constraints (const hb_ot_shape_plan_t *plan,') |
| 175 | print ('\t\t\t\t hb_buffer_t *buffer,') |
| 176 | print ('\t\t\t\t hb_font_t *font)') |
David Corbett | 205737a | 2018-10-12 16:54:54 -0400 | [diff] [blame] | 177 | print ('{') |
| 178 | print (' /* UGLY UGLY UGLY business of adding dotted-circle in the middle of') |
| 179 | print (' * vowel-sequences that look like another vowel. Data for each script') |
| 180 | print (' * collected from the USE script development spec.') |
| 181 | print (' *') |
| 182 | print (' * https://github.com/harfbuzz/harfbuzz/issues/1019') |
| 183 | print (' */') |
| 184 | print (' bool processed = false;') |
| 185 | print (' buffer->clear_output ();') |
| 186 | print (' unsigned int count = buffer->len;') |
| 187 | print (' switch ((unsigned) buffer->props.script)') |
| 188 | print (' {') |
| 189 | |
Behdad Esfahbod | 6d40eb8 | 2018-10-23 02:51:42 -0700 | [diff] [blame] | 190 | for script, constraints in sorted (constraints.items (), key=lambda s_c: script_order[s_c[0]]): |
David Corbett | 205737a | 2018-10-12 16:54:54 -0400 | [diff] [blame] | 191 | print (' case HB_SCRIPT_{}:'.format (script.upper ())) |
| 192 | print (' for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;)') |
| 193 | print (' {') |
| 194 | print ('\tbool matched = false;') |
| 195 | write (str (constraints)) |
| 196 | print ('\tbuffer->next_glyph ();') |
| 197 | print ('\tif (matched) _output_with_dotted_circle (buffer);') |
| 198 | print (' }') |
| 199 | print (' processed = true;') |
| 200 | print (' break;') |
| 201 | print () |
| 202 | |
| 203 | print (' default:') |
| 204 | print (' break;') |
| 205 | print (' }') |
| 206 | print (' if (processed)') |
| 207 | print (' {') |
| 208 | print (' if (buffer->idx < count)') |
| 209 | print (' buffer->next_glyph ();') |
| 210 | print (' if (likely (buffer->successful))') |
| 211 | print (' buffer->swap_buffers ();') |
| 212 | print (' }') |
| 213 | print ('}') |
| 214 | |
| 215 | print () |
David Corbett | 205737a | 2018-10-12 16:54:54 -0400 | [diff] [blame] | 216 | print ('/* == End of generated functions == */') |