[use] Prohibit visually ambiguous vowel sequences

commit: 205737acdc268b1c90cf00bde2d2038519a8bf48 [log] [tgz]
author: David Corbett <corbett.dav@husky.neu.edu> Fri Oct 12 16:54:54 2018 -0400
committer: Behdad Esfahbod <behdad@behdad.org> Tue Oct 23 02:25:08 2018 -0700
tree: 81dd69ffebde25e9b6d08726f985746fdaf3b452
parent: 48ed598a356983f4623029dd5e87254fb59e3691 [diff]
diff --git a/src/Makefile.am b/src/Makefile.am
index e0ea1c5..782992d 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am

@@ -295,6 +295,7 @@
 	gen-os2-unicode-ranges.py \
 	gen-tag-table.py \
 	gen-use-table.py \
+	gen-vowel-constraints.py \
 	$(NULL)
 EXTRA_DIST += $(GENERATORS)
 
@@ -316,13 +317,17 @@
 	$(AM_V_GEN) $(builddir)/$^ > $(srcdir)/hb-ot-shape-complex-use-table.cc \
 	|| ($(RM) $(srcdir)/hb-ot-shape-complex-use-table.cc; false)
 
+vowel-constraints: gen-vowel-constraints.py use Scripts.txt
+	$(AM_V_GEN) $(builddir)/$^ > $(srcdir)/hb-ot-shape-complex-vowel-constraints.hh \
+	|| ($(RM) $(srcdir)/hb-ot-shape-complex-vowel-constraints.hh; false)
+
 emoji-table: gen-emoji-table.py emoji-data.txt
 	$(AM_V_GEN) $(builddir)/$^ > $(srcdir)/hb-unicode-emoji-table.hh \
 	|| ($(RM) $(srcdir)/hb-unicode-emoji-table.hh; false)
 
 built-sources: $(BUILT_SOURCES)
 
-.PHONY: unicode-tables arabic-table indic-table tag-table use-table emoji-table built-sources
+.PHONY: unicode-tables arabic-table indic-table tag-table use-table vowel-constraints emoji-table built-sources
 
 RAGEL_GENERATED = \
 	$(patsubst %,$(srcdir)/%,$(HB_BASE_RAGEL_GENERATED_sources)) \

diff --git a/src/Makefile.sources b/src/Makefile.sources
index eed245b..b302910 100644
--- a/src/Makefile.sources
+++ b/src/Makefile.sources

@@ -142,6 +142,7 @@
 	hb-ot-shape-complex-use.cc \
 	hb-ot-shape-complex-use.hh \
 	hb-ot-shape-complex-use-table.cc \
+	hb-ot-shape-complex-vowel-constraints.hh \
 	hb-ot-shape-complex.hh \
 	hb-ot-shape-normalize.hh \
 	hb-ot-shape-normalize.cc \

diff --git a/src/gen-vowel-constraints.py b/src/gen-vowel-constraints.py
new file mode 100755
index 0000000..bcb5d27
--- /dev/null
+++ b/src/gen-vowel-constraints.py

@@ -0,0 +1,286 @@
+#!/usr/bin/python
+
+"""Generator of the function to prohibit certain vowel sequences.
+
+It creates ``preprocess_text_vowel_constraints``, which inserts dotted
+circles into sequences prohibited by the USE script development spec.
+This function should be used as the ``preprocess_text`` of an
+``hb_ot_complex_shaper_t``.
+
+It also creates the helper function ``_output_with_dotted_circle``.
+"""
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import collections
+try:
+	from HTMLParser import HTMLParser
+	def write (s):
+		print (s.encode ('utf-8'), end='')
+except ImportError:
+	from html.parser import HTMLParser
+	def write (s):
+		sys.stdout.flush ()
+		sys.stdout.buffer.write (s.encode ('utf-8'))
+import itertools
+import io
+import sys
+
+if len (sys.argv) != 3:
+	print ('usage: ./gen-vowel-constraints.py use Scripts.txt', file=sys.stderr)
+	sys.exit (1)
+
+try:
+	from html import unescape
+	def html_unescape (parser, entity):
+		return unescape (entity)
+except ImportError:
+	def html_unescape (parser, entity):
+		return parser.unescape (entity)
+
+def expect (condition, message=None):
+	if not condition:
+		if message is None:
+			raise AssertionError
+		raise AssertionError (message)
+
+with io.open (sys.argv[2], encoding='utf-8') as f:
+	scripts_header = [f.readline () for i in range (2)]
+	scripts = {}
+	script_order = {}
+	for line in f:
+		j = line.find ('#')
+		if j >= 0:
+			line = line[:j]
+		fields = [x.strip () for x in line.split (';')]
+		if len (fields) == 1:
+			continue
+		uu = fields[0].split ('..')
+		start = int (uu[0], 16)
+		if len (uu) == 1:
+			end = start
+		else:
+			end = int (uu[1], 16)
+		script = fields[1]
+		for u in range (start, end + 1):
+			scripts[u] = script
+		if script not in script_order:
+			script_order[script] = start
+
+class ConstraintSet (object):
+	"""A set of prohibited code point sequences.
+
+	Args:
+		constraint (List[int]): A prohibited code point sequence.
+
+	"""
+	def __init__ (self, constraint):
+		# Either a list or a dictionary. As a list of code points, it
+		# represents a prohibited code point sequence. As a dictionary,
+		# it represents a set of prohibited sequences, where each item
+		# represents the set of prohibited sequences starting with the
+		# key (a code point) concatenated with any of the values
+		# (ConstraintSets).
+		self._c = constraint
+
+	def add (self, constraint):
+		"""Add a constraint to this set."""
+		if not constraint:
+			return
+		first = constraint[0]
+		rest = constraint[1:]
+		if isinstance (self._c, list):
+			if constraint == self._c[:len (constraint)]:
+				self._c = constraint
+			elif self._c != constraint[:len (self._c)]:
+				self._c = {self._c[0]: ConstraintSet (self._c[1:])}
+		if isinstance (self._c, dict):
+			if first in self._c:
+				self._c[first].add (rest)
+			else:
+				self._c[first] = ConstraintSet (rest)
+
+	def _indent (self, depth):
+		return ('  ' * depth).replace ('        ', '\t')
+
+	def __str__ (self, index=0, depth=4):
+		s = []
+		indent = self._indent (depth)
+		if isinstance (self._c, list):
+			if len (self._c) == 0:
+				s.append ('{}matched = true;\n'.format (indent))
+			elif len (self._c) == 1:
+				s.append ('{}matched = 0x{:04X}u == buffer->cur ({}).codepoint;\n'.format (indent, next (iter (self._c)), index or ''))
+			else:
+				s.append ('{}if (0x{:04X}u == buffer->cur ({}).codepoint &&\n'.format (indent, self._c[0], index))
+				s.append ('{}buffer->idx + {} < count &&\n'.format (self._indent (depth + 2), len (self._c)))
+				for i, cp in enumerate (self._c[1:], start=1):
+					s.append ('{}0x{:04X}u == buffer->cur ({}).codepoint{}\n'.format (
+						self._indent (depth + 2), cp, index + i, ')' if i == len (self._c) - 1 else ' &&'))
+				s.append ('{}{{\n'.format (indent))
+				for i in range (len (self._c)):
+					s.append ('{}buffer->next_glyph ();\n'.format (self._indent (depth + 1)))
+				s.append ('{}buffer->output_glyph (0x25CCu);\n'.format (self._indent (depth + 1)))
+				s.append ('{}}}\n'.format (indent))
+		else:
+			s.append ('{}switch (buffer->cur ({}).codepoint)\n'.format(indent, index or ''))
+			s.append ('{}{{\n'.format (indent))
+			cases = collections.defaultdict (set)
+			for first, rest in sorted (self._c.items ()):
+				cases[rest.__str__ (index + 1, depth + 2)].add (first)
+			for body, labels in sorted (cases.items (), key=lambda b_ls: sorted (b_ls[1])[0]):
+				for i, cp in enumerate (sorted (labels)):
+					if i % 4 == 0:
+						s.append (self._indent (depth + 1))
+					else:
+						s.append (' ')
+					s.append ('case 0x{:04X}u:{}'.format (cp, '\n' if i % 4 == 3 else ''))
+				if len (labels) % 4 != 0:
+					s.append ('\n')
+				s.append (body)
+				s.append ('{}break;\n'.format (self._indent (depth + 2)))
+			s.append ('{}}}\n'.format (indent))
+		return ''.join (s)
+
+class USESpecParser (HTMLParser):
+	"""A parser for the USE script development spec.
+
+	Attributes:
+		header (str): The ``updated_at`` timestamp of the spec.
+		constraints (Mapping[str, ConstraintSet]): A map of script names
+			to the scripts' prohibited sequences.
+	"""
+	def __init__ (self):
+		HTMLParser.__init__ (self)
+		self.header = ''
+		self.constraints = {}
+		# Whether the next <code> contains the vowel constraints.
+		self._primed = False
+		# Whether the parser is in the <code> element with the constraints.
+		self._in_constraints = False
+		# The text of the constraints.
+		self._constraints = ''
+
+	def handle_starttag (self, tag, attrs):
+		if tag == 'meta':
+			for attr, value in attrs:
+				if attr == 'name' and value == 'updated_at':
+					self.header = self.get_starttag_text ()
+					break
+		elif tag == 'a':
+			for attr, value in attrs:
+				if attr == 'id' and value == 'ivdvconstraints':
+					self._primed = True
+					break
+		elif self._primed and tag == 'code':
+			self._primed = False
+			self._in_constraints = True
+
+	def handle_endtag (self, tag):
+		self._in_constraints = False
+
+	def handle_data (self, data):
+		if self._in_constraints:
+			self._constraints += data
+
+	def handle_charref (self, name):
+		self.handle_data (html_unescape (self, '&#%s;' % name))
+
+	def handle_entityref (self, name):
+		self.handle_data (html_unescape (self, '&%s;' % name))
+
+	def parse (self, filename):
+		"""Parse the USE script development spec.
+
+		Args:
+			filename (str): The file name of the spec.
+		"""
+		with io.open (filename, encoding='utf-8') as f:
+			self.feed (f.read ())
+		expect (self.header, 'No header found')
+		for line in self._constraints.splitlines ():
+			constraint = [int (cp, 16) for cp in line.split (';')[0].strip ().split (' ')]
+			expect (2 <= len (constraint), 'Prohibited sequence is too short: {}'.format (constraint))
+			script = scripts[constraint[0]]
+			if script in self.constraints:
+				self.constraints[script].add (constraint)
+			else:
+				self.constraints[script] = ConstraintSet (constraint)
+		expect (self.constraints, 'No constraints found')
+
+use_parser = USESpecParser ()
+use_parser.parse (sys.argv[1])
+
+print ('/* == Start of generated functions == */')
+print ('/*')
+print (' * The following functions are generated by running:')
+print (' *')
+print (' *   %s use Scripts.txt' % sys.argv[0])
+print (' *')
+print (' * on files with these headers:')
+print (' *')
+print (' * %s' % use_parser.header.strip ())
+for line in scripts_header:
+	print (' * %s' % line.strip ())
+print (' */')
+print ()
+print ('#ifndef HB_OT_SHAPE_COMPLEX_VOWEL_CONSTRAINTS_HH')
+print ('#define HB_OT_SHAPE_COMPLEX_VOWEL_CONSTRAINTS_HH')
+print ()
+
+print ('static void')
+print ('_output_with_dotted_circle (hb_buffer_t *buffer)')
+print ('{')
+print ('  hb_glyph_info_t &dottedcircle = buffer->output_glyph (0x25CCu);')
+print ('  _hb_glyph_info_reset_continuation (&dottedcircle);')
+print ()
+print ('  buffer->next_glyph ();')
+print ('}')
+print ()
+
+print ('static void')
+print ('preprocess_text_vowel_constraints (const hb_ot_shape_plan_t *plan,')
+print ('\t\t\t\t   hb_buffer_t              *buffer,')
+print ('\t\t\t\t   hb_font_t                *font)')
+print ('{')
+print ('  /* UGLY UGLY UGLY business of adding dotted-circle in the middle of')
+print ('   * vowel-sequences that look like another vowel.  Data for each script')
+print ('   * collected from the USE script development spec.')
+print ('   *')
+print ('   * https://github.com/harfbuzz/harfbuzz/issues/1019')
+print ('   */')
+print ('  bool processed = false;')
+print ('  buffer->clear_output ();')
+print ('  unsigned int count = buffer->len;')
+print ('  switch ((unsigned) buffer->props.script)')
+print ('  {')
+
+for script, constraints in sorted (use_parser.constraints.items (), key=lambda s_c: script_order[s_c[0]]):
+	print ('    case HB_SCRIPT_{}:'.format (script.upper ()))
+	print ('      for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;)')
+	print ('      {')
+	print ('\tbool matched = false;')
+	write (str (constraints))
+	print ('\tbuffer->next_glyph ();')
+	print ('\tif (matched) _output_with_dotted_circle (buffer);')
+	print ('      }')
+	print ('      processed = true;')
+	print ('      break;')
+	print ()
+
+print ('    default:')
+print ('      break;')
+print ('  }')
+print ('  if (processed)')
+print ('  {')
+print ('    if (buffer->idx < count)')
+print ('     buffer->next_glyph ();')
+print ('    if (likely (buffer->successful))')
+print ('      buffer->swap_buffers ();')
+print ('  }')
+print ('}')
+
+print ()
+print ('#endif /* HB_OT_SHAPE_COMPLEX_VOWEL_CONSTRAINTS_HH */')
+print ()
+print ('/* == End of generated functions == */')

diff --git a/src/hb-ot-shape-complex-indic.cc b/src/hb-ot-shape-complex-indic.cc
index f1ae303..092ac68 100644
--- a/src/hb-ot-shape-complex-indic.cc
+++ b/src/hb-ot-shape-complex-indic.cc

@@ -25,6 +25,7 @@
  */
 
 #include "hb-ot-shape-complex-indic.hh"
+#include "hb-ot-shape-complex-vowel-constraints.hh"
 #include "hb-ot-layout.hh"
 
 
@@ -331,275 +332,6 @@
   free (data);
 }
 
-static void
-_output_with_dotted_circle (hb_buffer_t *buffer)
-{
-  hb_glyph_info_t &dottedcircle = buffer->output_glyph (0x25CCu);
-  _hb_glyph_info_reset_continuation (&dottedcircle);
-
-  buffer->next_glyph ();
-}
-
-static void
-preprocess_text_indic (const hb_ot_shape_plan_t *plan,
-		       hb_buffer_t              *buffer,
-		       hb_font_t                *font)
-{
-  /* UGLY UGLY UGLY business of adding dotted-circle in the middle of
-   * vowel-sequences that look like another vowel.  Data for each script
-   * collected from Unicode 11 book, tables named "Vowel Letters" with
-   * "Use" and "Do Not Use" columns.
-   *
-   * https://github.com/harfbuzz/harfbuzz/issues/1019
-   */
-  bool processed = false;
-  buffer->clear_output ();
-  unsigned int count = buffer->len;
-  switch ((unsigned) buffer->props.script)
-  {
-    case HB_SCRIPT_DEVANAGARI:
-      for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;)
-      {
-	bool matched = false;
-	switch (buffer->cur().codepoint)
-	{
-	  case 0x0905u:
-	    switch (buffer->cur(1).codepoint)
-	    {
-	      case 0x093Au: case 0x093Bu: case 0x093Eu: case 0x0945u:
-	      case 0x0946u: case 0x0949u: case 0x094Au: case 0x094Bu:
-	      case 0x094Cu: case 0x094Fu: case 0x0956u: case 0x0957u:
-		matched = true;
-		break;
-	    }
-	    break;
-	  case 0x0906u:
-	    switch (buffer->cur(1).codepoint)
-	    {
-	      case 0x093Au: case 0x0945u: case 0x0946u: case 0x0947u:
-	      case 0x0948u:
-		matched = true;
-		break;
-	    }
-	    break;
-	  case 0x0909u:
-	    switch (buffer->cur(1).codepoint)
-	    {
-	      case 0x0941u:
-		matched = true;
-		break;
-	    }
-	    break;
-	  case 0x090Fu:
-	    switch (buffer->cur(1).codepoint)
-	    {
-	      case 0x0945u: case 0x0946u: case 0x0947u:
-		matched = true;
-		break;
-	    }
-	    break;
-	  case 0x0930u:
-	    if (0x094Du == buffer->cur(1).codepoint &&
-		buffer->idx + 2 < count &&
-	        0x0907u == buffer->cur(2).codepoint)
-	    {
-	      buffer->next_glyph ();
-	      buffer->next_glyph ();
-	      buffer->output_glyph (0x25CCu);
-	    }
-	    break;
-	}
-	buffer->next_glyph ();
-	if (matched) _output_with_dotted_circle (buffer);
-      }
-      processed = true;
-      break;
-
-    case HB_SCRIPT_BENGALI:
-      for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;)
-      {
-	bool matched = false;
-	switch (buffer->cur().codepoint)
-	{
-	  case 0x0985u:
-	    matched = 0x09BE == buffer->cur(1).codepoint;
-	    break;
-	  case 0x098Bu:
-	    matched = 0x09C3 == buffer->cur(1).codepoint;
-	    break;
-	  case 0x098Cu:
-	    matched = 0x09E2 == buffer->cur(1).codepoint;
-	    break;
-	}
-	buffer->next_glyph ();
-	if (matched) _output_with_dotted_circle (buffer);
-      }
-      processed = true;
-      break;
-
-    case HB_SCRIPT_GURMUKHI:
-      for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;)
-      {
-	bool matched = false;
-	switch (buffer->cur().codepoint)
-	{
-	  case 0x0A05u:
-	    switch (buffer->cur(1).codepoint)
-	    {
-	      case 0x0A3Eu: case 0x0A48u: case 0x0A4Cu:
-		matched = true;
-		break;
-	    }
-	    break;
-	  case 0x0A72u:
-	    switch (buffer->cur(1).codepoint)
-	    {
-	      case 0x0A3Fu: case 0x0A40u: case 0x0A47u:
-		matched = true;
-		break;
-	    }
-	    break;
-	  case 0x0A73u:
-	    switch (buffer->cur(1).codepoint)
-	    {
-	      case 0x0A41u: case 0x0A42u: case 0x0A4Bu:
-		matched = true;
-		break;
-	    }
-	    break;
-	}
-	buffer->next_glyph ();
-	if (matched) _output_with_dotted_circle (buffer);
-      }
-      processed = true;
-      break;
-
-    case HB_SCRIPT_GUJARATI:
-      for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;)
-      {
-	bool matched = false;
-	switch (buffer->cur().codepoint)
-	{
-	  case 0x0A85u:
-	    switch (buffer->cur(1).codepoint)
-	    {
-	      case 0x0ABEu: case 0x0AC5u: case 0x0AC7u: case 0x0AC8u:
-	      case 0x0AC9u: case 0x0ACBu: case 0x0ACCu:
-		matched = true;
-		break;
-	    }
-	    break;
-	  case 0x0AC5u:
-	    matched = 0x0ABE == buffer->cur(1).codepoint;
-	    break;
-	}
-	buffer->next_glyph ();
-	if (matched) _output_with_dotted_circle (buffer);
-      }
-      processed = true;
-      break;
-
-    case HB_SCRIPT_ORIYA:
-      for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;)
-      {
-	bool matched = false;
-	switch (buffer->cur().codepoint)
-	{
-	  case 0x0B05u:
-	    matched = 0x0B3E == buffer->cur(1).codepoint;
-	    break;
-	  case 0x0B0Fu: case 0x0B13u:
-	    matched = 0x0B57 == buffer->cur(1).codepoint;
-	    break;
-	}
-	buffer->next_glyph ();
-	if (matched) _output_with_dotted_circle (buffer);
-      }
-      processed = true;
-      break;
-
-    case HB_SCRIPT_TELUGU:
-      for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;)
-      {
-	bool matched = false;
-	switch (buffer->cur().codepoint)
-	{
-	  case 0x0C12u:
-	    switch (buffer->cur(1).codepoint)
-	    {
-	      case 0x0C4Cu: case 0x0C55u:
-		matched = true;
-		break;
-	    }
-	    break;
-	  case 0x0C3Fu: case 0x0C46u: case 0xC4Au:
-	    matched = 0x0C55 == buffer->cur(1).codepoint;
-	    break;
-	}
-	buffer->next_glyph ();
-	if (matched) _output_with_dotted_circle (buffer);
-      }
-      processed = true;
-      break;
-
-    case HB_SCRIPT_KANNADA:
-      for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;)
-      {
-	bool matched = false;
-	switch (buffer->cur().codepoint)
-	{
-	  case 0x0C89u: case 0x0C8Bu:
-	    matched = 0x0CBE == buffer->cur(1).codepoint;
-	    break;
-	  case 0x0C92u:
-	    matched = 0x0CCC == buffer->cur(1).codepoint;
-	    break;
-	}
-	buffer->next_glyph ();
-	if (matched) _output_with_dotted_circle (buffer);
-      }
-      processed = true;
-      break;
-
-    case HB_SCRIPT_MALAYALAM:
-      for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;)
-      {
-	bool matched = false;
-	switch (buffer->cur().codepoint)
-	{
-	  case 0x0D07u: case 0x0D09u:
-	    matched = 0x0D57 == buffer->cur(1).codepoint;
-	    break;
-	  case 0x0D0Eu:
-	    matched = 0x0D46 == buffer->cur(1).codepoint;
-	    break;
-	  case 0x0D12u:
-	    switch (buffer->cur(1).codepoint)
-	    {
-	      case 0x0D3Eu: case 0x0D57u:
-		matched = true;
-		break;
-	    }
-	    break;
-	}
-	buffer->next_glyph ();
-	if (matched) _output_with_dotted_circle (buffer);
-      }
-      processed = true;
-      break;
-
-    default:
-      break;
-  }
-  if (processed)
-  {
-    if (buffer->idx < count)
-      buffer->next_glyph ();
-    if (likely (buffer->successful))
-      buffer->swap_buffers ();
-  }
-}
-
 static indic_position_t
 consonant_position_from_face (const indic_shape_plan_t *indic_plan,
 			      const hb_codepoint_t consonant,
@@ -1884,7 +1616,7 @@
   override_features_indic,
   data_create_indic,
   data_destroy_indic,
-  preprocess_text_indic,
+  preprocess_text_vowel_constraints,
   nullptr, /* postprocess_glyphs */
   HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS_NO_SHORT_CIRCUIT,
   decompose_indic,

diff --git a/src/hb-ot-shape-complex-use.cc b/src/hb-ot-shape-complex-use.cc
index f9a580c..8c44fe0 100644
--- a/src/hb-ot-shape-complex-use.cc
+++ b/src/hb-ot-shape-complex-use.cc

@@ -28,6 +28,7 @@
 
 #include "hb-ot-shape-complex-use.hh"
 #include "hb-ot-shape-complex-arabic.hh"
+#include "hb-ot-shape-complex-vowel-constraints.hh"
 
 /* buffer var allocations */
 #define use_category() complex_var_u8_0()
@@ -591,7 +592,7 @@
   nullptr, /* override_features */
   data_create_use,
   data_destroy_use,
-  nullptr, /* preprocess_text */
+  preprocess_text_vowel_constraints,
   nullptr, /* postprocess_glyphs */
   HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS_NO_SHORT_CIRCUIT,
   nullptr, /* decompose */

diff --git a/src/hb-ot-shape-complex-vowel-constraints.hh b/src/hb-ot-shape-complex-vowel-constraints.hh
new file mode 100644
index 0000000..1b07c2f
--- /dev/null
+++ b/src/hb-ot-shape-complex-vowel-constraints.hh

@@ -0,0 +1,434 @@
+/* == Start of generated functions == */
+/*
+ * The following functions are generated by running:
+ *
+ *   ./gen-vowel-constraints.py use Scripts.txt
+ *
+ * on files with these headers:
+ *
+ * <meta name="updated_at" content="2018-03-27 12:21 AM" />
+ * # Scripts-11.0.0.txt
+ * # Date: 2018-02-21, 05:34:31 GMT
+ */
+
+#ifndef HB_OT_SHAPE_COMPLEX_VOWEL_CONSTRAINTS_HH
+#define HB_OT_SHAPE_COMPLEX_VOWEL_CONSTRAINTS_HH
+
+static void
+_output_with_dotted_circle (hb_buffer_t *buffer)
+{
+  hb_glyph_info_t &dottedcircle = buffer->output_glyph (0x25CCu);
+  _hb_glyph_info_reset_continuation (&dottedcircle);
+
+  buffer->next_glyph ();
+}
+
+static void
+preprocess_text_vowel_constraints (const hb_ot_shape_plan_t *plan,
+				   hb_buffer_t              *buffer,
+				   hb_font_t                *font)
+{
+  /* UGLY UGLY UGLY business of adding dotted-circle in the middle of
+   * vowel-sequences that look like another vowel.  Data for each script
+   * collected from the USE script development spec.
+   *
+   * https://github.com/harfbuzz/harfbuzz/issues/1019
+   */
+  bool processed = false;
+  buffer->clear_output ();
+  unsigned int count = buffer->len;
+  switch ((unsigned) buffer->props.script)
+  {
+    case HB_SCRIPT_DEVANAGARI:
+      for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;)
+      {
+	bool matched = false;
+	switch (buffer->cur ().codepoint)
+	{
+	  case 0x0905u:
+	    switch (buffer->cur (1).codepoint)
+	    {
+	      case 0x093Au: case 0x093Bu: case 0x093Eu: case 0x0945u:
+	      case 0x0946u: case 0x0949u: case 0x094Au: case 0x094Bu:
+	      case 0x094Cu: case 0x094Fu: case 0x0956u: case 0x0957u:
+		matched = true;
+		break;
+	    }
+	    break;
+	  case 0x0906u:
+	    switch (buffer->cur (1).codepoint)
+	    {
+	      case 0x093Au: case 0x0945u: case 0x0946u: case 0x0947u:
+	      case 0x0948u:
+		matched = true;
+		break;
+	    }
+	    break;
+	  case 0x0909u:
+	    matched = 0x0941u == buffer->cur (1).codepoint;
+	    break;
+	  case 0x090Fu:
+	    switch (buffer->cur (1).codepoint)
+	    {
+	      case 0x0945u: case 0x0946u: case 0x0947u:
+		matched = true;
+		break;
+	    }
+	    break;
+	  case 0x0930u:
+	    if (0x094Du == buffer->cur (1).codepoint &&
+		buffer->idx + 2 < count &&
+		0x0907u == buffer->cur (2).codepoint)
+	    {
+	      buffer->next_glyph ();
+	      buffer->next_glyph ();
+	      buffer->output_glyph (0x25CCu);
+	    }
+	    break;
+	}
+	buffer->next_glyph ();
+	if (matched) _output_with_dotted_circle (buffer);
+      }
+      processed = true;
+      break;
+
+    case HB_SCRIPT_BENGALI:
+      for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;)
+      {
+	bool matched = false;
+	switch (buffer->cur ().codepoint)
+	{
+	  case 0x0985u:
+	    matched = 0x09BEu == buffer->cur (1).codepoint;
+	    break;
+	  case 0x098Bu:
+	    matched = 0x09C3u == buffer->cur (1).codepoint;
+	    break;
+	  case 0x098Cu:
+	    matched = 0x09E2u == buffer->cur (1).codepoint;
+	    break;
+	}
+	buffer->next_glyph ();
+	if (matched) _output_with_dotted_circle (buffer);
+      }
+      processed = true;
+      break;
+
+    case HB_SCRIPT_GURMUKHI:
+      for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;)
+      {
+	bool matched = false;
+	switch (buffer->cur ().codepoint)
+	{
+	  case 0x0A05u:
+	    switch (buffer->cur (1).codepoint)
+	    {
+	      case 0x0A3Eu: case 0x0A48u: case 0x0A4Cu:
+		matched = true;
+		break;
+	    }
+	    break;
+	  case 0x0A72u:
+	    switch (buffer->cur (1).codepoint)
+	    {
+	      case 0x0A3Fu: case 0x0A40u: case 0x0A47u:
+		matched = true;
+		break;
+	    }
+	    break;
+	  case 0x0A73u:
+	    switch (buffer->cur (1).codepoint)
+	    {
+	      case 0x0A41u: case 0x0A42u: case 0x0A4Bu:
+		matched = true;
+		break;
+	    }
+	    break;
+	}
+	buffer->next_glyph ();
+	if (matched) _output_with_dotted_circle (buffer);
+      }
+      processed = true;
+      break;
+
+    case HB_SCRIPT_GUJARATI:
+      for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;)
+      {
+	bool matched = false;
+	switch (buffer->cur ().codepoint)
+	{
+	  case 0x0A85u:
+	    switch (buffer->cur (1).codepoint)
+	    {
+	      case 0x0ABEu: case 0x0AC5u: case 0x0AC7u: case 0x0AC8u:
+	      case 0x0AC9u: case 0x0ACBu: case 0x0ACCu:
+		matched = true;
+		break;
+	    }
+	    break;
+	  case 0x0AC5u:
+	    matched = 0x0ABEu == buffer->cur (1).codepoint;
+	    break;
+	}
+	buffer->next_glyph ();
+	if (matched) _output_with_dotted_circle (buffer);
+      }
+      processed = true;
+      break;
+
+    case HB_SCRIPT_ORIYA:
+      for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;)
+      {
+	bool matched = false;
+	switch (buffer->cur ().codepoint)
+	{
+	  case 0x0B05u:
+	    matched = 0x0B3Eu == buffer->cur (1).codepoint;
+	    break;
+	  case 0x0B0Fu: case 0x0B13u:
+	    matched = 0x0B57u == buffer->cur (1).codepoint;
+	    break;
+	}
+	buffer->next_glyph ();
+	if (matched) _output_with_dotted_circle (buffer);
+      }
+      processed = true;
+      break;
+
+    case HB_SCRIPT_TELUGU:
+      for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;)
+      {
+	bool matched = false;
+	switch (buffer->cur ().codepoint)
+	{
+	  case 0x0C12u:
+	    switch (buffer->cur (1).codepoint)
+	    {
+	      case 0x0C4Cu: case 0x0C55u:
+		matched = true;
+		break;
+	    }
+	    break;
+	  case 0x0C3Fu: case 0x0C46u: case 0x0C4Au:
+	    matched = 0x0C55u == buffer->cur (1).codepoint;
+	    break;
+	}
+	buffer->next_glyph ();
+	if (matched) _output_with_dotted_circle (buffer);
+      }
+      processed = true;
+      break;
+
+    case HB_SCRIPT_KANNADA:
+      for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;)
+      {
+	bool matched = false;
+	switch (buffer->cur ().codepoint)
+	{
+	  case 0x0C89u: case 0x0C8Bu:
+	    matched = 0x0CBEu == buffer->cur (1).codepoint;
+	    break;
+	  case 0x0C92u:
+	    matched = 0x0CCCu == buffer->cur (1).codepoint;
+	    break;
+	}
+	buffer->next_glyph ();
+	if (matched) _output_with_dotted_circle (buffer);
+      }
+      processed = true;
+      break;
+
+    case HB_SCRIPT_MALAYALAM:
+      for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;)
+      {
+	bool matched = false;
+	switch (buffer->cur ().codepoint)
+	{
+	  case 0x0D07u: case 0x0D09u:
+	    matched = 0x0D57u == buffer->cur (1).codepoint;
+	    break;
+	  case 0x0D0Eu:
+	    matched = 0x0D46u == buffer->cur (1).codepoint;
+	    break;
+	  case 0x0D12u:
+	    switch (buffer->cur (1).codepoint)
+	    {
+	      case 0x0D3Eu: case 0x0D57u:
+		matched = true;
+		break;
+	    }
+	    break;
+	}
+	buffer->next_glyph ();
+	if (matched) _output_with_dotted_circle (buffer);
+      }
+      processed = true;
+      break;
+
+    case HB_SCRIPT_SINHALA:
+      for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;)
+      {
+	bool matched = false;
+	switch (buffer->cur ().codepoint)
+	{
+	  case 0x0D85u:
+	    switch (buffer->cur (1).codepoint)
+	    {
+	      case 0x0DCFu: case 0x0DD0u: case 0x0DD1u:
+		matched = true;
+		break;
+	    }
+	    break;
+	  case 0x0D8Bu: case 0x0D8Fu: case 0x0D94u:
+	    matched = 0x0DDFu == buffer->cur (1).codepoint;
+	    break;
+	  case 0x0D8Du:
+	    matched = 0x0DD8u == buffer->cur (1).codepoint;
+	    break;
+	  case 0x0D91u:
+	    switch (buffer->cur (1).codepoint)
+	    {
+	      case 0x0DCAu: case 0x0DD9u: case 0x0DDAu: case 0x0DDCu:
+	      case 0x0DDDu:
+		matched = true;
+		break;
+	    }
+	    break;
+	}
+	buffer->next_glyph ();
+	if (matched) _output_with_dotted_circle (buffer);
+      }
+      processed = true;
+      break;
+
+    case HB_SCRIPT_BRAHMI:
+      for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;)
+      {
+	bool matched = false;
+	switch (buffer->cur ().codepoint)
+	{
+	  case 0x11005u:
+	    matched = 0x11038u == buffer->cur (1).codepoint;
+	    break;
+	  case 0x1100Bu:
+	    matched = 0x1103Eu == buffer->cur (1).codepoint;
+	    break;
+	  case 0x1100Fu:
+	    matched = 0x11042u == buffer->cur (1).codepoint;
+	    break;
+	}
+	buffer->next_glyph ();
+	if (matched) _output_with_dotted_circle (buffer);
+      }
+      processed = true;
+      break;
+
+    case HB_SCRIPT_KHUDAWADI:
+      for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;)
+      {
+	bool matched = false;
+	switch (buffer->cur ().codepoint)
+	{
+	  case 0x112B0u:
+	    switch (buffer->cur (1).codepoint)
+	    {
+	      case 0x112E0u: case 0x112E5u: case 0x112E6u: case 0x112E7u:
+	      case 0x112E8u:
+		matched = true;
+		break;
+	    }
+	    break;
+	}
+	buffer->next_glyph ();
+	if (matched) _output_with_dotted_circle (buffer);
+      }
+      processed = true;
+      break;
+
+    case HB_SCRIPT_TIRHUTA:
+      for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;)
+      {
+	bool matched = false;
+	switch (buffer->cur ().codepoint)
+	{
+	  case 0x11481u:
+	    matched = 0x114B0u == buffer->cur (1).codepoint;
+	    break;
+	  case 0x1148Bu: case 0x1148Du:
+	    matched = 0x114BAu == buffer->cur (1).codepoint;
+	    break;
+	  case 0x114AAu:
+	    switch (buffer->cur (1).codepoint)
+	    {
+	      case 0x114B5u: case 0x114B6u:
+		matched = true;
+		break;
+	    }
+	    break;
+	}
+	buffer->next_glyph ();
+	if (matched) _output_with_dotted_circle (buffer);
+      }
+      processed = true;
+      break;
+
+    case HB_SCRIPT_MODI:
+      for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;)
+      {
+	bool matched = false;
+	switch (buffer->cur ().codepoint)
+	{
+	  case 0x11600u: case 0x11601u:
+	    switch (buffer->cur (1).codepoint)
+	    {
+	      case 0x11639u: case 0x1163Au:
+		matched = true;
+		break;
+	    }
+	    break;
+	}
+	buffer->next_glyph ();
+	if (matched) _output_with_dotted_circle (buffer);
+      }
+      processed = true;
+      break;
+
+    case HB_SCRIPT_TAKRI:
+      for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;)
+      {
+	bool matched = false;
+	switch (buffer->cur ().codepoint)
+	{
+	  case 0x11680u:
+	    switch (buffer->cur (1).codepoint)
+	    {
+	      case 0x116ADu: case 0x116B4u: case 0x116B5u:
+		matched = true;
+		break;
+	    }
+	    break;
+	  case 0x11686u:
+	    matched = 0x116B2u == buffer->cur (1).codepoint;
+	    break;
+	}
+	buffer->next_glyph ();
+	if (matched) _output_with_dotted_circle (buffer);
+      }
+      processed = true;
+      break;
+
+    default:
+      break;
+  }
+  if (processed)
+  {
+    if (buffer->idx < count)
+     buffer->next_glyph ();
+    if (likely (buffer->successful))
+      buffer->swap_buffers ();
+  }
+}
+
+#endif /* HB_OT_SHAPE_COMPLEX_VOWEL_CONSTRAINTS_HH */
+
+/* == End of generated functions == */
commit	205737acdc268b1c90cf00bde2d2038519a8bf48	[log] [tgz]
author	David Corbett <corbett.dav@husky.neu.edu>	Fri Oct 12 16:54:54 2018 -0400
committer	Behdad Esfahbod <behdad@behdad.org>	Tue Oct 23 02:25:08 2018 -0700
tree	81dd69ffebde25e9b6d08726f985746fdaf3b452
parent	48ed598a356983f4623029dd5e87254fb59e3691 [diff]