| #!/usr/bin/env python3 |
| |
| """Generator of the mapping from OpenType tags to BCP 47 tags and vice |
| versa. |
| |
| It creates a ``const LangTag[]``, matching the tags from the OpenType |
| languages system tag list to the language subtags of the BCP 47 language |
| subtag registry, with some manual adjustments. The mappings are |
| supplemented with macrolanguages' sublanguages and retired codes' |
| replacements, according to BCP 47 and some manual additions where BCP 47 |
| omits a retired code entirely. |
| |
| Also generated is a function, ``hb_ot_ambiguous_tag_to_language``, |
| intended for use by ``hb_ot_tag_to_language``. It maps OpenType tags |
| back to BCP 47 tags. Ambiguous OpenType tags (those that correspond to |
| multiple BCP 47 tags) are listed here, except when the alphabetically |
| first BCP 47 tag happens to be the chosen disambiguated tag. In that |
| case, the fallback behavior will choose the right tag anyway. |
| |
| usage: ./gen-tag-table.py languagetags language-subtag-registry |
| |
| Input files: |
| * https://docs.microsoft.com/en-us/typography/opentype/spec/languagetags |
| * https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry |
| """ |
| |
| import collections |
| import html |
| from html.parser import HTMLParser |
| import itertools |
| import re |
| import sys |
| import unicodedata |
| |
| if len (sys.argv) != 3: |
| sys.exit (__doc__) |
| |
| def expect (condition, message=None): |
| if not condition: |
| if message is None: |
| raise AssertionError |
| raise AssertionError (message) |
| |
| def write (s): |
| sys.stdout.flush () |
| sys.stdout.buffer.write (s.encode ('utf-8')) |
| |
| DEFAULT_LANGUAGE_SYSTEM = '' |
| |
| # from https://www-01.sil.org/iso639-3/iso-639-3.tab |
| ISO_639_3_TO_1 = { |
| 'aar': 'aa', |
| 'abk': 'ab', |
| 'afr': 'af', |
| 'aka': 'ak', |
| 'amh': 'am', |
| 'ara': 'ar', |
| 'arg': 'an', |
| 'asm': 'as', |
| 'ava': 'av', |
| 'ave': 'ae', |
| 'aym': 'ay', |
| 'aze': 'az', |
| 'bak': 'ba', |
| 'bam': 'bm', |
| 'bel': 'be', |
| 'ben': 'bn', |
| 'bis': 'bi', |
| 'bod': 'bo', |
| 'bos': 'bs', |
| 'bre': 'br', |
| 'bul': 'bg', |
| 'cat': 'ca', |
| 'ces': 'cs', |
| 'cha': 'ch', |
| 'che': 'ce', |
| 'chu': 'cu', |
| 'chv': 'cv', |
| 'cor': 'kw', |
| 'cos': 'co', |
| 'cre': 'cr', |
| 'cym': 'cy', |
| 'dan': 'da', |
| 'deu': 'de', |
| 'div': 'dv', |
| 'dzo': 'dz', |
| 'ell': 'el', |
| 'eng': 'en', |
| 'epo': 'eo', |
| 'est': 'et', |
| 'eus': 'eu', |
| 'ewe': 'ee', |
| 'fao': 'fo', |
| 'fas': 'fa', |
| 'fij': 'fj', |
| 'fin': 'fi', |
| 'fra': 'fr', |
| 'fry': 'fy', |
| 'ful': 'ff', |
| 'gla': 'gd', |
| 'gle': 'ga', |
| 'glg': 'gl', |
| 'glv': 'gv', |
| 'grn': 'gn', |
| 'guj': 'gu', |
| 'hat': 'ht', |
| 'hau': 'ha', |
| 'hbs': 'sh', |
| 'heb': 'he', |
| 'her': 'hz', |
| 'hin': 'hi', |
| 'hmo': 'ho', |
| 'hrv': 'hr', |
| 'hun': 'hu', |
| 'hye': 'hy', |
| 'ibo': 'ig', |
| 'ido': 'io', |
| 'iii': 'ii', |
| 'iku': 'iu', |
| 'ile': 'ie', |
| 'ina': 'ia', |
| 'ind': 'id', |
| 'ipk': 'ik', |
| 'isl': 'is', |
| 'ita': 'it', |
| 'jav': 'jv', |
| 'jpn': 'ja', |
| 'kal': 'kl', |
| 'kan': 'kn', |
| 'kas': 'ks', |
| 'kat': 'ka', |
| 'kau': 'kr', |
| 'kaz': 'kk', |
| 'khm': 'km', |
| 'kik': 'ki', |
| 'kin': 'rw', |
| 'kir': 'ky', |
| 'kom': 'kv', |
| 'kon': 'kg', |
| 'kor': 'ko', |
| 'kua': 'kj', |
| 'kur': 'ku', |
| 'lao': 'lo', |
| 'lat': 'la', |
| 'lav': 'lv', |
| 'lim': 'li', |
| 'lin': 'ln', |
| 'lit': 'lt', |
| 'ltz': 'lb', |
| 'lub': 'lu', |
| 'lug': 'lg', |
| 'mah': 'mh', |
| 'mal': 'ml', |
| 'mar': 'mr', |
| 'mkd': 'mk', |
| 'mlg': 'mg', |
| 'mlt': 'mt', |
| 'mol': 'mo', |
| 'mon': 'mn', |
| 'mri': 'mi', |
| 'msa': 'ms', |
| 'mya': 'my', |
| 'nau': 'na', |
| 'nav': 'nv', |
| 'nbl': 'nr', |
| 'nde': 'nd', |
| 'ndo': 'ng', |
| 'nep': 'ne', |
| 'nld': 'nl', |
| 'nno': 'nn', |
| 'nob': 'nb', |
| 'nor': 'no', |
| 'nya': 'ny', |
| 'oci': 'oc', |
| 'oji': 'oj', |
| 'ori': 'or', |
| 'orm': 'om', |
| 'oss': 'os', |
| 'pan': 'pa', |
| 'pli': 'pi', |
| 'pol': 'pl', |
| 'por': 'pt', |
| 'pus': 'ps', |
| 'que': 'qu', |
| 'roh': 'rm', |
| 'ron': 'ro', |
| 'run': 'rn', |
| 'rus': 'ru', |
| 'sag': 'sg', |
| 'san': 'sa', |
| 'sin': 'si', |
| 'slk': 'sk', |
| 'slv': 'sl', |
| 'sme': 'se', |
| 'smo': 'sm', |
| 'sna': 'sn', |
| 'snd': 'sd', |
| 'som': 'so', |
| 'sot': 'st', |
| 'spa': 'es', |
| 'sqi': 'sq', |
| 'srd': 'sc', |
| 'srp': 'sr', |
| 'ssw': 'ss', |
| 'sun': 'su', |
| 'swa': 'sw', |
| 'swe': 'sv', |
| 'tah': 'ty', |
| 'tam': 'ta', |
| 'tat': 'tt', |
| 'tel': 'te', |
| 'tgk': 'tg', |
| 'tgl': 'tl', |
| 'tha': 'th', |
| 'tir': 'ti', |
| 'ton': 'to', |
| 'tsn': 'tn', |
| 'tso': 'ts', |
| 'tuk': 'tk', |
| 'tur': 'tr', |
| 'twi': 'tw', |
| 'uig': 'ug', |
| 'ukr': 'uk', |
| 'urd': 'ur', |
| 'uzb': 'uz', |
| 'ven': 've', |
| 'vie': 'vi', |
| 'vol': 'vo', |
| 'wln': 'wa', |
| 'wol': 'wo', |
| 'xho': 'xh', |
| 'yid': 'yi', |
| 'yor': 'yo', |
| 'zha': 'za', |
| 'zho': 'zh', |
| 'zul': 'zu', |
| } |
| |
| class LanguageTag (object): |
| """A BCP 47 language tag. |
| |
| Attributes: |
| subtags (List[str]): The list of subtags in this tag. |
| grandfathered (bool): Whether this tag is grandfathered. If |
| ``true``, the entire lowercased tag is the ``language`` |
| and the other subtag fields are empty. |
| language (str): The language subtag. |
| script (str): The script subtag. |
| region (str): The region subtag. |
| variant (str): The variant subtag. |
| |
| Args: |
| tag (str): A BCP 47 language tag. |
| |
| """ |
| def __init__ (self, tag): |
| global bcp_47 |
| self.subtags = tag.lower ().split ('-') |
| self.grandfathered = tag.lower () in bcp_47.grandfathered |
| if self.grandfathered: |
| self.language = tag.lower () |
| self.script = '' |
| self.region = '' |
| self.variant = '' |
| else: |
| self.language = self.subtags[0] |
| self.script = self._find_first (lambda s: len (s) == 4 and s[0] > '9', self.subtags) |
| self.region = self._find_first (lambda s: len (s) == 2 and s[0] > '9' or len (s) == 3 and s[0] <= '9', self.subtags[1:]) |
| self.variant = self._find_first (lambda s: len (s) > 4 or len (s) == 4 and s[0] <= '9', self.subtags) |
| |
| def __str__(self): |
| return '-'.join(self.subtags) |
| |
| def __repr__ (self): |
| return 'LanguageTag(%r)' % str(self) |
| |
| @staticmethod |
| def _find_first (function, sequence): |
| try: |
| return next (iter (filter (function, sequence))) |
| except StopIteration: |
| return None |
| |
| def is_complex (self): |
| """Return whether this tag is too complex to represent as a |
| ``LangTag`` in the generated code. |
| |
| Complex tags need to be handled in |
| ``hb_ot_tags_from_complex_language``. |
| |
| Returns: |
| Whether this tag is complex. |
| """ |
| return not (len (self.subtags) == 1 |
| or self.grandfathered |
| and len (self.subtags[1]) != 3 |
| and ot.from_bcp_47[self.subtags[0]] == ot.from_bcp_47[self.language]) |
| |
| def get_group (self): |
| """Return the group into which this tag should be categorized in |
| ``hb_ot_tags_from_complex_language``. |
| |
| The group is the first letter of the tag, or ``'und'`` if this tag |
| should not be matched in a ``switch`` statement in the generated |
| code. |
| |
| Returns: |
| This tag's group. |
| """ |
| return ('und' |
| if (self.language == 'und' |
| or self.variant in bcp_47.prefixes and len (bcp_47.prefixes[self.variant]) == 1) |
| else self.language[0]) |
| |
| class OpenTypeRegistryParser (HTMLParser): |
| """A parser for the OpenType language system tag registry. |
| |
| Attributes: |
| header (str): The "last updated" line of the registry. |
| names (Mapping[str, str]): A map of language system tags to the |
| names they are given in the registry. |
| ranks (DefaultDict[str, int]): A map of language system tags to |
| numbers. If a single BCP 47 tag corresponds to multiple |
| OpenType tags, the tags are ordered in increasing order by |
| rank. The rank is based on the number of BCP 47 tags |
| associated with a tag, though it may be manually modified. |
| to_bcp_47 (DefaultDict[str, AbstractSet[str]]): A map of |
| OpenType language system tags to sets of BCP 47 tags. |
| from_bcp_47 (DefaultDict[str, AbstractSet[str]]): ``to_bcp_47`` |
| inverted. Its values start as unsorted sets; |
| ``sort_languages`` converts them to sorted lists. |
| from_bcp_47_uninherited (Optional[Dict[str, AbstractSet[str]]]): |
| A copy of ``from_bcp_47``. It starts as ``None`` and is |
| populated at the beginning of the first call to |
| ``inherit_from_macrolanguages``. |
| |
| """ |
| def __init__ (self): |
| HTMLParser.__init__ (self) |
| self.header = '' |
| self.names = {} |
| self.ranks = collections.defaultdict (int) |
| self.to_bcp_47 = collections.defaultdict (set) |
| self.from_bcp_47 = collections.defaultdict (set) |
| self.from_bcp_47_uninherited = None |
| # Whether the parser is in a <td> element |
| self._td = False |
| # Whether the parser is after a <br> element within the current <tr> element |
| self._br = False |
| # The text of the <td> elements of the current <tr> element. |
| self._current_tr = [] |
| |
| def handle_starttag (self, tag, attrs): |
| if tag == 'br': |
| self._br = True |
| elif tag == 'meta': |
| for attr, value in attrs: |
| if attr == 'name' and value == 'updated_at': |
| self.header = self.get_starttag_text () |
| break |
| elif tag == 'td': |
| self._td = True |
| self._current_tr.append ('') |
| elif tag == 'tr': |
| self._br = False |
| self._current_tr = [] |
| |
| def handle_endtag (self, tag): |
| if tag == 'td': |
| self._td = False |
| elif tag == 'tr' and self._current_tr: |
| expect (2 <= len (self._current_tr) <= 3) |
| name = self._current_tr[0].strip () |
| tag = self._current_tr[1].strip ("\t\n\v\f\r '") |
| rank = 0 |
| if len (tag) > 4: |
| expect (tag.endswith (' (deprecated)'), 'ill-formed OpenType tag: %s' % tag) |
| name += ' (deprecated)' |
| tag = tag.split (' ')[0] |
| rank = 1 |
| self.names[tag] = re.sub (' languages$', '', name) |
| if not self._current_tr[2]: |
| return |
| iso_codes = self._current_tr[2].strip () |
| self.to_bcp_47[tag].update (ISO_639_3_TO_1.get (code, code) for code in iso_codes.replace (' ', '').split (',')) |
| rank += 2 * len (self.to_bcp_47[tag]) |
| self.ranks[tag] = rank |
| |
| def handle_data (self, data): |
| if self._td and not self._br: |
| self._current_tr[-1] += data |
| |
| def handle_charref (self, name): |
| self.handle_data (html.unescape ('&#%s;' % name)) |
| |
| def handle_entityref (self, name): |
| self.handle_data (html.unescape ('&%s;' % name)) |
| |
| def parse (self, filename): |
| """Parse the OpenType language system tag registry. |
| |
| Args: |
| filename (str): The file name of the registry. |
| """ |
| with open (filename, encoding='utf-8') as f: |
| self.feed (f.read ()) |
| expect (self.header) |
| for tag, iso_codes in self.to_bcp_47.items (): |
| for iso_code in iso_codes: |
| self.from_bcp_47[iso_code].add (tag) |
| |
| def add_language (self, bcp_47_tag, ot_tag): |
| """Add a language as if it were in the registry. |
| |
| Args: |
| bcp_47_tag (str): A BCP 47 tag. If the tag is more than just |
| a language subtag, and if the language subtag is a |
| macrolanguage, then new languages are added corresponding |
| to the macrolanguages' individual languages with the |
| remainder of the tag appended. |
| ot_tag (str): An OpenType language system tag. |
| """ |
| global bcp_47 |
| self.to_bcp_47[ot_tag].add (bcp_47_tag) |
| self.from_bcp_47[bcp_47_tag].add (ot_tag) |
| if bcp_47_tag.lower () not in bcp_47.grandfathered: |
| try: |
| [macrolanguage, suffix] = bcp_47_tag.split ('-', 1) |
| if macrolanguage in bcp_47.macrolanguages: |
| s = set () |
| for language in bcp_47.macrolanguages[macrolanguage]: |
| if language.lower () not in bcp_47.grandfathered: |
| s.add ('%s-%s' % (language, suffix)) |
| bcp_47.macrolanguages['%s-%s' % (macrolanguage, suffix)] = s |
| except ValueError: |
| pass |
| |
| @staticmethod |
| def _remove_language (tag_1, dict_1, dict_2): |
| for tag_2 in dict_1.pop (tag_1): |
| dict_2[tag_2].remove (tag_1) |
| if not dict_2[tag_2]: |
| del dict_2[tag_2] |
| |
| def remove_language_ot (self, ot_tag): |
| """Remove an OpenType tag from the registry. |
| |
| Args: |
| ot_tag (str): An OpenType tag. |
| """ |
| self._remove_language (ot_tag, self.to_bcp_47, self.from_bcp_47) |
| |
| def remove_language_bcp_47 (self, bcp_47_tag): |
| """Remove a BCP 47 tag from the registry. |
| |
| Args: |
| bcp_47_tag (str): A BCP 47 tag. |
| """ |
| self._remove_language (bcp_47_tag, self.from_bcp_47, self.to_bcp_47) |
| |
| def inherit_from_macrolanguages (self): |
| """Copy mappings from macrolanguages to individual languages. |
| |
| If a BCP 47 tag for an individual mapping has no OpenType |
| mapping but its macrolanguage does, the mapping is copied to |
| the individual language. For example, als (Tosk Albanian) has no |
| explicit mapping, so it inherits from sq (Albanian) the mapping |
| to SQI. |
| |
| However, if an OpenType tag maps to a BCP 47 macrolanguage and |
| some but not all of its individual languages, the mapping is not |
| inherited from the macrolanguage to the missing individual |
| languages. For example, INUK (Nunavik Inuktitut) is mapped to |
| ike (Eastern Canadian Inuktitut) and iu (Inuktitut) but not to |
| ikt (Inuinnaqtun, which is an individual language of iu), so |
| this method does not add a mapping from ikt to INUK. |
| |
| If a BCP 47 tag for a macrolanguage has no OpenType mapping but |
| some of its individual languages do, their mappings are copied |
| to the macrolanguage. |
| """ |
| global bcp_47 |
| first_time = self.from_bcp_47_uninherited is None |
| if first_time: |
| self.from_bcp_47_uninherited = dict (self.from_bcp_47) |
| for macrolanguage, languages in dict (bcp_47.macrolanguages).items (): |
| ot_macrolanguages = { |
| ot_macrolanguage for ot_macrolanguage in self.from_bcp_47_uninherited.get (macrolanguage, set ()) |
| } |
| blocked_ot_macrolanguages = set () |
| if 'retired code' not in bcp_47.scopes.get (macrolanguage, ''): |
| for ot_macrolanguage in ot_macrolanguages: |
| round_trip_macrolanguages = { |
| l for l in self.to_bcp_47[ot_macrolanguage] |
| if 'retired code' not in bcp_47.scopes.get (l, '') |
| } |
| round_trip_languages = { |
| l for l in languages |
| if 'retired code' not in bcp_47.scopes.get (l, '') |
| } |
| intersection = round_trip_macrolanguages & round_trip_languages |
| if intersection and intersection != round_trip_languages: |
| blocked_ot_macrolanguages.add (ot_macrolanguage) |
| if ot_macrolanguages: |
| for ot_macrolanguage in ot_macrolanguages: |
| if ot_macrolanguage not in blocked_ot_macrolanguages: |
| for language in languages: |
| self.add_language (language, ot_macrolanguage) |
| if not blocked_ot_macrolanguages: |
| self.ranks[ot_macrolanguage] += 1 |
| elif first_time: |
| for language in languages: |
| if language in self.from_bcp_47_uninherited: |
| ot_macrolanguages |= self.from_bcp_47_uninherited[language] |
| else: |
| ot_macrolanguages.clear () |
| if not ot_macrolanguages: |
| break |
| for ot_macrolanguage in ot_macrolanguages: |
| self.add_language (macrolanguage, ot_macrolanguage) |
| |
| def sort_languages (self): |
| """Sort the values of ``from_bcp_47`` in ascending rank order.""" |
| for language, tags in self.from_bcp_47.items (): |
| self.from_bcp_47[language] = sorted (tags, |
| key=lambda t: (self.ranks[t] + rank_delta (language, t), t)) |
| |
| ot = OpenTypeRegistryParser () |
| |
| class BCP47Parser (object): |
| """A parser for the BCP 47 subtag registry. |
| |
| Attributes: |
| header (str): The "File-Date" line of the registry. |
| names (Mapping[str, str]): A map of subtags to the names they |
| are given in the registry. Each value is a |
| ``'\\n'``-separated list of names. |
| scopes (Mapping[str, str]): A map of language subtags to strings |
| suffixed to language names, including suffixes to explain |
| language scopes. |
| macrolanguages (DefaultDict[str, AbstractSet[str]]): A map of |
| language subtags to the sets of language subtags which |
| inherit from them. See |
| ``OpenTypeRegistryParser.inherit_from_macrolanguages``. |
| prefixes (DefaultDict[str, AbstractSet[str]]): A map of variant |
| subtags to their prefixes. |
| grandfathered (AbstractSet[str]): The set of grandfathered tags, |
| normalized to lowercase. |
| |
| """ |
| def __init__ (self): |
| self.header = '' |
| self.names = {} |
| self.scopes = {} |
| self.macrolanguages = collections.defaultdict (set) |
| self.prefixes = collections.defaultdict (set) |
| self.grandfathered = set () |
| |
| def parse (self, filename): |
| """Parse the BCP 47 subtag registry. |
| |
| Args: |
| filename (str): The file name of the registry. |
| """ |
| with open (filename, encoding='utf-8') as f: |
| subtag_type = None |
| subtag = None |
| deprecated = False |
| has_preferred_value = False |
| line_buffer = '' |
| for line in itertools.chain (f, ['']): |
| line = line.rstrip () |
| if line.startswith (' '): |
| line_buffer += line[1:] |
| continue |
| line, line_buffer = line_buffer, line |
| if line.startswith ('Type: '): |
| subtag_type = line.split (' ')[1] |
| deprecated = False |
| has_preferred_value = False |
| elif line.startswith ('Subtag: ') or line.startswith ('Tag: '): |
| subtag = line.split (' ')[1] |
| if subtag_type == 'grandfathered': |
| self.grandfathered.add (subtag.lower ()) |
| elif line.startswith ('Description: '): |
| description = line.split (' ', 1)[1].replace (' (individual language)', '') |
| description = re.sub (' (\(family\)|\((individual |macro)language\)|languages)$', '', |
| description) |
| if subtag in self.names: |
| self.names[subtag] += '\n' + description |
| else: |
| self.names[subtag] = description |
| elif subtag_type == 'language' or subtag_type == 'grandfathered': |
| if line.startswith ('Scope: '): |
| scope = line.split (' ')[1] |
| if scope == 'macrolanguage': |
| scope = ' [macrolanguage]' |
| elif scope == 'collection': |
| scope = ' [collection]' |
| else: |
| continue |
| self.scopes[subtag] = scope |
| elif line.startswith ('Deprecated: '): |
| self.scopes[subtag] = ' (retired code)' + self.scopes.get (subtag, '') |
| deprecated = True |
| elif deprecated and line.startswith ('Comments: see '): |
| # If a subtag is split into multiple replacement subtags, |
| # it essentially represents a macrolanguage. |
| for language in line.replace (',', '').split (' ')[2:]: |
| self._add_macrolanguage (subtag, language) |
| elif line.startswith ('Preferred-Value: '): |
| # If a subtag is deprecated in favor of a single replacement subtag, |
| # it is either a dialect or synonym of the preferred subtag. Either |
| # way, it is close enough to the truth to consider the replacement |
| # the macrolanguage of the deprecated language. |
| has_preferred_value = True |
| macrolanguage = line.split (' ')[1] |
| self._add_macrolanguage (macrolanguage, subtag) |
| elif not has_preferred_value and line.startswith ('Macrolanguage: '): |
| self._add_macrolanguage (line.split (' ')[1], subtag) |
| elif subtag_type == 'variant': |
| if line.startswith ('Deprecated: '): |
| self.scopes[subtag] = ' (retired code)' + self.scopes.get (subtag, '') |
| elif line.startswith ('Prefix: '): |
| self.prefixes[subtag].add (line.split (' ')[1]) |
| elif line.startswith ('File-Date: '): |
| self.header = line |
| expect (self.header) |
| |
| def _add_macrolanguage (self, macrolanguage, language): |
| global ot |
| if language not in ot.from_bcp_47: |
| for l in self.macrolanguages.get (language, set ()): |
| self._add_macrolanguage (macrolanguage, l) |
| if macrolanguage not in ot.from_bcp_47: |
| for ls in list (self.macrolanguages.values ()): |
| if macrolanguage in ls: |
| ls.add (language) |
| return |
| self.macrolanguages[macrolanguage].add (language) |
| |
| def remove_extra_macrolanguages (self): |
| """Make every language have at most one macrolanguage.""" |
| inverted = collections.defaultdict (list) |
| for macrolanguage, languages in self.macrolanguages.items (): |
| for language in languages: |
| inverted[language].append (macrolanguage) |
| for language, macrolanguages in inverted.items (): |
| if len (macrolanguages) > 1: |
| macrolanguages.sort (key=lambda ml: len (self.macrolanguages[ml])) |
| biggest_macrolanguage = macrolanguages.pop () |
| for macrolanguage in macrolanguages: |
| self._add_macrolanguage (biggest_macrolanguage, macrolanguage) |
| |
| def _get_name_piece (self, subtag): |
| """Return the first name of a subtag plus its scope suffix. |
| |
| Args: |
| subtag (str): A BCP 47 subtag. |
| |
| Returns: |
| The name form of ``subtag``. |
| """ |
| return self.names[subtag].split ('\n')[0] + self.scopes.get (subtag, '') |
| |
| def get_name (self, lt): |
| """Return the names of the subtags in a language tag. |
| |
| Args: |
| lt (LanguageTag): A BCP 47 language tag. |
| |
| Returns: |
| The name form of ``lt``. |
| """ |
| name = self._get_name_piece (lt.language) |
| if lt.script: |
| name += '; ' + self._get_name_piece (lt.script.title ()) |
| if lt.region: |
| name += '; ' + self._get_name_piece (lt.region.upper ()) |
| if lt.variant: |
| name += '; ' + self._get_name_piece (lt.variant) |
| return name |
| |
| bcp_47 = BCP47Parser () |
| |
| ot.parse (sys.argv[1]) |
| bcp_47.parse (sys.argv[2]) |
| |
| ot.add_language ('ary', 'MOR') |
| |
| ot.add_language ('ath', 'ATH') |
| |
| ot.add_language ('bai', 'BML') |
| |
| ot.ranks['BAL'] = ot.ranks['KAR'] + 1 |
| |
| ot.add_language ('ber', 'BBR') |
| |
| ot.remove_language_ot ('PGR') |
| ot.add_language ('el-polyton', 'PGR') |
| |
| bcp_47.macrolanguages['et'] = {'ekk'} |
| |
| bcp_47.names['flm'] = 'Falam Chin' |
| bcp_47.scopes['flm'] = ' (retired code)' |
| bcp_47.macrolanguages['flm'] = {'cfm'} |
| |
| ot.ranks['FNE'] = ot.ranks['TNE'] + 1 |
| |
| ot.add_language ('und-fonipa', 'IPPH') |
| |
| ot.add_language ('und-fonnapa', 'APPH') |
| |
| ot.remove_language_ot ('IRT') |
| ot.add_language ('ga-Latg', 'IRT') |
| |
| ot.add_language ('hy-arevmda', 'HYE') |
| |
| ot.remove_language_ot ('KGE') |
| ot.add_language ('und-Geok', 'KGE') |
| |
| bcp_47.macrolanguages['id'] = {'in'} |
| |
| bcp_47.macrolanguages['ijo'] = {'ijc'} |
| |
| ot.add_language ('kht', 'KHN') |
| ot.names['KHN'] = ot.names['KHT'] + ' (Microsoft fonts)' |
| ot.ranks['KHN'] = ot.ranks['KHT'] + 1 |
| |
| ot.ranks['LCR'] = ot.ranks['MCR'] + 1 |
| |
| ot.names['MAL'] = 'Malayalam Traditional' |
| ot.ranks['MLR'] += 1 |
| |
| bcp_47.names['mhv'] = 'Arakanese' |
| bcp_47.scopes['mhv'] = ' (retired code)' |
| |
| ot.add_language ('mnw-TH', 'MONT') |
| |
| ot.add_language ('no', 'NOR') |
| |
| ot.add_language ('oc-provenc', 'PRO') |
| |
| ot.remove_language_ot ('QUZ') |
| ot.add_language ('qu', 'QUZ') |
| ot.add_language ('qub', 'QWH') |
| ot.add_language ('qud', 'QVI') |
| ot.add_language ('qug', 'QVI') |
| ot.add_language ('qul', 'QUH') |
| ot.add_language ('qup', 'QVI') |
| ot.add_language ('qur', 'QWH') |
| ot.add_language ('qus', 'QUH') |
| ot.add_language ('quw', 'QVI') |
| ot.add_language ('qux', 'QWH') |
| ot.add_language ('qva', 'QWH') |
| ot.add_language ('qvh', 'QWH') |
| ot.add_language ('qvj', 'QVI') |
| ot.add_language ('qvl', 'QWH') |
| ot.add_language ('qvm', 'QWH') |
| ot.add_language ('qvn', 'QWH') |
| ot.add_language ('qvo', 'QVI') |
| ot.add_language ('qvp', 'QWH') |
| ot.add_language ('qvw', 'QWH') |
| ot.add_language ('qvz', 'QVI') |
| ot.add_language ('qwa', 'QWH') |
| ot.add_language ('qws', 'QWH') |
| ot.add_language ('qxa', 'QWH') |
| ot.add_language ('qxc', 'QWH') |
| ot.add_language ('qxh', 'QWH') |
| ot.add_language ('qxl', 'QVI') |
| ot.add_language ('qxn', 'QWH') |
| ot.add_language ('qxo', 'QWH') |
| ot.add_language ('qxr', 'QVI') |
| ot.add_language ('qxt', 'QWH') |
| ot.add_language ('qxw', 'QWH') |
| |
| bcp_47.macrolanguages['ro-MD'].add ('mo') |
| |
| ot.remove_language_ot ('SYRE') |
| ot.remove_language_ot ('SYRJ') |
| ot.remove_language_ot ('SYRN') |
| ot.add_language ('und-Syre', 'SYRE') |
| ot.add_language ('und-Syrj', 'SYRJ') |
| ot.add_language ('und-Syrn', 'SYRN') |
| |
| bcp_47.names['xst'] = "Silt'e" |
| bcp_47.scopes['xst'] = ' (retired code)' |
| bcp_47.macrolanguages['xst'] = {'stv', 'wle'} |
| |
| ot.add_language ('xwo', 'TOD') |
| |
| ot.remove_language_ot ('ZHH') |
| ot.remove_language_ot ('ZHP') |
| ot.remove_language_ot ('ZHT') |
| ot.remove_language_ot ('ZHTM') |
| bcp_47.macrolanguages['zh'].remove ('lzh') |
| bcp_47.macrolanguages['zh'].remove ('yue') |
| ot.add_language ('zh-Hant-MO', 'ZHH') |
| ot.add_language ('zh-Hant-MO', 'ZHTM') |
| ot.add_language ('zh-Hant-HK', 'ZHH') |
| ot.add_language ('zh-Hans', 'ZHS') |
| ot.add_language ('zh-Hant', 'ZHT') |
| ot.add_language ('zh-HK', 'ZHH') |
| ot.add_language ('zh-MO', 'ZHH') |
| ot.add_language ('zh-MO', 'ZHTM') |
| ot.add_language ('zh-TW', 'ZHT') |
| ot.add_language ('lzh', 'ZHT') |
| ot.add_language ('lzh-Hans', 'ZHS') |
| ot.add_language ('yue', 'ZHH') |
| ot.add_language ('yue-Hans', 'ZHS') |
| |
| bcp_47.macrolanguages['zom'] = {'yos'} |
| |
| def rank_delta (bcp_47, ot): |
| """Return a delta to apply to a BCP 47 tag's rank. |
| |
| Most OpenType tags have a constant rank, but a few have ranks that |
| depend on the BCP 47 tag. |
| |
| Args: |
| bcp_47 (str): A BCP 47 tag. |
| ot (str): An OpenType tag to. |
| |
| Returns: |
| A number to add to ``ot``'s rank when sorting ``bcp_47``'s |
| OpenType equivalents. |
| """ |
| if bcp_47 == 'ak' and ot == 'AKA': |
| return -1 |
| if bcp_47 == 'tw' and ot == 'TWI': |
| return -1 |
| return 0 |
| |
| disambiguation = { |
| 'ALT': 'alt', |
| 'ARK': 'rki', |
| 'ATH': 'ath', |
| 'BHI': 'bhb', |
| 'BLN': 'bjt', |
| 'BTI': 'beb', |
| 'CCHN': 'cco', |
| 'CMR': 'swb', |
| 'CPP': 'crp', |
| 'CRR': 'crx', |
| 'DUJ': 'dwu', |
| 'ECR': 'crj', |
| 'HAL': 'cfm', |
| 'HND': 'hnd', |
| 'HYE': 'hyw', |
| 'KIS': 'kqs', |
| 'KUI': 'uki', |
| 'LRC': 'bqi', |
| 'NDB': 'nd', |
| 'NIS': 'njz', |
| 'PLG': 'pce', |
| 'PRO': 'pro', |
| 'QIN': 'bgr', |
| 'QUH': 'quh', |
| 'QVI': 'qvi', |
| 'QWH': 'qwh', |
| 'SIG': 'stv', |
| 'SRB': 'sr', |
| 'SXT': 'xnj', |
| 'ZHH': 'zh-HK', |
| 'ZHS': 'zh-Hans', |
| 'ZHT': 'zh-Hant', |
| 'ZHTM': 'zh-MO', |
| } |
| |
| ot.inherit_from_macrolanguages () |
| bcp_47.remove_extra_macrolanguages () |
| ot.inherit_from_macrolanguages () |
| ot.names[DEFAULT_LANGUAGE_SYSTEM] = '*/' |
| ot.ranks[DEFAULT_LANGUAGE_SYSTEM] = max (ot.ranks.values ()) + 1 |
| for tricky_ot_tag in filter (lambda tag: re.match ('[A-Z]{3}$', tag), ot.names): |
| possible_bcp_47_tag = tricky_ot_tag.lower () |
| if possible_bcp_47_tag in bcp_47.names and not ot.from_bcp_47[possible_bcp_47_tag]: |
| ot.add_language (possible_bcp_47_tag, DEFAULT_LANGUAGE_SYSTEM) |
| bcp_47.macrolanguages[possible_bcp_47_tag] = set () |
| ot.sort_languages () |
| |
| print ('/* == Start of generated table == */') |
| print ('/*') |
| print (' * The following table is generated by running:') |
| print (' *') |
| print (' * %s languagetags language-subtag-registry' % sys.argv[0]) |
| print (' *') |
| print (' * on files with these headers:') |
| print (' *') |
| print (' * %s' % ot.header.strip ()) |
| print (' * %s' % bcp_47.header) |
| print (' */') |
| print () |
| print ('#ifndef HB_OT_TAG_TABLE_HH') |
| print ('#define HB_OT_TAG_TABLE_HH') |
| print () |
| |
| def hb_tag (tag): |
| """Convert a tag to ``HB_TAG`` form. |
| |
| Args: |
| tag (str): An OpenType tag. |
| |
| Returns: |
| A snippet of C++ representing ``tag``. |
| """ |
| if tag == DEFAULT_LANGUAGE_SYSTEM: |
| return 'HB_TAG_NONE\t ' |
| return "HB_TAG('%s','%s','%s','%s')" % tuple (('%-4s' % tag)[:4]) |
| |
| def get_variant_set (name): |
| """Return a set of variant language names from a name. |
| |
| Args: |
| name (str): A list of language names from the BCP 47 registry, |
| joined on ``'\\n'``. |
| |
| Returns: |
| A set of normalized language names. |
| """ |
| return set (unicodedata.normalize ('NFD', n.replace ('\u2019', "'")) |
| .encode ('ASCII', 'ignore') |
| .strip () |
| for n in re.split ('[\n(),]', name) if n) |
| |
| def language_name_intersection (a, b): |
| """Return the names in common between two language names. |
| |
| Args: |
| a (str): A list of language names from the BCP 47 registry, |
| joined on ``'\\n'``. |
| b (str): A list of language names from the BCP 47 registry, |
| joined on ``'\\n'``. |
| |
| Returns: |
| The normalized language names shared by ``a`` and ``b``. |
| """ |
| return get_variant_set (a).intersection (get_variant_set (b)) |
| |
| def get_matching_language_name (intersection, candidates): |
| return next (iter (c for c in candidates if not intersection.isdisjoint (get_variant_set (c)))) |
| |
| def same_tag (bcp_47_tag, ot_tags): |
| return len (bcp_47_tag) == 3 and len (ot_tags) == 1 and bcp_47_tag == ot_tags[0].lower () |
| |
| for language_len in (2, 3): |
| if language_len == 3: |
| print ('#ifndef HB_NO_LANGUAGE_LONG') |
| print ('static const LangTag ot_languages%d[] = {' % language_len) |
| for language, tags in sorted (ot.from_bcp_47.items ()): |
| if language == '' or '-' in language: |
| continue |
| if len(language) != language_len: continue |
| commented_out = same_tag (language, tags) |
| for i, tag in enumerate (tags, start=1): |
| print ('%s{%s,\t%s},' % ('/*' if commented_out else ' ', hb_tag (language), hb_tag (tag)), end='') |
| if commented_out: |
| print ('*/', end='') |
| print ('\t/* ', end='') |
| bcp_47_name = bcp_47.names.get (language, '') |
| bcp_47_name_candidates = bcp_47_name.split ('\n') |
| ot_name = ot.names[tag] |
| scope = bcp_47.scopes.get (language, '') |
| if tag == DEFAULT_LANGUAGE_SYSTEM: |
| write (f'{bcp_47_name_candidates[0]}{scope} != {ot.names[language.upper ()]}') |
| else: |
| intersection = language_name_intersection (bcp_47_name, ot_name) |
| if not intersection: |
| write ('%s%s -> %s' % (bcp_47_name_candidates[0], scope, ot_name)) |
| else: |
| name = get_matching_language_name (intersection, bcp_47_name_candidates) |
| bcp_47.names[language] = name |
| write ('%s%s' % (name if len (name) > len (ot_name) else ot_name, scope)) |
| print (' */') |
| print ('};') |
| if language_len == 3: |
| print ('#endif') |
| print () |
| |
| print ('/**') |
| print (' * hb_ot_tags_from_complex_language:') |
| print (' * @lang_str: a BCP 47 language tag to convert.') |
| print (' * @limit: a pointer to the end of the substring of @lang_str to consider for') |
| print (' * conversion.') |
| print (' * @count: maximum number of language tags to retrieve (IN) and actual number of') |
| print (' * language tags retrieved (OUT). If no tags are retrieved, it is not modified.') |
| print (' * @tags: array of size at least @language_count to store the language tag') |
| print (' * results') |
| print (' *') |
| print (' * Converts a multi-subtag BCP 47 language tag to language tags.') |
| print (' *') |
| print (' * Return value: Whether any language systems were retrieved.') |
| print (' **/') |
| print ('static inline bool') |
| print ('hb_ot_tags_from_complex_language (const char *lang_str,') |
| print ('\t\t\t\t const char *limit,') |
| print ('\t\t\t\t unsigned int *count /* IN/OUT */,') |
| print ('\t\t\t\t hb_tag_t *tags /* OUT */)') |
| print ('{') |
| |
| def print_subtag_matches (subtag, string, new_line): |
| if subtag: |
| if new_line: |
| print () |
| print ('\t&& ', end='') |
| print ('subtag_matches (%s, limit, "-%s", %i)' % (string, subtag, 1 + len (subtag)), end='') |
| |
| complex_tags = collections.defaultdict (list) |
| for initial, group in itertools.groupby ((lt_tags for lt_tags in [ |
| (LanguageTag (language), tags) |
| for language, tags in sorted (ot.from_bcp_47.items (), |
| key=lambda i: (-len (i[0]), i[0])) |
| ] if lt_tags[0].is_complex ()), |
| key=lambda lt_tags: lt_tags[0].get_group ()): |
| complex_tags[initial] += group |
| |
| # Calculate the min length of the subtags outside the switch |
| min_subtag_len = 100 |
| for initial, items in sorted (complex_tags.items ()): |
| if initial != 'und': |
| continue |
| for lt, tags in items: |
| if not tags: |
| continue |
| subtag_len = 0 |
| subtag_len += 1 + len (lt.script) if lt.script is not None else 0 |
| subtag_len += 1 + len (lt.region) if lt.region is not None else 0 |
| subtag_len += 1 + len (lt.variant) if lt.variant is not None else 0 |
| min_subtag_len = min(subtag_len, min_subtag_len) |
| |
| print (' if (limit - lang_str >= %d)' % (min_subtag_len + 2)) |
| print (' {') |
| print (" const char *p = strchr (lang_str, '-');") |
| print (" if (!p || p >= limit || limit - p < %i) goto out;" % min_subtag_len) |
| for initial, items in sorted (complex_tags.items ()): |
| if initial != 'und': |
| continue |
| for lt, tags in items: |
| if not tags: |
| continue |
| if lt.variant in bcp_47.prefixes: |
| expect (next (iter (bcp_47.prefixes[lt.variant])) == lt.language, |
| '%s is not a valid prefix of %s' % (lt.language, lt.variant)) |
| print (' if (', end='') |
| print_subtag_matches (lt.script, 'p', False) |
| print_subtag_matches (lt.region, 'p', False) |
| print_subtag_matches (lt.variant, 'p', False) |
| print (')') |
| print (' {') |
| write (' /* %s */' % bcp_47.get_name (lt)) |
| print () |
| if len (tags) == 1: |
| write (' tags[0] = %s; /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]])) |
| print () |
| print (' *count = 1;') |
| else: |
| print (' hb_tag_t possible_tags[] = {') |
| for tag in tags: |
| write (' %s, /* %s */' % (hb_tag (tag), ot.names[tag])) |
| print () |
| print (' };') |
| print (' for (i = 0; i < %s && i < *count; i++)' % len (tags)) |
| print ('\ttags[i] = possible_tags[i];') |
| print (' *count = i;') |
| print (' return true;') |
| print (' }') |
| print (' }') |
| print ('out:') |
| |
| print (' switch (lang_str[0])') |
| print (' {') |
| for initial, items in sorted (complex_tags.items ()): |
| if initial == 'und': |
| continue |
| print (" case '%s':" % initial) |
| for lt, tags in items: |
| if not tags: |
| continue |
| print (' if (', end='') |
| script = lt.script |
| region = lt.region |
| if lt.grandfathered: |
| print ('0 == strcmp (&lang_str[1], "%s")' % lt.language[1:], end='') |
| else: |
| string_literal = lt.language[1:] + '-' |
| if script: |
| string_literal += script |
| script = None |
| if region: |
| string_literal += '-' + region |
| region = None |
| if string_literal[-1] == '-': |
| print ('0 == strncmp (&lang_str[1], "%s", %i)' % (string_literal, len (string_literal)), end='') |
| else: |
| print ('lang_matches (&lang_str[1], limit, "%s", %i)' % (string_literal, len (string_literal)), end='') |
| print_subtag_matches (script, 'lang_str', True) |
| print_subtag_matches (region, 'lang_str', True) |
| print_subtag_matches (lt.variant, 'lang_str', True) |
| print (')') |
| print (' {') |
| write (' /* %s */' % bcp_47.get_name (lt)) |
| print () |
| if len (tags) == 1: |
| write (' tags[0] = %s; /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]])) |
| print () |
| print (' *count = 1;') |
| else: |
| print (' unsigned int i;') |
| print (' hb_tag_t possible_tags[] = {') |
| for tag in tags: |
| write ('\t%s, /* %s */' % (hb_tag (tag), ot.names[tag])) |
| print () |
| print (' };') |
| print (' for (i = 0; i < %s && i < *count; i++)' % len (tags)) |
| print ('\ttags[i] = possible_tags[i];') |
| print (' *count = i;') |
| print (' return true;') |
| print (' }') |
| print (' break;') |
| |
| print (' }') |
| print (' return false;') |
| print ('}') |
| print () |
| print ('/**') |
| print (' * hb_ot_ambiguous_tag_to_language') |
| print (' * @tag: A language tag.') |
| print (' *') |
| print (' * Converts @tag to a BCP 47 language tag if it is ambiguous (it corresponds to') |
| print (' * many language tags) and the best tag is not the alphabetically first, or if') |
| print (' * the best tag consists of multiple subtags, or if the best tag does not appear') |
| print (' * in #ot_languages.') |
| print (' *') |
| print (' * Return value: The #hb_language_t corresponding to the BCP 47 language tag,') |
| print (' * or #HB_LANGUAGE_INVALID if @tag is not ambiguous.') |
| print (' **/') |
| print ('static inline hb_language_t') |
| print ('hb_ot_ambiguous_tag_to_language (hb_tag_t tag)') |
| print ('{') |
| print (' switch (tag)') |
| print (' {') |
| |
| def verify_disambiguation_dict (): |
| """Verify and normalize ``disambiguation``. |
| |
| ``disambiguation`` is a map of ambiguous OpenType language system |
| tags to the particular BCP 47 tags they correspond to. This function |
| checks that all its keys really are ambiguous and that each key's |
| value is valid for that key. It checks that no ambiguous tag is |
| missing, except when it can figure out which BCP 47 tag is the best |
| by itself. |
| |
| It modifies ``disambiguation`` to remove keys whose values are the |
| same as those that the fallback would return anyway, and to add |
| ambiguous keys whose disambiguations it determined automatically. |
| |
| Raises: |
| AssertionError: Verification failed. |
| """ |
| global bcp_47 |
| global disambiguation |
| global ot |
| for ot_tag, bcp_47_tags in ot.to_bcp_47.items (): |
| if ot_tag == DEFAULT_LANGUAGE_SYSTEM: |
| primary_tags = [] |
| else: |
| primary_tags = list (t for t in bcp_47_tags if t not in bcp_47.grandfathered and ot.from_bcp_47.get (t)[0] == ot_tag) |
| if len (primary_tags) == 1: |
| expect (ot_tag not in disambiguation, 'unnecessary disambiguation for OT tag: %s' % ot_tag) |
| if '-' in primary_tags[0]: |
| disambiguation[ot_tag] = primary_tags[0] |
| else: |
| first_tag = sorted (t for t in bcp_47_tags if t not in bcp_47.grandfathered and ot_tag in ot.from_bcp_47.get (t))[0] |
| if primary_tags[0] != first_tag: |
| disambiguation[ot_tag] = primary_tags[0] |
| elif len (primary_tags) == 0: |
| expect (ot_tag not in disambiguation, 'There is no possible valid disambiguation for %s' % ot_tag) |
| else: |
| original_languages = [t for t in primary_tags if t in ot.from_bcp_47_uninherited and 'retired code' not in bcp_47.scopes.get (t, '')] |
| if len (original_languages) == 1: |
| macrolanguages = original_languages |
| else: |
| macrolanguages = [t for t in primary_tags if bcp_47.scopes.get (t) == ' [macrolanguage]'] |
| if len (macrolanguages) != 1: |
| macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [collection]') |
| if len (macrolanguages) != 1: |
| macrolanguages = list (t for t in primary_tags if 'retired code' not in bcp_47.scopes.get (t, '')) |
| if len (macrolanguages) != 1: |
| expect (ot_tag in disambiguation, 'ambiguous OT tag: %s %s' % (ot_tag, str (macrolanguages))) |
| expect (disambiguation[ot_tag] in bcp_47_tags, |
| '%s is not a valid disambiguation for %s' % (disambiguation[ot_tag], ot_tag)) |
| elif ot_tag not in disambiguation: |
| disambiguation[ot_tag] = macrolanguages[0] |
| different_bcp_47_tags = sorted (t for t in bcp_47_tags if not same_tag (t, ot.from_bcp_47.get (t))) |
| if different_bcp_47_tags and disambiguation[ot_tag] == different_bcp_47_tags[0] and '-' not in disambiguation[ot_tag]: |
| del disambiguation[ot_tag] |
| for ot_tag in disambiguation.keys (): |
| expect (ot_tag in ot.to_bcp_47, 'unknown OT tag: %s' % ot_tag) |
| |
| verify_disambiguation_dict () |
| for ot_tag, bcp_47_tag in sorted (disambiguation.items ()): |
| write (' case %s: /* %s */' % (hb_tag (ot_tag), ot.names[ot_tag])) |
| print () |
| write (' return hb_language_from_string (\"%s\", -1); /* %s */' % (bcp_47_tag, bcp_47.get_name (LanguageTag (bcp_47_tag)))) |
| print () |
| |
| print (' default:') |
| print (' return HB_LANGUAGE_INVALID;') |
| print (' }') |
| print ('}') |
| |
| print () |
| print ('#endif /* HB_OT_TAG_TABLE_HH */') |
| print () |
| print ('/* == End of generated table == */') |
| |