| #!/usr/bin/env python3 |
| |
| """usage: ./gen-ucd-table [--rust] ucd.nounihan.grouped.xml [/path/to/hb-script-list.h] |
| |
| Input file: |
| * https://unicode.org/Public/UCD/latest/ucdxml/ucd.nounihan.grouped.zip |
| """ |
| |
| # https://github.com/harfbuzz/packtab |
| import packTab |
| import packTab.ucdxml |
| |
| import sys, re |
| import logging |
| |
| logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.INFO) |
| |
| if len(sys.argv) > 1 and sys.argv[1] == "--rust": |
| del sys.argv[1] |
| logging.info("Generating Rust code...") |
| language = "rust" |
| else: |
| logging.info("Generating C code...") |
| language = "c" |
| language = packTab.languages[language] |
| |
| if len(sys.argv) not in (2, 3): |
| sys.exit(__doc__) |
| |
| logging.info("Loading UCDXML...") |
| ucdxml = packTab.ucdxml.load_ucdxml(sys.argv[1]) |
| ucd = packTab.ucdxml.ucdxml_get_repertoire(ucdxml) |
| |
| hb_script_list_h = "hb-script-list.h" if len(sys.argv) < 3 else sys.argv[2] |
| |
| logging.info("Preparing data tables...") |
| |
| |
| # This is how the data is encoded: |
| # |
| # General_Category (gc), Canonical_Combining_Class (ccc), |
| # and Script (sc) are encoded as integers. |
| # |
| # Mirroring character (bmg) is encoded as difference from |
| # the original character. |
| # |
| # Composition & Decomposition (dm) are encoded elaborately, |
| # as discussed below. |
| |
| gc = [u["gc"] for u in ucd] |
| ccc = [int(u["ccc"]) for u in ucd] |
| bmg = [int(v, 16) - int(u) if v else 0 for u, v in enumerate(u["bmg"] for u in ucd)] |
| sc = [u["sc"] for u in ucd] |
| |
| |
| # Prepare Compose / Decompose data |
| # |
| # This code is very dense. See hb_ucd_compose() / hb_ucd_decompose() for the logic. |
| |
| dm = { |
| i: tuple(int(v, 16) for v in u["dm"].split()) |
| for i, u in enumerate(ucd) |
| if u["dm"] != "#" and u["dt"] == "can" and not (0xAC00 <= i < 0xAC00 + 11172) |
| } |
| ce = {i for i, u in enumerate(ucd) if u["Comp_Ex"] == "Y"} |
| |
| assert not any(v for v in dm.values() if len(v) not in (1, 2)) |
| dm1 = sorted(set(v for v in dm.values() if len(v) == 1)) |
| assert all((v[0] >> 16) in (0, 2) for v in dm1) |
| dm1_p0_array = ["0x%04X" % (v[0] & 0xFFFF) for v in dm1 if (v[0] >> 16) == 0] |
| dm1_p2_array = ["0x%04X" % (v[0] & 0xFFFF) for v in dm1 if (v[0] >> 16) == 2] |
| dm1_order = {v: i + 1 for i, v in enumerate(dm1)} |
| |
| dm2 = sorted( |
| (v + (i if i not in ce and not ccc[i] else 0,), v) |
| for i, v in dm.items() |
| if len(v) == 2 |
| ) |
| |
| filt = lambda v: ( |
| (v[0] & 0xFFFFF800) == 0x0000 |
| and (v[1] & 0xFFFFFF80) == 0x0300 |
| and (v[2] & 0xFFF0C000) == 0x0000 |
| ) |
| dm2_u32_array = [v for v in dm2 if filt(v[0])] |
| dm2_u64_array = [v for v in dm2 if not filt(v[0])] |
| assert dm2_u32_array + dm2_u64_array == dm2 |
| dm2_u32_array = [ |
| "HB_CODEPOINT_ENCODE3_11_7_14 (0x%04X, 0x%04X, 0x%04X)" % v[0] |
| for v in dm2_u32_array |
| ] |
| dm2_u64_array = [ |
| "HB_CODEPOINT_ENCODE3 (0x%04X, 0x%04X, 0x%04X)" % v[0] for v in dm2_u64_array |
| ] |
| |
| l = 1 + len(dm1_p0_array) + len(dm1_p2_array) |
| dm2_order = {v[1]: i + l for i, v in enumerate(dm2)} |
| |
| dm_order = {None: 0} |
| dm_order.update(dm1_order) |
| dm_order.update(dm2_order) |
| |
| |
| # Prepare General_Category / Script mapping arrays |
| |
| gc_order = dict() |
| for i, v in enumerate( |
| ( |
| "Cc", |
| "Cf", |
| "Cn", |
| "Co", |
| "Cs", |
| "Ll", |
| "Lm", |
| "Lo", |
| "Lt", |
| "Lu", |
| "Mc", |
| "Me", |
| "Mn", |
| "Nd", |
| "Nl", |
| "No", |
| "Pc", |
| "Pd", |
| "Pe", |
| "Pf", |
| "Pi", |
| "Po", |
| "Ps", |
| "Sc", |
| "Sk", |
| "Sm", |
| "So", |
| "Zl", |
| "Zp", |
| "Zs", |
| ) |
| ): |
| gc_order[i] = v |
| gc_order[v] = i |
| |
| sc_order = dict() |
| sc_array = [] |
| sc_re = re.compile(r"\b(HB_SCRIPT_[_A-Z]*).*HB_TAG [(]'(.)','(.)','(.)','(.)'[)]") |
| for line in open(hb_script_list_h): |
| m = sc_re.search(line) |
| if not m: |
| continue |
| name = m.group(1) |
| tag = "".join(m.group(i) for i in range(2, 6)) |
| i = len(sc_array) |
| sc_order[tag] = i |
| sc_order[i] = tag |
| if language.name == "rust": |
| name = name.replace("HB_SCRIPT_", "script::") |
| sc_array.append(name) |
| |
| |
| # Write out main data |
| |
| DEFAULT = "DEFAULT" |
| COMPACT = "COMPACT" |
| SLOPPY = "SLOPPY" |
| |
| compression_level = { |
| DEFAULT: 5, |
| COMPACT: 9, |
| SLOPPY: 9, |
| } |
| |
| logging.info("Generating output...") |
| print("/* == Start of generated table == */") |
| print("/*") |
| print(" * The following table is generated by running:") |
| print(" *") |
| print( |
| " * ./gen-ucd-table.py %sucd.nounihan.grouped.xml hb-script-list.h" |
| % (("--%s " % language.name) if language.name != "c" else "") |
| ) |
| print(" *") |
| print(" * on file with this description:", ucdxml.description) |
| print(" */") |
| print() |
| if language.name == "c": |
| print("#ifndef HB_UCD_TABLE_HH") |
| print("#define HB_UCD_TABLE_HH") |
| print() |
| print('#include "hb.hh"') |
| print() |
| elif language.name == "rust": |
| print("pub(crate) mod ucd {") |
| print() |
| print("#![allow(unused_parens)]") |
| print("#![allow(clippy::unnecessary_cast)]") |
| print() |
| print("use crate::hb::algs::{HB_CODEPOINT_ENCODE3, HB_CODEPOINT_ENCODE3_11_7_14};") |
| print("use crate::hb::common::script;") |
| print("use crate::hb::common::Script as hb_script_t;") |
| print() |
| else: |
| assert False, "Unknown language: %s" % language.name |
| |
| # Write mapping data |
| |
| uint16_t = language.type_name("u16") |
| uint32_t = language.type_name("u32") |
| uint64_t = language.type_name("u64") |
| |
| if language.name == "c": |
| private = True |
| elif language.name == "rust": |
| private = False |
| else: |
| assert False, "Unknown language: %s" % language.name |
| |
| code = packTab.Code("_hb_ucd") |
| sc_array, _ = code.addArray("hb_script_t", "sc_map", sc_array) |
| dm1_p0_array, _ = code.addArray(uint16_t, "dm1_p0_map", dm1_p0_array) |
| dm1_p2_array, _ = code.addArray(uint16_t, "dm1_p2_map", dm1_p2_array) |
| dm2_u32_array, _ = code.addArray(uint32_t, "dm2_u32_map", dm2_u32_array) |
| dm2_u64_array, _ = code.addArray(uint64_t, "dm2_u64_map", dm2_u64_array) |
| code.print_code(language=language, private=private) |
| |
| datasets = [ |
| ("gc", gc, "Cn", gc_order), |
| ("ccc", ccc, 0, None), |
| ("bmg", bmg, 0, None), |
| ("sc", sc, "Zzzz", sc_order), |
| ("dm", dm, None, dm_order), |
| ] |
| |
| |
| # Write main data |
| |
| modes = {} |
| if language.name == "c": |
| modes[DEFAULT] = "#ifndef HB_OPTIMIZE_SIZE" |
| modes[COMPACT] = "#elif !defined(HB_NO_UCD_UNASSIGNED)" |
| modes[SLOPPY] = "#else" |
| modes[None] = "#endif" |
| else: |
| modes[DEFAULT] = "" |
| |
| for step, text in modes.items(): |
| print() |
| if text: |
| print(text) |
| print() |
| if step is None: |
| continue |
| |
| compression = compression_level[step] |
| logging.info(" Compression=%d:" % compression) |
| |
| if step == SLOPPY: |
| for i in range(len(gc)): |
| if (i % 128) and gc[i] == "Cn": |
| gc[i] = gc[i - 1] |
| for i in range(len(gc) - 2, -1, -1): |
| if ((i + 1) % 128) and gc[i] == "Cn": |
| gc[i] = gc[i + 1] |
| for i in range(len(sc)): |
| if (i % 128) and sc[i] == "Zzzz": |
| sc[i] = sc[i - 1] |
| for i in range(len(sc) - 2, -1, -1): |
| if ((i + 1) % 128) and sc[i] == "Zzzz": |
| sc[i] = sc[i + 1] |
| |
| code = packTab.Code("_hb_ucd") |
| |
| for name, data, default, mapping in datasets: |
| sol = packTab.pack_table( |
| data, default, mapping=mapping, compression=compression |
| ) |
| logging.info(" Dataset=%-8s FullCost=%d" % (name, sol.fullCost)) |
| sol.genCode(code, name, private=private, language=language) |
| |
| code.print_code(language=language) |
| |
| print() |
| |
| if language.name == "c": |
| print("#endif /* HB_UCD_TABLE_HH */") |
| elif language.name == "rust": |
| print("}") |
| else: |
| assert False, "Unknown language: %s" % language.name |
| print() |
| print("/* == End of generated table == */") |
| logging.info("Done.") |