src/gen-emoji-table.py - third_party/harfbuzz - Git at Google

 #!/usr/bin/env python3

 """usage: ./gen-emoji-table.py [--rust] emoji-data.txt emoji-test.txt

 Input file:
 * https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt
 * https://www.unicode.org/Public/emoji/latest/emoji-test.txt
 """

 import os
 import sys
 from collections import OrderedDict
 import packTab

 if len(sys.argv) > 1 and sys.argv[1] == "--rust":
 	del sys.argv[1]
 	language = packTab.languages["rust"]
 else:
 	language = packTab.languages["c"]

 if len (sys.argv) != 3:
 	sys.exit (__doc__)

 f = open(sys.argv[1])
 header = [f.readline () for _ in range(10)]

 ranges = OrderedDict()
 for line in f.readlines():
 	line = line.strip()
 	if not line or line[0] == '#':
 		continue
 	rang, typ = [s.strip() for s in line.split('#')[0].split(';')[:2]]

 	rang = [int(s, 16) for s in rang.split('..')]
 	if len(rang) > 1:
 		start, end = rang
 	else:
 		start = end = rang[0]

 	if typ not in ranges:
 		ranges[typ] = []
 	if ranges[typ] and ranges[typ][-1][1] == start - 1:
 		ranges[typ][-1] = (ranges[typ][-1][0], end)
 	else:
 		ranges[typ].append((start, end))


 print ("/* == Start of generated table == */")
 print ("/*")
 print (" * The following tables are generated by running:")
 print (" *")
 print (" *   ./gen-emoji-table.py %semoji-data.txt" %
 		("--rust " if language.name == "rust" else ""))
 print (" *")
 print (" * on file with this header:")
 print (" *")
 for l in header:
 	print (" * %s" % (l.strip()))
 print (" */")
 print ()
 if language.name == "c":
 	print ("#ifndef HB_UNICODE_EMOJI_TABLE_HH")
 	print ("#define HB_UNICODE_EMOJI_TABLE_HH")
 	print ()
 	print ('#include "hb-unicode.hh"')
 	print ()
 elif language.name == "rust":
 	print ("#![allow(unused_parens)]")
 	print ("#![allow(clippy::unnecessary_cast, clippy::unreadable_literal, clippy::double_parens)]")
 	print ()
 	print ("use crate::hb::unicode::Codepoint;")
 	print ()
 else:
 	assert False, "Unknown language: %s" % language.name

 for typ, s in ranges.items():
 	if typ != "Extended_Pictographic": continue

 	arr = dict()
 	for start,end in s:
 		for i in range(start, end + 1):
 			arr[i] = 1

 	sol = packTab.pack_table(arr, 0, compression=9)
 	code = packTab.Code('_hb_emoji')
 	if language.name == "c":
 		sol.genCode(code, 'is_'+typ, language=language)
 		code.print_code(language=language)
 		print()
 	elif language.name == "rust":
 		sol.genCode(code, 'is_'+typ+'_u8', language=language, private=False)
 		code.print_code(language=language, private=False)
 		print()
 		print ("#[inline]")
 		print ("pub(crate) fn is_%s (u: Codepoint) -> bool" % typ)
 		print ("{")
 		print ("  _hb_emoji_is_%s_u8 (u as usize) != 0" % typ)
 		print ("}")
 		print()
 	else:
 		assert False, "Unknown language: %s" % language.name

 print ()
 if language.name == "c":
 	print ("#endif /* HB_UNICODE_EMOJI_TABLE_HH */")
 print ()
 print ("/* == End of generated table == */")


 # Generate test file.
 sequences = []
 with open(sys.argv[2]) as f:
     for line in f.readlines():
         if "#" in line:
             line = line[:line.index("#")]
         if ";" in line:
             line = line[:line.index(";")]
         line = line.strip()
         line = line.split(" ")
         if len(line) < 2:
             continue
         sequences.append(line)

 test_path = os.path.join(os.path.dirname(__file__), "..", "test", "shape", "data", "in-house", "tests", "emoji-clusters.tests")
 if os.path.isdir(os.path.dirname(test_path)):
     try:
         with open(test_path, "w") as f:
             for sequence in sequences:
                 f.write("../fonts/AdobeBlank2.ttf;--no-glyph-names --no-positions --font-funcs=ot")
                 f.write(";" + ",".join(sequence))
                 f.write(";[" + "|".join("1=0" for c in sequence) + "]\n")
     except OSError:
         pass
	#!/usr/bin/env python3

	"""usage: ./gen-emoji-table.py [--rust] emoji-data.txt emoji-test.txt

	Input file:
	* https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt
	* https://www.unicode.org/Public/emoji/latest/emoji-test.txt
	"""

	import os
	import sys
	from collections import OrderedDict
	import packTab

	if len(sys.argv) > 1 and sys.argv[1] == "--rust":
	del sys.argv[1]
	language = packTab.languages["rust"]
	else:
	language = packTab.languages["c"]

	if len (sys.argv) != 3:
	sys.exit (__doc__)

	f = open(sys.argv[1])
	header = [f.readline () for _ in range(10)]

	ranges = OrderedDict()
	for line in f.readlines():
	line = line.strip()
	if not line or line[0] == '#':
	continue
	rang, typ = [s.strip() for s in line.split('#')[0].split(';')[:2]]

	rang = [int(s, 16) for s in rang.split('..')]
	if len(rang) > 1:
	start, end = rang
	else:
	start = end = rang[0]

	if typ not in ranges:
	ranges[typ] = []
	if ranges[typ] and ranges[typ][-1][1] == start - 1:
	ranges[typ][-1] = (ranges[typ][-1][0], end)
	else:
	ranges[typ].append((start, end))



	print ("/* == Start of generated table == */")
	print ("/*")
	print (" * The following tables are generated by running:")
	print (" *")
	print (" * ./gen-emoji-table.py %semoji-data.txt" %
	("--rust " if language.name == "rust" else ""))
	print (" *")
	print (" * on file with this header:")
	print (" *")
	for l in header:
	print (" * %s" % (l.strip()))
	print (" */")
	print ()
	if language.name == "c":
	print ("#ifndef HB_UNICODE_EMOJI_TABLE_HH")
	print ("#define HB_UNICODE_EMOJI_TABLE_HH")
	print ()
	print ('#include "hb-unicode.hh"')
	print ()
	elif language.name == "rust":
	print ("#![allow(unused_parens)]")
	print ("#![allow(clippy::unnecessary_cast, clippy::unreadable_literal, clippy::double_parens)]")
	print ()
	print ("use crate::hb::unicode::Codepoint;")
	print ()
	else:
	assert False, "Unknown language: %s" % language.name

	for typ, s in ranges.items():
	if typ != "Extended_Pictographic": continue

	arr = dict()
	for start,end in s:
	for i in range(start, end + 1):
	arr[i] = 1

	sol = packTab.pack_table(arr, 0, compression=9)
	code = packTab.Code('_hb_emoji')
	if language.name == "c":
	sol.genCode(code, 'is_'+typ, language=language)
	code.print_code(language=language)
	print()
	elif language.name == "rust":
	sol.genCode(code, 'is_'+typ+'_u8', language=language, private=False)
	code.print_code(language=language, private=False)
	print()
	print ("#[inline]")
	print ("pub(crate) fn is_%s (u: Codepoint) -> bool" % typ)
	print ("{")
	print (" _hb_emoji_is_%s_u8 (u as usize) != 0" % typ)
	print ("}")
	print()
	else:
	assert False, "Unknown language: %s" % language.name

	print ()
	if language.name == "c":
	print ("#endif /* HB_UNICODE_EMOJI_TABLE_HH */")
	print ()
	print ("/* == End of generated table == */")


	# Generate test file.
	sequences = []
	with open(sys.argv[2]) as f:
	for line in f.readlines():
	if "#" in line:
	line = line[:line.index("#")]
	if ";" in line:
	line = line[:line.index(";")]
	line = line.strip()
	line = line.split(" ")
	if len(line) < 2:
	continue
	sequences.append(line)

	test_path = os.path.join(os.path.dirname(__file__), "..", "test", "shape", "data", "in-house", "tests", "emoji-clusters.tests")
	if os.path.isdir(os.path.dirname(test_path)):
	try:
	with open(test_path, "w") as f:
	for sequence in sequences:
	f.write("../fonts/AdobeBlank2.ttf;--no-glyph-names --no-positions --font-funcs=ot")
	f.write(";" + ",".join(sequence))
	f.write(";[" + "\|".join("1=0" for c in sequence) + "]\n")
	except OSError:
	pass