[USE] Start moving Unicode-to-USE mapping into Python code

commit: 20e246e674155d5fb6527722fc3ef3accf2413df [log] [tgz]
author: Behdad Esfahbod <behdad@behdad.org> Mon Jul 20 15:56:19 2015 +0100
committer: Behdad Esfahbod <behdad@behdad.org> Mon Jul 20 15:56:19 2015 +0100
tree: 223836f20988365a8d7108db78039e124266025b
parent: eb74535cc2c0d0de41e54e75bdc71825ec969523 [diff] [blame]
diff --git a/src/gen-use-table.py b/src/gen-use-table.py
index 73aa379..a79becb 100755
--- a/src/gen-use-table.py
+++ b/src/gen-use-table.py

@@ -2,15 +2,16 @@
 
 import sys
 
-if len (sys.argv) != 4:
-	print >>sys.stderr, "usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt Blocks.txt"
+if len (sys.argv) != 5:
+	print >>sys.stderr, "usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt"
 	sys.exit (1)
 
 BLACKLISTED_BLOCKS = ["Thai", "Lao", "Tibetan"]
 
 files = [file (x) for x in sys.argv[1:]]
 
-headers = [[f.readline () for i in range (2)] for f in files]
+headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 2]
+headers.append (["UnicodeData.txt does not have a header."])
 
 data = [{} for f in files]
 values = [{} for f in files]
@@ -32,29 +33,184 @@
 		else:
 			end = int (uu[1], 16)
 
-		t = fields[1]
+		t = fields[1 if i != 2 else 2]
 
 		for u in range (start, end + 1):
 			data[i][u] = t
 		values[i][t] = values[i].get (t, 0) + end - start + 1
 
 # Merge data into one dict:
-defaults = ('Other', 'Not_Applicable', 'No_Block')
+defaults = ('Other', 'Not_Applicable', 'Cn', 'No_Block')
 for i,v in enumerate (defaults):
 	values[i][v] = values[i].get (v, 0) + 1
 combined = {}
 for i,d in enumerate (data):
 	for u,v in d.items ():
-		if i == 2 and not u in combined:
+		if i >= 2 and not u in combined:
 			continue
 		if not u in combined:
 			combined[u] = list (defaults)
 		combined[u][i] = v
-combined = {k:v for k,v in combined.items() if v[2] not in BLACKLISTED_BLOCKS}
+combined = {k:v for k,v in combined.items() if v[3] not in BLACKLISTED_BLOCKS}
 data = combined
 del combined
 num = len (data)
 
+
+property_names = [
+	# General_Category
+	'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc',
+	'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po',
+	'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs',
+	# Indic_Syllabic_Category
+	'Bindu',
+	'Visarga',
+	'Avagraha',
+	'Nukta',
+	'Virama',
+	'Pure_Killer',
+	'Invisible_Stacker',
+	'Vowel_Independent',
+	'Vowel_Dependent',
+	'Vowel',
+	'Consonant_Placeholder',
+	'Consonant',
+	'Consonant_Dead',
+	'Consonant_With_Stacker',
+	'Consonant_Prefixed',
+	'Consonant_Preceding_Repha',
+	'Consonant_Succeeding_Repha',
+	'Consonant_Subjoined',
+	'Consonant_Medial',
+	'Consonant_Final',
+	'Consonant_Head_Letter',
+	'Modifying_Letter',
+	'Tone_Letter',
+	'Tone_Mark',
+	'Gemination_Mark',
+	'Cantillation_Mark',
+	'Register_Shifter',
+	'Syllable_Modifier',
+	'Consonant_Killer',
+	'Non_Joiner',
+	'Joiner',
+	'Number_Joiner',
+	'Number',
+	'Brahmi_Joining_Number',
+	# Indic_Positional_Category
+	'Right',
+	'Left',
+	'Visual_Order_Left',
+	'Left_And_Right',
+	'Top',
+	'Bottom',
+	'Top_And_Bottom',
+	'Top_And_Right',
+	'Top_And_Left',
+	'Top_And_Left_And_Right',
+	'Bottom_And_Right',
+	'Top_And_Bottom_And_Right',
+	'Overstruck',
+]
+
+class PropertyValue(object):
+	def __init__(self, name_):
+		self.name = name_
+
+property_values = {}
+
+for name in property_names:
+	value = PropertyValue(name)
+	assert value not in property_values
+	assert value not in globals()
+	property_values[name] = value
+globals().update(property_values)
+
+
+def is_BASE(U, UISC, UGC):
+	return (UISC in [Number, Consonant, Consonant_Head_Letter, Consonant_Placeholder, Tone_Letter] or
+		(UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial,
+					Consonant_Subjoined, Vowel, Vowel_Dependent]))
+def is_BASE_VOWEL(U, UISC, UGC):
+	return UISC == Vowel_Independent
+def is_BASE_IND(U, UISC, UGC):
+	return (UISC in [Consonant_Dead, Modifying_Letter] or UGC == Po)
+def is_BASE_NUM(U, UISC, UGC):
+	return UISC == Brahmi_Joining_Number
+def is_BASE_OTHER(U, UISC, UGC):
+	return U in [0x00A0, 0x00D7, 0x2015, 0x2022, 0x25CC,
+		     0x25FB, 0x25FC, 0x25FD, 0x25FE]
+def is_CGJ(U, UISC, UGC):
+	return U == 0x034F
+def is_CONS_FINAL(U, UISC, UGC):
+	return ((UISC == Consonant_Final and UGC != Lo) or
+		UISC == Consonant_Succeeding_Repha)
+def is_CONS_FINAL_MOD(U, UISC, UGC):
+	return  UISC in [Consonant_Final_Modifier, Syllable_Modifier]
+def is_CONS_MED(U, UISC, UGC):
+	return UISC == Consonant_Medial and UGC != Lo
+def is_CONS_MOD(U, UISC, UGC):
+	return UISC in [Nukta, Gemination_Mark, Consonant_Killer]
+def is_CONS_SUB(U, UISC, UGC):
+	return UISC == Consonant_Subjoined
+def is_HALANT(U, UISC, UGC):
+	return UISC in [Virama, Invisible_Stacker]
+def is_HALANT_NUM(U, UISC, UGC):
+	return UISC == Number_Joiner
+def is_ZWNJ(U, UISC, UGC):
+	return UISC == Non_Joiner
+def is_ZWJ(U, UISC, UGC):
+	return UISC == Joiner
+def is_Word_Joiner(U, UISC, UGC):
+	return U == 0x2060
+def is_OTHER(U, UISC, UGC):
+	return UGC == Zs # or any other SCRIPT_COMMON characters
+def is_Reserved(U, UISC, UGC):
+	return UGC == 'Cn'
+def is_REPHA(U, UISC, UGC):
+	return UISC == Consonant_Preceding_Repha
+def is_SYM(U, UISC, UGC):
+	return UGC in [So, Sc] or UISC == Symbol_Letter
+def is_SYM_MOD(U, UISC, UGC):
+	return U in [0x1B6B, 0x1B6C, 0x1B6D, 0x1B6E, 0x1B6F, 0x1B70, 0x1B71, 0x1B72, 0x1B73]
+def is_VARIATION_SELECTOR(U, UISC, UGC):
+	return 0xFE00 <= U <= 0xFE0F
+def is_VOWEL(U, UISC, UGC):
+	return (UISC == Pure_Killer or
+		(UGC != Lo and UISC in [Vowel, Vowel_Dependent]))
+def is_VOWEL_MOD(U, UISC, UGC):
+	return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or
+		(UGC != Lo and UISC == Bindu))
+
+use_mapping = {
+	'B':	is_BASE,
+	'IV':	is_BASE_VOWEL,
+	'IND':	is_BASE_IND,
+	'N':	is_BASE_NUM,
+	'GB':	is_BASE_OTHER,
+	'CGJ':	is_CGJ,
+	'F':	is_CONS_FINAL,
+	'FM':	is_CONS_FINAL_MOD,
+	'M':	is_CONS_MED,
+	'CM':	is_CONS_MOD,
+	'SUB':	is_CONS_SUB,
+	'H':	is_HALANT,
+	'HN':	is_HALANT_NUM,
+	'ZWNJ':	is_ZWNJ,
+	'ZWJ':	is_ZWJ,
+	'WJ':	is_Word_Joiner,
+	'O':	is_OTHER,
+	'Rsv':	is_Reserved,
+	'R':	is_REPHA,
+	'S':	is_SYM,
+	'SM':	is_SYM_MOD,
+	'VS':	is_VARIATION_SELECTOR,
+	'V':	is_VOWEL,
+	'VM':	is_VOWEL_MOD,
+}
+
+#data = map_to_use(data)
+
 # Remove the outliers
 singles = {}
 for u in [0x25CC, 0x1107F]:
@@ -65,7 +221,7 @@
 print "/*"
 print " * The following table is generated by running:"
 print " *"
-print " *   ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt Blocks.txt"
+print " *   ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt"
 print " *"
 print " * on files with these headers:"
 print " *"
@@ -164,11 +320,11 @@
 for u in uu:
 	if u <= last:
 		continue
-	block = data[u][2]
+	block = data[u][3]
 
 	start = u//8*8
 	end = start+1
-	while end in uu and block == data[end][2]:
+	while end in uu and block == data[end][3]:
 		end += 1
 	end = (end-1)//8*8 + 7
commit	20e246e674155d5fb6527722fc3ef3accf2413df	[log] [tgz]
author	Behdad Esfahbod <behdad@behdad.org>	Mon Jul 20 15:56:19 2015 +0100
committer	Behdad Esfahbod <behdad@behdad.org>	Mon Jul 20 15:56:19 2015 +0100
tree	223836f20988365a8d7108db78039e124266025b
parent	eb74535cc2c0d0de41e54e75bdc71825ec969523 [diff] [blame]