| #!/usr/bin/env python |
| |
| from __future__ import print_function |
| import sys, os, re, difflib, unicodedata, errno, cgi |
| from itertools import * |
| |
| diff_symbols = "-+=*&^%$#@!~/" |
| diff_colors = ['red', 'green', 'blue'] |
| |
| try: |
| unichr = unichr |
| |
| if sys.maxunicode < 0x10FFFF: |
| # workarounds for Python 2 "narrow" builds with UCS2-only support. |
| |
| _narrow_unichr = unichr |
| |
| def unichr(i): |
| """ |
| Return the unicode character whose Unicode code is the integer 'i'. |
| The valid range is 0 to 0x10FFFF inclusive. |
| |
| >>> _narrow_unichr(0xFFFF + 1) |
| Traceback (most recent call last): |
| File "<stdin>", line 1, in ? |
| ValueError: unichr() arg not in range(0x10000) (narrow Python build) |
| >>> unichr(0xFFFF + 1) == u'\U00010000' |
| True |
| >>> unichr(1114111) == u'\U0010FFFF' |
| True |
| >>> unichr(0x10FFFF + 1) |
| Traceback (most recent call last): |
| File "<stdin>", line 1, in ? |
| ValueError: unichr() arg not in range(0x110000) |
| """ |
| try: |
| return _narrow_unichr(i) |
| except ValueError: |
| try: |
| padded_hex_str = hex(i)[2:].zfill(8) |
| escape_str = "\\U" + padded_hex_str |
| return escape_str.decode("unicode-escape") |
| except UnicodeDecodeError: |
| raise ValueError('unichr() arg not in range(0x110000)') |
| |
| except NameError: |
| unichr = chr |
| |
| class ColorFormatter: |
| |
| class Null: |
| @staticmethod |
| def start_color (c): return '' |
| @staticmethod |
| def end_color (): return '' |
| @staticmethod |
| def escape (s): return s |
| @staticmethod |
| def newline (): return '\n' |
| |
| class ANSI: |
| @staticmethod |
| def start_color (c): |
| return { |
| 'red': '\033[41;37;1m', |
| 'green': '\033[42;37;1m', |
| 'blue': '\033[44;37;1m', |
| }[c] |
| @staticmethod |
| def end_color (): |
| return '\033[m' |
| @staticmethod |
| def escape (s): return s |
| @staticmethod |
| def newline (): return '\n' |
| |
| class HTML: |
| @staticmethod |
| def start_color (c): |
| return '<span style="background:%s">' % c |
| @staticmethod |
| def end_color (): |
| return '</span>' |
| @staticmethod |
| def escape (s): return cgi.escape (s) |
| @staticmethod |
| def newline (): return '<br/>\n' |
| |
| @staticmethod |
| def Auto (argv = [], out = sys.stdout): |
| format = ColorFormatter.ANSI |
| if "--format" in argv: |
| argv.remove ("--format") |
| format = ColorFormatter.ANSI |
| if "--format=ansi" in argv: |
| argv.remove ("--format=ansi") |
| format = ColorFormatter.ANSI |
| if "--format=html" in argv: |
| argv.remove ("--format=html") |
| format = ColorFormatter.HTML |
| if "--no-format" in argv: |
| argv.remove ("--no-format") |
| format = ColorFormatter.Null |
| return format |
| |
| |
| class DiffColorizer: |
| |
| diff_regex = re.compile ('([a-za-z0-9_]*)([^a-za-z0-9_]?)') |
| |
| def __init__ (self, formatter, colors=diff_colors, symbols=diff_symbols): |
| self.formatter = formatter |
| self.colors = colors |
| self.symbols = symbols |
| |
| def colorize_lines (self, lines): |
| lines = (l if l else '' for l in lines) |
| ss = [self.diff_regex.sub (r'\1\n\2\n', l).splitlines (True) for l in lines] |
| oo = ["",""] |
| st = [False, False] |
| for l in difflib.Differ().compare (*ss): |
| if l[0] == '?': |
| continue |
| if l[0] == ' ': |
| for i in range(2): |
| if st[i]: |
| oo[i] += self.formatter.end_color () |
| st[i] = False |
| oo = [o + self.formatter.escape (l[2:]) for o in oo] |
| continue |
| if l[0] in self.symbols: |
| i = self.symbols.index (l[0]) |
| if not st[i]: |
| oo[i] += self.formatter.start_color (self.colors[i]) |
| st[i] = True |
| oo[i] += self.formatter.escape (l[2:]) |
| continue |
| for i in range(2): |
| if st[i]: |
| oo[i] += self.formatter.end_color () |
| st[i] = False |
| oo = [o.replace ('\n', '') for o in oo] |
| return [s1+s2+self.formatter.newline () for (s1,s2) in zip (self.symbols, oo) if s2] |
| |
| def colorize_diff (self, f): |
| lines = [None, None] |
| for l in f: |
| if l[0] not in self.symbols: |
| yield self.formatter.escape (l).replace ('\n', self.formatter.newline ()) |
| continue |
| i = self.symbols.index (l[0]) |
| if lines[i]: |
| # Flush |
| for line in self.colorize_lines (lines): |
| yield line |
| lines = [None, None] |
| lines[i] = l[1:] |
| if (all (lines)): |
| # Flush |
| for line in self.colorize_lines (lines): |
| yield line |
| lines = [None, None] |
| if (any (lines)): |
| # Flush |
| for line in self.colorize_lines (lines): |
| yield line |
| |
| |
| class ZipDiffer: |
| |
| @staticmethod |
| def diff_files (files, symbols=diff_symbols): |
| files = tuple (files) # in case it's a generator, copy it |
| try: |
| for lines in izip_longest (*files): |
| if all (lines[0] == line for line in lines[1:]): |
| sys.stdout.writelines ([" ", lines[0]]) |
| continue |
| |
| for i, l in enumerate (lines): |
| if l: |
| sys.stdout.writelines ([symbols[i], l]) |
| except IOError as e: |
| if e.errno != errno.EPIPE: |
| print ("%s: %s: %s" % (sys.argv[0], e.filename, e.strerror), file=sys.stderr) |
| sys.exit (1) |
| |
| |
| class DiffFilters: |
| |
| @staticmethod |
| def filter_failures (f): |
| for key, lines in DiffHelpers.separate_test_cases (f): |
| lines = list (lines) |
| if not DiffHelpers.test_passed (lines): |
| for l in lines: yield l |
| |
| class Stat: |
| |
| def __init__ (self): |
| self.count = 0 |
| self.freq = 0 |
| |
| def add (self, test): |
| self.count += 1 |
| self.freq += test.freq |
| |
| class Stats: |
| |
| def __init__ (self): |
| self.passed = Stat () |
| self.failed = Stat () |
| self.total = Stat () |
| |
| def add (self, test): |
| self.total.add (test) |
| if test.passed: |
| self.passed.add (test) |
| else: |
| self.failed.add (test) |
| |
| def mean (self): |
| return float (self.passed.count) / self.total.count |
| |
| def variance (self): |
| return (float (self.passed.count) / self.total.count) * \ |
| (float (self.failed.count) / self.total.count) |
| |
| def stddev (self): |
| return self.variance () ** .5 |
| |
| def zscore (self, population): |
| """Calculate the standard score. |
| Population is the Stats for population. |
| Self is Stats for sample. |
| Returns larger absolute value if sample is highly unlikely to be random. |
| Anything outside of -3..+3 is very unlikely to be random. |
| See: http://en.wikipedia.org/wiki/Standard_score""" |
| |
| return (self.mean () - population.mean ()) / population.stddev () |
| |
| |
| |
| |
| class DiffSinks: |
| |
| @staticmethod |
| def print_stat (f): |
| passed = 0 |
| failed = 0 |
| # XXX port to Stats, but that would really slow us down here |
| for key, lines in DiffHelpers.separate_test_cases (f): |
| if DiffHelpers.test_passed (lines): |
| passed += 1 |
| else: |
| failed += 1 |
| total = passed + failed |
| print ("%d out of %d tests passed. %d failed (%g%%)" % (passed, total, failed, 100. * failed / total)) |
| |
| @staticmethod |
| def print_ngrams (f, ns=(1,2,3)): |
| gens = tuple (Ngram.generator (n) for n in ns) |
| allstats = Stats () |
| allgrams = {} |
| for key, lines in DiffHelpers.separate_test_cases (f): |
| test = Test (lines) |
| allstats.add (test) |
| |
| for gen in gens: |
| for ngram in gen (test.unicodes): |
| if ngram not in allgrams: |
| allgrams[ngram] = Stats () |
| allgrams[ngram].add (test) |
| |
| importantgrams = {} |
| for ngram, stats in allgrams.iteritems (): |
| if stats.failed.count >= 30: # for statistical reasons |
| importantgrams[ngram] = stats |
| allgrams = importantgrams |
| del importantgrams |
| |
| for ngram, stats in allgrams.iteritems (): |
| print ("zscore: %9f failed: %6d passed: %6d ngram: <%s>" % (stats.zscore (allstats), stats.failed.count, stats.passed.count, ','.join ("U+%04X" % u for u in ngram))) |
| |
| |
| |
| class Test: |
| |
| def __init__ (self, lines): |
| self.freq = 1 |
| self.passed = True |
| self.identifier = None |
| self.text = None |
| self.unicodes = None |
| self.glyphs = None |
| for l in lines: |
| symbol = l[0] |
| if symbol != ' ': |
| self.passed = False |
| i = 1 |
| if ':' in l: |
| i = l.index (':') |
| if not self.identifier: |
| self.identifier = l[1:i] |
| i = i + 2 # Skip colon and space |
| j = -1 |
| if l[j] == '\n': |
| j -= 1 |
| brackets = l[i] + l[j] |
| l = l[i+1:-2] |
| if brackets == '()': |
| self.text = l |
| elif brackets == '<>': |
| self.unicodes = Unicode.parse (l) |
| elif brackets == '[]': |
| # XXX we don't handle failed tests here |
| self.glyphs = l |
| |
| |
| class DiffHelpers: |
| |
| @staticmethod |
| def separate_test_cases (f): |
| '''Reads lines from f, and if the lines have identifiers, ie. |
| have a colon character, groups them by identifier, |
| yielding lists of all lines with the same identifier.''' |
| |
| def identifier (l): |
| if ':' in l[1:]: |
| return l[1:l.index (':')] |
| return l |
| return groupby (f, key=identifier) |
| |
| @staticmethod |
| def test_passed (lines): |
| lines = list (lines) |
| # XXX This is a hack, but does the job for now. |
| if any (l.find("space+0|space+0") >= 0 for l in lines if l[0] == '+'): return True |
| if any (l.find("uni25CC") >= 0 for l in lines if l[0] == '+'): return True |
| if any (l.find("dottedcircle") >= 0 for l in lines if l[0] == '+'): return True |
| if any (l.find("glyph0") >= 0 for l in lines if l[0] == '+'): return True |
| if any (l.find("gid0") >= 0 for l in lines if l[0] == '+'): return True |
| if any (l.find("notdef") >= 0 for l in lines if l[0] == '+'): return True |
| return all (l[0] == ' ' for l in lines) |
| |
| |
| class FilterHelpers: |
| |
| @staticmethod |
| def filter_printer_function (filter_callback): |
| def printer (f): |
| for line in filter_callback (f): |
| print (line) |
| return printer |
| |
| @staticmethod |
| def filter_printer_function_no_newline (filter_callback): |
| def printer (f): |
| for line in filter_callback (f): |
| sys.stdout.writelines ([line]) |
| return printer |
| |
| |
| class Ngram: |
| |
| @staticmethod |
| def generator (n): |
| |
| def gen (f): |
| l = [] |
| for x in f: |
| l.append (x) |
| if len (l) == n: |
| yield tuple (l) |
| l[:1] = [] |
| |
| gen.n = n |
| return gen |
| |
| |
| class UtilMains: |
| |
| @staticmethod |
| def process_multiple_files (callback, mnemonic = "FILE"): |
| |
| if "--help" in sys.argv: |
| print ("Usage: %s %s..." % (sys.argv[0], mnemonic)) |
| sys.exit (1) |
| |
| try: |
| files = sys.argv[1:] if len (sys.argv) > 1 else ['-'] |
| for s in files: |
| callback (FileHelpers.open_file_or_stdin (s)) |
| except IOError as e: |
| if e.errno != errno.EPIPE: |
| print ("%s: %s: %s" % (sys.argv[0], e.filename, e.strerror), file=sys.stderr) |
| sys.exit (1) |
| |
| @staticmethod |
| def process_multiple_args (callback, mnemonic): |
| |
| if len (sys.argv) == 1 or "--help" in sys.argv: |
| print ("Usage: %s %s..." % (sys.argv[0], mnemonic)) |
| sys.exit (1) |
| |
| try: |
| for s in sys.argv[1:]: |
| callback (s) |
| except IOError as e: |
| if e.errno != errno.EPIPE: |
| print ("%s: %s: %s" % (sys.argv[0], e.filename, e.strerror), file=sys.stderr) |
| sys.exit (1) |
| |
| @staticmethod |
| def filter_multiple_strings_or_stdin (callback, mnemonic, \ |
| separator = " ", \ |
| concat_separator = False): |
| |
| if "--help" in sys.argv: |
| print ("Usage:\n %s %s...\nor:\n %s\n\nWhen called with no arguments, input is read from standard input." \ |
| % (sys.argv[0], mnemonic, sys.argv[0])) |
| sys.exit (1) |
| |
| try: |
| if len (sys.argv) == 1: |
| while (1): |
| line = sys.stdin.readline () |
| if not len (line): |
| break |
| if line[-1] == '\n': |
| line = line[:-1] |
| print (callback (line)) |
| else: |
| args = sys.argv[1:] |
| if concat_separator != False: |
| args = [concat_separator.join (args)] |
| print (separator.join (callback (x) for x in (args))) |
| except IOError as e: |
| if e.errno != errno.EPIPE: |
| print ("%s: %s: %s" % (sys.argv[0], e.filename, e.strerror), file=sys.stderr) |
| sys.exit (1) |
| |
| |
| class Unicode: |
| |
| @staticmethod |
| def decode (s): |
| return u','.join ("U+%04X" % ord (u) for u in unicode (s, 'utf-8')).encode ('utf-8') |
| |
| @staticmethod |
| def parse (s): |
| s = re.sub (r"0[xX]", " ", s) |
| s = re.sub (r"[<+>{},;&#\\xXuUnNiI\n ]", " ", s) |
| return [int (x, 16) for x in s.split ()] |
| |
| @staticmethod |
| def encode (s): |
| s = u''.join (unichr (x) for x in Unicode.parse (s)) |
| if sys.version_info[0] == 2: s = s.encode ('utf-8') |
| return s |
| |
| shorthands = { |
| "ZERO WIDTH NON-JOINER": "ZWNJ", |
| "ZERO WIDTH JOINER": "ZWJ", |
| "NARROW NO-BREAK SPACE": "NNBSP", |
| "COMBINING GRAPHEME JOINER": "CGJ", |
| "LEFT-TO-RIGHT MARK": "LRM", |
| "RIGHT-TO-LEFT MARK": "RLM", |
| "LEFT-TO-RIGHT EMBEDDING": "LRE", |
| "RIGHT-TO-LEFT EMBEDDING": "RLE", |
| "POP DIRECTIONAL FORMATTING": "PDF", |
| "LEFT-TO-RIGHT OVERRIDE": "LRO", |
| "RIGHT-TO-LEFT OVERRIDE": "RLO", |
| } |
| |
| @staticmethod |
| def pretty_name (u): |
| try: |
| s = unicodedata.name (u) |
| except ValueError: |
| return "XXX" |
| s = re.sub (".* LETTER ", "", s) |
| s = re.sub (".* VOWEL SIGN (.*)", r"\1-MATRA", s) |
| s = re.sub (".* SIGN ", "", s) |
| s = re.sub (".* COMBINING ", "", s) |
| if re.match (".* VIRAMA", s): |
| s = "HALANT" |
| if s in Unicode.shorthands: |
| s = Unicode.shorthands[s] |
| return s |
| |
| @staticmethod |
| def pretty_names (s): |
| s = re.sub (r"[<+>\\uU]", " ", s) |
| s = re.sub (r"0[xX]", " ", s) |
| s = [unichr (int (x, 16)) for x in re.split ('[, \n]', s) if len (x)] |
| return u' + '.join (Unicode.pretty_name (x) for x in s).encode ('utf-8') |
| |
| |
| class FileHelpers: |
| |
| @staticmethod |
| def open_file_or_stdin (f): |
| if f == '-': |
| return sys.stdin |
| return file (f) |
| |
| |
| class Manifest: |
| |
| @staticmethod |
| def read (s, strict = True): |
| |
| if not os.path.exists (s): |
| if strict: |
| print ("%s: %s does not exist" % (sys.argv[0], s), file=sys.stderr) |
| sys.exit (1) |
| return |
| |
| s = os.path.normpath (s) |
| |
| if os.path.isdir (s): |
| |
| try: |
| m = file (os.path.join (s, "MANIFEST")) |
| items = [x.strip () for x in m.readlines ()] |
| for f in items: |
| for p in Manifest.read (os.path.join (s, f)): |
| yield p |
| except IOError: |
| if strict: |
| print ("%s: %s does not exist" % (sys.argv[0], os.path.join (s, "MANIFEST")), file=sys.stderr) |
| sys.exit (1) |
| return |
| else: |
| yield s |
| |
| @staticmethod |
| def update_recursive (s): |
| |
| for dirpath, dirnames, filenames in os.walk (s, followlinks=True): |
| |
| for f in ["MANIFEST", "README", "LICENSE", "COPYING", "AUTHORS", "SOURCES", "ChangeLog"]: |
| if f in dirnames: |
| dirnames.remove (f) |
| if f in filenames: |
| filenames.remove (f) |
| dirnames.sort () |
| filenames.sort () |
| ms = os.path.join (dirpath, "MANIFEST") |
| print (" GEN %s" % ms) |
| m = open (ms, "w") |
| for f in filenames: |
| print (f, file=m) |
| for f in dirnames: |
| print (f, file=m) |
| for f in dirnames: |
| Manifest.update_recursive (os.path.join (dirpath, f)) |
| |
| if __name__ == '__main__': |
| pass |