test/vector/hb-svg-compare - third_party/harfbuzz - Git at Google

 #!/usr/bin/env python3

 import xml.etree.ElementTree as ET
 import re
 import math

 def tokenize_path_data(d_attribute):
     """
     Splits an SVG path 'd' attribute string into a list of tokens:
       - Single-letter commands (M, m, L, l, C, c, etc.)
       - Float values (including possible scientific notation)
     Returns a list of strings/float, e.g. ['M', 10.0, 20.0, 'L', 30.5, 40.7, 'Z'].
     """
     # Regex explanation:
     #   ([MmZzLlHhVvCcSsQqTtAa])  => one of the valid path command letters
     #   |                        => OR
     #   ([+-]?\d+(\.\d+)?([eE][+-]?\d+)?) => a number that may include optional +/-,
     #                                       decimals, and scientific notation
     token_pattern = re.compile(r'([MmZzLlHhVvCcSsQqTtAa])|([+-]?\d+(\.\d+)?([eE][+-]?\d+)?)')
     raw_tokens = token_pattern.findall(d_attribute)

     # raw_tokens is a list of tuples from the capturing groups. We only need one
     # of those fields from each match. We'll filter out empty strings and parse floats.
     tokens = []
     for match in raw_tokens:
         # match is a tuple: (command_letter, number_string, ..., ...)
         command_letter = match[0]
         number_string = match[1]
         if command_letter:
             # It's a path command like 'M', 'L', etc.
             tokens.append(command_letter)
         else:
             # It's a numeric value
             tokens.append(float(number_string))

     return tokens

 def extract_paths_from_svg(svg_data):
     """
     Parses an SVG file and extracts the 'd' attribute from
     all <path> elements in order. Returns a list of token lists.
     """
     root = ET.fromstring(svg_data)

     # Inkscape / Illustrator SVGs often have a default namespace.
     # You may need to adjust the namespace if the <path> tags are qualified.
     # For example:
     #   namespace = {'svg': 'http://www.w3.org/2000/svg'}
     #   for path_elem in root.findall('.//svg:path', namespace):
     #       ...
     # If your SVGs do not use a default namespace, the below should work.

     all_token_lists = []
     for path_elem in root.findall('.//{http://www.w3.org/2000/svg}path'):
         d_attribute = path_elem.get('d')
         if d_attribute:
             tokens = tokenize_path_data(d_attribute)
             all_token_lists.append(tokens)

     # If your SVG has no default namespace or if you removed it, you might do:
     #   for path_elem in root.findall('.//path'):
     #       d_attribute = path_elem.get('d')
     #       ...
     #
     # Adjust as appropriate depending on your actual SVG structure/namespaces.

     return all_token_lists

 def normalize_tokens(tokens):
     """
     Normalize path tokens for structural comparison.
     Currently this drops degenerate linear segments:
       - L/l where endpoint equals current point
       - H/h where x stays unchanged
       - V/v where y stays unchanged
     """
     out = []
     i = 0
     cmd = None
     cx = 0.0
     cy = 0.0
     sx = 0.0
     sy = 0.0
     have_current = False

     def gather_numbers(start):
         j = start
         vals = []
         while j < len(tokens) and not isinstance(tokens[j], str):
             vals.append(tokens[j])
             j += 1
         return vals, j

     while i < len(tokens):
         token = tokens[i]
         if isinstance(token, str):
             cmd = token
             i += 1
         elif cmd is None:
             # Malformed path: no command to apply.
             return tokens

         if cmd in ('Z', 'z'):
             out.append(cmd)
             if have_current:
                 cx, cy = sx, sy
             continue

         values, i = gather_numbers(i)

         if cmd in ('M', 'm'):
             if len(values) < 2:
                 return tokens

             # First pair is moveto and cannot be dropped.
             out.extend([cmd, values[0], values[1]])
             if cmd == 'M':
                 cx, cy = values[0], values[1]
             else:
                 if have_current:
                     cx += values[0]
                     cy += values[1]
                 else:
                     cx, cy = values[0], values[1]
             sx, sy = cx, cy
             have_current = True

             # Remaining pairs are implicit lineto.
             kept = []
             k = 2
             while k + 1 < len(values):
                 x = values[k]
                 y = values[k + 1]
                 if cmd == 'M':
                     nx, ny = x, y
                 else:
                     nx, ny = cx + x, cy + y

                 if nx != cx or ny != cy:
                     kept.extend([x, y])
                     cx, cy = nx, ny
                 k += 2

             out.extend(kept)
             continue

         if cmd in ('L', 'l'):
             kept = []
             k = 0
             while k + 1 < len(values):
                 x = values[k]
                 y = values[k + 1]
                 if cmd == 'L':
                     nx, ny = x, y
                 else:
                     nx, ny = cx + x, cy + y

                 if nx != cx or ny != cy:
                     kept.extend([x, y])
                     cx, cy = nx, ny
                 k += 2

             if kept:
                 out.append(cmd)
                 out.extend(kept)
             continue

         if cmd in ('H', 'h'):
             kept = []
             for x in values:
                 nx = x if cmd == 'H' else cx + x
                 if nx != cx:
                     kept.append(x)
                     cx = nx
             if kept:
                 out.append(cmd)
                 out.extend(kept)
             continue

         if cmd in ('V', 'v'):
             kept = []
             for y in values:
                 ny = y if cmd == 'V' else cy + y
                 if ny != cy:
                     kept.append(y)
                     cy = ny
             if kept:
                 out.append(cmd)
                 out.extend(kept)
             continue

         # For other commands, keep original parameters and just track current point.
         if cmd in ('C', 'c'):
             if values:
                 out.append(cmd)
                 out.extend(values)
             k = 0
             while k + 5 < len(values):
                 if cmd == 'C':
                     cx, cy = values[k + 4], values[k + 5]
                 else:
                     cx += values[k + 4]
                     cy += values[k + 5]
                 k += 6
             have_current = True
             continue

         if cmd in ('S', 's'):
             if values:
                 out.append(cmd)
                 out.extend(values)
             k = 0
             while k + 3 < len(values):
                 if cmd == 'S':
                     cx, cy = values[k + 2], values[k + 3]
                 else:
                     cx += values[k + 2]
                     cy += values[k + 3]
                 k += 4
             have_current = True
             continue

         if cmd in ('Q', 'q'):
             if values:
                 out.append(cmd)
                 out.extend(values)
             k = 0
             while k + 3 < len(values):
                 if cmd == 'Q':
                     cx, cy = values[k + 2], values[k + 3]
                 else:
                     cx += values[k + 2]
                     cy += values[k + 3]
                 k += 4
             have_current = True
             continue

         if cmd in ('T', 't'):
             if values:
                 out.append(cmd)
                 out.extend(values)
             k = 0
             while k + 1 < len(values):
                 if cmd == 'T':
                     cx, cy = values[k], values[k + 1]
                 else:
                     cx += values[k]
                     cy += values[k + 1]
                 k += 2
             have_current = True
             continue

         if cmd in ('A', 'a'):
             if values:
                 out.append(cmd)
                 out.extend(values)
             k = 0
             while k + 6 < len(values):
                 if cmd == 'A':
                     cx, cy = values[k + 5], values[k + 6]
                 else:
                     cx += values[k + 5]
                     cy += values[k + 6]
                 k += 7
             have_current = True
             continue

         # Unknown command: keep as-is.
         out.append(cmd)
         out.extend(values)

     return out

 def compare_token_lists_exact(tokens_a, tokens_b):
     """
     Compares two lists of path tokens (commands and floats).
     Returns True if they match (same commands in same positions,
     numeric values within given tolerance), otherwise False.
     """
     if len(tokens_a) != len(tokens_b):
         return None  # Different lengths => not a match

     max_diff = 0
     for a, b in zip(tokens_a, tokens_b):
         if isinstance(a, str) and isinstance(b, str):
             # Must match exactly the same command letter
             if a != b:
                 return None
         elif isinstance(a, float) and isinstance(b, float):
             # Compare numeric values
             diff = abs(a - b)
             max_diff = max(max_diff, diff)
         else:
             # One is command, the other is float => mismatch
             return None

     return max_diff

 def compare_token_lists(tokens_a, tokens_b):
     """
     Compare path token streams with a fallback that ignores degenerate
     linear segments (no-op L/l/H/h/V/v) when needed to resolve
     structural mismatches.
     """
     ret = compare_token_lists_exact(tokens_a, tokens_b)
     if ret is not None:
         return ret

     return compare_token_lists_exact(normalize_tokens(tokens_a),
                                      normalize_tokens(tokens_b))

 def compare_svg_files(svg_file_1, svg_file_2):
     """
     Compares two SVG files to check if they have the same number of <path> elements,
     and each corresponding path has the same structure of commands.
     Return max difference between respective numeric values in the paths.
     """

     svg_data_1 = open(svg_file_1).read()
     svg_data_2 = open(svg_file_2).read()

     # If contents match exactly, return 0
     if svg_data_1 == svg_data_2:
         return 0

     paths1 = extract_paths_from_svg(svg_data_1)
     paths2 = extract_paths_from_svg(svg_data_2)

     # Check that we have the same number of <path> elements
     if len(paths1) != len(paths2):
         return None

     # Compare each path token list
     max_diff = 0
     for tokens1, tokens2 in zip(paths1, paths2):
         ret = compare_token_lists(tokens1, tokens2)
         if ret is None:
             return ret
         max_diff = max(max_diff, ret)

     return max_diff


 if __name__ == "__main__":
     import sys

     if '--help' in sys.argv[1:]:
         print("Usage: hb-svg-compare TOLERANCE < file_with_svg_pairs.txt")
         sys.exit(1)

     tolerance = 0
     if len(sys.argv) > 1:
         tolerance = float(sys.argv[1])

     # Read all lines of two SVG file paths from stdin and compare
     for line in sys.stdin:
         svg1, svg2 = line.strip().split()
         diff = compare_svg_files(svg1, svg2)

         if diff is None:
             diff = "DIFF"
         elif diff <= tolerance:
             continue
         else:
             diff = f"{diff:g}"

         print(f"{diff}\t{svg1}\t{svg2}")
         # Flush stdout to make sure output is immediate
         sys.stdout.flush()
	#!/usr/bin/env python3

	import xml.etree.ElementTree as ET
	import re
	import math

	def tokenize_path_data(d_attribute):
	"""
	Splits an SVG path 'd' attribute string into a list of tokens:
	- Single-letter commands (M, m, L, l, C, c, etc.)
	- Float values (including possible scientific notation)
	Returns a list of strings/float, e.g. ['M', 10.0, 20.0, 'L', 30.5, 40.7, 'Z'].
	"""
	# Regex explanation:
	# ([MmZzLlHhVvCcSsQqTtAa]) => one of the valid path command letters
	# \| => OR
	# ([+-]?\d+(\.\d+)?([eE][+-]?\d+)?) => a number that may include optional +/-,
	# decimals, and scientific notation
	token_pattern = re.compile(r'([MmZzLlHhVvCcSsQqTtAa])\|([+-]?\d+(\.\d+)?([eE][+-]?\d+)?)')
	raw_tokens = token_pattern.findall(d_attribute)

	# raw_tokens is a list of tuples from the capturing groups. We only need one
	# of those fields from each match. We'll filter out empty strings and parse floats.
	tokens = []
	for match in raw_tokens:
	# match is a tuple: (command_letter, number_string, ..., ...)
	command_letter = match[0]
	number_string = match[1]
	if command_letter:
	# It's a path command like 'M', 'L', etc.
	tokens.append(command_letter)
	else:
	# It's a numeric value
	tokens.append(float(number_string))

	return tokens

	def extract_paths_from_svg(svg_data):
	"""
	Parses an SVG file and extracts the 'd' attribute from
	all <path> elements in order. Returns a list of token lists.
	"""
	root = ET.fromstring(svg_data)

	# Inkscape / Illustrator SVGs often have a default namespace.
	# You may need to adjust the namespace if the <path> tags are qualified.
	# For example:
	# namespace = {'svg': 'http://www.w3.org/2000/svg'}
	# for path_elem in root.findall('.//svg:path', namespace):
	# ...
	# If your SVGs do not use a default namespace, the below should work.

	all_token_lists = []
	for path_elem in root.findall('.//{http://www.w3.org/2000/svg}path'):
	d_attribute = path_elem.get('d')
	if d_attribute:
	tokens = tokenize_path_data(d_attribute)
	all_token_lists.append(tokens)

	# If your SVG has no default namespace or if you removed it, you might do:
	# for path_elem in root.findall('.//path'):
	# d_attribute = path_elem.get('d')
	# ...
	#
	# Adjust as appropriate depending on your actual SVG structure/namespaces.

	return all_token_lists

	def normalize_tokens(tokens):
	"""
	Normalize path tokens for structural comparison.
	Currently this drops degenerate linear segments:
	- L/l where endpoint equals current point
	- H/h where x stays unchanged
	- V/v where y stays unchanged
	"""
	out = []
	i = 0
	cmd = None
	cx = 0.0
	cy = 0.0
	sx = 0.0
	sy = 0.0
	have_current = False

	def gather_numbers(start):
	j = start
	vals = []
	while j < len(tokens) and not isinstance(tokens[j], str):
	vals.append(tokens[j])
	j += 1
	return vals, j

	while i < len(tokens):
	token = tokens[i]
	if isinstance(token, str):
	cmd = token
	i += 1
	elif cmd is None:
	# Malformed path: no command to apply.
	return tokens

	if cmd in ('Z', 'z'):
	out.append(cmd)
	if have_current:
	cx, cy = sx, sy
	continue

	values, i = gather_numbers(i)

	if cmd in ('M', 'm'):
	if len(values) < 2:
	return tokens

	# First pair is moveto and cannot be dropped.
	out.extend([cmd, values[0], values[1]])
	if cmd == 'M':
	cx, cy = values[0], values[1]
	else:
	if have_current:
	cx += values[0]
	cy += values[1]
	else:
	cx, cy = values[0], values[1]
	sx, sy = cx, cy
	have_current = True

	# Remaining pairs are implicit lineto.
	kept = []
	k = 2
	while k + 1 < len(values):
	x = values[k]
	y = values[k + 1]
	if cmd == 'M':
	nx, ny = x, y
	else:
	nx, ny = cx + x, cy + y

	if nx != cx or ny != cy:
	kept.extend([x, y])
	cx, cy = nx, ny
	k += 2

	out.extend(kept)
	continue

	if cmd in ('L', 'l'):
	kept = []
	k = 0
	while k + 1 < len(values):
	x = values[k]
	y = values[k + 1]
	if cmd == 'L':
	nx, ny = x, y
	else:
	nx, ny = cx + x, cy + y

	if nx != cx or ny != cy:
	kept.extend([x, y])
	cx, cy = nx, ny
	k += 2

	if kept:
	out.append(cmd)
	out.extend(kept)
	continue

	if cmd in ('H', 'h'):
	kept = []
	for x in values:
	nx = x if cmd == 'H' else cx + x
	if nx != cx:
	kept.append(x)
	cx = nx
	if kept:
	out.append(cmd)
	out.extend(kept)
	continue

	if cmd in ('V', 'v'):
	kept = []
	for y in values:
	ny = y if cmd == 'V' else cy + y
	if ny != cy:
	kept.append(y)
	cy = ny
	if kept:
	out.append(cmd)
	out.extend(kept)
	continue

	# For other commands, keep original parameters and just track current point.
	if cmd in ('C', 'c'):
	if values:
	out.append(cmd)
	out.extend(values)
	k = 0
	while k + 5 < len(values):
	if cmd == 'C':
	cx, cy = values[k + 4], values[k + 5]
	else:
	cx += values[k + 4]
	cy += values[k + 5]
	k += 6
	have_current = True
	continue

	if cmd in ('S', 's'):
	if values:
	out.append(cmd)
	out.extend(values)
	k = 0
	while k + 3 < len(values):
	if cmd == 'S':
	cx, cy = values[k + 2], values[k + 3]
	else:
	cx += values[k + 2]
	cy += values[k + 3]
	k += 4
	have_current = True
	continue

	if cmd in ('Q', 'q'):
	if values:
	out.append(cmd)
	out.extend(values)
	k = 0
	while k + 3 < len(values):
	if cmd == 'Q':
	cx, cy = values[k + 2], values[k + 3]
	else:
	cx += values[k + 2]
	cy += values[k + 3]
	k += 4
	have_current = True
	continue

	if cmd in ('T', 't'):
	if values:
	out.append(cmd)
	out.extend(values)
	k = 0
	while k + 1 < len(values):
	if cmd == 'T':
	cx, cy = values[k], values[k + 1]
	else:
	cx += values[k]
	cy += values[k + 1]
	k += 2
	have_current = True
	continue

	if cmd in ('A', 'a'):
	if values:
	out.append(cmd)
	out.extend(values)
	k = 0
	while k + 6 < len(values):
	if cmd == 'A':
	cx, cy = values[k + 5], values[k + 6]
	else:
	cx += values[k + 5]
	cy += values[k + 6]
	k += 7
	have_current = True
	continue

	# Unknown command: keep as-is.
	out.append(cmd)
	out.extend(values)

	return out

	def compare_token_lists_exact(tokens_a, tokens_b):
	"""
	Compares two lists of path tokens (commands and floats).
	Returns True if they match (same commands in same positions,
	numeric values within given tolerance), otherwise False.
	"""
	if len(tokens_a) != len(tokens_b):
	return None # Different lengths => not a match

	max_diff = 0
	for a, b in zip(tokens_a, tokens_b):
	if isinstance(a, str) and isinstance(b, str):
	# Must match exactly the same command letter
	if a != b:
	return None
	elif isinstance(a, float) and isinstance(b, float):
	# Compare numeric values
	diff = abs(a - b)
	max_diff = max(max_diff, diff)
	else:
	# One is command, the other is float => mismatch
	return None

	return max_diff

	def compare_token_lists(tokens_a, tokens_b):
	"""
	Compare path token streams with a fallback that ignores degenerate
	linear segments (no-op L/l/H/h/V/v) when needed to resolve
	structural mismatches.
	"""
	ret = compare_token_lists_exact(tokens_a, tokens_b)
	if ret is not None:
	return ret

	return compare_token_lists_exact(normalize_tokens(tokens_a),
	normalize_tokens(tokens_b))

	def compare_svg_files(svg_file_1, svg_file_2):
	"""
	Compares two SVG files to check if they have the same number of <path> elements,
	and each corresponding path has the same structure of commands.
	Return max difference between respective numeric values in the paths.
	"""

	svg_data_1 = open(svg_file_1).read()
	svg_data_2 = open(svg_file_2).read()

	# If contents match exactly, return 0
	if svg_data_1 == svg_data_2:
	return 0

	paths1 = extract_paths_from_svg(svg_data_1)
	paths2 = extract_paths_from_svg(svg_data_2)

	# Check that we have the same number of <path> elements
	if len(paths1) != len(paths2):
	return None

	# Compare each path token list
	max_diff = 0
	for tokens1, tokens2 in zip(paths1, paths2):
	ret = compare_token_lists(tokens1, tokens2)
	if ret is None:
	return ret
	max_diff = max(max_diff, ret)

	return max_diff


	if __name__ == "__main__":
	import sys

	if '--help' in sys.argv[1:]:
	print("Usage: hb-svg-compare TOLERANCE < file_with_svg_pairs.txt")
	sys.exit(1)

	tolerance = 0
	if len(sys.argv) > 1:
	tolerance = float(sys.argv[1])

	# Read all lines of two SVG file paths from stdin and compare
	for line in sys.stdin:
	svg1, svg2 = line.strip().split()
	diff = compare_svg_files(svg1, svg2)

	if diff is None:
	diff = "DIFF"
	elif diff <= tolerance:
	continue
	else:
	diff = f"{diff:g}"

	print(f"{diff}\t{svg1}\t{svg2}")
	# Flush stdout to make sure output is immediate
	sys.stdout.flush()