blob: 13ba90d72604e7bacf02b02fc16977846043dafd [file] [log] [blame]
#!/usr/bin/env python3
import xml.etree.ElementTree as ET
import re
import math
def tokenize_path_data(d_attribute):
"""
Splits an SVG path 'd' attribute string into a list of tokens:
- Single-letter commands (M, m, L, l, C, c, etc.)
- Float values (including possible scientific notation)
Returns a list of strings/float, e.g. ['M', 10.0, 20.0, 'L', 30.5, 40.7, 'Z'].
"""
# Regex explanation:
# ([MmZzLlHhVvCcSsQqTtAa]) => one of the valid path command letters
# | => OR
# ([+-]?\d+(\.\d+)?([eE][+-]?\d+)?) => a number that may include optional +/-,
# decimals, and scientific notation
token_pattern = re.compile(r'([MmZzLlHhVvCcSsQqTtAa])|([+-]?\d+(\.\d+)?([eE][+-]?\d+)?)')
raw_tokens = token_pattern.findall(d_attribute)
# raw_tokens is a list of tuples from the capturing groups. We only need one
# of those fields from each match. We'll filter out empty strings and parse floats.
tokens = []
for match in raw_tokens:
# match is a tuple: (command_letter, number_string, ..., ...)
command_letter = match[0]
number_string = match[1]
if command_letter:
# It's a path command like 'M', 'L', etc.
tokens.append(command_letter)
else:
# It's a numeric value
tokens.append(float(number_string))
return tokens
def extract_paths_from_svg(svg_data):
"""
Parses an SVG file and extracts the 'd' attribute from
all <path> elements in order. Returns a list of token lists.
"""
root = ET.fromstring(svg_data)
# Inkscape / Illustrator SVGs often have a default namespace.
# You may need to adjust the namespace if the <path> tags are qualified.
# For example:
# namespace = {'svg': 'http://www.w3.org/2000/svg'}
# for path_elem in root.findall('.//svg:path', namespace):
# ...
# If your SVGs do not use a default namespace, the below should work.
all_token_lists = []
for path_elem in root.findall('.//{http://www.w3.org/2000/svg}path'):
d_attribute = path_elem.get('d')
if d_attribute:
tokens = tokenize_path_data(d_attribute)
all_token_lists.append(tokens)
# If your SVG has no default namespace or if you removed it, you might do:
# for path_elem in root.findall('.//path'):
# d_attribute = path_elem.get('d')
# ...
#
# Adjust as appropriate depending on your actual SVG structure/namespaces.
return all_token_lists
def normalize_tokens(tokens):
"""
Normalize path tokens for structural comparison.
Currently this drops degenerate linear segments:
- L/l where endpoint equals current point
- H/h where x stays unchanged
- V/v where y stays unchanged
"""
out = []
i = 0
cmd = None
cx = 0.0
cy = 0.0
sx = 0.0
sy = 0.0
have_current = False
def gather_numbers(start):
j = start
vals = []
while j < len(tokens) and not isinstance(tokens[j], str):
vals.append(tokens[j])
j += 1
return vals, j
while i < len(tokens):
token = tokens[i]
if isinstance(token, str):
cmd = token
i += 1
elif cmd is None:
# Malformed path: no command to apply.
return tokens
if cmd in ('Z', 'z'):
out.append(cmd)
if have_current:
cx, cy = sx, sy
continue
values, i = gather_numbers(i)
if cmd in ('M', 'm'):
if len(values) < 2:
return tokens
# First pair is moveto and cannot be dropped.
out.extend([cmd, values[0], values[1]])
if cmd == 'M':
cx, cy = values[0], values[1]
else:
if have_current:
cx += values[0]
cy += values[1]
else:
cx, cy = values[0], values[1]
sx, sy = cx, cy
have_current = True
# Remaining pairs are implicit lineto.
kept = []
k = 2
while k + 1 < len(values):
x = values[k]
y = values[k + 1]
if cmd == 'M':
nx, ny = x, y
else:
nx, ny = cx + x, cy + y
if nx != cx or ny != cy:
kept.extend([x, y])
cx, cy = nx, ny
k += 2
out.extend(kept)
continue
if cmd in ('L', 'l'):
kept = []
k = 0
while k + 1 < len(values):
x = values[k]
y = values[k + 1]
if cmd == 'L':
nx, ny = x, y
else:
nx, ny = cx + x, cy + y
if nx != cx or ny != cy:
kept.extend([x, y])
cx, cy = nx, ny
k += 2
if kept:
out.append(cmd)
out.extend(kept)
continue
if cmd in ('H', 'h'):
kept = []
for x in values:
nx = x if cmd == 'H' else cx + x
if nx != cx:
kept.append(x)
cx = nx
if kept:
out.append(cmd)
out.extend(kept)
continue
if cmd in ('V', 'v'):
kept = []
for y in values:
ny = y if cmd == 'V' else cy + y
if ny != cy:
kept.append(y)
cy = ny
if kept:
out.append(cmd)
out.extend(kept)
continue
# For other commands, keep original parameters and just track current point.
if cmd in ('C', 'c'):
if values:
out.append(cmd)
out.extend(values)
k = 0
while k + 5 < len(values):
if cmd == 'C':
cx, cy = values[k + 4], values[k + 5]
else:
cx += values[k + 4]
cy += values[k + 5]
k += 6
have_current = True
continue
if cmd in ('S', 's'):
if values:
out.append(cmd)
out.extend(values)
k = 0
while k + 3 < len(values):
if cmd == 'S':
cx, cy = values[k + 2], values[k + 3]
else:
cx += values[k + 2]
cy += values[k + 3]
k += 4
have_current = True
continue
if cmd in ('Q', 'q'):
if values:
out.append(cmd)
out.extend(values)
k = 0
while k + 3 < len(values):
if cmd == 'Q':
cx, cy = values[k + 2], values[k + 3]
else:
cx += values[k + 2]
cy += values[k + 3]
k += 4
have_current = True
continue
if cmd in ('T', 't'):
if values:
out.append(cmd)
out.extend(values)
k = 0
while k + 1 < len(values):
if cmd == 'T':
cx, cy = values[k], values[k + 1]
else:
cx += values[k]
cy += values[k + 1]
k += 2
have_current = True
continue
if cmd in ('A', 'a'):
if values:
out.append(cmd)
out.extend(values)
k = 0
while k + 6 < len(values):
if cmd == 'A':
cx, cy = values[k + 5], values[k + 6]
else:
cx += values[k + 5]
cy += values[k + 6]
k += 7
have_current = True
continue
# Unknown command: keep as-is.
out.append(cmd)
out.extend(values)
return out
def compare_token_lists_exact(tokens_a, tokens_b):
"""
Compares two lists of path tokens (commands and floats).
Returns True if they match (same commands in same positions,
numeric values within given tolerance), otherwise False.
"""
if len(tokens_a) != len(tokens_b):
return None # Different lengths => not a match
max_diff = 0
for a, b in zip(tokens_a, tokens_b):
if isinstance(a, str) and isinstance(b, str):
# Must match exactly the same command letter
if a != b:
return None
elif isinstance(a, float) and isinstance(b, float):
# Compare numeric values
diff = abs(a - b)
max_diff = max(max_diff, diff)
else:
# One is command, the other is float => mismatch
return None
return max_diff
def compare_token_lists(tokens_a, tokens_b):
"""
Compare path token streams with a fallback that ignores degenerate
linear segments (no-op L/l/H/h/V/v) when needed to resolve
structural mismatches.
"""
ret = compare_token_lists_exact(tokens_a, tokens_b)
if ret is not None:
return ret
return compare_token_lists_exact(normalize_tokens(tokens_a),
normalize_tokens(tokens_b))
def compare_svg_files(svg_file_1, svg_file_2):
"""
Compares two SVG files to check if they have the same number of <path> elements,
and each corresponding path has the same structure of commands.
Return max difference between respective numeric values in the paths.
"""
svg_data_1 = open(svg_file_1).read()
svg_data_2 = open(svg_file_2).read()
# If contents match exactly, return 0
if svg_data_1 == svg_data_2:
return 0
paths1 = extract_paths_from_svg(svg_data_1)
paths2 = extract_paths_from_svg(svg_data_2)
# Check that we have the same number of <path> elements
if len(paths1) != len(paths2):
return None
# Compare each path token list
max_diff = 0
for tokens1, tokens2 in zip(paths1, paths2):
ret = compare_token_lists(tokens1, tokens2)
if ret is None:
return ret
max_diff = max(max_diff, ret)
return max_diff
if __name__ == "__main__":
import sys
if '--help' in sys.argv[1:]:
print("Usage: hb-svg-compare TOLERANCE < file_with_svg_pairs.txt")
sys.exit(1)
tolerance = 0
if len(sys.argv) > 1:
tolerance = float(sys.argv[1])
# Read all lines of two SVG file paths from stdin and compare
for line in sys.stdin:
svg1, svg2 = line.strip().split()
diff = compare_svg_files(svg1, svg2)
if diff is None:
diff = "DIFF"
elif diff <= tolerance:
continue
else:
diff = f"{diff:g}"
print(f"{diff}\t{svg1}\t{svg2}")
# Flush stdout to make sure output is immediate
sys.stdout.flush()