| #!/usr/bin/env python3 |
| |
| import xml.etree.ElementTree as ET |
| import re |
| import math |
| |
| def tokenize_path_data(d_attribute): |
| """ |
| Splits an SVG path 'd' attribute string into a list of tokens: |
| - Single-letter commands (M, m, L, l, C, c, etc.) |
| - Float values (including possible scientific notation) |
| Returns a list of strings/float, e.g. ['M', 10.0, 20.0, 'L', 30.5, 40.7, 'Z']. |
| """ |
| # Regex explanation: |
| # ([MmZzLlHhVvCcSsQqTtAa]) => one of the valid path command letters |
| # | => OR |
| # ([+-]?\d+(\.\d+)?([eE][+-]?\d+)?) => a number that may include optional +/-, |
| # decimals, and scientific notation |
| token_pattern = re.compile(r'([MmZzLlHhVvCcSsQqTtAa])|([+-]?\d+(\.\d+)?([eE][+-]?\d+)?)') |
| raw_tokens = token_pattern.findall(d_attribute) |
| |
| # raw_tokens is a list of tuples from the capturing groups. We only need one |
| # of those fields from each match. We'll filter out empty strings and parse floats. |
| tokens = [] |
| for match in raw_tokens: |
| # match is a tuple: (command_letter, number_string, ..., ...) |
| command_letter = match[0] |
| number_string = match[1] |
| if command_letter: |
| # It's a path command like 'M', 'L', etc. |
| tokens.append(command_letter) |
| else: |
| # It's a numeric value |
| tokens.append(float(number_string)) |
| |
| return tokens |
| |
| def extract_paths_from_svg(svg_data): |
| """ |
| Parses an SVG file and extracts the 'd' attribute from |
| all <path> elements in order. Returns a list of token lists. |
| """ |
| root = ET.fromstring(svg_data) |
| |
| # Inkscape / Illustrator SVGs often have a default namespace. |
| # You may need to adjust the namespace if the <path> tags are qualified. |
| # For example: |
| # namespace = {'svg': 'http://www.w3.org/2000/svg'} |
| # for path_elem in root.findall('.//svg:path', namespace): |
| # ... |
| # If your SVGs do not use a default namespace, the below should work. |
| |
| all_token_lists = [] |
| for path_elem in root.findall('.//{http://www.w3.org/2000/svg}path'): |
| d_attribute = path_elem.get('d') |
| if d_attribute: |
| tokens = tokenize_path_data(d_attribute) |
| all_token_lists.append(tokens) |
| |
| # If your SVG has no default namespace or if you removed it, you might do: |
| # for path_elem in root.findall('.//path'): |
| # d_attribute = path_elem.get('d') |
| # ... |
| # |
| # Adjust as appropriate depending on your actual SVG structure/namespaces. |
| |
| return all_token_lists |
| |
| def normalize_tokens(tokens): |
| """ |
| Normalize path tokens for structural comparison. |
| Currently this drops degenerate linear segments: |
| - L/l where endpoint equals current point |
| - H/h where x stays unchanged |
| - V/v where y stays unchanged |
| """ |
| out = [] |
| i = 0 |
| cmd = None |
| cx = 0.0 |
| cy = 0.0 |
| sx = 0.0 |
| sy = 0.0 |
| have_current = False |
| |
| def gather_numbers(start): |
| j = start |
| vals = [] |
| while j < len(tokens) and not isinstance(tokens[j], str): |
| vals.append(tokens[j]) |
| j += 1 |
| return vals, j |
| |
| while i < len(tokens): |
| token = tokens[i] |
| if isinstance(token, str): |
| cmd = token |
| i += 1 |
| elif cmd is None: |
| # Malformed path: no command to apply. |
| return tokens |
| |
| if cmd in ('Z', 'z'): |
| out.append(cmd) |
| if have_current: |
| cx, cy = sx, sy |
| continue |
| |
| values, i = gather_numbers(i) |
| |
| if cmd in ('M', 'm'): |
| if len(values) < 2: |
| return tokens |
| |
| # First pair is moveto and cannot be dropped. |
| out.extend([cmd, values[0], values[1]]) |
| if cmd == 'M': |
| cx, cy = values[0], values[1] |
| else: |
| if have_current: |
| cx += values[0] |
| cy += values[1] |
| else: |
| cx, cy = values[0], values[1] |
| sx, sy = cx, cy |
| have_current = True |
| |
| # Remaining pairs are implicit lineto. |
| kept = [] |
| k = 2 |
| while k + 1 < len(values): |
| x = values[k] |
| y = values[k + 1] |
| if cmd == 'M': |
| nx, ny = x, y |
| else: |
| nx, ny = cx + x, cy + y |
| |
| if nx != cx or ny != cy: |
| kept.extend([x, y]) |
| cx, cy = nx, ny |
| k += 2 |
| |
| out.extend(kept) |
| continue |
| |
| if cmd in ('L', 'l'): |
| kept = [] |
| k = 0 |
| while k + 1 < len(values): |
| x = values[k] |
| y = values[k + 1] |
| if cmd == 'L': |
| nx, ny = x, y |
| else: |
| nx, ny = cx + x, cy + y |
| |
| if nx != cx or ny != cy: |
| kept.extend([x, y]) |
| cx, cy = nx, ny |
| k += 2 |
| |
| if kept: |
| out.append(cmd) |
| out.extend(kept) |
| continue |
| |
| if cmd in ('H', 'h'): |
| kept = [] |
| for x in values: |
| nx = x if cmd == 'H' else cx + x |
| if nx != cx: |
| kept.append(x) |
| cx = nx |
| if kept: |
| out.append(cmd) |
| out.extend(kept) |
| continue |
| |
| if cmd in ('V', 'v'): |
| kept = [] |
| for y in values: |
| ny = y if cmd == 'V' else cy + y |
| if ny != cy: |
| kept.append(y) |
| cy = ny |
| if kept: |
| out.append(cmd) |
| out.extend(kept) |
| continue |
| |
| # For other commands, keep original parameters and just track current point. |
| if cmd in ('C', 'c'): |
| if values: |
| out.append(cmd) |
| out.extend(values) |
| k = 0 |
| while k + 5 < len(values): |
| if cmd == 'C': |
| cx, cy = values[k + 4], values[k + 5] |
| else: |
| cx += values[k + 4] |
| cy += values[k + 5] |
| k += 6 |
| have_current = True |
| continue |
| |
| if cmd in ('S', 's'): |
| if values: |
| out.append(cmd) |
| out.extend(values) |
| k = 0 |
| while k + 3 < len(values): |
| if cmd == 'S': |
| cx, cy = values[k + 2], values[k + 3] |
| else: |
| cx += values[k + 2] |
| cy += values[k + 3] |
| k += 4 |
| have_current = True |
| continue |
| |
| if cmd in ('Q', 'q'): |
| if values: |
| out.append(cmd) |
| out.extend(values) |
| k = 0 |
| while k + 3 < len(values): |
| if cmd == 'Q': |
| cx, cy = values[k + 2], values[k + 3] |
| else: |
| cx += values[k + 2] |
| cy += values[k + 3] |
| k += 4 |
| have_current = True |
| continue |
| |
| if cmd in ('T', 't'): |
| if values: |
| out.append(cmd) |
| out.extend(values) |
| k = 0 |
| while k + 1 < len(values): |
| if cmd == 'T': |
| cx, cy = values[k], values[k + 1] |
| else: |
| cx += values[k] |
| cy += values[k + 1] |
| k += 2 |
| have_current = True |
| continue |
| |
| if cmd in ('A', 'a'): |
| if values: |
| out.append(cmd) |
| out.extend(values) |
| k = 0 |
| while k + 6 < len(values): |
| if cmd == 'A': |
| cx, cy = values[k + 5], values[k + 6] |
| else: |
| cx += values[k + 5] |
| cy += values[k + 6] |
| k += 7 |
| have_current = True |
| continue |
| |
| # Unknown command: keep as-is. |
| out.append(cmd) |
| out.extend(values) |
| |
| return out |
| |
| def compare_token_lists_exact(tokens_a, tokens_b): |
| """ |
| Compares two lists of path tokens (commands and floats). |
| Returns True if they match (same commands in same positions, |
| numeric values within given tolerance), otherwise False. |
| """ |
| if len(tokens_a) != len(tokens_b): |
| return None # Different lengths => not a match |
| |
| max_diff = 0 |
| for a, b in zip(tokens_a, tokens_b): |
| if isinstance(a, str) and isinstance(b, str): |
| # Must match exactly the same command letter |
| if a != b: |
| return None |
| elif isinstance(a, float) and isinstance(b, float): |
| # Compare numeric values |
| diff = abs(a - b) |
| max_diff = max(max_diff, diff) |
| else: |
| # One is command, the other is float => mismatch |
| return None |
| |
| return max_diff |
| |
| def compare_token_lists(tokens_a, tokens_b): |
| """ |
| Compare path token streams with a fallback that ignores degenerate |
| linear segments (no-op L/l/H/h/V/v) when needed to resolve |
| structural mismatches. |
| """ |
| ret = compare_token_lists_exact(tokens_a, tokens_b) |
| if ret is not None: |
| return ret |
| |
| return compare_token_lists_exact(normalize_tokens(tokens_a), |
| normalize_tokens(tokens_b)) |
| |
| def compare_svg_files(svg_file_1, svg_file_2): |
| """ |
| Compares two SVG files to check if they have the same number of <path> elements, |
| and each corresponding path has the same structure of commands. |
| Return max difference between respective numeric values in the paths. |
| """ |
| |
| svg_data_1 = open(svg_file_1).read() |
| svg_data_2 = open(svg_file_2).read() |
| |
| # If contents match exactly, return 0 |
| if svg_data_1 == svg_data_2: |
| return 0 |
| |
| paths1 = extract_paths_from_svg(svg_data_1) |
| paths2 = extract_paths_from_svg(svg_data_2) |
| |
| # Check that we have the same number of <path> elements |
| if len(paths1) != len(paths2): |
| return None |
| |
| # Compare each path token list |
| max_diff = 0 |
| for tokens1, tokens2 in zip(paths1, paths2): |
| ret = compare_token_lists(tokens1, tokens2) |
| if ret is None: |
| return ret |
| max_diff = max(max_diff, ret) |
| |
| return max_diff |
| |
| |
| if __name__ == "__main__": |
| import sys |
| |
| if '--help' in sys.argv[1:]: |
| print("Usage: hb-svg-compare TOLERANCE < file_with_svg_pairs.txt") |
| sys.exit(1) |
| |
| tolerance = 0 |
| if len(sys.argv) > 1: |
| tolerance = float(sys.argv[1]) |
| |
| # Read all lines of two SVG file paths from stdin and compare |
| for line in sys.stdin: |
| svg1, svg2 = line.strip().split() |
| diff = compare_svg_files(svg1, svg2) |
| |
| if diff is None: |
| diff = "DIFF" |
| elif diff <= tolerance: |
| continue |
| else: |
| diff = f"{diff:g}" |
| |
| print(f"{diff}\t{svg1}\t{svg2}") |
| # Flush stdout to make sure output is immediate |
| sys.stdout.flush() |