| # Protocol Buffers - Google's data interchange format |
| # Copyright 2026 Google LLC. All rights reserved. |
| # |
| # Use of this source code is governed by a BSD-style |
| # license that can be found in the LICENSE file or at |
| # https://developers.google.com/open-source/licenses/bsd |
| |
| """Implements encode_raw_string_as_crate_name. |
| |
| Based on an implementation in rules_rust: |
| * https://github.com/bazelbuild/rules_rust/blob/cdaf15f5796e3e934b074526272823284bbaed01/rust/private/utils.bzl#L643 |
| """ |
| |
| # This is a list of pairs, where the first element of the pair is a character |
| # that is allowed in Bazel package or target names but not in crate names; and |
| # the second element is an encoding of that char suitable for use in a crate |
| # name. |
| _encodings = ( |
| (":", "x"), |
| ("!", "excl"), |
| ("%", "prc"), |
| ("@", "ao"), |
| ("^", "caret"), |
| ("`", "bt"), |
| (" ", "sp"), |
| ("\"", "dq"), |
| ("#", "octo"), |
| ("$", "dllr"), |
| ("&", "amp"), |
| ("'", "sq"), |
| ("(", "lp"), |
| (")", "rp"), |
| ("*", "astr"), |
| ("-", "d"), |
| ("+", "pl"), |
| (",", "cm"), |
| (";", "sm"), |
| ("<", "la"), |
| ("=", "eq"), |
| (">", "ra"), |
| ("?", "qm"), |
| ("[", "lbk"), |
| ("]", "rbk"), |
| ("{", "lbe"), |
| ("|", "pp"), |
| ("}", "rbe"), |
| ("~", "td"), |
| ("/", "y"), |
| (".", "pd"), |
| ) |
| |
| # For each of the above encodings, we generate two substitution rules: one that |
| # ensures any occurrences of the encodings themselves in the package/target |
| # aren't clobbered by this translation, and one that does the encoding itself. |
| # We also include a rule that protects the clobbering-protection rules from |
| # getting clobbered. |
| _substitutions = [("_z", "_zz_")] + [ |
| subst |
| for (pattern, replacement) in _encodings |
| for subst in ( |
| ("_{}_".format(replacement), "_z{}_".format(replacement)), |
| (pattern, "_{}_".format(replacement)), |
| ) |
| ] |
| |
| # Expose the substitutions for testing only. |
| substitutions_for_testing = _substitutions |
| |
| def _replace_all(string, substitutions): |
| """Replaces occurrences of the given patterns in `string`. |
| |
| There are a few reasons this looks complicated: |
| * The substitutions are performed with some priority, i.e. patterns that are |
| listed first in `substitutions` are higher priority than patterns that are |
| listed later. |
| * We also take pains to avoid doing replacements that overlap with each |
| other, since overlaps invalidate pattern matches. |
| * To avoid hairy offset invalidation, we apply the substitutions |
| right-to-left. |
| * To avoid the "_quote" -> "_quotequote_" rule introducing new pattern |
| matches later in the string during decoding, we take the leftmost |
| replacement, in cases of overlap. (Note that no rule can induce new |
| pattern matches *earlier* in the string.) (E.g. "_quotedot_" encodes to |
| "_quotequote_dot_". Note that "_quotequote_" and "_dot_" both occur in |
| this string, and overlap.). |
| |
| Args: |
| string (string): the string in which the replacements should be performed. |
| substitutions: the list of patterns and replacements to apply. |
| |
| Returns: |
| A string with the appropriate substitutions performed. |
| """ |
| |
| # Find the highest-priority pattern matches for each string index, going |
| # left-to-right and skipping indices that are already involved in a |
| # pattern match. |
| plan = {} |
| matched_indices_set = {} |
| for pattern_start in range(len(string)): |
| if pattern_start in matched_indices_set: |
| continue |
| for (pattern, replacement) in substitutions: |
| if not string.startswith(pattern, pattern_start): |
| continue |
| length = len(pattern) |
| plan[pattern_start] = (length, replacement) |
| matched_indices_set.update([(pattern_start + i, True) for i in range(length)]) |
| break |
| |
| # Execute the replacement plan, working from right to left. |
| for pattern_start in sorted(plan.keys(), reverse = True): |
| length, replacement = plan[pattern_start] |
| after_pattern = pattern_start + length |
| string = string[:pattern_start] + replacement + string[after_pattern:] |
| |
| return string |
| |
| def encode_raw_string_as_crate_name(str): |
| """Encodes a string using the above encoding format. |
| |
| Args: |
| str (string): The string to be encoded. |
| |
| Returns: |
| An encoded version of the input string. |
| """ |
| return _replace_all(str, _substitutions) |
| |
| def decode_crate_name_as_raw_string_for_testing(crate_name): |
| """Decodes a crate_name that was encoded by encode_raw_string_as_crate_name. |
| |
| This is used to check that the encoding is bijective; it is expected to only |
| be used in tests. |
| |
| Args: |
| crate_name (string): The name of the crate. |
| |
| Returns: |
| A string representing the Bazel label (package and target). |
| """ |
| return _replace_all(crate_name, [(t[1], t[0]) for t in _substitutions]) |