| // Protocol Buffers - Google's data interchange format |
| // Copyright 2023 Google LLC. All rights reserved. |
| // |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file or at |
| // https://developers.google.com/open-source/licenses/bsd |
| |
| #include "upb/text/encode.h" |
| |
| #include <ctype.h> |
| #include <float.h> |
| #include <inttypes.h> |
| #include <stdarg.h> |
| #include <stdint.h> |
| #include <string.h> |
| |
| #include "upb/base/descriptor_constants.h" |
| #include "upb/base/string_view.h" |
| #include "upb/lex/round_trip.h" |
| #include "upb/message/array.h" |
| #include "upb/message/internal/map_entry.h" |
| #include "upb/message/internal/map_sorter.h" |
| #include "upb/message/map.h" |
| #include "upb/message/message.h" |
| #include "upb/message/value.h" |
| #include "upb/port/vsnprintf_compat.h" |
| #include "upb/reflection/def.h" |
| #include "upb/reflection/message.h" |
| #include "upb/wire/eps_copy_input_stream.h" |
| #include "upb/wire/reader.h" |
| #include "upb/wire/types.h" |
| #include "utf8_range.h" |
| |
| // Must be last. |
| #include "upb/port/def.inc" |
| |
| typedef struct { |
| char *buf, *ptr, *end; |
| size_t overflow; |
| int indent_depth; |
| int options; |
| const upb_DefPool* ext_pool; |
| _upb_mapsorter sorter; |
| } txtenc; |
| |
| static void txtenc_msg(txtenc* e, const upb_Message* msg, |
| const upb_MessageDef* m); |
| |
| static void txtenc_putbytes(txtenc* e, const void* data, size_t len) { |
| size_t have = e->end - e->ptr; |
| if (UPB_LIKELY(have >= len)) { |
| memcpy(e->ptr, data, len); |
| e->ptr += len; |
| } else { |
| if (have) { |
| memcpy(e->ptr, data, have); |
| e->ptr += have; |
| } |
| e->overflow += (len - have); |
| } |
| } |
| |
| static void txtenc_putstr(txtenc* e, const char* str) { |
| txtenc_putbytes(e, str, strlen(str)); |
| } |
| |
| static void txtenc_printf(txtenc* e, const char* fmt, ...) { |
| size_t n; |
| size_t have = e->end - e->ptr; |
| va_list args; |
| |
| va_start(args, fmt); |
| n = _upb_vsnprintf(e->ptr, have, fmt, args); |
| va_end(args); |
| |
| if (UPB_LIKELY(have > n)) { |
| e->ptr += n; |
| } else { |
| e->ptr = UPB_PTRADD(e->ptr, have); |
| e->overflow += (n - have); |
| } |
| } |
| |
| static void txtenc_indent(txtenc* e) { |
| if ((e->options & UPB_TXTENC_SINGLELINE) == 0) { |
| int i = e->indent_depth; |
| while (i-- > 0) { |
| txtenc_putstr(e, " "); |
| } |
| } |
| } |
| |
| static void txtenc_endfield(txtenc* e) { |
| if (e->options & UPB_TXTENC_SINGLELINE) { |
| txtenc_putstr(e, " "); |
| } else { |
| txtenc_putstr(e, "\n"); |
| } |
| } |
| |
| static void txtenc_enum(int32_t val, const upb_FieldDef* f, txtenc* e) { |
| const upb_EnumDef* e_def = upb_FieldDef_EnumSubDef(f); |
| const upb_EnumValueDef* ev = upb_EnumDef_FindValueByNumber(e_def, val); |
| |
| if (ev) { |
| txtenc_printf(e, "%s", upb_EnumValueDef_Name(ev)); |
| } else { |
| txtenc_printf(e, "%" PRId32, val); |
| } |
| } |
| |
| static void txtenc_escaped(txtenc* e, unsigned char ch) { |
| switch (ch) { |
| case '\n': |
| txtenc_putstr(e, "\\n"); |
| break; |
| case '\r': |
| txtenc_putstr(e, "\\r"); |
| break; |
| case '\t': |
| txtenc_putstr(e, "\\t"); |
| break; |
| case '\"': |
| txtenc_putstr(e, "\\\""); |
| break; |
| case '\'': |
| txtenc_putstr(e, "\\'"); |
| break; |
| case '\\': |
| txtenc_putstr(e, "\\\\"); |
| break; |
| default: |
| txtenc_printf(e, "\\%03o", ch); |
| break; |
| } |
| } |
| |
| // Returns true if `ch` needs to be escaped in TextFormat, independent of any |
| // UTF-8 validity issues. |
| static bool upb_DefinitelyNeedsEscape(unsigned char ch) { |
| if (ch < 32) return true; |
| switch (ch) { |
| case '\"': |
| case '\'': |
| case '\\': |
| case 127: |
| return true; |
| } |
| return false; |
| } |
| |
| static bool upb_AsciiIsPrint(unsigned char ch) { return ch >= 32 && ch < 127; } |
| |
| // Returns true if this is a high byte that requires UTF-8 validation. If the |
| // UTF-8 validation fails, we must escape the byte. |
| static bool upb_NeedsUtf8Validation(unsigned char ch) { return ch > 127; } |
| |
| // Returns the number of bytes in the prefix of `val` that do not need escaping. |
| // This is like utf8_range::SpanStructurallyValid(), except that it also |
| // terminates at any ASCII char that needs to be escaped in TextFormat (any char |
| // that has `DefinitelyNeedsEscape(ch) == true`). |
| // |
| // If we could get a variant of utf8_range::SpanStructurallyValid() that could |
| // terminate on any of these chars, that might be more efficient, but it would |
| // be much more complicated to modify that heavily SIMD code. |
| static size_t SkipPassthroughBytes(const char* ptr, size_t size) { |
| for (size_t i = 0; i < size; i++) { |
| unsigned char uc = ptr[i]; |
| if (upb_DefinitelyNeedsEscape(uc)) return i; |
| if (upb_NeedsUtf8Validation(uc)) { |
| // Find the end of this region of consecutive high bytes, so that we only |
| // give high bytes to the UTF-8 checker. This avoids needing to perform |
| // a second scan of the ASCII characters looking for characters that |
| // need escaping. |
| // |
| // We assume that high bytes are less frequent than plain, printable ASCII |
| // bytes, so we accept the double-scan of high bytes. |
| size_t end = i + 1; |
| for (; end < size; end++) { |
| if (!upb_NeedsUtf8Validation(ptr[end])) break; |
| } |
| size_t n = end - i; |
| size_t ok = utf8_range_ValidPrefix(ptr + i, n); |
| if (ok != n) return i + ok; |
| i += ok - 1; |
| } |
| } |
| return size; |
| } |
| |
| static void upb_HardenedPrintString(txtenc* e, const char* ptr, size_t len) { |
| // Print as UTF-8, while guarding against any invalid UTF-8 in the string |
| // field. |
| // |
| // If in the future we have a guaranteed invariant that invalid UTF-8 will |
| // never be present, we could avoid the UTF-8 check here. |
| txtenc_putstr(e, "\""); |
| const char* end = ptr + len; |
| while (ptr < end) { |
| size_t n = SkipPassthroughBytes(ptr, end - ptr); |
| if (n != 0) { |
| txtenc_putbytes(e, ptr, n); |
| ptr += n; |
| if (ptr == end) break; |
| } |
| |
| // If repeated calls to CEscape() and PrintString() are expensive, we could |
| // consider batching them, at the cost of some complexity. |
| txtenc_escaped(e, *ptr); |
| ptr++; |
| } |
| txtenc_putstr(e, "\""); |
| } |
| |
| static void txtenc_bytes(txtenc* e, upb_StringView data) { |
| const char* ptr = data.data; |
| const char* end = ptr + data.size; |
| txtenc_putstr(e, "\""); |
| for (; ptr < end; ptr++) { |
| unsigned char uc = *ptr; |
| if (upb_AsciiIsPrint(uc)) { |
| txtenc_putbytes(e, ptr, 1); |
| } else { |
| txtenc_escaped(e, uc); |
| } |
| } |
| txtenc_putstr(e, "\""); |
| } |
| |
| static void txtenc_field(txtenc* e, upb_MessageValue val, |
| const upb_FieldDef* f) { |
| txtenc_indent(e); |
| const upb_CType ctype = upb_FieldDef_CType(f); |
| const bool is_ext = upb_FieldDef_IsExtension(f); |
| const char* full = upb_FieldDef_FullName(f); |
| const char* name = upb_FieldDef_Name(f); |
| |
| if (ctype == kUpb_CType_Message) { |
| // begin:google_only |
| // // TODO: Turn this into a feature check and opensource it. |
| // if (_upb_FieldDef_IsGroupLike(f)) { |
| // const upb_MessageDef* m = upb_FieldDef_MessageSubDef(f); |
| // name = upb_MessageDef_Name(m); |
| // } |
| // end:google_only |
| if (is_ext) { |
| txtenc_printf(e, "[%s] {", full); |
| } else { |
| txtenc_printf(e, "%s {", name); |
| } |
| txtenc_endfield(e); |
| e->indent_depth++; |
| txtenc_msg(e, val.msg_val, upb_FieldDef_MessageSubDef(f)); |
| e->indent_depth--; |
| txtenc_indent(e); |
| txtenc_putstr(e, "}"); |
| txtenc_endfield(e); |
| return; |
| } |
| |
| if (is_ext) { |
| txtenc_printf(e, "[%s]: ", full); |
| } else { |
| txtenc_printf(e, "%s: ", name); |
| } |
| |
| switch (ctype) { |
| case kUpb_CType_Bool: |
| txtenc_putstr(e, val.bool_val ? "true" : "false"); |
| break; |
| case kUpb_CType_Float: { |
| char buf[32]; |
| _upb_EncodeRoundTripFloat(val.float_val, buf, sizeof(buf)); |
| txtenc_putstr(e, buf); |
| break; |
| } |
| case kUpb_CType_Double: { |
| char buf[32]; |
| _upb_EncodeRoundTripDouble(val.double_val, buf, sizeof(buf)); |
| txtenc_putstr(e, buf); |
| break; |
| } |
| case kUpb_CType_Int32: |
| txtenc_printf(e, "%" PRId32, val.int32_val); |
| break; |
| case kUpb_CType_UInt32: |
| txtenc_printf(e, "%" PRIu32, val.uint32_val); |
| break; |
| case kUpb_CType_Int64: |
| txtenc_printf(e, "%" PRId64, val.int64_val); |
| break; |
| case kUpb_CType_UInt64: |
| txtenc_printf(e, "%" PRIu64, val.uint64_val); |
| break; |
| case kUpb_CType_String: |
| upb_HardenedPrintString(e, val.str_val.data, val.str_val.size); |
| break; |
| case kUpb_CType_Bytes: |
| txtenc_bytes(e, val.str_val); |
| break; |
| case kUpb_CType_Enum: |
| txtenc_enum(val.int32_val, f, e); |
| break; |
| default: |
| UPB_UNREACHABLE(); |
| } |
| |
| txtenc_endfield(e); |
| } |
| |
| /* |
| * Arrays print as simple repeated elements, eg. |
| * |
| * foo_field: 1 |
| * foo_field: 2 |
| * foo_field: 3 |
| */ |
| static void txtenc_array(txtenc* e, const upb_Array* arr, |
| const upb_FieldDef* f) { |
| size_t i; |
| size_t size = upb_Array_Size(arr); |
| |
| for (i = 0; i < size; i++) { |
| txtenc_field(e, upb_Array_Get(arr, i), f); |
| } |
| } |
| |
| static void txtenc_mapentry(txtenc* e, upb_MessageValue key, |
| upb_MessageValue val, const upb_FieldDef* f) { |
| const upb_MessageDef* entry = upb_FieldDef_MessageSubDef(f); |
| const upb_FieldDef* key_f = upb_MessageDef_Field(entry, 0); |
| const upb_FieldDef* val_f = upb_MessageDef_Field(entry, 1); |
| txtenc_indent(e); |
| txtenc_printf(e, "%s {", upb_FieldDef_Name(f)); |
| txtenc_endfield(e); |
| e->indent_depth++; |
| |
| txtenc_field(e, key, key_f); |
| txtenc_field(e, val, val_f); |
| |
| e->indent_depth--; |
| txtenc_indent(e); |
| txtenc_putstr(e, "}"); |
| txtenc_endfield(e); |
| } |
| |
| /* |
| * Maps print as messages of key/value, etc. |
| * |
| * foo_map: { |
| * key: "abc" |
| * value: 123 |
| * } |
| * foo_map: { |
| * key: "def" |
| * value: 456 |
| * } |
| */ |
| static void txtenc_map(txtenc* e, const upb_Map* map, const upb_FieldDef* f) { |
| if (e->options & UPB_TXTENC_NOSORT) { |
| size_t iter = kUpb_Map_Begin; |
| upb_MessageValue key, val; |
| while (upb_Map_Next(map, &key, &val, &iter)) { |
| txtenc_mapentry(e, key, val, f); |
| } |
| } else { |
| if (upb_Map_Size(map) == 0) return; |
| |
| const upb_MessageDef* entry = upb_FieldDef_MessageSubDef(f); |
| const upb_FieldDef* key_f = upb_MessageDef_Field(entry, 0); |
| _upb_sortedmap sorted; |
| upb_MapEntry ent; |
| |
| _upb_mapsorter_pushmap(&e->sorter, upb_FieldDef_Type(key_f), map, &sorted); |
| while (_upb_sortedmap_next(&e->sorter, map, &sorted, &ent)) { |
| upb_MessageValue key, val; |
| memcpy(&key, &ent.k, sizeof(key)); |
| memcpy(&val, &ent.v, sizeof(val)); |
| txtenc_mapentry(e, key, val, f); |
| } |
| _upb_mapsorter_popmap(&e->sorter, &sorted); |
| } |
| } |
| |
| #define CHK(x) \ |
| do { \ |
| if (!(x)) { \ |
| return false; \ |
| } \ |
| } while (0) |
| |
| /* |
| * Unknown fields are printed by number. |
| * |
| * 1001: 123 |
| * 1002: "hello" |
| * 1006: 0xdeadbeef |
| * 1003: { |
| * 1: 111 |
| * } |
| */ |
| static const char* txtenc_unknown(txtenc* e, const char* ptr, |
| upb_EpsCopyInputStream* stream, |
| int groupnum) { |
| // We are guaranteed that the unknown data is valid wire format, and will not |
| // contain tag zero. |
| uint32_t end_group = groupnum > 0 |
| ? ((groupnum << kUpb_WireReader_WireTypeBits) | |
| kUpb_WireType_EndGroup) |
| : 0; |
| |
| while (!upb_EpsCopyInputStream_IsDone(stream, &ptr)) { |
| uint32_t tag; |
| CHK(ptr = upb_WireReader_ReadTag(ptr, &tag)); |
| if (tag == end_group) return ptr; |
| |
| txtenc_indent(e); |
| txtenc_printf(e, "%d: ", (int)upb_WireReader_GetFieldNumber(tag)); |
| |
| switch (upb_WireReader_GetWireType(tag)) { |
| case kUpb_WireType_Varint: { |
| uint64_t val; |
| CHK(ptr = upb_WireReader_ReadVarint(ptr, &val)); |
| txtenc_printf(e, "%" PRIu64, val); |
| break; |
| } |
| case kUpb_WireType_32Bit: { |
| uint32_t val; |
| ptr = upb_WireReader_ReadFixed32(ptr, &val); |
| txtenc_printf(e, "0x%08" PRIu32, val); |
| break; |
| } |
| case kUpb_WireType_64Bit: { |
| uint64_t val; |
| ptr = upb_WireReader_ReadFixed64(ptr, &val); |
| txtenc_printf(e, "0x%016" PRIu64, val); |
| break; |
| } |
| case kUpb_WireType_Delimited: { |
| int size; |
| char* start = e->ptr; |
| size_t start_overflow = e->overflow; |
| CHK(ptr = upb_WireReader_ReadSize(ptr, &size)); |
| CHK(upb_EpsCopyInputStream_CheckDataSizeAvailable(stream, ptr, size)); |
| |
| // Speculatively try to parse as message. |
| txtenc_putstr(e, "{"); |
| txtenc_endfield(e); |
| |
| // EpsCopyInputStream can't back up, so create a sub-stream for the |
| // speculative parse. |
| upb_EpsCopyInputStream sub_stream; |
| const char* sub_ptr = upb_EpsCopyInputStream_GetAliasedPtr(stream, ptr); |
| upb_EpsCopyInputStream_Init(&sub_stream, &sub_ptr, size, true); |
| |
| e->indent_depth++; |
| if (txtenc_unknown(e, sub_ptr, &sub_stream, -1)) { |
| ptr = upb_EpsCopyInputStream_Skip(stream, ptr, size); |
| e->indent_depth--; |
| txtenc_indent(e); |
| txtenc_putstr(e, "}"); |
| } else { |
| // Didn't work out, print as raw bytes. |
| e->indent_depth--; |
| e->ptr = start; |
| e->overflow = start_overflow; |
| const char* str = ptr; |
| ptr = upb_EpsCopyInputStream_ReadString(stream, &str, size, NULL); |
| UPB_ASSERT(ptr); |
| txtenc_bytes(e, (upb_StringView){.data = str, .size = size}); |
| } |
| break; |
| } |
| case kUpb_WireType_StartGroup: |
| txtenc_putstr(e, "{"); |
| txtenc_endfield(e); |
| e->indent_depth++; |
| CHK(ptr = txtenc_unknown(e, ptr, stream, |
| upb_WireReader_GetFieldNumber(tag))); |
| e->indent_depth--; |
| txtenc_indent(e); |
| txtenc_putstr(e, "}"); |
| break; |
| default: |
| return NULL; |
| } |
| txtenc_endfield(e); |
| } |
| |
| return end_group == 0 && !upb_EpsCopyInputStream_IsError(stream) ? ptr : NULL; |
| } |
| |
| #undef CHK |
| |
| static void txtenc_msg(txtenc* e, const upb_Message* msg, |
| const upb_MessageDef* m) { |
| size_t iter = kUpb_Message_Begin; |
| const upb_FieldDef* f; |
| upb_MessageValue val; |
| |
| while (upb_Message_Next(msg, m, e->ext_pool, &f, &val, &iter)) { |
| if (upb_FieldDef_IsMap(f)) { |
| txtenc_map(e, val.map_val, f); |
| } else if (upb_FieldDef_IsRepeated(f)) { |
| txtenc_array(e, val.array_val, f); |
| } else { |
| txtenc_field(e, val, f); |
| } |
| } |
| |
| if ((e->options & UPB_TXTENC_SKIPUNKNOWN) == 0) { |
| size_t size; |
| const char* ptr = upb_Message_GetUnknown(msg, &size); |
| if (size != 0) { |
| char* start = e->ptr; |
| upb_EpsCopyInputStream stream; |
| upb_EpsCopyInputStream_Init(&stream, &ptr, size, true); |
| if (!txtenc_unknown(e, ptr, &stream, -1)) { |
| /* Unknown failed to parse, back up and don't print it at all. */ |
| e->ptr = start; |
| } |
| } |
| } |
| } |
| |
| size_t txtenc_nullz(txtenc* e, size_t size) { |
| size_t ret = e->ptr - e->buf + e->overflow; |
| |
| if (size > 0) { |
| if (e->ptr == e->end) e->ptr--; |
| *e->ptr = '\0'; |
| } |
| |
| return ret; |
| } |
| |
| size_t upb_TextEncode(const upb_Message* msg, const upb_MessageDef* m, |
| const upb_DefPool* ext_pool, int options, char* buf, |
| size_t size) { |
| txtenc e; |
| |
| e.buf = buf; |
| e.ptr = buf; |
| e.end = UPB_PTRADD(buf, size); |
| e.overflow = 0; |
| e.indent_depth = 0; |
| e.options = options; |
| e.ext_pool = ext_pool; |
| _upb_mapsorter_init(&e.sorter); |
| |
| txtenc_msg(&e, msg, m); |
| _upb_mapsorter_destroy(&e.sorter); |
| return txtenc_nullz(&e, size); |
| } |