| // Protocol Buffers - Google's data interchange format |
| // Copyright 2008 Google Inc. All rights reserved. |
| // https://developers.google.com/protocol-buffers/ |
| // |
| // Redistribution and use in source and binary forms, with or without |
| // modification, are permitted provided that the following conditions are |
| // met: |
| // |
| // * Redistributions of source code must retain the above copyright |
| // notice, this list of conditions and the following disclaimer. |
| // * Redistributions in binary form must reproduce the above |
| // copyright notice, this list of conditions and the following disclaimer |
| // in the documentation and/or other materials provided with the |
| // distribution. |
| // * Neither the name of Google Inc. nor the names of its |
| // contributors may be used to endorse or promote products derived from |
| // this software without specific prior written permission. |
| // |
| // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| #ifndef GOOGLE_PROTOBUF_PARSE_CONTEXT_H__ |
| #define GOOGLE_PROTOBUF_PARSE_CONTEXT_H__ |
| |
| #include <string> |
| |
| #include <google/protobuf/port.h> |
| |
| #if GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER |
| #include <google/protobuf/io/coded_stream.h> |
| #include <google/protobuf/wire_format_lite.h> |
| #include <google/protobuf/stubs/common.h> |
| #include "third_party/absl/base/optimization.h" |
| #include <google/protobuf/stubs/strutil.h> |
| #include "util/coding/varint.h" |
| |
| #include <google/protobuf/port_def.inc> |
| |
| namespace google { |
| namespace protobuf { |
| |
| class UnknownFieldSet; |
| class DescriptorPool; |
| class MessageFactory; |
| |
| namespace internal { |
| |
| // Template code below needs to know about the existence of these functions. |
| void WriteVarint(uint32 num, uint64 val, std::string* s); |
| void WriteLengthDelimited(uint32 num, StringPiece val, std::string* s); |
| // Inline because it is just forwarding to s->WriteVarint |
| inline void WriteVarint(uint32 num, uint64 val, UnknownFieldSet* s); |
| inline void WriteLengthDelimited(uint32 num, StringPiece val, |
| UnknownFieldSet* s); |
| |
| |
| // ParseContext contains state that needs to be preserved across buffer seams. |
| |
| class ParseContext; |
| |
| // The parser works by composing elementary parse functions, that are generated |
| // by the compiler, together to perform the full parse. To accomplish this the |
| // functionality of the elementary parse function is slightly increased which |
| // allows it to become composable. |
| |
| // The basic abstraction ParseContext is designed for is a slight modification |
| // of the ZeroCopyInputStream (ZCIS) abstraction. A ZCIS presents a serialized |
| // stream as a series of buffers that concatenate to the full stream. |
| // Pictorially a ZCIS presents a stream in chunks like so |
| // [---------------------------------------------------------------] |
| // [---------------------] chunk 1 |
| // [----------------------------] chunk 2 |
| // chunk 3 [--------------] |
| // |
| // Where the '-' represent the bytes which are vertically lined up with the |
| // bytes of the stream. |
| // ParseContext requires its input to be presented similarily with the extra |
| // property that the last kSlopBytes of a chunk overlaps with the first |
| // kSlopBytes of the next chunk, or if there is no next chunk at least its still |
| // valid to read those bytes. Again, pictorially, we now have |
| // |
| // [---------------------------------------------------------------] |
| // [-------------------....] chunk 1 |
| // [------------------------....] chunk 2 |
| // chunk 3 [------------------..**] |
| // chunk 4 [--****] |
| // Here '-' mean the bytes of the stream or chunk and '.' means bytes past the |
| // chunk that match up with the start of the next chunk. Above each chunk has |
| // 4 '.' after the chunk. In the case these 'overflow' bytes represents bytes |
| // past the stream, indicated by '*' above, their values are unspecified. It is |
| // still legal to read them (ie. should not segfault). Reading past the |
| // end should be detected by the user and indicated as an error. |
| // |
| // The reason for this, admittedly, unconventional invariant is to ruthlessly |
| // optimize the protobuf parser. Having an overlap helps in two important ways. |
| // Firstly it alleviates having to performing bounds checks, if a piece of code |
| // will never read more than kSlopBytes. Secondly, and more importantly, the |
| // protobuf wireformat is such that there is always a fresh start of a tag |
| // within kSlopBytes. This allows the parser to exit parsing a chunk leaving |
| // the parse on a position inside the overlap where a fresh tag starts. |
| |
| // The elementary parse function has the following signature |
| |
| typedef const char* (*ParseFunc)(const char* ptr, const char* end, void* object, |
| ParseContext* ctx); |
| |
| // which parses the serialized data stored in the range [ptr, end) into object. |
| // A parse function together with its object forms a callable closure. |
| struct ParseClosure { |
| ParseFunc func; |
| void* object; |
| |
| // Pre-conditions |
| // ptr < end is a non-empty range where ptr points to the start of a tag |
| // and it's okay to read the bytes in [end, end + kSlopBytes). |
| // Which will contain the bytes of the next chunk if the stream continues, |
| // or undefined in which case the parse will be guaranteed to fail. |
| // |
| // Post-conditions |
| // Parsed all tag/value pairs starting before end or if a group end |
| // tag is encountered returns the pointer to that tag. |
| // If a group end is encountered it verifies it matches the one that was |
| // pushed and the stack is popped. |
| // Otherwise it will parses the entire range pushing if end is inside one |
| // of the children those are pushed on the stack. |
| // |
| // If an element is popped from the stack it ended on the correct end group |
| // returns pointer after end-group tag (posibly in overlap, but the start |
| // of end-group tag will be before end). |
| // If the stack is the same or deeper, returns pointer in overlap region |
| // (end <= retval < end + kSlopBytes). |
| // All tag/value pairs between in [begin, retval) are parsed and retval |
| // points to start of a tag. |
| const char* operator()(const char* ptr, const char* end, ParseContext* ctx) { |
| ABSL_ASSERT(ptr < end); |
| return func(ptr, end, object, ctx); |
| } |
| }; |
| |
| // To fully parse a stream, a driver loop repeatedly calls the parse function |
| // at the top of the stack, popping and resume parsing the parent message |
| // according to the recursive structure of the wireformat. This loop will also |
| // need to provide new buffer chunks and align the ptr correctly over the seams. |
| // The point of this framework is that chunk refresh logic is located in the |
| // outer loop, while the inner loop is almost free of it. The two code paths in |
| // the parse code dealing with seams are located in fallback paths whose checks |
| // are folded with input limit checks that are necessary anyway. In other words, |
| // all the parser code that deals with seams is located in what would otherwise |
| // be error paths of a parser that wouldn't need to deal with seams. |
| |
| class ParseContext { |
| public: |
| enum { |
| // Tag is atmost 5 bytes, varint is atmost 10 resulting in 15 bytes. We |
| // choose |
| // 16 bytes for the obvious reason of alignment. |
| kSlopBytes = 16, |
| // Inlined stack size |
| kInlinedDepth = 15, |
| }; |
| |
| // Arghh!!! here be tech-debt dragons |
| struct ExtraParseData { |
| const DescriptorPool* pool = nullptr; |
| MessageFactory* factory = nullptr; |
| |
| // payload is used for MessageSetItem and maps |
| std::string payload; |
| bool (*parse_map)(const char* begin, const char* end, void* map_field, |
| ParseContext* ctx); |
| |
| void SetEnumValidator(bool (*validator)(int), void* unknown, |
| int field_num) { |
| enum_validator = validator; |
| unknown_fields = unknown; |
| field_number = field_num; |
| } |
| void SetEnumValidatorArg(bool (*validator)(const void*, int), |
| const void* arg, void* unknown, int field_num) { |
| arg_enum_validator = {validator, arg}; |
| unknown_fields = unknown; |
| field_number = field_num; |
| } |
| template <typename Unknown> |
| bool ValidateEnum(int val) const { |
| if (enum_validator(val)) return true; |
| WriteVarint(field_number, val, static_cast<Unknown*>(unknown_fields)); |
| return false; |
| } |
| template <typename Unknown> |
| bool ValidateEnumArg(int val) const { |
| if (arg_enum_validator(val)) return true; |
| WriteVarint(field_number, val, static_cast<Unknown*>(unknown_fields)); |
| return false; |
| } |
| |
| void SetFieldName(const void* name) { |
| unknown_fields = const_cast<void*>(name); |
| } |
| const char* FieldName() const { |
| return static_cast<const char*>(unknown_fields); |
| } |
| |
| union { |
| bool (*enum_validator)(int); |
| struct { |
| bool operator()(int val) const { return validator(arg, val); } |
| bool (*validator)(const void*, int); |
| const void* arg; |
| } arg_enum_validator; |
| }; |
| void* unknown_fields; |
| int field_number; |
| // 0 means no aliasing. If not zero aliasing is the delta between the |
| // ptr and the buffer that needs to be aliased. If the value is |
| // kNoDelta (1) this means delta is actually 0 (we're working directly in |
| // the buffer). |
| enum { kNoDelta = 1 }; |
| std::uintptr_t aliasing = 0; |
| }; |
| |
| ExtraParseData& extra_parse_data() { return extra_parse_data_; } |
| const ExtraParseData& extra_parse_data() const { return extra_parse_data_; } |
| |
| // Helpers to detect if a parse of length delimited field is completed. |
| bool AtLimit() const { return limit_ == 0; } |
| int32 CurrentLimit() const { return limit_; } |
| |
| // Initializes ParseContext with a specific recursion limit (rec_limit) |
| explicit ParseContext(int rec_limit) |
| : depth_(rec_limit), |
| start_depth_(rec_limit), |
| stack_(inline_stack_ + kInlinedDepth - rec_limit), |
| inlined_depth_(std::max(0, rec_limit - kInlinedDepth)) {} |
| |
| ~ParseContext() { |
| if (inlined_depth_ == -1) delete stack_; |
| } |
| |
| void StartParse(ParseClosure parser) { parser_ = parser; } |
| |
| // Parses a chunk of memory given the current state of parse context (ie. |
| // the active parser and stack) and overrun. |
| // Pre-condition: |
| // chunk_ is not empty. |
| // limit_ > 0 (limit from begin) or -1 (no limit) |
| // Post-condition: |
| // returns true on success with overrun_ptr adjusted to the new value, or |
| // false is the parse is finished. False means either a parse failure or |
| // or because the top-level was terminated on a 0 or end-group tag in which |
| // case overrun points to the position after the ending tag. You can call |
| // EndedOnTag() to find if the parse failed due to an error or ended on |
| // terminating tag. |
| bool ParseRange(StringPiece chunk, int* overrun_ptr) { |
| ABSL_ASSERT(!chunk.empty()); |
| int& overrun = *overrun_ptr; |
| if (overrun >= chunk.size()) { |
| // This case can easily happen in patch buffers and we like to inline |
| // this case. |
| overrun -= chunk.size(); |
| return true; |
| } |
| auto res = ParseRangeWithLimit(chunk.begin() + overrun, chunk.end()); |
| overrun = res.second; |
| return res.first; |
| } |
| |
| bool ValidEnd(int overrun) { return depth_ == start_depth_ && overrun == 0; } |
| bool EndedOnTag() const { return last_tag_minus_1_ != 0; } |
| uint32 LastTag() const { return last_tag_minus_1_ + 1; } |
| |
| // Generically verifies for the slop region [begin, begin + kSlopBytes) if |
| // the parse will be terminated by 0 or end-group tag. If true than you can |
| // safely parse the slop region without having to load more data. |
| bool ParseEndsInSlopRegion(const char* begin, int overrun) const; |
| |
| // Should only be called by Parse code. |
| |
| ////////////////////////////////////////////////////////////////////////////// |
| // Fast path helpers. These helpers maintain the state in parse context |
| // through recursive calls. The whole design is to make this as minimal as |
| // possible. Only recursion depth and limit are maintained at every recursion. |
| ////////////////////////////////////////////////////////////////////////////// |
| |
| bool ParseExactRange(ParseClosure parser, const char* begin, |
| const char* end) { |
| if (PROTOBUF_PREDICT_FALSE(--depth_ < 0)) return false; |
| auto old_limit = limit_; |
| limit_ = 0; |
| auto ptr = begin; |
| if (ptr < end) ptr = parser(ptr, end, this); |
| if (ptr != end || EndedOnTag()) return false; |
| limit_ = old_limit; |
| ++depth_; |
| return true; |
| } |
| |
| // Returns a pair of the pointer the parse is left and a boolean indicating |
| // if the group is still continuing. |
| std::pair<const char*, bool> ParseGroup(uint32 tag, ParseClosure parser, |
| const char* begin, const char* end, |
| int* depth) { |
| if (PROTOBUF_PREDICT_FALSE(--depth_ < 0)) return {}; |
| *depth = depth_; |
| auto ptr = begin; |
| if (ptr < end) ptr = parser(ptr, end, this); |
| if (ptr == nullptr) return {}; |
| if (!EndedOnTag()) { |
| // The group hasn't been terminated by an end-group and thus continues, |
| // hence it must have ended because it crossed "end". |
| ABSL_ASSERT(ptr >= end); |
| return {ptr, true}; |
| } |
| // Verify that the terminating tag matches the start group tag. As an extra |
| // subtlety it could have been terminated by an end-group tag but in a |
| // length delimited sub field of the group. So we must also check that depth |
| // matches, if it doesn't match it means a length delimited subfield got |
| // terminated by an end group which is an error. |
| if (tag != last_tag_minus_1_ || *depth != depth_) return {}; |
| last_tag_minus_1_ = 0; // It must always be cleared. |
| ++depth_; |
| return {ptr, false}; |
| } |
| |
| void EndGroup(uint32 tag) { |
| ABSL_ASSERT(tag == 0 || (tag & 7) == 4); |
| // Because of the above assert last_tag_minus_1 is never set to 0, and the |
| // caller can verify the child parser was terminated, by comparing to 0. |
| last_tag_minus_1_ = tag - 1; |
| } |
| |
| ////////////////////////////////////////////////////////////////////////////// |
| // Slow path helper functions when a child crosses the "end" of range. |
| // This is either an error (if limit_ = 0) OR we need to store state. |
| // These functions manage the task of updating the state correctly. |
| ////////////////////////////////////////////////////////////////////////////// |
| |
| // Helper function called by generated code in case of a length delimited |
| // field that is going to cross the boundary. |
| const char* StoreAndTailCall(const char* ptr, const char* end, |
| ParseClosure current_parser, |
| ParseClosure child_parser, int32 size) { |
| // if size was bigger than 2GB we should fail |
| if (size < 0) return nullptr; |
| // At this point ptr could be past end. Hence a malicious size could |
| // overflow. |
| int64 safe_new_limit = size - static_cast<int64>(end - ptr); |
| if (safe_new_limit > INT_MAX) return nullptr; |
| ABSL_ASSERT(safe_new_limit > 0); // only call this if it's crossing end |
| int32 new_limit = static_cast<int32>(safe_new_limit); |
| int32 delta; |
| if (limit_ != -1) { |
| if (PROTOBUF_PREDICT_FALSE(new_limit > limit_)) return nullptr; |
| delta = limit_ - new_limit; |
| } else { |
| delta = -1; // special value |
| } |
| limit_ = new_limit; |
| // Save the current closure on the stack. |
| if (!Push(current_parser, delta)) return nullptr; |
| // Ensure the active state is set correctly. |
| parser_ = child_parser; |
| return ptr < end ? child_parser(ptr, end, this) : ptr; |
| } |
| |
| // Helper function for a child group that has crossed the boundary. |
| bool StoreGroup(ParseClosure current_parser, ParseClosure child_parser, |
| int depth, uint32 tag) { |
| // The group must still read an end-group tag, so it can't be at a limit. |
| // By having this check we ensure that when limit_ = 0 we can't end in some |
| // deeper recursion. Hence ParseExactRange does not need to check for |
| // matching depth. |
| if (limit_ == 0) return false; |
| if (depth == depth_) { |
| // This child group is the active parser. The fast path code assumes |
| // everything will be parsed within a chunk and doesn't modify |
| // parse context in this case. We need to make the child parser active. |
| parser_ = child_parser; |
| } |
| if (ABSL_PREDICT_FALSE(depth < inlined_depth_)) SwitchStack(); |
| stack_[depth] = {current_parser, static_cast<int32>(~(tag >> 3))}; |
| return true; |
| } |
| |
| private: |
| // This the "active" or current parser. |
| ParseClosure parser_; |
| // The context keeps an internal stack to keep track of the recursive |
| // part of the parse state. |
| // Current depth of the active parser, depth counts down. |
| // This is used to limit recursion depth (to prevent overflow on malicious |
| // data), but is also used to index in stack_ to store the current state. |
| int depth_; |
| int32 limit_ = -1; |
| |
| // A state is on the stack to save it, in order to continue parsing after |
| // child is done. |
| struct State { |
| ParseClosure parser; |
| // This element describes how to adjust the parse state after finishing |
| // the child. If the child was a length delimited field, delta describes |
| // the limit relative to the child's limit (hence >= 0). |
| // If child was a sub group limit contains ~field num (hence < 0) in order |
| // to verify the group ended on a correct end tag. No limit adjusting. |
| // Note above the sign of delta is meaningful |
| int32 delta_or_group_num; |
| }; |
| int start_depth_; |
| // This is used to return the end group (or 0 tag) that terminated the parse. |
| // Actually it contains last_tag minus 1. Which is either the start group tag |
| // or -1. This member should always be zero and the caller should immediately |
| // check this member to verify what state the parser ended on and clear its |
| // value. |
| uint32 last_tag_minus_1_ = 0; |
| |
| ExtraParseData extra_parse_data_; |
| State* stack_; |
| State inline_stack_[kInlinedDepth]; |
| int inlined_depth_; |
| |
| bool Push(ParseClosure parser, int32 delta) { |
| ABSL_ASSERT(delta >= -1); // Make sure it's a valid len-delim |
| if (PROTOBUF_PREDICT_FALSE(--depth_ < 0)) return false; |
| if (ABSL_PREDICT_FALSE(depth_ < inlined_depth_)) SwitchStack(); |
| stack_[depth_] = {parser, delta}; |
| return true; |
| } |
| |
| State Pop() { return stack_[depth_++]; } |
| |
| void SwitchStack(); |
| |
| // Parses a chunk of memory given the current state of parse context (ie. |
| // the active parser and stack). |
| // Pre-condition: |
| // begin < end (non-empty range) |
| // limit_ > 0 (limit from begin) or -1 (no limit) |
| // Post-condition: |
| // returns either (true, overrun) for a successful parse that can continue, |
| // or (false, overrun) for a parse that can't continue. Either due to a |
| // corrupt data (parse failure) or because the top-level was terminated on a |
| // 0 or end-group tag in which case overrun points to the position after the |
| // end. |
| std::pair<bool, int> ParseRangeWithLimit(const char* begin, const char* end); |
| }; |
| |
| // This is wrapper to parse a sequence of buffers without the overlap property, |
| // like the sequence given by ZeroCopyInputStream (ZCIS) or ByteSource. This is |
| // done by copying data around the seams, hence the name EpsCopyParser. |
| // Pictorially if ZCIS presents a stream in chunks like so |
| // [---------------------------------------------------------------] |
| // [---------------------] chunk 1 |
| // [----------------------------] chunk 2 |
| // chunk 3 [--------------] |
| // where '-' depicts bytes of the stream or chunks vertically alligned with the |
| // corresponding bytes between stream and chunk. |
| // |
| // This class will present chunks to the ParseContext like this |
| // [-----------------....] chunk 1 |
| // [----....] patch |
| // [------------------------....] chunk 2 |
| // [----....] patch |
| // chunk 3 [----------....] |
| // patch [----****] |
| // by using a fixed size buffer to patch over the seams. This requires |
| // copying of an "epsilon" neighboorhood around the seams. In the picture above |
| // dots mean bytes beyond the end of the new chunks. Each chunk is kSlopBytes |
| // smalller as its original chunk (above depicted as 4 dots) and the number of |
| // of chunks is doubled because each seam in the original stream introduces a |
| // new patch. |
| // |
| // The algorithm is simple but not entirely trivial. Two complications arise |
| // 1) The original chunk could be less than kSlopBytes. Hence we can't simply |
| // chop the last kSlopBytes of a chunk. |
| // 2) In some (infrequent) use cases, we don't necessarily parse unitl the end |
| // of a stream, but instead the parse is terminated by 0 or end-group tag. If |
| // this is allowed we must take care to leave the underlying stream at a |
| // position precisely after the terminating tag. If this happens in the slop |
| // region of a buffer we will already have loaded the next buffer. Not all |
| // streams allow backing up to a previous buffer blocking us from leaving the |
| // stream in the proper state. If terminating on 0 is allowed (in the old parser |
| // this means a call to MergePartialFromCodedStream without a subsequent call to |
| // ConsumedEntireMessage), this algorithm needs to ensure the parse won't end |
| // in the slop region before moving the next buffer. |
| // |
| // The core idea of EpsCopyParser is to parse ranges except the last kSlopBytes |
| // and store those in the patch buffer, until the next parse provides additional |
| // data to fill the slop region. So parsing a range means first parsing the slop |
| // bytes of the previous range using the new range to provide slop bytes for the |
| // patch, followed by parsing the actual range except the last kSlopBytes and |
| // store those. If no more data is available a call to Done finishes the parse |
| // by parsing the remaining slopbytes. |
| // |
| // In order to deal with problem 1, we need to deal with the case that a new |
| // chunk can be less or equal than kSlopBytes big. We can just copy the chunk |
| // to the end and return (buffer, chunk->size). Pictorially |
| // [--------] chunk 1 |
| // [--] chunk 2 |
| // [---] chunk 3 |
| // will become |
| // [----....] chunk 1 |
| // [--....] patch (not full range of the patch buffer, only two hyphens) |
| // [--] chunk 2 (too small so never parsed directly) |
| // [---....] patch (not full range of the buffer, only three hyphens) |
| // [---] chunk 3 (too small so never parsed directly) |
| // [----****] patch (full range, last bytes are garbage) |
| // Because of this the source (the dots in above) can overlap with the |
| // destination buffer and so we have to use memmove. |
| // |
| // To solve problem 2, we use a generic parser together with knowledge of the |
| // nesting from the side stack to verify if the parse will be terminated in the |
| // slop region. If it terminates inside the slop region, we just parse it as |
| // well. See ParseEndsInSlopRegion in ParseContext for the implementation. This |
| // is only done if ensure_non_negative_skip is true, if it's false Skip() could |
| // return a negative number. |
| template <bool ensure_non_negative_skip> |
| class EpsCopyParser { |
| public: |
| EpsCopyParser(ParseClosure parser, ParseContext* ctx) : ctx_(ctx) { |
| ctx_->StartParse(parser); |
| } |
| |
| // Parse the bytes as provided by the non-empty range. |
| // Returns true on a successful parse ready to accept more data, if there is |
| // no more data call Done() to finish the parse. |
| // Returns false if the parse is terminated. Termination is either due to a |
| // parse error or due to termination on an end-group or 0 tag. You can call |
| // EndedOnTag() on the underlying ParseContext to find out if the parse ended |
| // correctly on a terminating tag. |
| bool Parse(StringPiece range) { |
| ABSL_ASSERT(!range.empty()); |
| auto size = range.size(); |
| if (size > kSlopBytes) { |
| // The buffer is large enough to be able to parse the (size - kSlopBytes) |
| // prefix directly. However we still need to parse the data in buffer_, |
| // that holds the slop region of the previous buffer. |
| if (overrun_ == kSlopBytes) { |
| // We overrun the whole slop region of the previous buffer. |
| // Optimization, we can skip the patch buffer. |
| overrun_ = 0; |
| } else { |
| std::memcpy(buffer_ + kSlopBytes, range.begin(), kSlopBytes); |
| if (!ParseRange({buffer_, kSlopBytes}, 0)) return false; |
| } |
| range.remove_suffix(kSlopBytes); |
| } else { |
| std::memcpy(buffer_ + kSlopBytes, range.begin(), size); |
| range = {buffer_, size}; |
| } |
| if (!ParseRange(range, size - kSlopBytes)) return false; |
| std::memmove(buffer_, range.end(), kSlopBytes); |
| if (ensure_non_negative_skip && |
| ctx_->ParseEndsInSlopRegion(buffer_, overrun_)) { |
| // We care about leaving the stream at the right place and the stream will |
| // indeed terminate, so just parse it. |
| auto res = ParseRange({buffer_, kSlopBytes}, size); |
| ABSL_ASSERT(!res); |
| return false; |
| } |
| return true; |
| } |
| |
| // Finish the parse by parsing the remaining data and verify success. |
| bool Done() { |
| return ParseRange({buffer_, kSlopBytes}, 0) && ctx_->ValidEnd(overrun_); |
| } |
| |
| // If the parse was terminated by a end-group or 0 tag. Skip returns the |
| // offset where the parse left off relative to the start of the last range |
| // parsed. |
| // NOTE: This could be negative unless ensure_non_negative_skip is true. |
| int Skip() { |
| // The reason of ensure_non_negative_skip and ParseEndsInSlopRegion is that |
| // the following assert holds. Which implies the stream doesn't need to |
| // backup. |
| ABSL_ASSERT(!ensure_non_negative_skip || overrun_ >= 0); |
| return overrun_; |
| } |
| |
| private: |
| constexpr static int kSlopBytes = ParseContext::kSlopBytes; |
| // overrun_ stores where in the slop region of the previous parse the parse |
| // was left off. This is used to start the parse of the next region at the |
| // correct point. Initially overrun_ should be set to kSlopBytes which means |
| // that the parse starts at precisely the beginning of new buffer provided. |
| int overrun_ = kSlopBytes; |
| // The first kSlopBytes of buffer_ contains the slop region of the previous |
| // parsed region. |
| char buffer_[2 * kSlopBytes] = {}; |
| ParseContext* ctx_; |
| |
| bool ParseRange(StringPiece range, int delta) { |
| auto res = ctx_->ParseRange(range, &overrun_); |
| if (!res) overrun_ += delta; |
| return res; |
| } |
| }; |
| |
| // Add any of the following lines to debug which parse function is failing. |
| |
| #define GOOGLE_PROTOBUF_ASSERT_RETURN(predicate, ret) \ |
| if (!(predicate)) { \ |
| /* raise(SIGINT); */ \ |
| /* GOOGLE_LOG(ERROR) << "Parse failure"; */ \ |
| return ret; \ |
| } |
| |
| #define GOOGLE_PROTOBUF_PARSER_ASSERT(predicate) \ |
| GOOGLE_PROTOBUF_ASSERT_RETURN(predicate, nullptr) |
| |
| template <typename T> |
| std::pair<const char*, bool> FieldParser(uint64 tag, ParseClosure parent, |
| T field_parser, const char* begin, |
| const char* end, ParseContext* ctx) { |
| auto ptr = begin; |
| uint32 number = tag >> 3; |
| if (ABSL_PREDICT_FALSE(number == 0)) { |
| GOOGLE_PROTOBUF_ASSERT_RETURN(tag == 0, {}); |
| // Special case scenario of 0 termination. |
| ctx->EndGroup(tag); |
| return {ptr, true}; |
| } |
| using WireType = internal::WireFormatLite::WireType; |
| switch (tag & 7) { |
| case WireType::WIRETYPE_VARINT: { |
| uint64 value; |
| ptr = Varint::Parse64(ptr, &value); |
| GOOGLE_PROTOBUF_ASSERT_RETURN(ptr != nullptr, {}); |
| field_parser.AddVarint(number, value); |
| break; |
| } |
| case WireType::WIRETYPE_FIXED64: { |
| uint64 value = io::UnalignedLoad<uint64>(ptr); |
| ptr += 8; |
| field_parser.AddFixed64(number, value); |
| break; |
| } |
| case WireType::WIRETYPE_LENGTH_DELIMITED: { |
| uint32 size; |
| ptr = Varint::Parse32(ptr, &size); |
| GOOGLE_PROTOBUF_ASSERT_RETURN(ptr != nullptr, {}); |
| ParseClosure child = field_parser.AddLengthDelimited(number, size); |
| if (size > end - ptr) { |
| return {ctx->StoreAndTailCall(ptr, end, parent, child, size), true}; |
| } |
| auto newend = ptr + size; |
| GOOGLE_PROTOBUF_ASSERT_RETURN(ctx->ParseExactRange(child, ptr, newend), |
| {}); |
| ptr = newend; |
| break; |
| } |
| case WireType::WIRETYPE_START_GROUP: { |
| int depth; |
| ParseClosure child = field_parser.StartGroup(number); |
| auto res = ctx->ParseGroup(tag, child, ptr, end, &depth); |
| ptr = res.first; |
| GOOGLE_PROTOBUF_ASSERT_RETURN(ptr != nullptr, {}); |
| if (res.second) { |
| GOOGLE_PROTOBUF_ASSERT_RETURN( |
| ctx->StoreGroup(parent, child, depth, tag), {}); |
| return {ptr, true}; |
| } |
| break; |
| } |
| case WireType::WIRETYPE_END_GROUP: { |
| field_parser.EndGroup(number); |
| ctx->EndGroup(tag); |
| return {ptr, true}; |
| } |
| case WireType::WIRETYPE_FIXED32: { |
| uint32 value = io::UnalignedLoad<uint32>(ptr); |
| ptr += 4; |
| field_parser.AddFixed32(number, value); |
| break; |
| } |
| default: |
| GOOGLE_PROTOBUF_ASSERT_RETURN(false, {}); |
| } |
| ABSL_ASSERT(ptr != nullptr); |
| return {ptr, false}; |
| } |
| |
| template <typename T> |
| const char* WireFormatParser(ParseClosure parent, T field_parser, |
| const char* begin, const char* end, |
| ParseContext* ctx) { |
| auto ptr = begin; |
| while (ptr < end) { |
| uint32 tag; |
| ptr = Varint::Parse32(ptr, &tag); |
| GOOGLE_PROTOBUF_PARSER_ASSERT(ptr != nullptr); |
| auto res = FieldParser(tag, parent, field_parser, ptr, end, ctx); |
| ptr = res.first; |
| GOOGLE_PROTOBUF_PARSER_ASSERT(ptr != nullptr); |
| if (res.second) return ptr; |
| } |
| return ptr; |
| } |
| |
| // Here are the elementary parsers for length delimited subfields that contain |
| // plain data (ie not a protobuf). These are trivial as they don't recurse, |
| // except for the UnknownGroupLiteParse that parses a group into a string. |
| // Some functions need extra arguments that the function signature allows, |
| // these are passed through variables in ParseContext::ExtraParseData that the |
| // caller needs to set prior to the call. |
| |
| // The null parser does not do anything, but is useful as a substitute. |
| const char* NullParser(const char* begin, const char* end, void* object, |
| ParseContext*); |
| |
| // Helper for verification of utf8 |
| bool VerifyUTF8(StringPiece s, ParseContext* ctx); |
| // All the string parsers with or without UTF checking and for all CTypes. |
| const char* StringParser(const char* begin, const char* end, void* object, |
| ParseContext*); |
| const char* CordParser(const char* begin, const char* end, void* object, |
| ParseContext*); |
| const char* StringPieceParser(const char* begin, const char* end, void* object, |
| ParseContext*); |
| const char* StringParserUTF8(const char* begin, const char* end, void* object, |
| ParseContext*); |
| const char* CordParserUTF8(const char* begin, const char* end, void* object, |
| ParseContext*); |
| const char* StringPieceParserUTF8(const char* begin, const char* end, |
| void* object, ParseContext*); |
| const char* StringParserUTF8Verify(const char* begin, const char* end, |
| void* object, ParseContext*); |
| const char* CordParserUTF8Verify(const char* begin, const char* end, |
| void* object, ParseContext*); |
| const char* StringPieceParserUTF8Verify(const char* begin, const char* end, |
| void* object, ParseContext*); |
| // Parsers that also eat the slopbytes if possible. Can only be called in a |
| // ParseContext where limit_ is set properly. |
| const char* GreedyStringParser(const char* begin, const char* end, void* object, |
| ParseContext*); |
| const char* GreedyStringParserUTF8(const char* begin, const char* end, void* object, |
| ParseContext*); |
| const char* GreedyStringParserUTF8Verify(const char* begin, const char* end, |
| void* object, ParseContext*); |
| |
| // This is the only recursive parser. |
| const char* UnknownGroupLiteParse(const char* begin, const char* end, |
| void* object, ParseContext* ctx); |
| // This is a helper to for the UnknownGroupLiteParse but is actually also |
| // useful in the generated code. It uses overload on string* vs |
| // UnknownFieldSet* to make the generated code isomorphic between full and lite. |
| std::pair<const char*, bool> UnknownFieldParse(uint32 tag, ParseClosure parent, |
| const char* begin, |
| const char* end, std::string* unknown, |
| ParseContext* ctx); |
| |
| // The packed parsers parse repeated numeric primitives directly into the |
| // corresponding field |
| |
| // These are packed varints |
| const char* PackedInt32Parser(const char* begin, const char* end, void* object, |
| ParseContext* ctx); |
| const char* PackedUInt32Parser(const char* begin, const char* end, void* object, |
| ParseContext* ctx); |
| const char* PackedInt64Parser(const char* begin, const char* end, void* object, |
| ParseContext* ctx); |
| const char* PackedUInt64Parser(const char* begin, const char* end, void* object, |
| ParseContext* ctx); |
| const char* PackedSInt32Parser(const char* begin, const char* end, void* object, |
| ParseContext* ctx); |
| const char* PackedSInt64Parser(const char* begin, const char* end, void* object, |
| ParseContext* ctx); |
| const char* PackedBoolParser(const char* begin, const char* end, void* object, |
| ParseContext* ctx); |
| |
| // Enums in proto3 do not require verification |
| const char* PackedEnumParser(const char* begin, const char* end, void* object, |
| ParseContext* ctx); |
| // Enums in proto2 require verification. So an additional verification function |
| // needs to be passed into ExtraParseData. |
| // If it's a generated verification function we only need the function pointer. |
| const char* PackedValidEnumParserLite(const char* begin, const char* end, |
| void* object, ParseContext* ctx); |
| // If it's reflective we need a function that takes an additional argument. |
| const char* PackedValidEnumParserLiteArg(const char* begin, const char* end, |
| void* object, ParseContext* ctx); |
| |
| // These are the packed fixed field parsers. |
| const char* PackedFixed32Parser(const char* begin, const char* end, |
| void* object, ParseContext* ctx); |
| const char* PackedSFixed32Parser(const char* begin, const char* end, |
| void* object, ParseContext* ctx); |
| const char* PackedFixed64Parser(const char* begin, const char* end, |
| void* object, ParseContext* ctx); |
| const char* PackedSFixed64Parser(const char* begin, const char* end, |
| void* object, ParseContext* ctx); |
| const char* PackedFloatParser(const char* begin, const char* end, void* object, |
| ParseContext* ctx); |
| const char* PackedDoubleParser(const char* begin, const char* end, void* object, |
| ParseContext* ctx); |
| |
| // Maps key/value's are stored in a MapEntry length delimited field. If this |
| // crosses a seam we fallback to first store in payload. The object points |
| // to a MapField in which we parse the payload upon done (we detect this when |
| // this function is called with limit_ == 0), by calling parse_map (also stored |
| // in ctx) on the resulting string. |
| const char* SlowMapEntryParser(const char* begin, const char* end, void* object, |
| internal::ParseContext* ctx); |
| |
| } // namespace internal |
| } // namespace protobuf |
| } // namespace google |
| |
| #include <google/protobuf/port_undef.inc> |
| |
| #endif // GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER |
| #endif // GOOGLE_PROTOBUF_PARSE_CONTEXT_H__ |