protozero: add filtering bytecode v2 with support for string filtering

This CL introduces a new bytecode v2 format which adds a new opcode
for filtering strings in protos. This CL does not implement the actual
filtering logic yet, instead making the field a noop at the moment.

Bug: 283246642
Change-Id: I4e1dfbeafa7b7728faf3dc063f868c9a36f1672a
diff --git a/protos/perfetto/config/perfetto_config.proto b/protos/perfetto/config/perfetto_config.proto
index 5fbcf8d..14dac10 100644
--- a/protos/perfetto/config/perfetto_config.proto
+++ b/protos/perfetto/config/perfetto_config.proto
@@ -3170,9 +3170,19 @@
   // design.
   //
   // Introduced in Android S, but it was broken (b/195065199). Reintroduced in
-  // Android T with a different field number.
+  // Android T with a different field number. Updated in Android U with a new
+  // bytecode version which supports string filtering.
   message TraceFilter {
+    // =========================
+    // Filter bytecode.
+    // =========================
+
+    // The bytecode as implemented in Android T.
     optional bytes bytecode = 1;
+
+    // The bytecode as implemented in Android U. Adds support for string
+    // filtering.
+    optional bytes bytecode_v2 = 2;
   }
   // old field number for trace_filter
   reserved 32;
diff --git a/protos/perfetto/config/trace_config.proto b/protos/perfetto/config/trace_config.proto
index 67c5f3d..05d4520 100644
--- a/protos/perfetto/config/trace_config.proto
+++ b/protos/perfetto/config/trace_config.proto
@@ -491,9 +491,19 @@
   // design.
   //
   // Introduced in Android S, but it was broken (b/195065199). Reintroduced in
-  // Android T with a different field number.
+  // Android T with a different field number. Updated in Android U with a new
+  // bytecode version which supports string filtering.
   message TraceFilter {
+    // =========================
+    // Filter bytecode.
+    // =========================
+
+    // The bytecode as implemented in Android T.
     optional bytes bytecode = 1;
+
+    // The bytecode as implemented in Android U. Adds support for string
+    // filtering.
+    optional bytes bytecode_v2 = 2;
   }
   // old field number for trace_filter
   reserved 32;
diff --git a/protos/perfetto/trace/perfetto_trace.proto b/protos/perfetto/trace/perfetto_trace.proto
index b93244d..a6c4823 100644
--- a/protos/perfetto/trace/perfetto_trace.proto
+++ b/protos/perfetto/trace/perfetto_trace.proto
@@ -3170,9 +3170,19 @@
   // design.
   //
   // Introduced in Android S, but it was broken (b/195065199). Reintroduced in
-  // Android T with a different field number.
+  // Android T with a different field number. Updated in Android U with a new
+  // bytecode version which supports string filtering.
   message TraceFilter {
+    // =========================
+    // Filter bytecode.
+    // =========================
+
+    // The bytecode as implemented in Android T.
     optional bytes bytecode = 1;
+
+    // The bytecode as implemented in Android U. Adds support for string
+    // filtering.
+    optional bytes bytecode_v2 = 2;
   }
   // old field number for trace_filter
   reserved 32;
diff --git a/src/protozero/filtering/filter_bytecode_common.h b/src/protozero/filtering/filter_bytecode_common.h
index 9c0e6f2..02a6cd0 100644
--- a/src/protozero/filtering/filter_bytecode_common.h
+++ b/src/protozero/filtering/filter_bytecode_common.h
@@ -36,7 +36,14 @@
   // (without any shifting) is the index of the filter that should be used to
   // recurse into the nested message.
   kFilterOpcode_NestedField = 3,
+
+  // The imediate value is the id of the allowed field. The behaviour of this
+  // opcode is the same as kFilterOpcode_SimpleField, with the further semantic
+  // that the field is a string and needs to be processed using the string
+  // filtering fules.
+  kFilterOpcode_FilterString = 4,
 };
+
 }  // namespace protozero
 
 #endif  // SRC_PROTOZERO_FILTERING_FILTER_BYTECODE_COMMON_H_
diff --git a/src/protozero/filtering/filter_bytecode_generator.cc b/src/protozero/filtering/filter_bytecode_generator.cc
index b8a059a..487534b 100644
--- a/src/protozero/filtering/filter_bytecode_generator.cc
+++ b/src/protozero/filtering/filter_bytecode_generator.cc
@@ -43,6 +43,14 @@
   endmessage_called_ = false;
 }
 
+// Allows a string field which needs to be rewritten using the given chain.
+void FilterBytecodeGenerator::AddFilterStringField(uint32_t field_id) {
+  PERFETTO_CHECK(field_id > last_field_id_);
+  bytecode_.push_back(field_id << 3 | kFilterOpcode_FilterString);
+  last_field_id_ = field_id;
+  endmessage_called_ = false;
+}
+
 // Allows a range of simple fields. |range_start| is the id of the first field
 // in range, |range_len| the number of fields in the range.
 // AddSimpleFieldRange(N,1) is semantically equivalent to AddSimpleField(N).
diff --git a/src/protozero/filtering/filter_bytecode_generator.h b/src/protozero/filtering/filter_bytecode_generator.h
index 809799c..ecff87e 100644
--- a/src/protozero/filtering/filter_bytecode_generator.h
+++ b/src/protozero/filtering/filter_bytecode_generator.h
@@ -43,6 +43,9 @@
   // Allows a simple field (varint, fixed32/64, string or bytes).
   void AddSimpleField(uint32_t field_id);
 
+  // Allows a string field which needs to be filtered.
+  void AddFilterStringField(uint32_t field_id);
+
   // Allows a range of simple fields. |range_start| is the id of the first field
   // in range, |range_len| the number of fields in the range.
   // AddSimpleFieldRange(N,1) is semantically equivalent to AddSimpleField(N)
diff --git a/src/protozero/filtering/filter_bytecode_parser.cc b/src/protozero/filtering/filter_bytecode_parser.cc
index e8712c7..6c22722 100644
--- a/src/protozero/filtering/filter_bytecode_parser.cc
+++ b/src/protozero/filtering/filter_bytecode_parser.cc
@@ -105,15 +105,19 @@
     }
 
     if (opcode == kFilterOpcode_SimpleField ||
-        opcode == kFilterOpcode_NestedField) {
+        opcode == kFilterOpcode_NestedField ||
+        opcode == kFilterOpcode_FilterString) {
       // Field words are organized as follow:
       // MSB: 1 if allowed, 0 if not allowed.
       // Remaining bits:
       //   Message index in the case of nested (non-simple) messages.
-      //   0x7f..f in the case of simple messages.
+      //   0x7f..e in the case of string fields which need filtering.
+      //   0x7f..f in the case of simple fields.
       uint32_t msg_id;
       if (opcode == kFilterOpcode_SimpleField) {
         msg_id = kSimpleField;
+      } else if (opcode == kFilterOpcode_FilterString) {
+        msg_id = kFilterStringField;
       } else {  // FILTER_OPCODE_NESTED_FIELD
         // The next word in the bytecode contains the message index.
         if (!has_next_word) {
diff --git a/src/protozero/filtering/filter_bytecode_parser.h b/src/protozero/filtering/filter_bytecode_parser.h
index 8744a06..fa07be2 100644
--- a/src/protozero/filtering/filter_bytecode_parser.h
+++ b/src/protozero/filtering/filter_bytecode_parser.h
@@ -20,6 +20,7 @@
 #include <stddef.h>
 #include <stdint.h>
 
+#include <optional>
 #include <vector>
 
 namespace protozero {
@@ -55,10 +56,20 @@
     // fixed32/64, string or byte).
     bool simple_field() const { return nested_msg_index == kSimpleField; }
 
+    // If |allowed|==true, specifies if this field is a string field that needs
+    // to be filtered.
+    bool filter_string_field() const {
+      return nested_msg_index == kFilterStringField;
+    }
+
     // If |allowed|==true, specifies if the field is a nested field that needs
     // recursion. The caller is expected to use |nested_msg_index| for the next
     // Query() calls.
-    bool nested_msg_field() const { return nested_msg_index < kSimpleField; }
+    bool nested_msg_field() const {
+      static_assert(kFilterStringField < kSimpleField,
+                    "kFilterStringField < kSimpleField");
+      return nested_msg_index < kFilterStringField;
+    }
   };
 
   // Loads a filter. The filter data consists of a sequence of varints which
@@ -77,6 +88,7 @@
   static constexpr uint32_t kDirectlyIndexLimit = 128;
   static constexpr uint32_t kAllowed = 1u << 31u;
   static constexpr uint32_t kSimpleField = 0x7fffffff;
+  static constexpr uint32_t kFilterStringField = 0x7ffffffe;
 
   bool LoadInternal(const uint8_t* filter_data, size_t len);
 
@@ -101,17 +113,17 @@
   // [N + 6] -> Field state for fields in range 2 (below)
 
   // The "field state" word is as follows:
-  // Bit 31: 0 if the field is disallowed, 1 if allowed.
+  // Bit 31: 1 if the field is allowed, 0 if disallowed.
   //         Only directly indexed fields can be 0 (it doesn't make sense to add
   //         a range and then say "btw it's NOT allowed".. don't add it then.
   //         0 is only used for filling gaps in the directly indexed bucket.
   // Bits [30..0] (only when MSB == allowed):
   //  0x7fffffff: The field is "simple" (varint, fixed32/64, string, bytes) and
   //      can be directly passed through in output. No recursion is needed.
-  //  [0, 7ffffffe]: The field is a nested submessage. The value is the index
+  //  0x7ffffffe: The field is string field which needs to be filtered.
+  //  [0, 7ffffffd]: The field is a nested submessage. The value is the index
   //     that must be passed as first argument to the next Query() calls.
   //     Note that the message index is purely a monotonic counter in the
-  //     filter bytecode, has no proto-equivalent match (unlike field ids).
   std::vector<uint32_t> words_;
 
   // One entry for each message index stored in the filter plus a sentinel at
diff --git a/src/protozero/filtering/message_filter.cc b/src/protozero/filtering/message_filter.cc
index fe5e1c5..ff283f8 100644
--- a/src/protozero/filtering/message_filter.cc
+++ b/src/protozero/filtering/message_filter.cc
@@ -154,11 +154,17 @@
   if (state->eat_next_bytes > 0) {
     // This is the case where the previous tokenizer_.Push() call returned a
     // length delimited message which is NOT a submessage (a string or a bytes
-    // field). We just want to consume it, and pass it through in output
+    // field). We just want to consume it, and pass it through/filter strings
     // if the field was allowed.
     --state->eat_next_bytes;
-    if (state->passthrough_eaten_bytes)
+    if (state->action == StackState::kPassthrough) {
       *(out_++) = octet;
+    } else if (state->action == StackState::kFilterString) {
+      *(out_++) = octet;
+      if (state->eat_next_bytes == 0) {
+        // TODO(lalitm): do the filtering using |filter_string_ptr|.
+      }
+    }
   } else {
     MessageTokenizer::Token token = tokenizer_.Push(octet);
     // |token| will not be valid() in most cases and this is WAI. When pushing
@@ -224,9 +230,16 @@
           } else {
             // A string or bytes field, or a 0 length submessage.
             state->eat_next_bytes = submessage_len;
-            state->passthrough_eaten_bytes = filter.allowed;
-            if (filter.allowed)
+            if (filter.allowed && filter.filter_string_field()) {
+              state->action = StackState::kFilterString;
               AppendLenDelim(token.field_id, submessage_len, &out_);
+              state->filter_string_ptr = out_;
+            } else if (filter.allowed) {
+              state->action = StackState::kPassthrough;
+              AppendLenDelim(token.field_id, submessage_len, &out_);
+            } else {
+              state->action = StackState::kDrop;
+            }
           }
           break;
       }  // switch(type)
@@ -278,7 +291,7 @@
   auto& state = stack_[0];
   state.eat_next_bytes = UINT32_MAX;
   state.in_bytes_limit = UINT32_MAX;
-  state.passthrough_eaten_bytes = false;
+  state.action = StackState::kDrop;
   out_ = out_buf_.get();  // Reset the write pointer.
 }
 
diff --git a/src/protozero/filtering/message_filter.h b/src/protozero/filtering/message_filter.h
index c752b2a..e67467b 100644
--- a/src/protozero/filtering/message_filter.h
+++ b/src/protozero/filtering/message_filter.h
@@ -181,11 +181,20 @@
     uint8_t* size_field = nullptr;
     uint32_t size_field_len = 0;
 
-    // When true the next |eat_next_bytes| are copied as-is in output.
-    // It seems that keeping this field at the end rather than next to
-    // |eat_next_bytes| makes the filter a little (but measurably) faster.
-    // (likely something related with struct layout vs cache sizes).
-    bool passthrough_eaten_bytes = false;
+    // The pointer to the start of the string to update the string if it is
+    // filtered.
+    uint8_t* filter_string_ptr = nullptr;
+
+    // How |eat_next_bytes| should be handled. It seems that keeping this field
+    // at the end rather than next to |eat_next_bytes| makes the filter a little
+    // (but measurably) faster. (likely something related with struct layout vs
+    // cache sizes).
+    enum FilterAction {
+      kDrop,
+      kPassthrough,
+      kFilterString,
+    };
+    FilterAction action = FilterAction::kDrop;
   };
 
   uint32_t out_written() { return static_cast<uint32_t>(out_ - &out_buf_[0]); }