tp: optimize SelectBvWithIv function in RowMap

This function will be important for go/perfetto-cpp-views so add an
extra algorithm which will kick in when the selector is "small".

To find the value of the threshold constant, add some algorithm
benchmarks which calculate the ratio where one algorithm is faster than
the other.

Also while I'm here, add some non-controversial optimizations to RowMap
(true fast-paths).

Bug: 220373202
Doc: go/perfetto-cpp-views
Change-Id: Ib4d5d18bb1843d6836927586da0aa1c0fa95b6c7
diff --git a/BUILD b/BUILD
index a981ee3..d9cfb6b 100644
--- a/BUILD
+++ b/BUILD
@@ -952,6 +952,7 @@
         "src/trace_processor/containers/null_term_string_view.h",
         "src/trace_processor/containers/nullable_vector.h",
         "src/trace_processor/containers/row_map.h",
+        "src/trace_processor/containers/row_map_algorithms.h",
         "src/trace_processor/containers/string_pool.h",
     ],
     deps = [
diff --git a/src/trace_processor/containers/BUILD.gn b/src/trace_processor/containers/BUILD.gn
index 1f843e1..1e92c62 100644
--- a/src/trace_processor/containers/BUILD.gn
+++ b/src/trace_processor/containers/BUILD.gn
@@ -26,6 +26,7 @@
     "null_term_string_view.h",
     "nullable_vector.h",
     "row_map.h",
+    "row_map_algorithms.h",
     "string_pool.h",
   ]
   sources = [
@@ -69,6 +70,7 @@
     sources = [
       "bit_vector_benchmark.cc",
       "nullable_vector_benchmark.cc",
+      "row_map_algorithms_benchmark.cc",
       "row_map_benchmark.cc",
     ]
   }
diff --git a/src/trace_processor/containers/bit_vector.h b/src/trace_processor/containers/bit_vector.h
index 5f9c27a..06297f0 100644
--- a/src/trace_processor/containers/bit_vector.h
+++ b/src/trace_processor/containers/bit_vector.h
@@ -51,7 +51,7 @@
   explicit BitVector(std::initializer_list<bool> init);
 
   // Creates a bitvector of |count| size filled with |value|.
-  BitVector(uint32_t count, bool value = false);
+  explicit BitVector(uint32_t count, bool value = false);
 
   // Enable moving bitvectors as they have no unmovable state.
   BitVector(BitVector&&) noexcept = default;
diff --git a/src/trace_processor/containers/row_map.cc b/src/trace_processor/containers/row_map.cc
index ceacf95..16c476a 100644
--- a/src/trace_processor/containers/row_map.cc
+++ b/src/trace_processor/containers/row_map.cc
@@ -16,6 +16,8 @@
 
 #include "src/trace_processor/containers/row_map.h"
 
+#include "src/trace_processor/containers/row_map_algorithms.h"
+
 namespace perfetto {
 namespace trace_processor {
 
@@ -77,7 +79,13 @@
   PERFETTO_DCHECK(selector_start <= selector_end);
   PERFETTO_DCHECK(selector_end <= bv.GetNumBitsSet());
 
+  // If we're simply selecting every element in the bitvector, just
+  // return a copy of the BitVector without iterating.
   BitVector ret = bv.Copy();
+  if (selector_start == 0 && selector_end == bv.GetNumBitsSet()) {
+    return RowMap(std::move(ret));
+  }
+
   for (auto it = ret.IterateSetBits(); it; it.Next()) {
     auto set_idx = it.ordinal();
     if (set_idx < selector_start || set_idx >= selector_end)
@@ -94,12 +102,32 @@
 
 RowMap SelectBvWithIv(const BitVector& bv,
                       const std::vector<uint32_t>& selector) {
-  std::vector<uint32_t> iv(selector.size());
-  for (uint32_t i = 0; i < selector.size(); ++i) {
-    // TODO(lalitm): this is pretty inefficient.
-    iv[i] = bv.IndexOfNthSet(selector[i]);
+  // The value of this constant was found by considering the benchmarks
+  // |BM_SelectBvWithIvByConvertToIv| and |BM_SelectBvWithIvByIndexOfNthSet|.
+  //
+  // We use this to find the ratio between |bv.GetNumBitsSet()| and
+  // |selector.size()| where |SelectBvWithIvByIndexOfNthSet| was found to be
+  // faster than |SelectBvWithIvByConvertToIv|.
+  //
+  // Note: as of writing this, the benchmarks do not take into account the fill
+  // ratio of the BitVector; they assume 50% rate which almost never happens in
+  // practice. In the future, we could also take this into account (by
+  // considering the ratio between bv.size() and bv.GetNumBitsSet()) but this
+  // causes an explosion in the state space for the benchmark so we're not
+  // considering this today.
+  //
+  // The current value of the constant was picked by running these benchmarks on
+  // a E5-2690 v4 and finding the crossover point using a spreadsheet.
+  constexpr uint32_t kIndexOfSetBitToSelectorRatio = 4;
+
+  // If the selector is larger than a threshold, it's more efficient to convert
+  // the entire BitVector to an index vector and use SelectIvWithIv instead.
+  if (bv.GetNumBitsSet() / kIndexOfSetBitToSelectorRatio < selector.size()) {
+    return RowMap(
+        row_map_algorithms::SelectBvWithIvByConvertToIv(bv, selector));
   }
-  return RowMap(std::move(iv));
+  return RowMap(
+      row_map_algorithms::SelectBvWithIvByIndexOfNthSet(bv, selector));
 }
 
 RowMap SelectIvWithRange(const std::vector<uint32_t>& iv,
@@ -132,12 +160,7 @@
 
 RowMap SelectIvWithIv(const std::vector<uint32_t>& iv,
                       const std::vector<uint32_t>& selector) {
-  std::vector<uint32_t> ret(selector.size());
-  for (uint32_t i = 0; i < selector.size(); ++i) {
-    PERFETTO_DCHECK(selector[i] < iv.size());
-    ret[i] = iv[selector[i]];
-  }
-  return RowMap(std::move(ret));
+  return RowMap(row_map_algorithms::SelectIvWithIv(iv, selector));
 }
 
 }  // namespace
diff --git a/src/trace_processor/containers/row_map.h b/src/trace_processor/containers/row_map.h
index 5ca3c9c..49bbdcb 100644
--- a/src/trace_processor/containers/row_map.h
+++ b/src/trace_processor/containers/row_map.h
@@ -238,6 +238,12 @@
   // Creates a RowMap backed by an std::vector<uint32_t>.
   explicit RowMap(std::vector<OutputIndex> vec);
 
+  RowMap(const RowMap&) noexcept = delete;
+  RowMap& operator=(const RowMap&) = delete;
+
+  RowMap(RowMap&&) noexcept = default;
+  RowMap& operator=(RowMap&&) = default;
+
   // Creates a RowMap containing just |index|.
   // By default this will be implemented using a range.
   static RowMap SingleRow(OutputIndex index) {
@@ -651,8 +657,16 @@
   void InsertIntoBitVector(uint32_t row) {
     PERFETTO_DCHECK(mode_ == Mode::kBitVector);
 
-    if (row >= bit_vector_.size())
+    // If we're adding a row to precisely the end of the BitVector, just append
+    // true instead of resizing and then setting.
+    if (row == bit_vector_.size()) {
+      bit_vector_.AppendTrue();
+      return;
+    }
+
+    if (row > bit_vector_.size()) {
       bit_vector_.Resize(row + 1, false);
+    }
     bit_vector_.Set(row);
   }
 
diff --git a/src/trace_processor/containers/row_map_algorithms.h b/src/trace_processor/containers/row_map_algorithms.h
new file mode 100644
index 0000000..7d585c3
--- /dev/null
+++ b/src/trace_processor/containers/row_map_algorithms.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (C) 2022 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SRC_TRACE_PROCESSOR_CONTAINERS_ROW_MAP_ALGORITHMS_H_
+#define SRC_TRACE_PROCESSOR_CONTAINERS_ROW_MAP_ALGORITHMS_H_
+
+#include <vector>
+
+#include "perfetto/base/logging.h"
+#include "src/trace_processor/containers/bit_vector.h"
+#include "src/trace_processor/containers/bit_vector_iterators.h"
+
+// This file contains fundamental algorithms used by RowMap.
+//
+// This file is structured in a way to make benchmarking easy. The intention is
+// to use this to decide which heurustics to use and the value of magic
+// constants in RowMap algorithms.
+
+namespace perfetto {
+namespace trace_processor {
+namespace row_map_algorithms {
+
+// Returns a vector containing elements from |iv| selected by indices from
+// |selector|.
+inline std::vector<uint32_t> SelectIvWithIv(
+    const std::vector<uint32_t>& iv,
+    const std::vector<uint32_t>& selector) {
+  std::vector<uint32_t> ret(selector.size());
+  for (uint32_t i = 0; i < selector.size(); ++i) {
+    PERFETTO_DCHECK(selector[i] < iv.size());
+    ret[i] = iv[selector[i]];
+  }
+  return ret;
+}
+
+// Returns a vector containing elements from |bv| by first converting to an
+// index vector and then selecting indices from |selector|.
+inline std::vector<uint32_t> SelectBvWithIvByConvertToIv(
+    const BitVector& bv,
+    const std::vector<uint32_t>& selector) {
+  std::vector<uint32_t> bv_conv(bv.GetNumBitsSet());
+  for (auto it = bv.IterateSetBits(); it; it.Next()) {
+    bv_conv[it.ordinal()] = it.index();
+  }
+  return SelectIvWithIv(bv_conv, selector);
+}
+
+// Returns a vector containing elements from |bv| by selecting indices from
+// |selector| using IndexOfNthSet calls.
+inline std::vector<uint32_t> SelectBvWithIvByIndexOfNthSet(
+    const BitVector& bv,
+    const std::vector<uint32_t>& selector) {
+  std::vector<uint32_t> iv(selector.size());
+  for (uint32_t i = 0; i < selector.size(); ++i) {
+    iv[i] = bv.IndexOfNthSet(selector[i]);
+  }
+  return iv;
+}
+
+}  // namespace row_map_algorithms
+}  // namespace trace_processor
+}  // namespace perfetto
+
+#endif  // SRC_TRACE_PROCESSOR_CONTAINERS_ROW_MAP_ALGORITHMS_H_
diff --git a/src/trace_processor/containers/row_map_algorithms_benchmark.cc b/src/trace_processor/containers/row_map_algorithms_benchmark.cc
new file mode 100644
index 0000000..805172c
--- /dev/null
+++ b/src/trace_processor/containers/row_map_algorithms_benchmark.cc
@@ -0,0 +1,120 @@
+/*
+ * Copyright (C) 2022 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <random>
+
+#include <benchmark/benchmark.h>
+
+#include "src/trace_processor/containers/row_map_algorithms.h"
+
+using perfetto::trace_processor::BitVector;
+
+namespace {
+
+using namespace perfetto::trace_processor::row_map_algorithms;
+
+bool IsBenchmarkFunctionalOnly() {
+  return getenv("BENCHMARK_FUNCTIONAL_TEST_ONLY") != nullptr;
+}
+
+BitVector BvWithSetBits(uint32_t bv_set_bits) {
+  static constexpr uint32_t kRandomSeed = 29;
+  std::minstd_rand0 rnd_engine(kRandomSeed);
+
+  BitVector bv;
+  for (uint32_t i = 0; i < bv_set_bits;) {
+    if (rnd_engine() % 2 == 0) {
+      bv.AppendTrue();
+      ++i;
+    } else {
+      bv.AppendFalse();
+    }
+  }
+  return bv;
+}
+
+std::vector<uint32_t> IvWithSizeAndMaxIndex(
+    uint32_t bv_set_bits,
+    uint32_t set_bit_to_selector_ratio) {
+  static constexpr uint32_t kRandomSeed = 78;
+  std::minstd_rand0 rnd_engine(kRandomSeed);
+
+  uint32_t size = bv_set_bits / set_bit_to_selector_ratio;
+  std::vector<uint32_t> indices(size);
+  for (uint32_t i = 0; i < size; ++i) {
+    indices[i] = rnd_engine() % bv_set_bits;
+  }
+  return indices;
+}
+
+void BvWithIvArgs(benchmark::internal::Benchmark* b) {
+  std::vector<int> set_bit_to_selector_ratios;
+  if (IsBenchmarkFunctionalOnly()) {
+    set_bit_to_selector_ratios = std::vector<int>{2};
+  } else {
+    set_bit_to_selector_ratios = std::vector<int>{2, 4, 6, 8, 10, 12, 16, 32};
+  }
+
+  std::vector<int> bv_set_bits;
+  if (IsBenchmarkFunctionalOnly()) {
+    bv_set_bits = std::vector<int>{1024};
+  } else {
+    bv_set_bits = std::vector<int>{1024, 4096, 1024 * 1024};
+  }
+
+  for (int bv_set_bit : bv_set_bits) {
+    for (int set_bit_to_selector_ratio : set_bit_to_selector_ratios) {
+      b->Args({bv_set_bit, set_bit_to_selector_ratio});
+    }
+  }
+}
+
+// |BM_SelectBvWithIvByConvertToIv| and |BM_SelectBvWithIvByIndexOfNthSet| are
+// used together to find the ratio between the selector size and the number of
+// set bits in the BitVector where |SelectBvWithIvByIndexOfNthSet| is faster
+// than |SelectBvWithIvByConvertToIv|.
+//
+// See the comment in SelectBvWithIv in row_map.cc for more information on this.
+
+static void BM_SelectBvWithIvByConvertToIv(benchmark::State& state) {
+  uint32_t bv_set_bit = static_cast<uint32_t>(state.range(0));
+  uint32_t set_bit_to_selector_ratio = static_cast<uint32_t>(state.range(1));
+
+  BitVector bv = BvWithSetBits(bv_set_bit);
+  std::vector<uint32_t> iv =
+      IvWithSizeAndMaxIndex(bv_set_bit, set_bit_to_selector_ratio);
+
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(SelectBvWithIvByConvertToIv(bv, iv));
+  }
+}
+BENCHMARK(BM_SelectBvWithIvByConvertToIv)->Apply(BvWithIvArgs);
+
+static void BM_SelectBvWithIvByIndexOfNthSet(benchmark::State& state) {
+  uint32_t bv_set_bit = static_cast<uint32_t>(state.range(0));
+  uint32_t set_bit_to_selector_ratio = static_cast<uint32_t>(state.range(1));
+
+  BitVector bv = BvWithSetBits(bv_set_bit);
+  std::vector<uint32_t> iv =
+      IvWithSizeAndMaxIndex(bv_set_bit, set_bit_to_selector_ratio);
+
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(SelectBvWithIvByIndexOfNthSet(bv, iv));
+  }
+}
+BENCHMARK(BM_SelectBvWithIvByIndexOfNthSet)->Apply(BvWithIvArgs);
+
+}  // namespace