Merge "tp: optimize SelectBvWithIv function in RowMap"
diff --git a/BUILD b/BUILD
index 3c90cb2..fa04d14 100644
--- a/BUILD
+++ b/BUILD
@@ -952,6 +952,7 @@
         "src/trace_processor/containers/null_term_string_view.h",
         "src/trace_processor/containers/nullable_vector.h",
         "src/trace_processor/containers/row_map.h",
+        "src/trace_processor/containers/row_map_algorithms.h",
         "src/trace_processor/containers/string_pool.h",
     ],
     deps = [
diff --git a/src/trace_processor/containers/BUILD.gn b/src/trace_processor/containers/BUILD.gn
index 1f843e1..1e92c62 100644
--- a/src/trace_processor/containers/BUILD.gn
+++ b/src/trace_processor/containers/BUILD.gn
@@ -26,6 +26,7 @@
     "null_term_string_view.h",
     "nullable_vector.h",
     "row_map.h",
+    "row_map_algorithms.h",
     "string_pool.h",
   ]
   sources = [
@@ -69,6 +70,7 @@
     sources = [
       "bit_vector_benchmark.cc",
       "nullable_vector_benchmark.cc",
+      "row_map_algorithms_benchmark.cc",
       "row_map_benchmark.cc",
     ]
   }
diff --git a/src/trace_processor/containers/bit_vector.h b/src/trace_processor/containers/bit_vector.h
index 5f9c27a..06297f0 100644
--- a/src/trace_processor/containers/bit_vector.h
+++ b/src/trace_processor/containers/bit_vector.h
@@ -51,7 +51,7 @@
   explicit BitVector(std::initializer_list<bool> init);
 
   // Creates a bitvector of |count| size filled with |value|.
-  BitVector(uint32_t count, bool value = false);
+  explicit BitVector(uint32_t count, bool value = false);
 
   // Enable moving bitvectors as they have no unmovable state.
   BitVector(BitVector&&) noexcept = default;
diff --git a/src/trace_processor/containers/row_map.cc b/src/trace_processor/containers/row_map.cc
index ceacf95..16c476a 100644
--- a/src/trace_processor/containers/row_map.cc
+++ b/src/trace_processor/containers/row_map.cc
@@ -16,6 +16,8 @@
 
 #include "src/trace_processor/containers/row_map.h"
 
+#include "src/trace_processor/containers/row_map_algorithms.h"
+
 namespace perfetto {
 namespace trace_processor {
 
@@ -77,7 +79,13 @@
   PERFETTO_DCHECK(selector_start <= selector_end);
   PERFETTO_DCHECK(selector_end <= bv.GetNumBitsSet());
 
+  // If we're simply selecting every element in the bitvector, just
+  // return a copy of the BitVector without iterating.
   BitVector ret = bv.Copy();
+  if (selector_start == 0 && selector_end == bv.GetNumBitsSet()) {
+    return RowMap(std::move(ret));
+  }
+
   for (auto it = ret.IterateSetBits(); it; it.Next()) {
     auto set_idx = it.ordinal();
     if (set_idx < selector_start || set_idx >= selector_end)
@@ -94,12 +102,32 @@
 
 RowMap SelectBvWithIv(const BitVector& bv,
                       const std::vector<uint32_t>& selector) {
-  std::vector<uint32_t> iv(selector.size());
-  for (uint32_t i = 0; i < selector.size(); ++i) {
-    // TODO(lalitm): this is pretty inefficient.
-    iv[i] = bv.IndexOfNthSet(selector[i]);
+  // The value of this constant was found by considering the benchmarks
+  // |BM_SelectBvWithIvByConvertToIv| and |BM_SelectBvWithIvByIndexOfNthSet|.
+  //
+  // We use this to find the ratio between |bv.GetNumBitsSet()| and
+  // |selector.size()| where |SelectBvWithIvByIndexOfNthSet| was found to be
+  // faster than |SelectBvWithIvByConvertToIv|.
+  //
+  // Note: as of writing this, the benchmarks do not take into account the fill
+  // ratio of the BitVector; they assume 50% rate which almost never happens in
+  // practice. In the future, we could also take this into account (by
+  // considering the ratio between bv.size() and bv.GetNumBitsSet()) but this
+  // causes an explosion in the state space for the benchmark so we're not
+  // considering this today.
+  //
+  // The current value of the constant was picked by running these benchmarks on
+  // a E5-2690 v4 and finding the crossover point using a spreadsheet.
+  constexpr uint32_t kIndexOfSetBitToSelectorRatio = 4;
+
+  // If the selector is larger than a threshold, it's more efficient to convert
+  // the entire BitVector to an index vector and use SelectIvWithIv instead.
+  if (bv.GetNumBitsSet() / kIndexOfSetBitToSelectorRatio < selector.size()) {
+    return RowMap(
+        row_map_algorithms::SelectBvWithIvByConvertToIv(bv, selector));
   }
-  return RowMap(std::move(iv));
+  return RowMap(
+      row_map_algorithms::SelectBvWithIvByIndexOfNthSet(bv, selector));
 }
 
 RowMap SelectIvWithRange(const std::vector<uint32_t>& iv,
@@ -132,12 +160,7 @@
 
 RowMap SelectIvWithIv(const std::vector<uint32_t>& iv,
                       const std::vector<uint32_t>& selector) {
-  std::vector<uint32_t> ret(selector.size());
-  for (uint32_t i = 0; i < selector.size(); ++i) {
-    PERFETTO_DCHECK(selector[i] < iv.size());
-    ret[i] = iv[selector[i]];
-  }
-  return RowMap(std::move(ret));
+  return RowMap(row_map_algorithms::SelectIvWithIv(iv, selector));
 }
 
 }  // namespace
diff --git a/src/trace_processor/containers/row_map.h b/src/trace_processor/containers/row_map.h
index 5ca3c9c..49bbdcb 100644
--- a/src/trace_processor/containers/row_map.h
+++ b/src/trace_processor/containers/row_map.h
@@ -238,6 +238,12 @@
   // Creates a RowMap backed by an std::vector<uint32_t>.
   explicit RowMap(std::vector<OutputIndex> vec);
 
+  RowMap(const RowMap&) noexcept = delete;
+  RowMap& operator=(const RowMap&) = delete;
+
+  RowMap(RowMap&&) noexcept = default;
+  RowMap& operator=(RowMap&&) = default;
+
   // Creates a RowMap containing just |index|.
   // By default this will be implemented using a range.
   static RowMap SingleRow(OutputIndex index) {
@@ -651,8 +657,16 @@
   void InsertIntoBitVector(uint32_t row) {
     PERFETTO_DCHECK(mode_ == Mode::kBitVector);
 
-    if (row >= bit_vector_.size())
+    // If we're adding a row to precisely the end of the BitVector, just append
+    // true instead of resizing and then setting.
+    if (row == bit_vector_.size()) {
+      bit_vector_.AppendTrue();
+      return;
+    }
+
+    if (row > bit_vector_.size()) {
       bit_vector_.Resize(row + 1, false);
+    }
     bit_vector_.Set(row);
   }
 
diff --git a/src/trace_processor/containers/row_map_algorithms.h b/src/trace_processor/containers/row_map_algorithms.h
new file mode 100644
index 0000000..7d585c3
--- /dev/null
+++ b/src/trace_processor/containers/row_map_algorithms.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (C) 2022 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SRC_TRACE_PROCESSOR_CONTAINERS_ROW_MAP_ALGORITHMS_H_
+#define SRC_TRACE_PROCESSOR_CONTAINERS_ROW_MAP_ALGORITHMS_H_
+
+#include <vector>
+
+#include "perfetto/base/logging.h"
+#include "src/trace_processor/containers/bit_vector.h"
+#include "src/trace_processor/containers/bit_vector_iterators.h"
+
+// This file contains fundamental algorithms used by RowMap.
+//
+// This file is structured in a way to make benchmarking easy. The intention is
+// to use this to decide which heurustics to use and the value of magic
+// constants in RowMap algorithms.
+
+namespace perfetto {
+namespace trace_processor {
+namespace row_map_algorithms {
+
+// Returns a vector containing elements from |iv| selected by indices from
+// |selector|.
+inline std::vector<uint32_t> SelectIvWithIv(
+    const std::vector<uint32_t>& iv,
+    const std::vector<uint32_t>& selector) {
+  std::vector<uint32_t> ret(selector.size());
+  for (uint32_t i = 0; i < selector.size(); ++i) {
+    PERFETTO_DCHECK(selector[i] < iv.size());
+    ret[i] = iv[selector[i]];
+  }
+  return ret;
+}
+
+// Returns a vector containing elements from |bv| by first converting to an
+// index vector and then selecting indices from |selector|.
+inline std::vector<uint32_t> SelectBvWithIvByConvertToIv(
+    const BitVector& bv,
+    const std::vector<uint32_t>& selector) {
+  std::vector<uint32_t> bv_conv(bv.GetNumBitsSet());
+  for (auto it = bv.IterateSetBits(); it; it.Next()) {
+    bv_conv[it.ordinal()] = it.index();
+  }
+  return SelectIvWithIv(bv_conv, selector);
+}
+
+// Returns a vector containing elements from |bv| by selecting indices from
+// |selector| using IndexOfNthSet calls.
+inline std::vector<uint32_t> SelectBvWithIvByIndexOfNthSet(
+    const BitVector& bv,
+    const std::vector<uint32_t>& selector) {
+  std::vector<uint32_t> iv(selector.size());
+  for (uint32_t i = 0; i < selector.size(); ++i) {
+    iv[i] = bv.IndexOfNthSet(selector[i]);
+  }
+  return iv;
+}
+
+}  // namespace row_map_algorithms
+}  // namespace trace_processor
+}  // namespace perfetto
+
+#endif  // SRC_TRACE_PROCESSOR_CONTAINERS_ROW_MAP_ALGORITHMS_H_
diff --git a/src/trace_processor/containers/row_map_algorithms_benchmark.cc b/src/trace_processor/containers/row_map_algorithms_benchmark.cc
new file mode 100644
index 0000000..805172c
--- /dev/null
+++ b/src/trace_processor/containers/row_map_algorithms_benchmark.cc
@@ -0,0 +1,120 @@
+/*
+ * Copyright (C) 2022 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <random>
+
+#include <benchmark/benchmark.h>
+
+#include "src/trace_processor/containers/row_map_algorithms.h"
+
+using perfetto::trace_processor::BitVector;
+
+namespace {
+
+using namespace perfetto::trace_processor::row_map_algorithms;
+
+bool IsBenchmarkFunctionalOnly() {
+  return getenv("BENCHMARK_FUNCTIONAL_TEST_ONLY") != nullptr;
+}
+
+BitVector BvWithSetBits(uint32_t bv_set_bits) {
+  static constexpr uint32_t kRandomSeed = 29;
+  std::minstd_rand0 rnd_engine(kRandomSeed);
+
+  BitVector bv;
+  for (uint32_t i = 0; i < bv_set_bits;) {
+    if (rnd_engine() % 2 == 0) {
+      bv.AppendTrue();
+      ++i;
+    } else {
+      bv.AppendFalse();
+    }
+  }
+  return bv;
+}
+
+std::vector<uint32_t> IvWithSizeAndMaxIndex(
+    uint32_t bv_set_bits,
+    uint32_t set_bit_to_selector_ratio) {
+  static constexpr uint32_t kRandomSeed = 78;
+  std::minstd_rand0 rnd_engine(kRandomSeed);
+
+  uint32_t size = bv_set_bits / set_bit_to_selector_ratio;
+  std::vector<uint32_t> indices(size);
+  for (uint32_t i = 0; i < size; ++i) {
+    indices[i] = rnd_engine() % bv_set_bits;
+  }
+  return indices;
+}
+
+void BvWithIvArgs(benchmark::internal::Benchmark* b) {
+  std::vector<int> set_bit_to_selector_ratios;
+  if (IsBenchmarkFunctionalOnly()) {
+    set_bit_to_selector_ratios = std::vector<int>{2};
+  } else {
+    set_bit_to_selector_ratios = std::vector<int>{2, 4, 6, 8, 10, 12, 16, 32};
+  }
+
+  std::vector<int> bv_set_bits;
+  if (IsBenchmarkFunctionalOnly()) {
+    bv_set_bits = std::vector<int>{1024};
+  } else {
+    bv_set_bits = std::vector<int>{1024, 4096, 1024 * 1024};
+  }
+
+  for (int bv_set_bit : bv_set_bits) {
+    for (int set_bit_to_selector_ratio : set_bit_to_selector_ratios) {
+      b->Args({bv_set_bit, set_bit_to_selector_ratio});
+    }
+  }
+}
+
+// |BM_SelectBvWithIvByConvertToIv| and |BM_SelectBvWithIvByIndexOfNthSet| are
+// used together to find the ratio between the selector size and the number of
+// set bits in the BitVector where |SelectBvWithIvByIndexOfNthSet| is faster
+// than |SelectBvWithIvByConvertToIv|.
+//
+// See the comment in SelectBvWithIv in row_map.cc for more information on this.
+
+static void BM_SelectBvWithIvByConvertToIv(benchmark::State& state) {
+  uint32_t bv_set_bit = static_cast<uint32_t>(state.range(0));
+  uint32_t set_bit_to_selector_ratio = static_cast<uint32_t>(state.range(1));
+
+  BitVector bv = BvWithSetBits(bv_set_bit);
+  std::vector<uint32_t> iv =
+      IvWithSizeAndMaxIndex(bv_set_bit, set_bit_to_selector_ratio);
+
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(SelectBvWithIvByConvertToIv(bv, iv));
+  }
+}
+BENCHMARK(BM_SelectBvWithIvByConvertToIv)->Apply(BvWithIvArgs);
+
+static void BM_SelectBvWithIvByIndexOfNthSet(benchmark::State& state) {
+  uint32_t bv_set_bit = static_cast<uint32_t>(state.range(0));
+  uint32_t set_bit_to_selector_ratio = static_cast<uint32_t>(state.range(1));
+
+  BitVector bv = BvWithSetBits(bv_set_bit);
+  std::vector<uint32_t> iv =
+      IvWithSizeAndMaxIndex(bv_set_bit, set_bit_to_selector_ratio);
+
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(SelectBvWithIvByIndexOfNthSet(bv, iv));
+  }
+}
+BENCHMARK(BM_SelectBvWithIvByIndexOfNthSet)->Apply(BvWithIvArgs);
+
+}  // namespace