Merge "processor: Fix order of async events in json export"
diff --git a/docs/trace-processor.md b/docs/trace-processor.md
index cea8846..532056b 100644
--- a/docs/trace-processor.md
+++ b/docs/trace-processor.md
@@ -8,8 +8,10 @@
 through SQL queries. The trace processor is used:
 * By the [Perfetto UI](https://ui.perfetto.dev/), in the form of a
   Web Assembly module.
-* Standalone, using the `trace_processor_shell` target
-  (`ninja -C out/xxx trace_processor_shell`).
+* Standalone:
+  * using the [prebuilt](http://get.perfetto.dev/trace_processor) binaries.
+  * using the `trace_processor_shell` target from source
+    (`ninja -C out/xxx trace_processor_shell`).
 * In internal pipelines for batch processing.
 
 Supported input formats:
diff --git a/gn/perfetto.gni b/gn/perfetto.gni
index 3d756d0..828a493 100644
--- a/gn/perfetto.gni
+++ b/gn/perfetto.gni
@@ -66,6 +66,10 @@
   build_with_chromium = false
 }
 
+if (!defined(is_nacl)) {
+  is_nacl = false
+}
+
 declare_args() {
   # The Android blueprint file generator set this to true (as well as
   # is_perfetto_build_generator). This is just about being built in the
@@ -135,7 +139,7 @@
   # system backend in the client library.
   # This includes building things that rely on POSIX sockets, this places
   # limitations on the supported operating systems.
-  enable_perfetto_ipc = !is_win && !is_fuchsia &&
+  enable_perfetto_ipc = !is_win && !is_fuchsia && !is_nacl &&
                         (perfetto_build_standalone ||
                          perfetto_build_with_android || build_with_chromium)
 
diff --git a/gn/standalone/BUILDCONFIG.gn b/gn/standalone/BUILDCONFIG.gn
index d980a05..3668c0f 100644
--- a/gn/standalone/BUILDCONFIG.gn
+++ b/gn/standalone/BUILDCONFIG.gn
@@ -36,10 +36,11 @@
 is_linux_host = host_os == "linux"
 is_mac = current_os == "mac"
 
-# Building with Windows/Fuchsia is currently only supported in the Chromium
+# Building with Windows/Fuchsia/nacl is currently only supported in the Chromium
 # tree so always set this to false.
 is_win = false
 is_fuchsia = false
+is_nacl = false
 
 if (target_cpu == "") {
   target_cpu = host_cpu
diff --git a/src/profiling/memory/unwinding.cc b/src/profiling/memory/unwinding.cc
index 6b030c0..ad18031 100644
--- a/src/profiling/memory/unwinding.cc
+++ b/src/profiling/memory/unwinding.cc
@@ -58,6 +58,8 @@
 namespace profiling {
 namespace {
 
+constexpr base::TimeMillis kMapsReparseInterval{500};
+
 constexpr size_t kMaxFrames = 1000;
 
 // We assume average ~300us per unwind. If we handle up to 1000 unwinds, this
@@ -204,8 +206,14 @@
   uint8_t error_code = 0;
   for (int attempt = 0; attempt < 2; ++attempt) {
     if (attempt > 0) {
+      if (metadata->last_maps_reparse_time + kMapsReparseInterval >
+          base::GetWallTimeMs()) {
+        PERFETTO_DLOG("Skipping reparse due to rate limit.");
+        break;
+      }
       PERFETTO_DLOG("Reparsing maps");
       metadata->ReparseMaps();
+      metadata->last_maps_reparse_time = base::GetWallTimeMs();
       // Regs got invalidated by libuwindstack's speculative jump.
       // Reset.
       ReadFromRawData(regs.get(), alloc_metadata->register_data);
diff --git a/src/profiling/memory/unwinding.h b/src/profiling/memory/unwinding.h
index 39216fe..a702c64 100644
--- a/src/profiling/memory/unwinding.h
+++ b/src/profiling/memory/unwinding.h
@@ -27,6 +27,7 @@
 #include <unwindstack/JitDebug.h>
 #endif
 
+#include "perfetto/base/time.h"
 #include "perfetto/ext/base/scoped_file.h"
 #include "perfetto/ext/base/thread_task_runner.h"
 #include "perfetto/ext/tracing/core/basic_types.h"
@@ -129,6 +130,7 @@
   // The API of libunwindstack expects shared_ptr for Memory.
   std::shared_ptr<unwindstack::Memory> fd_mem;
   uint64_t reparses = 0;
+  base::TimeMillis last_maps_reparse_time{0};
 #if PERFETTO_BUILDFLAG(PERFETTO_ANDROID_BUILD)
   std::unique_ptr<unwindstack::JitDebug> jit_debug;
   std::unique_ptr<unwindstack::DexFiles> dex_files;
diff --git a/src/trace_processor/containers/bit_vector_benchmark.cc b/src/trace_processor/containers/bit_vector_benchmark.cc
index b4a0c27..206bc5d 100644
--- a/src/trace_processor/containers/bit_vector_benchmark.cc
+++ b/src/trace_processor/containers/bit_vector_benchmark.cc
@@ -17,6 +17,7 @@
 #include <benchmark/benchmark.h>
 
 #include "src/trace_processor/containers/bit_vector.h"
+#include "src/trace_processor/containers/bit_vector_iterators.h"
 
 namespace {
 
@@ -27,15 +28,40 @@
 }
 
 void BitVectorArgs(benchmark::internal::Benchmark* b) {
-  b->Arg(64);
+  std::vector<int> set_percentages;
+  if (IsBenchmarkFunctionalOnly()) {
+    set_percentages = std::vector<int>{50};
+  } else {
+    set_percentages = std::vector<int>{0, 1, 5, 50, 95, 99, 100};
+  }
 
-  if (!IsBenchmarkFunctionalOnly()) {
-    b->Arg(512);
-    b->Arg(8192);
-    b->Arg(123456);
-    b->Arg(1234567);
+  for (int percentage : set_percentages) {
+    b->Args({64, percentage});
+
+    if (!IsBenchmarkFunctionalOnly()) {
+      b->Args({512, percentage});
+      b->Args({8192, percentage});
+      b->Args({123456, percentage});
+      b->Args({1234567, percentage});
+    }
   }
 }
+
+BitVector BvWithSizeAndSetPercentage(uint32_t size, uint32_t set_percentage) {
+  static constexpr uint32_t kRandomSeed = 29;
+  std::minstd_rand0 rnd_engine(kRandomSeed);
+
+  BitVector bv;
+  for (uint32_t i = 0; i < size; ++i) {
+    if (rnd_engine() % 100 < set_percentage) {
+      bv.AppendTrue();
+    } else {
+      bv.AppendFalse();
+    }
+  }
+  return bv;
+}
+
 }  // namespace
 
 static void BM_BitVectorAppendTrue(benchmark::State& state) {
@@ -61,15 +87,9 @@
   std::minstd_rand0 rnd_engine(kRandomSeed);
 
   uint32_t size = static_cast<uint32_t>(state.range(0));
+  uint32_t set_percentage = static_cast<uint32_t>(state.range(1));
 
-  BitVector bv;
-  for (uint32_t i = 0; i < size; ++i) {
-    if (rnd_engine() % 2) {
-      bv.AppendTrue();
-    } else {
-      bv.AppendFalse();
-    }
-  }
+  BitVector bv = BvWithSizeAndSetPercentage(size, set_percentage);
 
   static constexpr uint32_t kPoolSize = 1024 * 1024;
   std::vector<bool> bit_pool(kPoolSize);
@@ -93,15 +113,9 @@
   std::minstd_rand0 rnd_engine(kRandomSeed);
 
   uint32_t size = static_cast<uint32_t>(state.range(0));
+  uint32_t set_percentage = static_cast<uint32_t>(state.range(1));
 
-  BitVector bv;
-  for (uint32_t i = 0; i < size; ++i) {
-    if (rnd_engine() % 2) {
-      bv.AppendTrue();
-    } else {
-      bv.AppendFalse();
-    }
-  }
+  BitVector bv = BvWithSizeAndSetPercentage(size, set_percentage);
 
   static constexpr uint32_t kPoolSize = 1024 * 1024;
   std::vector<uint32_t> row_pool(kPoolSize);
@@ -123,16 +137,9 @@
   std::minstd_rand0 rnd_engine(kRandomSeed);
 
   uint32_t size = static_cast<uint32_t>(state.range(0));
+  uint32_t set_percentage = static_cast<uint32_t>(state.range(1));
 
-  BitVector bv;
-  for (uint32_t i = 0; i < size; ++i) {
-    if (rnd_engine() % 2) {
-      bv.AppendTrue();
-    } else {
-      bv.AppendFalse();
-    }
-  }
-
+  BitVector bv = BvWithSizeAndSetPercentage(size, set_percentage);
   static constexpr uint32_t kPoolSize = 1024 * 1024;
   std::vector<uint32_t> row_pool(kPoolSize);
   uint32_t set_bit_count = bv.GetNumBitsSet();
@@ -153,11 +160,12 @@
   std::minstd_rand0 rnd_engine(kRandomSeed);
 
   uint32_t size = static_cast<uint32_t>(state.range(0));
+  uint32_t set_percentage = static_cast<uint32_t>(state.range(1));
 
   uint32_t count = 0;
   BitVector bv;
   for (uint32_t i = 0; i < size; ++i) {
-    bool value = rnd_engine() % 2;
+    bool value = rnd_engine() % 100 < set_percentage;
     if (value) {
       bv.AppendTrue();
     } else {
@@ -205,11 +213,12 @@
   std::minstd_rand0 rnd_engine(kRandomSeed);
 
   uint32_t size = static_cast<uint32_t>(state.range(0));
+  uint32_t set_percentage = static_cast<uint32_t>(state.range(1));
 
   BitVector bv;
   BitVector picker;
   for (uint32_t i = 0; i < size; ++i) {
-    bool value = rnd_engine() % 2;
+    bool value = rnd_engine() % 100 < set_percentage;
     if (value) {
       bv.AppendTrue();
 
@@ -234,3 +243,16 @@
   }
 }
 BENCHMARK(BM_BitVectorUpdateSetBits)->Apply(BitVectorArgs);
+
+static void BM_BitVectorSetBitsIterator(benchmark::State& state) {
+  uint32_t size = static_cast<uint32_t>(state.range(0));
+  uint32_t set_percentage = static_cast<uint32_t>(state.range(1));
+
+  BitVector bv = BvWithSizeAndSetPercentage(size, set_percentage);
+  for (auto _ : state) {
+    for (auto it = bv.IterateSetBits(); it; it.Next()) {
+      benchmark::DoNotOptimize(it.index());
+    }
+  }
+}
+BENCHMARK(BM_BitVectorSetBitsIterator)->Apply(BitVectorArgs);
diff --git a/src/trace_processor/heap_profile_tracker.cc b/src/trace_processor/heap_profile_tracker.cc
index 4bf0756..a2aff3a 100644
--- a/src/trace_processor/heap_profile_tracker.cc
+++ b/src/trace_processor/heap_profile_tracker.cc
@@ -24,6 +24,201 @@
 namespace perfetto {
 namespace trace_processor {
 
+namespace {
+struct MergedCallsite {
+  StringId frame_name;
+  StringId mapping_name;
+  base::Optional<uint32_t> parent_idx;
+  bool operator<(const MergedCallsite& o) const {
+    return std::tie(frame_name, mapping_name, parent_idx) <
+           std::tie(o.frame_name, o.mapping_name, o.parent_idx);
+  }
+};
+
+std::vector<MergedCallsite> GetMergedCallsites(TraceStorage* storage,
+                                               uint32_t callstack_row) {
+  const tables::StackProfileCallsiteTable& callsites_tbl =
+      storage->stack_profile_callsite_table();
+  const tables::StackProfileFrameTable& frames_tbl =
+      storage->stack_profile_frame_table();
+  const tables::SymbolTable& symbols_tbl = storage->symbol_table();
+  const tables::StackProfileMappingTable& mapping_tbl =
+      storage->stack_profile_mapping_table();
+
+  // TODO(fmayer): Clean up types and remove the static_cast.
+  uint32_t frame_idx = *frames_tbl.id().IndexOf(
+      FrameId(static_cast<uint32_t>(callsites_tbl.frame_id()[callstack_row])));
+
+  // TODO(fmayer): Clean up types and remove the static_cast.
+  uint32_t mapping_idx = *mapping_tbl.id().IndexOf(
+      MappingId(static_cast<uint32_t>(frames_tbl.mapping()[frame_idx])));
+  StringId mapping_name = mapping_tbl.name()[mapping_idx];
+
+  base::Optional<uint32_t> symbol_set_id =
+      frames_tbl.symbol_set_id()[frame_idx];
+
+  if (!symbol_set_id) {
+    StringId frame_name = frames_tbl.name()[frame_idx];
+    return {{frame_name, mapping_name, base::nullopt}};
+  }
+
+  std::vector<MergedCallsite> result;
+  // id == symbol_set_id for the bottommost frame.
+  // TODO(lalitm): Encode this optimization in the table and remove this
+  // custom optimization.
+  uint32_t symbol_set_idx = *symbols_tbl.id().IndexOf(SymbolId(*symbol_set_id));
+  for (uint32_t i = symbol_set_idx;
+       i < symbols_tbl.row_count() &&
+       symbols_tbl.symbol_set_id()[i] == *symbol_set_id;
+       ++i) {
+    result.emplace_back(
+        MergedCallsite{symbols_tbl.name()[i], mapping_name, base::nullopt});
+  }
+  return result;
+}
+}  // namespace
+
+std::unique_ptr<tables::ExperimentalFlamegraphNodesTable> BuildNativeFlamegraph(
+    TraceStorage* storage,
+    UniquePid upid,
+    int64_t timestamp) {
+  const tables::HeapProfileAllocationTable& allocation_tbl =
+      storage->heap_profile_allocation_table();
+  const tables::StackProfileCallsiteTable& callsites_tbl =
+      storage->stack_profile_callsite_table();
+
+  StringId profile_type = storage->InternString("native");
+
+  std::vector<uint32_t> callsite_to_merged_callsite(callsites_tbl.row_count(),
+                                                    0);
+  std::map<MergedCallsite, uint32_t> merged_callsites_to_table_idx;
+
+  std::unique_ptr<tables::ExperimentalFlamegraphNodesTable> tbl(
+      new tables::ExperimentalFlamegraphNodesTable(
+          storage->mutable_string_pool(), nullptr));
+
+  // FORWARD PASS:
+  // Aggregate callstacks by frame name / mapping name. Use symbolization data.
+  for (uint32_t i = 0; i < callsites_tbl.row_count(); ++i) {
+    base::Optional<uint32_t> parent_idx;
+    // TODO(fmayer): Clean up types and remove the conditional and static_cast.
+    if (callsites_tbl.parent_id()[i] != -1) {
+      auto parent_id = static_cast<uint32_t>(callsites_tbl.parent_id()[i]);
+      parent_idx = callsites_tbl.id().IndexOf(CallsiteId(parent_id));
+      parent_idx = callsite_to_merged_callsite[*parent_idx];
+      PERFETTO_CHECK(*parent_idx < i);
+    }
+
+    auto callsites = GetMergedCallsites(storage, i);
+    for (MergedCallsite& merged_callsite : callsites) {
+      merged_callsite.parent_idx = parent_idx;
+      auto it = merged_callsites_to_table_idx.find(merged_callsite);
+      if (it == merged_callsites_to_table_idx.end()) {
+        std::tie(it, std::ignore) = merged_callsites_to_table_idx.emplace(
+            merged_callsite, merged_callsites_to_table_idx.size());
+        tables::ExperimentalFlamegraphNodesTable::Row row{};
+        if (parent_idx) {
+          row.depth = tbl->depth()[*parent_idx] + 1;
+        } else {
+          row.depth = 0;
+        }
+        row.ts = timestamp;
+        row.upid = upid;
+        row.profile_type = profile_type;
+        row.name = merged_callsite.frame_name;
+        row.map_name = merged_callsite.mapping_name;
+        if (parent_idx)
+          row.parent_id = tbl->id()[*parent_idx].value;
+
+        parent_idx = *tbl->id().IndexOf(tbl->Insert(std::move(row)));
+        PERFETTO_CHECK(merged_callsites_to_table_idx.size() ==
+                       tbl->row_count());
+      }
+      parent_idx = it->second;
+    }
+    PERFETTO_CHECK(parent_idx);
+    callsite_to_merged_callsite[i] = *parent_idx;
+  }
+
+  // PASS OVER ALLOCATIONS:
+  // Aggregate allocations into the newly built tree.
+  auto filtered = allocation_tbl.Filter(
+      {allocation_tbl.ts().eq(timestamp), allocation_tbl.upid().eq(upid)});
+
+  if (filtered.row_count() == 0) {
+    return nullptr;
+  }
+
+  for (auto it = filtered.IterateRows(); it; it.Next()) {
+    int64_t size =
+        it.Get(static_cast<uint32_t>(
+                   tables::HeapProfileAllocationTable::ColumnIndex::size))
+            .long_value;
+    int64_t count =
+        it.Get(static_cast<uint32_t>(
+                   tables::HeapProfileAllocationTable::ColumnIndex::count))
+            .long_value;
+    int64_t callsite_id =
+        it
+            .Get(static_cast<uint32_t>(
+                tables::HeapProfileAllocationTable::ColumnIndex::callsite_id))
+            .long_value;
+
+    PERFETTO_CHECK((size < 0 && count < 0) || (size >= 0 && count >= 0));
+    uint32_t merged_idx =
+        callsite_to_merged_callsite[*callsites_tbl.id().IndexOf(
+            CallsiteId(static_cast<uint32_t>(callsite_id)))];
+    if (size > 0) {
+      // TODO(fmayer): Clean up types and remove the static_cast.
+      tbl->mutable_alloc_size()->Set(merged_idx,
+                                     tbl->alloc_size()[merged_idx] + size);
+      tbl->mutable_alloc_count()->Set(merged_idx,
+                                      tbl->alloc_count()[merged_idx] + count);
+    }
+
+    // TODO(fmayer): Clean up types and remove the static_cast.
+    tbl->mutable_size()->Set(merged_idx, tbl->size()[merged_idx] + size);
+    tbl->mutable_count()->Set(merged_idx, tbl->count()[merged_idx] + count);
+  }
+
+  // BACKWARD PASS:
+  // Propagate sizes to parents.
+  for (int64_t i = tbl->row_count() - 1; i >= 0; --i) {
+    uint32_t idx = static_cast<uint32_t>(i);
+
+    tbl->mutable_cumulative_size()->Set(
+        idx, tbl->cumulative_size()[idx] + tbl->size()[idx]);
+    tbl->mutable_cumulative_count()->Set(
+        idx, tbl->cumulative_count()[idx] + tbl->count()[idx]);
+
+    tbl->mutable_cumulative_alloc_size()->Set(
+        idx, tbl->cumulative_alloc_size()[idx] + tbl->alloc_size()[idx]);
+    tbl->mutable_cumulative_alloc_count()->Set(
+        idx, tbl->cumulative_alloc_count()[idx] + tbl->alloc_count()[idx]);
+
+    auto parent = tbl->parent_id()[idx];
+    if (parent) {
+      uint32_t parent_idx = *tbl->id().IndexOf(
+          tables::ExperimentalFlamegraphNodesTable::Id(*parent));
+      tbl->mutable_cumulative_size()->Set(
+          parent_idx,
+          tbl->cumulative_size()[parent_idx] + tbl->cumulative_size()[idx]);
+      tbl->mutable_cumulative_count()->Set(
+          parent_idx,
+          tbl->cumulative_count()[parent_idx] + tbl->cumulative_count()[idx]);
+
+      tbl->mutable_cumulative_alloc_size()->Set(
+          parent_idx, tbl->cumulative_alloc_size()[parent_idx] +
+                          tbl->cumulative_alloc_size()[idx]);
+      tbl->mutable_cumulative_alloc_count()->Set(
+          parent_idx, tbl->cumulative_alloc_count()[parent_idx] +
+                          tbl->cumulative_alloc_count()[idx]);
+    }
+  }
+
+  return tbl;
+}
+
 HeapProfileTracker::HeapProfileTracker(TraceProcessorContext* context)
     : context_(context), empty_(context_->storage->InternString({"", 0})) {}
 
diff --git a/src/trace_processor/heap_profile_tracker.h b/src/trace_processor/heap_profile_tracker.h
index 93a2440..53e7637 100644
--- a/src/trace_processor/heap_profile_tracker.h
+++ b/src/trace_processor/heap_profile_tracker.h
@@ -30,6 +30,11 @@
 namespace perfetto {
 namespace trace_processor {
 
+std::unique_ptr<tables::ExperimentalFlamegraphNodesTable> BuildNativeFlamegraph(
+    TraceStorage* storage,
+    UniquePid upid,
+    int64_t timestamp);
+
 class TraceProcessorContext;
 
 class HeapProfileTracker {
diff --git a/src/trace_processor/importers/ftrace/rss_stat_tracker.cc b/src/trace_processor/importers/ftrace/rss_stat_tracker.cc
index db444a7..f536a02 100644
--- a/src/trace_processor/importers/ftrace/rss_stat_tracker.cc
+++ b/src/trace_processor/importers/ftrace/rss_stat_tracker.cc
@@ -69,24 +69,34 @@
 base::Optional<UniqueTid> RssStatTracker::FindUtidForMmId(int64_t mm_id,
                                                           bool is_curr,
                                                           uint32_t pid) {
-  auto it = mm_id_to_utid_.find(mm_id);
-  if (!is_curr) {
-    return it == mm_id_to_utid_.end() ? base::nullopt
-                                      : base::make_optional(it->second);
+  // If curr is true, we can just overwrite the state in the map and return
+  // the utid correspodning to |pid|.
+  if (is_curr) {
+    UniqueTid utid = context_->process_tracker->GetOrCreateThread(pid);
+    mm_id_to_utid_[mm_id] = utid;
+    return utid;
   }
 
+  // If curr is false, try and lookup the utid we previously saw for this
+  // mm id.
+  auto it = mm_id_to_utid_.find(mm_id);
+  if (it == mm_id_to_utid_.end())
+    return base::nullopt;
+
+  // If the utid in the map is the same as our current utid but curr is false,
+  // that means we are in the middle of a process changing mm structs (i.e. in
+  // the middle of a vfork + exec). Therefore, we should discard the association
+  // of this vm struct with this thread.
   UniqueTid utid = context_->process_tracker->GetOrCreateThread(pid);
-  if (it != mm_id_to_utid_.end() && it->second != utid) {
-    // Since both of these structs have the same mm hash and both say that
-    // the mm hash is for the current project, we can assume they belong to
-    // the same process so we can associate them together.
-    // TODO(lalitm): investigate if it's possible for mm_id to be reused
-    // between different processes if we have pid reuse and get unlucky. If
-    // so, we'll need to do some more careful tracking here.
-    context_->process_tracker->AssociateThreads(it->second, utid);
+  if (it->second == utid) {
+    mm_id_to_utid_.erase(it);
+    return base::nullopt;
   }
-  mm_id_to_utid_[mm_id] = utid;
-  return utid;
+
+  // This case happens when a process is changing the VM of another process and
+  // we know that the utid corresponding to the target process. Just return that
+  // utid.
+  return it->second;
 }
 
 }  // namespace trace_processor
diff --git a/src/trace_processor/importers/proto/heap_graph_tracker.cc b/src/trace_processor/importers/proto/heap_graph_tracker.cc
index 124fab9..f96cf6e 100644
--- a/src/trace_processor/importers/proto/heap_graph_tracker.cc
+++ b/src/trace_processor/importers/proto/heap_graph_tracker.cc
@@ -234,17 +234,17 @@
                                             // artificial root in the database.
 
     tables::ExperimentalFlamegraphNodesTable::Row alloc_row{
-        current_ts,
-        current_upid,
-        profile_type,
-        depth,
-        StringId::Raw(static_cast<uint32_t>(node.class_name)),
-        java_mapping,
+        current_ts, current_upid, profile_type, depth,
+        StringId::Raw(static_cast<uint32_t>(node.class_name)), java_mapping,
         static_cast<int64_t>(node.count),
         static_cast<int64_t>(node_to_cumulative_count[i]),
         static_cast<int64_t>(node.size),
         static_cast<int64_t>(node_to_cumulative_size[i]),
-        parent_id};
+        // For java dumps, set alloc_count == count, etc.
+        static_cast<int64_t>(node.count),
+        static_cast<int64_t>(node_to_cumulative_count[i]),
+        static_cast<int64_t>(node.size),
+        static_cast<int64_t>(node_to_cumulative_size[i]), parent_id};
     node_to_row_idx[i] = *tbl->id().IndexOf(tbl->Insert(alloc_row));
   }
   return tbl;
diff --git a/src/trace_processor/importers/proto/proto_trace_parser_unittest.cc b/src/trace_processor/importers/proto/proto_trace_parser_unittest.cc
index 5fd1d74..ff27f76 100644
--- a/src/trace_processor/importers/proto/proto_trace_parser_unittest.cc
+++ b/src/trace_processor/importers/proto/proto_trace_parser_unittest.cc
@@ -2390,11 +2390,6 @@
 
     auto mapping = interned_data->add_mappings();
     mapping->set_iid(1);
-    mapping->set_build_id(1);
-
-    auto build_id = interned_data->add_build_ids();
-    build_id->set_iid(1);
-    build_id->set_str("3BBCFBD372448A727265C3E7C4D954F91");
 
     auto frame = interned_data->add_frames();
     frame->set_iid(1);
@@ -2456,13 +2451,6 @@
   EXPECT_EQ(samples.ts()[2], 68000);
   EXPECT_EQ(samples.callsite_id()[2], 0);
   EXPECT_EQ(samples.utid()[2], 1u);
-
-  // Breakpad build_ids should not be modified/mangled.
-  ASSERT_STREQ(
-      context_.storage
-          ->GetString(storage_->stack_profile_mapping_table().build_id()[0])
-          .c_str(),
-      "3BBCFBD372448A727265C3E7C4D954F91");
 }
 
 }  // namespace
diff --git a/src/trace_processor/metrics/android/unmapped_java_symbols.sql b/src/trace_processor/metrics/android/unmapped_java_symbols.sql
index 7cc314f..d13d948 100644
--- a/src/trace_processor/metrics/android/unmapped_java_symbols.sql
+++ b/src/trace_processor/metrics/android/unmapped_java_symbols.sql
@@ -24,6 +24,7 @@
   AND INSTR(type_name, '.') = 0
   AND RTRIM(type_name, '[]') NOT IN ('byte', 'char', 'short', 'int', 'long', 'boolean', 'float', 'double')
   AND type_name NOT LIKE '$Proxy%'
+  AND LENGTH(type_name) > 0
 )
 SELECT upid, RepeatedField(type_name) AS types
 FROM distinct_unmapped_type_names GROUP BY 1;
@@ -35,6 +36,7 @@
   WHERE deobfuscated_type_name IS NULL
   AND field_name NOT LIKE '%.%.%'
   AND field_name NOT LIKE '$Proxy%'
+  AND LENGTH(field_name) > 0
 )
 SELECT upid, RepeatedField(field_name) AS fields
 FROM distinct_unmapped_field_names GROUP BY 1;
diff --git a/src/trace_processor/sqlite/db_sqlite_table.cc b/src/trace_processor/sqlite/db_sqlite_table.cc
index 054f387..83789ea 100644
--- a/src/trace_processor/sqlite/db_sqlite_table.cc
+++ b/src/trace_processor/sqlite/db_sqlite_table.cc
@@ -222,9 +222,12 @@
     } else if (sqlite_utils::IsOpEq(c.op)) {
       // If there is only a single equality constraint, we have special logic
       // to sort by that column and then binary search if we see the constraint
-      // set often. Model this by dividing but the log of the number of rows as
+      // set often. Model this by dividing by the log of the number of rows as
       // a good approximation. Otherwise, we'll need to do a full table scan.
-      filter_cost += cs.size() == 1
+      // Alternatively, if the column is sorted, we can use the same binary
+      // search logic so we have the same low cost (even better because we don't
+      // have to sort at all).
+      filter_cost += cs.size() == 1 || col.IsSorted()
                          ? (2 * current_row_count) / log2(current_row_count)
                          : current_row_count;
 
diff --git a/src/trace_processor/sqlite/db_sqlite_table_unittest.cc b/src/trace_processor/sqlite/db_sqlite_table_unittest.cc
index 4f39839..2fbc3e4 100644
--- a/src/trace_processor/sqlite/db_sqlite_table_unittest.cc
+++ b/src/trace_processor/sqlite/db_sqlite_table_unittest.cc
@@ -33,12 +33,18 @@
         Column("a", &a_, Column::Flag::kNoFlag, this, 1u, 0u));
     columns_.emplace_back(
         Column("sorted", &sorted_, Column::Flag::kSorted, this, 2u, 0u));
+    columns_.emplace_back(
+        Column("other", &other_, Column::Flag::kNoFlag, this, 3u, 0u));
+    columns_.emplace_back(
+        Column("other2", &other_, Column::Flag::kNoFlag, this, 4u, 0u));
   }
 
  private:
   StringPool pool_;
   SparseVector<uint32_t> a_;
   SparseVector<uint32_t> sorted_;
+  SparseVector<uint32_t> other_;
+  SparseVector<uint32_t> other2_;
 };
 
 TEST(DbSqliteTable, IdEqCheaperThanOtherEq) {
@@ -95,6 +101,27 @@
   ASSERT_GT(single_cost.rows, multi_cost.rows);
 }
 
+TEST(DbSqliteTable, MultiSortedEqCheaperThanMultiUnsortedEq) {
+  TestTable table(1234);
+
+  QueryConstraints sorted_eq;
+  sorted_eq.AddConstraint(2u, SQLITE_INDEX_CONSTRAINT_EQ, 0u);
+  sorted_eq.AddConstraint(3u, SQLITE_INDEX_CONSTRAINT_EQ, 0u);
+
+  auto sorted_cost = DbSqliteTable::EstimateCost(table, sorted_eq);
+
+  QueryConstraints unsorted_eq;
+  unsorted_eq.AddConstraint(3u, SQLITE_INDEX_CONSTRAINT_EQ, 0u);
+  unsorted_eq.AddConstraint(4u, SQLITE_INDEX_CONSTRAINT_EQ, 0u);
+
+  auto unsorted_cost = DbSqliteTable::EstimateCost(table, unsorted_eq);
+
+  // The number of rows should be the same but the cost of the sorted
+  // query should be less.
+  ASSERT_LT(sorted_cost.cost, unsorted_cost.cost);
+  ASSERT_EQ(sorted_cost.rows, unsorted_cost.rows);
+}
+
 TEST(DbSqliteTable, EmptyTableCosting) {
   TestTable table(0u);
 
diff --git a/src/trace_processor/sqlite_experimental_flamegraph_table.cc b/src/trace_processor/sqlite_experimental_flamegraph_table.cc
index 68dd50f..12a9bb6 100644
--- a/src/trace_processor/sqlite_experimental_flamegraph_table.cc
+++ b/src/trace_processor/sqlite_experimental_flamegraph_table.cc
@@ -17,6 +17,7 @@
 
 #include "src/trace_processor/sqlite_experimental_flamegraph_table.h"
 
+#include "src/trace_processor/heap_profile_tracker.h"
 #include "src/trace_processor/importers/proto/heap_graph_tracker.h"
 #include "src/trace_processor/trace_processor_context.h"
 
@@ -154,11 +155,14 @@
   // Get the input column values and compute the flamegraph using them.
   values_ = GetInputValues(qc, argv);
 
-  // TODO(fmayer): extend this to support native profile as well.
   if (values_.profile_type == "graph") {
     auto* tracker = HeapGraphTracker::GetOrCreate(context_);
     table_ = tracker->BuildFlamegraph(values_.ts, values_.upid);
   }
+  if (values_.profile_type == "native") {
+    table_ = BuildNativeFlamegraph(context_->storage.get(),
+                                   values_.upid, values_.ts);
+  }
 
   // table_ can be nullptr precisely where the constraints passed to us don't
   // make sense. Therefore, we can just return this to SQLite.
diff --git a/src/trace_processor/stack_profile_tracker.cc b/src/trace_processor/stack_profile_tracker.cc
index e9185e0..63443f4 100644
--- a/src/trace_processor/stack_profile_tracker.cc
+++ b/src/trace_processor/stack_profile_tracker.cc
@@ -68,18 +68,9 @@
       context_->storage->GetString(raw_build_id);
   StringId build_id = GetEmptyStringId();
   if (raw_build_id_str.size() > 0) {
-    // If the build_id is 33 characters long, we assume it's a Breakpad debug
-    // identifier which is already in Hex and doesn't need conversion.
-    // TODO(b/148109467): Remove workaround once all active Chrome versions
-    // write raw bytes instead of a string as build_id.
-    if (raw_build_id_str.size() == 33) {
-      build_id = raw_build_id;
-    } else {
-      std::string hex_build_id =
-          base::ToHex(raw_build_id_str.c_str(), raw_build_id_str.size());
-      build_id =
-          context_->storage->InternString(base::StringView(hex_build_id));
-    }
+    std::string hex_build_id =
+        base::ToHex(raw_build_id_str.c_str(), raw_build_id_str.size());
+    build_id = context_->storage->InternString(base::StringView(hex_build_id));
   }
 
   tables::StackProfileMappingTable::Row row{
diff --git a/src/trace_processor/tables/profiler_tables.h b/src/trace_processor/tables/profiler_tables.h
index 5f529eb..04dae8d 100644
--- a/src/trace_processor/tables/profiler_tables.h
+++ b/src/trace_processor/tables/profiler_tables.h
@@ -100,6 +100,10 @@
   C(int64_t, cumulative_count)                                            \
   C(int64_t, size)                                                        \
   C(int64_t, cumulative_size)                                             \
+  C(int64_t, alloc_count)                                                 \
+  C(int64_t, cumulative_alloc_count)                                      \
+  C(int64_t, alloc_size)                                                  \
+  C(int64_t, cumulative_alloc_size)                                       \
   C(base::Optional<uint32_t>, parent_id)
 
 PERFETTO_TP_TABLE(PERFETTO_TP_EXPERIMENTAL_FLAMEGRAPH_NODES);
@@ -124,7 +128,7 @@
 #define PERFETTO_TP_HEAP_GRAPH_REFERENCE_DEF(NAME, PARENT, C) \
   NAME(HeapGraphReferenceTable, "heap_graph_reference")       \
   PERFETTO_TP_ROOT_TABLE(PARENT, C)                           \
-  C(int64_t, reference_set_id)                                \
+  C(int64_t, reference_set_id, Column::Flag::kSorted)         \
   C(int64_t, owner_id)                                        \
   C(int64_t, owned_id)                                        \
   C(StringPool::Id, field_name)                               \
diff --git a/src/trace_processor/trace_storage.h b/src/trace_processor/trace_storage.h
index db6ac60..67c6358 100644
--- a/src/trace_processor/trace_storage.h
+++ b/src/trace_processor/trace_storage.h
@@ -75,6 +75,8 @@
 
 using FrameId = tables::StackProfileFrameTable::Id;
 
+using SymbolId = tables::SymbolTable::Id;
+
 using CallsiteId = tables::StackProfileCallsiteTable::Id;
 
 using MetadataId = tables::MetadataTable::Id;
diff --git a/test/synth_common.py b/test/synth_common.py
index f950294..09bf07e 100644
--- a/test/synth_common.py
+++ b/test/synth_common.py
@@ -19,6 +19,8 @@
 from google.protobuf.pyext import _message
 
 CLONE_THREAD = 0x00010000
+CLONE_VFORK = 0x00004000
+CLONE_VM = 0x00000100
 
 
 class Trace(object):
diff --git a/test/trace_processor/heap_profile_flamegraph.sql b/test/trace_processor/heap_profile_flamegraph.sql
new file mode 100644
index 0000000..fa90b15
--- /dev/null
+++ b/test/trace_processor/heap_profile_flamegraph.sql
@@ -0,0 +1 @@
+select * from experimental_flamegraph(605908369259172, 1, 'native')  limit 10;
diff --git a/test/trace_processor/heap_profile_flamegraph_system-server-native-profile.out b/test/trace_processor/heap_profile_flamegraph_system-server-native-profile.out
new file mode 100644
index 0000000..42ab65d
--- /dev/null
+++ b/test/trace_processor/heap_profile_flamegraph_system-server-native-profile.out
@@ -0,0 +1,11 @@
+"id","type","depth","name","map_name","count","cumulative_count","size","cumulative_size","alloc_count","cumulative_alloc_count","alloc_size","cumulative_alloc_size","parent_id"
+0,"experimental_flamegraph_nodes",0,"__start_thread","/apex/com.android.runtime/lib64/bionic/libc.so",0,8,0,84848,0,210,0,1084996,"[NULL]"
+1,"experimental_flamegraph_nodes",1,"_ZL15__pthread_startPv","/apex/com.android.runtime/lib64/bionic/libc.so",0,8,0,84848,0,210,0,1084996,0
+2,"experimental_flamegraph_nodes",2,"_ZN7android14AndroidRuntime15javaThreadShellEPv","/system/lib64/libandroid_runtime.so",0,5,0,27704,0,77,0,348050,1
+3,"experimental_flamegraph_nodes",3,"_ZN7android6Thread11_threadLoopEPv","/system/lib64/libutils.so",0,5,0,27704,0,77,0,348050,2
+4,"experimental_flamegraph_nodes",4,"_ZN7android10PoolThread10threadLoopEv","/system/lib64/libbinder.so",0,1,0,4096,0,64,0,279182,3
+5,"experimental_flamegraph_nodes",5,"_ZN7android14IPCThreadState14joinThreadPoolEb","/system/lib64/libbinder.so",0,1,0,4096,0,64,0,279182,4
+6,"experimental_flamegraph_nodes",6,"_ZN7android14IPCThreadState20getAndExecuteCommandEv","/system/lib64/libbinder.so",0,1,0,4096,0,64,0,279182,5
+7,"experimental_flamegraph_nodes",7,"_ZN7android14IPCThreadState14executeCommandEi","/system/lib64/libbinder.so",0,1,0,4096,0,64,0,279182,6
+8,"experimental_flamegraph_nodes",8,"_ZN7android7BBinder8transactEjRKNS_6ParcelEPS1_j","/system/lib64/libbinder.so",0,1,0,4096,0,64,0,279182,7
+9,"experimental_flamegraph_nodes",9,"_ZN11JavaBBinder10onTransactEjRKN7android6ParcelEPS1_j","/system/lib64/libandroid_runtime.so",0,0,0,0,0,60,0,262730,8
diff --git a/test/trace_processor/index b/test/trace_processor/index
index 612d838..afd8064 100644
--- a/test/trace_processor/index
+++ b/test/trace_processor/index
@@ -57,6 +57,8 @@
 
 # Rss stats
 rss_stat_mm_id.py rss_stat.sql rss_stat_mm_id.out
+rss_stat_mm_id_clone.py rss_stat.sql rss_stat_mm_id_clone.out
+rss_stat_mm_id_reuse.py rss_stat.sql rss_stat_mm_id_reuse.out
 rss_stat_legacy.py rss_stat.sql rss_stat_legacy.out
 
 # Memory counters
@@ -152,6 +154,7 @@
 heap_graph_interleaved.textproto heap_graph_object.sql heap_graph_interleaved_object.out
 heap_graph_interleaved.textproto heap_graph_reference.sql heap_graph_interleaved_reference.out
 ../data/system-server-heap-graph.pftrace heap_graph_flamegraph.sql heap_graph_flamegraph_system-server-heap-graph.out
+../data/system-server-native-profile heap_profile_flamegraph.sql heap_profile_flamegraph_system-server-native-profile.out
 
 # TrackEvent tests.
 track_event_same_tids.textproto process_tracking.sql track_event_same_tids_threads.out
diff --git a/test/trace_processor/rss_stat_mm_id copy.py b/test/trace_processor/rss_stat_mm_id copy.py
new file mode 100644
index 0000000..d2b5e5a
--- /dev/null
+++ b/test/trace_processor/rss_stat_mm_id copy.py
@@ -0,0 +1,52 @@
+#!/usr/bin/python
+# Copyright (C) 2019 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This synthetic trace tests handling of the mm_id field in the rss_stat
+# event.
+
+from os import sys, path
+
+sys.path.append(path.dirname(path.dirname(path.abspath(__file__))))
+import synth_common
+
+trace = synth_common.create_trace()
+
+trace.add_process_tree_packet(ts=1)
+trace.add_process(10, 0, "process")
+
+trace.add_ftrace_packet(0)
+
+# Create a new child process (treated internally as a thread) of kthreadd.
+trace.add_newtask(ts=50, tid=2, new_tid=3, new_comm="kthread_child", flags=0)
+
+# Add an event on tid 3 which affects its own rss.
+trace.add_rss_stat(ts=90, tid=3, member=0, size=9, mm_id=4321, curr=True)
+
+# Try to add an event for tid 10. However, as we've not seen an event
+# with curr == True for tid == 10, this event will be dropped.
+trace.add_rss_stat(ts=91, tid=3, member=0, size=900, mm_id=1234, curr=False)
+
+# Add an event for tid 3 from tid 10. This emulates e.g. direct reclaim
+# where a process reaches into another process' mm struct.
+trace.add_rss_stat(ts=99, tid=10, member=0, size=10, mm_id=4321, curr=False)
+
+# Add an event on tid 10 which affects its own rss.
+trace.add_rss_stat(ts=100, tid=10, member=0, size=1000, mm_id=1234, curr=True)
+
+# Add an event on tid 10 from tid 3. This emlates e.g. background reclaim
+# where kthreadd is cleaning up the mm struct of another process.
+trace.add_rss_stat(ts=101, tid=3, member=0, size=900, mm_id=1234, curr=False)
+
+print(trace.trace.SerializeToString())
diff --git a/test/trace_processor/rss_stat_mm_id_clone.out b/test/trace_processor/rss_stat_mm_id_clone.out
new file mode 100644
index 0000000..0969481
--- /dev/null
+++ b/test/trace_processor/rss_stat_mm_id_clone.out
@@ -0,0 +1,9 @@
+"ts","name","pid","name","value"
+100,"mem.rss.file",10,"parent_process",100.000000
+100,"mem.rss.file",2,"kthreadd",10.000000
+102,"mem.rss.file",2,"kthreadd",20.000000
+102,"mem.rss.file",11,"child_process",90.000000
+104,"mem.rss.file",11,"child_process",10.000000
+105,"mem.rss.file",10,"parent_process",95.000000
+107,"mem.rss.file",10,"parent_process",105.000000
+108,"mem.rss.file",10,"parent_process",110.000000
diff --git a/test/trace_processor/rss_stat_mm_id_clone.py b/test/trace_processor/rss_stat_mm_id_clone.py
new file mode 100644
index 0000000..308b6b4
--- /dev/null
+++ b/test/trace_processor/rss_stat_mm_id_clone.py
@@ -0,0 +1,98 @@
+#!/usr/bin/python
+# Copyright (C) 2019 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This synthetic trace tests handling of the mm_id field in the rss_stat
+# event during clone events which have various flag combinations set.
+
+from os import sys, path
+
+sys.path.append(path.dirname(path.dirname(path.abspath(__file__))))
+import synth_common
+
+trace = synth_common.create_trace()
+
+trace.add_process_tree_packet(ts=1)
+trace.add_process(10, 1, "parent_process")
+trace.add_process(3, 2, "kernel_thread")
+
+# In this packet, check what happens to userspace processes with different
+# clone flags.
+trace.add_ftrace_packet(1)
+
+# Emit an rss stat event for the main thread of the process to associate it
+# with an mm_id.
+trace.add_rss_stat(100, tid=10, member=0, size=100, mm_id=0x1234, curr=1)
+
+# Create a newtask event emulating vfork/posix_spawn (i.e. CLONE_VM and
+# CLONE_VFORK set).
+trace.add_newtask(
+    101,
+    tid=10,
+    new_tid=11,
+    new_comm="child_process",
+    flags=synth_common.CLONE_VFORK | synth_common.CLONE_VM)
+
+# The child process will now change its own (and parent's) VM space with
+# |curr| set to 1 (emulating cleaning up some memory in parent).
+trace.add_rss_stat(102, tid=11, member=0, size=90, mm_id=0x1234, curr=1)
+
+# At this point, the child process will obtain a new mm struct. From this
+# point on, all mm_ids from the child should be different from the parent.
+
+# The child process will now change its parents VM space with curr set to
+# 0 (emulating e.g. cleaning up its stack).
+trace.add_rss_stat(103, tid=11, member=0, size=85, mm_id=0x1234, curr=0)
+
+# Now the child process should exec another process.
+
+# The child can now change its own memory.
+trace.add_rss_stat(104, tid=11, member=0, size=10, mm_id=0x5678, curr=1)
+
+# The parent can now resume execution and may emit another rss event.
+trace.add_rss_stat(105, tid=10, member=0, size=95, mm_id=0x1234, curr=1)
+
+# The parent can now go ahead and start a new thread.
+trace.add_newtask(
+    106,
+    tid=10,
+    new_tid=12,
+    new_comm="parent_thread",
+    flags=synth_common.CLONE_VM | synth_common.CLONE_THREAD)
+
+# Since this thread shares mm space with the parent, it should have the
+# same mm id and have curr set to 1.
+trace.add_rss_stat(107, tid=12, member=0, size=105, mm_id=0x1234, curr=1)
+
+# The parent can also emit events with the same mm struct at the same time.
+trace.add_rss_stat(108, tid=10, member=0, size=110, mm_id=0x1234, curr=1)
+
+# In this packet, we check what happens to kernel threads in RSS stat.
+trace.add_ftrace_packet(1)
+
+# Emit an rss stat event for the the existing kernel thread.
+trace.add_rss_stat(100, tid=3, member=0, size=10, mm_id=0x2345, curr=1)
+
+# Start a new kernel thread.
+trace.add_newtask(
+    101,
+    tid=2,
+    new_tid=4,
+    new_comm="kernel_thread2",
+    flags=synth_common.CLONE_VM)
+
+# Emit a rss stat for the new kernel thread.
+trace.add_rss_stat(102, tid=4, member=0, size=20, mm_id=0x2345, curr=1)
+
+print(trace.trace.SerializeToString())
diff --git a/test/trace_processor/rss_stat_mm_id_reuse.out b/test/trace_processor/rss_stat_mm_id_reuse.out
new file mode 100644
index 0000000..396f5f7
--- /dev/null
+++ b/test/trace_processor/rss_stat_mm_id_reuse.out
@@ -0,0 +1,3 @@
+"ts","name","pid","name","value"
+100,"mem.rss.file",10,"parent_process",100.000000
+103,"mem.rss.file",10,"new_process",10.000000
diff --git a/test/trace_processor/rss_stat_mm_id_reuse.py b/test/trace_processor/rss_stat_mm_id_reuse.py
new file mode 100644
index 0000000..58d7b4d
--- /dev/null
+++ b/test/trace_processor/rss_stat_mm_id_reuse.py
@@ -0,0 +1,43 @@
+#!/usr/bin/python
+# Copyright (C) 2019 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This synthetic trace tests handling of the mm_id field in the rss_stat
+# event when mm_structs are reused on process death.
+
+from os import sys, path
+
+sys.path.append(path.dirname(path.dirname(path.abspath(__file__))))
+import synth_common
+
+trace = synth_common.create_trace()
+
+trace.add_process_tree_packet(ts=1)
+trace.add_process(10, 1, "parent_process")
+
+trace.add_ftrace_packet(1)
+
+# Emit an event for the process.
+trace.add_rss_stat(100, tid=10, member=0, size=100, mm_id=0x1234, curr=1)
+
+# Now kill the process.
+trace.add_process_free(ts=101, tid=10, comm="parent_process", prio=0)
+
+# Create a new thread which reuses the pid and mm struct.
+trace.add_newtask(102, tid=1, new_tid=10, new_comm="new_process", flags=0)
+
+# Emit an event for the new thread.
+trace.add_rss_stat(103, tid=10, member=0, size=10, mm_id=0x1234, curr=1)
+
+print(trace.trace.SerializeToString())
diff --git a/tools/install-build-deps b/tools/install-build-deps
index db5aa59..1ed1fab 100755
--- a/tools/install-build-deps
+++ b/tools/install-build-deps
@@ -146,8 +146,8 @@
     # Example traces for regression tests.
     (
         'buildtools/test_data.zip',
-        'https://storage.googleapis.com/perfetto/test-data-20200121-101157.zip',
-        'ddfe8aa6ceca7be17f222740927dc50aa9f8ed02',
+        'https://storage.googleapis.com/perfetto/test-data-20200122-100845.zip',
+        '56ac45b5239fda50d33cf05bfd329dcb3efb0b2a',
         'all',
     ),
 
diff --git a/ui/src/controller/heap_profile_controller.ts b/ui/src/controller/heap_profile_controller.ts
index 973a92f..eadb704 100644
--- a/ui/src/controller/heap_profile_controller.ts
+++ b/ui/src/controller/heap_profile_controller.ts
@@ -177,23 +177,23 @@
     let sizeIndex = 4;
     switch (viewingOption) {
       case SPACE_MEMORY_ALLOCATED_NOT_FREED_KEY:
-        orderBy = `where size > 0 order by depth, parent_hash, size desc, name`;
+        orderBy = `where cumulative_size > 0 order by depth, parent_id,
+            cumulative_size desc, name`;
         sizeIndex = 4;
         break;
       case ALLOC_SPACE_MEMORY_ALLOCATED_KEY:
-        orderBy =
-            `where alloc_size > 0 order by depth, parent_hash, alloc_size desc,
-            name`;
+        orderBy = `where cumulative_alloc_size > 0 order by depth, parent_id,
+            cumulative_alloc_size desc, name`;
         sizeIndex = 5;
         break;
       case OBJECTS_ALLOCATED_NOT_FREED_KEY:
-        orderBy =
-            `where count > 0 order by depth, parent_hash, count desc, name`;
+        orderBy = `where cumulative_count > 0 order by depth, parent_id,
+            cumulative_count desc, name`;
         sizeIndex = 6;
         break;
       case OBJECTS_ALLOCATED_KEY:
-        orderBy = `where alloc_count > 0 order by depth, parent_hash,
-            alloc_count desc, name`;
+        orderBy = `where cumulative_alloc_count > 0 order by depth, parent_id,
+            cumulative_alloc_count desc, name`;
         sizeIndex = 7;
         break;
       default:
@@ -201,8 +201,9 @@
     }
 
     const callsites = await this.args.engine.query(
-        `SELECT hash, name, parent_hash, depth, size, alloc_size, count,
-        alloc_count, map_name, self_size from ${tableName} ${orderBy}`);
+        `SELECT id, name, parent_id, depth, cumulative_size,
+        cumulative_alloc_size, cumulative_count,
+        cumulative_alloc_count, map_name, size from ${tableName} ${orderBy}`);
 
     const flamegraphData: CallsiteInfo[] = new Array();
     const hashToindex: Map<number, number> = new Map();
@@ -230,102 +231,15 @@
       Promise<string> {
     // Creating unique names for views so we can reuse and not delete them
     // for each marker.
-    const tableNameCallsiteNameSize =
-        this.tableName(`callsite_with_name_and_size`);
-    const tableNameCallsiteHashNameSize =
-        this.tableName(`callsite_hash_name_size`);
     const tableNameGroupedCallsitesForFlamegraph =
         this.tableName(`grouped_callsites_for_flamegraph`);
-    // Joining the callsite table with frame table then with alloc table to get
-    // the size and name for each callsite.
-    // TODO(taylori): Make frame name nullable in the trace processor for
-    // consistency with the other columns.
-    await this.args.engine.query(
-        `create view if not exists ${tableNameCallsiteNameSize} as
-         select id, parent_id, depth, IFNULL(DEMANGLE(name), name) as name,
-            map_name, size, alloc_size, count, alloc_count from (
-         select cs.id as id, parent_id, depth,
-            coalesce(symbols.name,
-                case when fr.name != '' then fr.name else map.name end) as name,
-            map.name as map_name,
-            SUM(IFNULL(size, 0)) as size,
-            SUM(IFNULL(size, 0)) as size,
-            SUM(case when size > 0 then size else 0 end) as alloc_size,
-            SUM(IFNULL(count, 0)) as count,
-            SUM(case when count > 0 then count else 0 end) as alloc_count
-         from stack_profile_callsite cs
-         join stack_profile_frame fr on cs.frame_id = fr.id
-         join stack_profile_mapping map on fr.mapping = map.id
-         left join (
-              select symbol_set_id, FIRST_VALUE(name) OVER(PARTITION BY
-                symbol_set_id) as name
-              from stack_profile_symbol GROUP BY symbol_set_id
-            ) as symbols using(symbol_set_id)
-         left join heap_profile_allocation alloc on alloc.callsite_id = cs.id
-         and alloc.ts <= ${ts} and alloc.upid = ${upid} group by cs.id)`);
 
-    // Recursive query to compute the hash for each callsite based on names
-    // rather than ids.
-    // We get all the children of the row in question and emit a row with hash
-    // equal hash(name, parent.hash). Roots without the parent will have -1 as
-    // hash.  Slices will be merged into a big slice.
-    await this.args.engine.query(
-        `create view if not exists ${tableNameCallsiteHashNameSize} as
-        with recursive callsite_table_names(
-          id, hash, name, map_name, size, alloc_size, count, alloc_count,
-          parent_hash, depth) AS (
-        select id, hash(name) as hash, name, map_name, size, alloc_size, count,
-          alloc_count, -1, depth
-        from ${tableNameCallsiteNameSize}
-        where depth = 0
-        union all
-        select cs.id, hash(cs.name, ctn.hash) as hash, cs.name, cs.map_name,
-          cs.size, cs.alloc_size, cs.count, cs.alloc_count, ctn.hash, cs.depth
-        from callsite_table_names ctn
-        inner join ${tableNameCallsiteNameSize} cs ON ctn.id = cs.parent_id
-        )
-        select hash, name, map_name, parent_hash, depth, SUM(size) as size,
-          SUM(case when alloc_size > 0 then alloc_size else 0 end)
-            as alloc_size, SUM(count) as count,
-          SUM(case when alloc_count > 0 then alloc_count else 0 end)
-            as alloc_count
-        from callsite_table_names
-        group by hash`);
-
-    // Recursive query to compute the cumulative size of each callsite.
-    // Base case: We get all the callsites where the size is non-zero.
-    // Recursive case: We get the callsite which is the parent of the current
-    //  callsite(in terms of hashes) and emit a row with the size of the current
-    //  callsite plus all the info of the parent.
-    // Grouping: For each callsite, our recursive table has n rows where n is
-    //  the number of descendents with a non-zero self size. We need to group on
-    //  the hash and sum all the sizes to get the cumulative size for each
-    //  callsite hash.
     await this.args.engine.query(`create temp table if not exists ${
-        tableNameGroupedCallsitesForFlamegraph}
-        as with recursive callsite_children(
-          hash, name, map_name, parent_hash, depth, size, alloc_size, count,
-          alloc_count, self_size, self_alloc_size, self_count, self_alloc_count)
-        as (
-        select hash, name, map_name, parent_hash, depth, size, alloc_size,
-          count, alloc_count, size as self_size, alloc_size as self_alloc_size,
-          count as self_count, alloc_count as self_alloc_count
-        from ${tableNameCallsiteHashNameSize}
-        union all
-        select chns.hash, chns.name, chns.map_name, chns.parent_hash,
-          chns.depth, cc.size, cc.alloc_size, cc.count, cc.alloc_count,
-          chns.size, chns.alloc_size, chns.count, chns.alloc_count
-        from ${tableNameCallsiteHashNameSize} chns
-        inner join callsite_children cc on chns.hash = cc.parent_hash
-        )
-        select hash, name, map_name, parent_hash, depth, SUM(size) as size,
-          SUM(case when alloc_size > 0 then alloc_size else 0 end)
-            as alloc_size, SUM(count) as count,
-          SUM(case when alloc_count > 0 then alloc_count else 0 end) as
-            alloc_count,
-          self_size, self_alloc_size, self_count, self_alloc_count
-        from callsite_children
-        group by hash`);
+        tableNameGroupedCallsitesForFlamegraph} as
+        select id, name, map_name, parent_id, depth, cumulative_size,
+          cumulative_alloc_size, cumulative_count, cumulative_alloc_count,
+          size, alloc_size, count, alloc_count
+        from experimental_flamegraph(${ts}, ${upid}, 'native')`);
     return tableNameGroupedCallsitesForFlamegraph;
   }
 
diff --git a/ui/src/controller/trace_controller.ts b/ui/src/controller/trace_controller.ts
index f819244..d7b07b6 100644
--- a/ui/src/controller/trace_controller.ts
+++ b/ui/src/controller/trace_controller.ts
@@ -467,6 +467,30 @@
       });
     }
 
+    // Add global slice tracks.
+    const globalSliceTracks = await engine.query(`
+      select
+        track.name as track_name,
+        track.id as track_id,
+        max(depth) as max_depth
+      from track join slice on track.id = slice.track_id
+      where track.type = 'track'
+    `);
+
+    for (let i = 0; i < globalSliceTracks.numRecords; i++) {
+      const name = globalSliceTracks.columns[0].stringValues![i];
+      const trackId = +globalSliceTracks.columns[1].longValues![i];
+      const maxDepth = +globalSliceTracks.columns[2].longValues![i];
+
+      tracksToAdd.push({
+        engineId: this.engineId,
+        kind: SLICE_TRACK_KIND,
+        name: `${name}`,
+        trackGroup: SCROLLING_TRACK_GROUP,
+        config: {maxDepth, trackId},
+      });
+    }
+
     interface CounterTrack {
       name: string;
       trackId: number;
@@ -687,8 +711,6 @@
           name: `${threadName} [${tid}]`,
           trackGroup: pUuid,
           config: {
-            upid,
-            utid,
             maxDepth: threadTrack.maxDepth,
             trackId: threadTrack.trackId
           },
diff --git a/ui/src/tracks/chrome_slices/common.ts b/ui/src/tracks/chrome_slices/common.ts
index faa40c3..41eb3c2 100644
--- a/ui/src/tracks/chrome_slices/common.ts
+++ b/ui/src/tracks/chrome_slices/common.ts
@@ -18,8 +18,6 @@
 
 export interface Config {
   maxDepth: number;
-  upid: number;
-  utid: number;
   trackId: number;
 }