Merge "perfetto-ui: Improve rendering of note panels"
diff --git a/Android.bp b/Android.bp
index 217284a..87e7112 100644
--- a/Android.bp
+++ b/Android.bp
@@ -168,7 +168,7 @@
     "src/profiling/memory/client.cc",
     "src/profiling/memory/malloc_hooks.cc",
     "src/profiling/memory/proc_utils.cc",
-    "src/profiling/memory/sampler.cc",
+    "src/profiling/memory/scoped_spinlock.cc",
     "src/profiling/memory/wire_protocol.cc",
   ],
   shared_libs: [
@@ -588,7 +588,7 @@
     "src/profiling/memory/proc_utils.cc",
     "src/profiling/memory/process_matcher.cc",
     "src/profiling/memory/record_reader.cc",
-    "src/profiling/memory/sampler.cc",
+    "src/profiling/memory/scoped_spinlock.cc",
     "src/profiling/memory/socket_listener.cc",
     "src/profiling/memory/system_property.cc",
     "src/profiling/memory/unwinding.cc",
@@ -2854,8 +2854,8 @@
     "src/profiling/memory/process_matcher_unittest.cc",
     "src/profiling/memory/record_reader.cc",
     "src/profiling/memory/record_reader_unittest.cc",
-    "src/profiling/memory/sampler.cc",
     "src/profiling/memory/sampler_unittest.cc",
+    "src/profiling/memory/scoped_spinlock.cc",
     "src/profiling/memory/shared_ring_buffer.cc",
     "src/profiling/memory/shared_ring_buffer_unittest.cc",
     "src/profiling/memory/socket_listener.cc",
diff --git a/BUILD.gn b/BUILD.gn
index eebb27b..971d25d 100644
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -254,6 +254,8 @@
       "include/perfetto/tracing/core",
       "protos/perfetto/trace:zero",
       "protos/perfetto/trace/chrome:zero",
+      "protos/perfetto/trace/interned_data:zero",
+      "protos/perfetto/trace/track_event:zero",
     ]
   }
 }
diff --git a/include/perfetto/base/unix_socket.h b/include/perfetto/base/unix_socket.h
index c6c9775..c1d1745 100644
--- a/include/perfetto/base/unix_socket.h
+++ b/include/perfetto/base/unix_socket.h
@@ -67,6 +67,7 @@
   bool Listen();
   bool Connect(const std::string& socket_name);
   bool SetTxTimeout(uint32_t timeout_ms);
+  bool SetRxTimeout(uint32_t timeout_ms);
   void Shutdown();
   void SetBlocking(bool);
   bool IsBlocking() const;
diff --git a/protos/perfetto/trace/perfetto_trace.proto b/protos/perfetto/trace/perfetto_trace.proto
index ee6fb8f..338e665 100644
--- a/protos/perfetto/trace/perfetto_trace.proto
+++ b/protos/perfetto/trace/perfetto_trace.proto
@@ -2894,7 +2894,7 @@
   repeated InternedString strings = 1;
   message InternedString {
     optional uint64 id = 1;
-    optional string str = 2;
+    optional bytes str = 2;
   }
 
   repeated Frame frames = 2;
diff --git a/protos/perfetto/trace/profiling/profile_packet.proto b/protos/perfetto/trace/profiling/profile_packet.proto
index 723eec2..91818b4 100644
--- a/protos/perfetto/trace/profiling/profile_packet.proto
+++ b/protos/perfetto/trace/profiling/profile_packet.proto
@@ -24,7 +24,7 @@
   repeated InternedString strings = 1;
   message InternedString {
     optional uint64 id = 1;
-    optional string str = 2;
+    optional bytes str = 2;
   }
 
   repeated Frame frames = 2;
diff --git a/src/base/unix_socket.cc b/src/base/unix_socket.cc
index ed95044..63bbd11 100644
--- a/src/base/unix_socket.cc
+++ b/src/base/unix_socket.cc
@@ -347,6 +347,19 @@
                     sizeof(timeout)) == 0;
 }
 
+bool UnixSocketRaw::SetRxTimeout(uint32_t timeout_ms) {
+  PERFETTO_DCHECK(fd_);
+  struct timeval timeout {};
+  uint32_t timeout_sec = timeout_ms / 1000;
+  timeout.tv_sec = static_cast<decltype(timeout.tv_sec)>(timeout_sec);
+  timeout.tv_usec = static_cast<decltype(timeout.tv_usec)>(
+      (timeout_ms - (timeout_sec * 1000)) * 1000);
+
+  return setsockopt(*fd_, SOL_SOCKET, SO_RCVTIMEO,
+                    reinterpret_cast<const char*>(&timeout),
+                    sizeof(timeout)) == 0;
+}
+
 #pragma GCC diagnostic pop
 
 // +--------------------+
diff --git a/src/base/unix_socket_unittest.cc b/src/base/unix_socket_unittest.cc
index 50a6133..5ea646e 100644
--- a/src/base/unix_socket_unittest.cc
+++ b/src/base/unix_socket_unittest.cc
@@ -733,7 +733,7 @@
   task_runner_.RunUntilCheckpoint("connected");
   srv->Shutdown(true);
 
-  cli->Send("test");
+  cli->Send("test", UnixSocket::BlockingMode::kBlocking);
 
   ASSERT_NE(peer, nullptr);
   auto raw_sock = peer->ReleaseSocket();
diff --git a/src/profiling/memory/BUILD.gn b/src/profiling/memory/BUILD.gn
index a8cd362..d75d87f 100644
--- a/src/profiling/memory/BUILD.gn
+++ b/src/profiling/memory/BUILD.gn
@@ -48,14 +48,26 @@
   ]
 }
 
-source_set("ring_buffer") {
+source_set("scoped_spinlock") {
   deps = [
     "../../../gn:default_deps",
     "../../base",
   ]
   sources = [
+    "scoped_spinlock.cc",
+    "scoped_spinlock.h",
+  ]
+}
+
+source_set("ring_buffer") {
+  deps = [
+    ":scoped_spinlock",
+    "../../../gn:default_deps",
+    "../../base",
+  ]
+  sources = [
     "shared_ring_buffer.cc",
-    "shared_ring_buffer.ch",
+    "shared_ring_buffer.h",
   ]
 }
 
@@ -112,6 +124,7 @@
   public_configs = [ "../../../buildtools:libunwindstack_config" ]
   deps = [
     ":proc_utils",
+    ":scoped_spinlock",
     ":wire_protocol",
     "../../../buildtools:libunwindstack",
     "../../../gn:default_deps",
@@ -121,7 +134,6 @@
   sources = [
     "client.cc",
     "client.h",
-    "sampler.cc",
     "sampler.h",
   ]
 }
@@ -182,6 +194,7 @@
   deps = [
     ":client",
     ":proc_utils",
+    ":scoped_spinlock",
     ":wire_protocol",
     "../../../gn:default_deps",
     "../../base",
diff --git a/src/profiling/memory/bookkeeping.cc b/src/profiling/memory/bookkeeping.cc
index 3d5b8da..642fc54 100644
--- a/src/profiling/memory/bookkeeping.cc
+++ b/src/profiling/memory/bookkeeping.cc
@@ -297,7 +297,8 @@
 
     auto interned_string = current_profile_packet->add_strings();
     interned_string->set_id(str.id());
-    interned_string->set_str(str->c_str(), str->size());
+    interned_string->set_str(reinterpret_cast<const uint8_t*>(str->c_str()),
+                             str->size());
   }
 }
 
diff --git a/src/profiling/memory/client.cc b/src/profiling/memory/client.cc
index 807b091..e194f59 100644
--- a/src/profiling/memory/client.cc
+++ b/src/profiling/memory/client.cc
@@ -39,6 +39,7 @@
 #include "perfetto/base/unix_socket.h"
 #include "perfetto/base/utils.h"
 #include "src/profiling/memory/sampler.h"
+#include "src/profiling/memory/scoped_spinlock.h"
 #include "src/profiling/memory/wire_protocol.h"
 
 namespace perfetto {
@@ -47,8 +48,6 @@
 
 constexpr std::chrono::seconds kLockTimeout{1};
 
-// TODO(rsavitski): consider setting a receive timeout as well, otherwise the
-// constructor can block indefinitely (while waiting on the client config).
 std::vector<base::UnixSocketRaw> ConnectPool(const std::string& sock_name,
                                              size_t n) {
   std::vector<base::UnixSocketRaw> res;
@@ -59,8 +58,12 @@
       PERFETTO_PLOG("Failed to connect to %s", sock_name.c_str());
       continue;
     }
-    if (!sock.SetTxTimeout(kClientSockTxTimeoutMs)) {
-      PERFETTO_PLOG("Failed to set timeout for %s", sock_name.c_str());
+    if (!sock.SetTxTimeout(kClientSockTimeoutMs)) {
+      PERFETTO_PLOG("Failed to set send timeout for %s", sock_name.c_str());
+      continue;
+    }
+    if (!sock.SetRxTimeout(kClientSockTimeoutMs)) {
+      PERFETTO_PLOG("Failed to set receive timeout for %s", sock_name.c_str());
       continue;
     }
     res.emplace_back(std::move(sock));
@@ -204,12 +207,10 @@
 
 Client::Client(std::vector<base::UnixSocketRaw> socks)
     : generation_(++max_generation_),
-      pthread_key_(ThreadLocalSamplingData::KeyDestructor),
+      sampler_(8192),  // placeholder until we receive the config (within ctor)
       socket_pool_(std::move(socks)),
       free_page_(generation_),
       main_thread_stack_base_(FindMainThreadStack()) {
-  PERFETTO_DCHECK(pthread_key_.valid());
-
   // We might be running in a process that is not dumpable (such as app
   // processes on user builds), in which case the /proc/self/mem will be chown'd
   // to root:root, and will not be accessible even to the process itself (see
@@ -256,6 +257,8 @@
     return;
   }
   PERFETTO_DCHECK(client_config_.interval >= 1);
+  sampler_ = Sampler(client_config_.interval);
+
   PERFETTO_DLOG("Initialized client.");
   inited_.store(true, std::memory_order_release);
 }
@@ -342,28 +345,6 @@
   return success;
 }
 
-ssize_t Client::ShouldSampleAlloc(uint64_t alloc_size,
-                                  void* (*unhooked_malloc)(size_t),
-                                  void (*unhooked_free)(void*)) {
-  if (!inited_.load(std::memory_order_acquire))
-    return -1;
-  return static_cast<ssize_t>(SampleSize(pthread_key_.get(), alloc_size,
-                                         client_config_.interval,
-                                         unhooked_malloc, unhooked_free));
-}
-
-bool Client::MaybeSampleAlloc(uint64_t alloc_size,
-                              uint64_t alloc_address,
-                              void* (*unhooked_malloc)(size_t),
-                              void (*unhooked_free)(void*)) {
-  ssize_t total_size =
-      ShouldSampleAlloc(alloc_size, unhooked_malloc, unhooked_free);
-  if (total_size > 0)
-    return RecordMalloc(alloc_size, static_cast<size_t>(total_size),
-                        alloc_address);
-  return total_size != -1;
-}
-
 void Client::Shutdown() {
   socket_pool_.Shutdown();
   inited_.store(false, std::memory_order_release);
diff --git a/src/profiling/memory/client.h b/src/profiling/memory/client.h
index ead3e1b..5bcfff5 100644
--- a/src/profiling/memory/client.h
+++ b/src/profiling/memory/client.h
@@ -17,14 +17,15 @@
 #ifndef SRC_PROFILING_MEMORY_CLIENT_H_
 #define SRC_PROFILING_MEMORY_CLIENT_H_
 
-#include <pthread.h>
 #include <stddef.h>
 
+#include <atomic>
 #include <condition_variable>
 #include <mutex>
 #include <vector>
 
 #include "perfetto/base/unix_socket.h"
+#include "src/profiling/memory/sampler.h"
 #include "src/profiling/memory/wire_protocol.h"
 
 namespace perfetto {
@@ -89,7 +90,7 @@
 
   // Add address to buffer. Flush if necessary using a socket borrowed from
   // pool.
-  // Can be called from any thread. Must not hold mutex_.`
+  // Can be called from any thread. Must not hold mutex_.
   bool Add(const uint64_t addr, uint64_t sequence_number, SocketPool* pool);
 
  private:
@@ -103,33 +104,15 @@
 
 const char* GetThreadStackBase();
 
-// RAII wrapper around pthread_key_t. This is different from a ScopedResource
-// because it needs a separate boolean indicating validity.
-class PThreadKey {
- public:
-  PThreadKey(const PThreadKey&) = delete;
-  PThreadKey& operator=(const PThreadKey&) = delete;
+constexpr uint32_t kClientSockTimeoutMs = 1000;
 
-  PThreadKey(void (*destructor)(void*)) noexcept
-      : valid_(pthread_key_create(&key_, destructor) == 0) {}
-  ~PThreadKey() noexcept {
-    if (valid_)
-      pthread_key_delete(key_);
-  }
-  bool valid() const { return valid_; }
-  pthread_key_t get() const {
-    PERFETTO_DCHECK(valid_);
-    return key_;
-  }
-
- private:
-  pthread_key_t key_;
-  bool valid_;
-};
-
-constexpr uint32_t kClientSockTxTimeoutMs = 1000;
-
-// This is created and owned by the malloc hooks.
+// Profiling client, used to sample and record the malloc/free family of calls,
+// and communicate the necessary state to a separate profiling daemon process.
+//
+// Created and owned by the malloc hooks.
+//
+// Methods of this class are thread-safe unless otherwise stated, in which case
+// the caller needs to synchronize calls behind a mutex or similar.
 class Client {
  public:
   Client(std::vector<base::UnixSocketRaw> sockets);
@@ -138,27 +121,38 @@
                     uint64_t total_size,
                     uint64_t alloc_address);
   bool RecordFree(uint64_t alloc_address);
-  bool MaybeSampleAlloc(uint64_t alloc_size,
-                        uint64_t alloc_address,
-                        void* (*unhooked_malloc)(size_t),
-                        void (*unhooked_free)(void*));
   void Shutdown();
 
+  // Returns the number of bytes to assign to an allocation with the given
+  // |alloc_size|, based on the current sampling rate. A return value of zero
+  // means that the allocation should not be recorded. Not idempotent, each
+  // invocation mutates the sampler state.
+  //
+  // Not thread-safe.
+  size_t GetSampleSizeLocked(size_t alloc_size) {
+    if (!inited_.load(std::memory_order_acquire))
+      return 0;
+    return sampler_.SampleSize(alloc_size);
+  }
+
   ClientConfiguration client_config_for_testing() { return client_config_; }
   bool inited() { return inited_; }
 
  private:
-  ssize_t ShouldSampleAlloc(uint64_t alloc_size,
-                            void* (*unhooked_malloc)(size_t),
-                            void (*unhooked_free)(void*));
   const char* GetStackBase();
 
   static std::atomic<uint64_t> max_generation_;
   const uint64_t generation_;
 
+  // TODO(rsavitski): used to check if the client is completely initialized
+  // after construction. The reads in RecordFree & GetSampleSizeLocked are no
+  // longer necessary (was an optimization to not do redundant work after
+  // shutdown). Turn into a normal bool, or indicate construction failures
+  // differently.
   std::atomic<bool> inited_{false};
   ClientConfiguration client_config_;
-  PThreadKey pthread_key_;
+  // sampler_ operations are not thread-safe.
+  Sampler sampler_;
   SocketPool socket_pool_;
   FreePage free_page_;
   const char* main_thread_stack_base_ = nullptr;
diff --git a/src/profiling/memory/heapprofd_end_to_end_test.cc b/src/profiling/memory/heapprofd_end_to_end_test.cc
index 39e6e24..1dd3eda 100644
--- a/src/profiling/memory/heapprofd_end_to_end_test.cc
+++ b/src/profiling/memory/heapprofd_end_to_end_test.cc
@@ -109,15 +109,19 @@
 
 constexpr size_t kStartupAllocSize = 10;
 
+void AllocateAndFree(size_t bytes) {
+  // This volatile is needed to prevent the compiler from trying to be
+  // helpful and compiling a "useless" malloc + free into a noop.
+  volatile char* x = static_cast<char*>(malloc(bytes));
+  if (x) {
+    x[1] = 'x';
+    free(const_cast<char*>(x));
+  }
+}
+
 void __attribute__((noreturn)) ContinuousMalloc(size_t bytes) {
   for (;;) {
-    // This volatile is needed to prevent the compiler from trying to be
-    // helpful and compiling a "useless" malloc + free into a noop.
-    volatile char* x = static_cast<char*>(malloc(bytes));
-    if (x) {
-      x[1] = 'x';
-      free(const_cast<char*>(x));
-    }
+    AllocateAndFree(bytes);
     usleep(10 * kMsToUs);
   }
 }
@@ -362,16 +366,13 @@
       signal_pipe.wr.reset();
       ack_pipe.rd.reset();
       for (;;) {
-        // This volatile is needed to prevent the compiler from trying to be
-        // helpful and compiling a "useless" malloc + free into a noop.
-        volatile char* x = static_cast<char*>(malloc(bytes));
-        if (x) {
-          x[1] = 'x';
-          free(const_cast<char*>(x));
-        }
+        AllocateAndFree(bytes);
         char buf[1];
         if (bool(signal_pipe.rd) &&
             read(*signal_pipe.rd, buf, sizeof(buf)) == 0) {
+          // make sure the client has noticed that the session has stopped
+          AllocateAndFree(bytes);
+
           bytes = kSecondIterationBytes;
           signal_pipe.rd.reset();
           ack_pipe.wr.reset();
@@ -408,6 +409,11 @@
   ASSERT_EQ(read(*ack_pipe.rd, buf, sizeof(buf)), 0);
   ack_pipe.rd.reset();
 
+  // TODO(rsavitski): this sleep is to compensate for the heapprofd delaying in
+  // closing the sockets (and therefore the client noticing that the session is
+  // over). Clarify where the delays are coming from.
+  usleep(100 * kMsToUs);
+
   PERFETTO_LOG("HeapprofdEndToEnd::Reinit: Starting second");
   TraceAndValidate(trace_config, pid, kSecondIterationBytes);
 
diff --git a/src/profiling/memory/malloc_hooks.cc b/src/profiling/memory/malloc_hooks.cc
index 8096e7a..531fccf 100644
--- a/src/profiling/memory/malloc_hooks.cc
+++ b/src/profiling/memory/malloc_hooks.cc
@@ -36,32 +36,10 @@
 #include "perfetto/base/utils.h"
 #include "src/profiling/memory/client.h"
 #include "src/profiling/memory/proc_utils.h"
+#include "src/profiling/memory/scoped_spinlock.h"
 #include "src/profiling/memory/wire_protocol.h"
 
-// The real malloc function pointers we get in initialize.
-static std::atomic<const MallocDispatch*> g_dispatch{nullptr};
-static std::atomic<perfetto::profiling::Client*> g_client{nullptr};
-static constexpr size_t kNumConnections = 2;
-
-static constexpr char kHeapprofdBinPath[] = "/system/bin/heapprofd";
-
-// The only writes are in the initialization function. Because Bionic does a
-// release write after initialization and an acquire read to retrieve the hooked
-// malloc functions, we can use relaxed memory mode for both writing, and more
-// importantly because in the fast-path, reading.
-static constexpr std::memory_order write_order = std::memory_order_relaxed;
-
-static perfetto::profiling::Client* GetClient() {
-  return g_client.load(std::memory_order_relaxed);
-}
-
-static const MallocDispatch* GetDispatch() {
-  return g_dispatch.load(std::memory_order_relaxed);
-}
-
-static void MallocDispatchReset(const MallocDispatch* dispatch) {
-  android_mallopt(M_RESET_HOOKS, nullptr, 0);
-}
+using perfetto::profiling::ScopedSpinlock;
 
 // This is so we can make an so that we can swap out with the existing
 // libc_malloc_hooks.so
@@ -120,6 +98,52 @@
 
 namespace {
 
+// The real malloc function pointers we get in initialize. Set once in the first
+// initialize invocation, and never changed afterwards. Because bionic does a
+// release write after initialization and an acquire read to retrieve the hooked
+// malloc functions, we can use relaxed memory mode for both writing and
+// reading.
+std::atomic<const MallocDispatch*> g_dispatch{nullptr};
+
+// Holds the active profiling client. Is empty at the start, or after we've
+// started shutting down a profiling session. Hook invocations take shared_ptr
+// copies (ensuring that the client stays alive until no longer needed), and do
+// nothing if this master pointer is empty.
+//
+// This shared_ptr itself is protected by g_client_lock. Note that shared_ptr
+// handles are not thread-safe by themselves:
+// https://en.cppreference.com/w/cpp/memory/shared_ptr/atomic
+std::shared_ptr<perfetto::profiling::Client> g_client;
+
+// Protects g_client, and serves as an external lock for sampling decisions (see
+// perfetto::profiling::Sampler).
+//
+// TODO(rsavitski): consider lifting Sampler into this global scope. Nesting
+// under client is not necessary (though it does highlight that their validity
+// is tied together).
+std::atomic<bool> g_client_lock{false};
+
+constexpr size_t kNumConnections = 2;
+constexpr char kHeapprofdBinPath[] = "/system/bin/heapprofd";
+
+const MallocDispatch* GetDispatch() {
+  return g_dispatch.load(std::memory_order_relaxed);
+}
+
+// Note: android_mallopt(M_RESET_HOOKS) is mutually exclusive with initialize
+// (concurrent calls get discarded).
+void ShutdownLazy() {
+  ScopedSpinlock s(&g_client_lock, ScopedSpinlock::Mode::Blocking);
+  if (!g_client)  // other invocation already initiated shutdown
+    return;
+
+  // Clear primary shared pointer, such that later hook invocations become nops.
+  g_client.reset();
+
+  if (!android_mallopt(M_RESET_HOOKS, nullptr, 0))
+    PERFETTO_PLOG("Unpatching heapprofd hooks failed.");
+}
+
 std::string ReadSystemProperty(const char* key) {
   std::string prop_value;
   const prop_info* prop = __system_property_find(key);
@@ -156,18 +180,15 @@
   return true;
 }
 
-std::unique_ptr<perfetto::profiling::Client> CreateClientForCentralDaemon() {
+std::shared_ptr<perfetto::profiling::Client> CreateClientForCentralDaemon() {
   PERFETTO_DLOG("Constructing client for central daemon.");
 
-  using perfetto::profiling::Client;
-  return std::unique_ptr<Client>(new (std::nothrow) Client(
-      perfetto::profiling::kHeapprofdSocketFile, kNumConnections));
+  return std::make_shared<perfetto::profiling::Client>(
+      perfetto::profiling::kHeapprofdSocketFile, kNumConnections);
 }
 
-std::unique_ptr<perfetto::profiling::Client> CreateClientAndPrivateDaemon() {
+std::shared_ptr<perfetto::profiling::Client> CreateClientAndPrivateDaemon() {
   PERFETTO_DLOG("Setting up fork mode profiling.");
-  // TODO(rsavitski): create kNumConnections socketpairs to match central mode
-  // behavior.
   perfetto::base::UnixSocketRaw parent_sock;
   perfetto::base::UnixSocketRaw child_sock;
   std::tie(parent_sock, child_sock) = perfetto::base::UnixSocketRaw::CreatePair(
@@ -216,11 +237,16 @@
   }  // else - parent continuing the client setup
 
   child_sock.ReleaseFd().reset();  // close child socket's fd
-  if (!parent_sock.SetTxTimeout(perfetto::profiling::kClientSockTxTimeoutMs)) {
+  if (!parent_sock.SetTxTimeout(perfetto::profiling::kClientSockTimeoutMs)) {
     PERFETTO_PLOG("Failed to set socket transmit timeout.");
     return nullptr;
   }
 
+  if (!parent_sock.SetRxTimeout(perfetto::profiling::kClientSockTimeoutMs)) {
+    PERFETTO_PLOG("Failed to set socket receive timeout.");
+    return nullptr;
+  }
+
   // Wait on the immediate child to exit (allow for ECHILD in the unlikely case
   // we're in a process that has made its children unwaitable).
   siginfo_t unused = {};
@@ -230,26 +256,38 @@
     return nullptr;
   }
 
-  using perfetto::profiling::Client;
   std::vector<perfetto::base::UnixSocketRaw> client_sockets;
   client_sockets.emplace_back(std::move(parent_sock));
-  return std::unique_ptr<Client>(new (std::nothrow)
-                                     Client(std::move(client_sockets)));
+  return std::make_shared<perfetto::profiling::Client>(
+      std::move(client_sockets));
 }
 
 }  // namespace
 
+// Setup for the rest of profiling. The first time profiling is triggered in a
+// process, this is called after this client library is dlopened, but before the
+// rest of the hooks are patched in. However, as we support multiple profiling
+// sessions within a process' lifetime, this function can also be legitimately
+// called any number of times afterwards (note: bionic guarantees that at most
+// one initialize call is active at a time).
+//
+// Note: if profiling is triggered at runtime, this runs on a dedicated pthread
+// (which is safe to block). If profiling is triggered at startup, then this
+// code runs synchronously.
 bool HEAPPROFD_ADD_PREFIX(_initialize)(const MallocDispatch* malloc_dispatch,
                                        int*,
                                        const char*) {
-  perfetto::profiling::Client* old_client = GetClient();
-  if (old_client)
-    old_client->Shutdown();
-
   // Table of pointers to backing implementation.
-  g_dispatch.store(malloc_dispatch, write_order);
+  g_dispatch.store(malloc_dispatch, std::memory_order_relaxed);
 
-  std::unique_ptr<perfetto::profiling::Client> client =
+  ScopedSpinlock s(&g_client_lock, ScopedSpinlock::Mode::Blocking);
+
+  if (g_client) {
+    PERFETTO_LOG("Rejecting concurrent profiling initialization.");
+    return true;  // success as we're in a valid state
+  }
+
+  std::shared_ptr<perfetto::profiling::Client> client =
       ShouldForkPrivateDaemon() ? CreateClientAndPrivateDaemon()
                                 : CreateClientForCentralDaemon();
 
@@ -258,16 +296,142 @@
     return false;
   }
 
-  g_client.store(client.release());
+  g_client = std::move(client);
   return true;
 }
 
 void HEAPPROFD_ADD_PREFIX(_finalize)() {
-  // TODO(fmayer): This should not leak.
-  perfetto::profiling::Client* client = GetClient();
-  if (client)
-    client->Shutdown();
-  MallocDispatchReset(GetDispatch());
+  // At the time of writing, invoked only as an atexit handler. We don't have
+  // any specific action to take, and cleanup can be left to the OS.
+}
+
+// Decides whether an allocation with the given address and size needs to be
+// sampled, and if so, records it. Performs the necessary synchronization (holds
+// |g_client_lock| spinlock) while accessing the shared sampler, and obtaining a
+// profiling client handle (shared_ptr).
+//
+// If the allocation is to be sampled, the recording is done without holding
+// |g_client_lock|. The client handle is guaranteed to not be invalidated while
+// the allocation is being recorded.
+//
+// If the attempt to record the allocation fails, initiates lazy shutdown of the
+// client & hooks.
+static void MaybeSampleAllocation(size_t size, void* addr) {
+  size_t sampled_alloc_sz = 0;
+  std::shared_ptr<perfetto::profiling::Client> client;
+  {
+    ScopedSpinlock s(&g_client_lock, ScopedSpinlock::Mode::Blocking);
+    if (!g_client)  // no active client (most likely shutting down)
+      return;
+
+    sampled_alloc_sz = g_client->GetSampleSizeLocked(size);
+    if (sampled_alloc_sz == 0)  // not sampling
+      return;
+
+    client = g_client;  // owning copy
+  }                     // unlock
+
+  if (!client->RecordMalloc(size, sampled_alloc_sz,
+                            reinterpret_cast<uint64_t>(addr))) {
+    ShutdownLazy();
+  }
+}
+
+void* HEAPPROFD_ADD_PREFIX(_malloc)(size_t size) {
+  const MallocDispatch* dispatch = GetDispatch();
+  void* addr = dispatch->malloc(size);
+  MaybeSampleAllocation(size, addr);
+  return addr;
+}
+
+void* HEAPPROFD_ADD_PREFIX(_calloc)(size_t nmemb, size_t size) {
+  const MallocDispatch* dispatch = GetDispatch();
+  void* addr = dispatch->calloc(nmemb, size);
+  MaybeSampleAllocation(size, addr);
+  return addr;
+}
+
+void* HEAPPROFD_ADD_PREFIX(_aligned_alloc)(size_t alignment, size_t size) {
+  const MallocDispatch* dispatch = GetDispatch();
+  void* addr = dispatch->aligned_alloc(alignment, size);
+  MaybeSampleAllocation(size, addr);
+  return addr;
+}
+
+void* HEAPPROFD_ADD_PREFIX(_memalign)(size_t alignment, size_t size) {
+  const MallocDispatch* dispatch = GetDispatch();
+  void* addr = dispatch->memalign(alignment, size);
+  MaybeSampleAllocation(size, addr);
+  return addr;
+}
+
+int HEAPPROFD_ADD_PREFIX(_posix_memalign)(void** memptr,
+                                          size_t alignment,
+                                          size_t size) {
+  const MallocDispatch* dispatch = GetDispatch();
+  int res = dispatch->posix_memalign(memptr, alignment, size);
+  if (res != 0)
+    return res;
+
+  MaybeSampleAllocation(size, *memptr);
+  return 0;
+}
+
+// Note: we record the free before calling the backing implementation to make
+// sure that the address is not reused before we've processed the deallocation
+// (which includes assigning a sequence id to it).
+void HEAPPROFD_ADD_PREFIX(_free)(void* pointer) {
+  const MallocDispatch* dispatch = GetDispatch();
+  std::shared_ptr<perfetto::profiling::Client> client;
+  {
+    ScopedSpinlock s(&g_client_lock, ScopedSpinlock::Mode::Blocking);
+    client = g_client;  // owning copy (or empty)
+  }
+
+  if (client) {
+    if (!client->RecordFree(reinterpret_cast<uint64_t>(pointer)))
+      ShutdownLazy();
+  }
+  return dispatch->free(pointer);
+}
+
+// Approach to recording realloc: under the initial lock, get a safe copy of the
+// client, and make the sampling decision in advance. Then record the
+// deallocation, call the real realloc, and finally record the sample if one is
+// necessary.
+//
+// As with the free, we record the deallocation before calling the backing
+// implementation to make sure the address is still exclusive while we're
+// processing it.
+void* HEAPPROFD_ADD_PREFIX(_realloc)(void* pointer, size_t size) {
+  const MallocDispatch* dispatch = GetDispatch();
+
+  size_t sampled_alloc_sz = 0;
+  std::shared_ptr<perfetto::profiling::Client> client;
+  {
+    ScopedSpinlock s(&g_client_lock, ScopedSpinlock::Mode::Blocking);
+    // If there is no active client, we still want to reach the backing realloc,
+    // so keep going.
+    if (g_client) {
+      client = g_client;  // owning copy
+      sampled_alloc_sz = g_client->GetSampleSizeLocked(size);
+    }
+  }  // unlock
+
+  if (client && pointer) {
+    if (!client->RecordFree(reinterpret_cast<uint64_t>(pointer)))
+      ShutdownLazy();
+  }
+  void* addr = dispatch->realloc(pointer, size);
+
+  if (size == 0 || sampled_alloc_sz == 0)
+    return addr;
+
+  if (!client->RecordMalloc(size, sampled_alloc_sz,
+                            reinterpret_cast<uint64_t>(addr))) {
+    ShutdownLazy();
+  }
+  return addr;
 }
 
 void HEAPPROFD_ADD_PREFIX(_dump_heap)(const char*) {}
@@ -290,78 +454,6 @@
   return dispatch->malloc_usable_size(pointer);
 }
 
-void* HEAPPROFD_ADD_PREFIX(_malloc)(size_t size) {
-  const MallocDispatch* dispatch = GetDispatch();
-  perfetto::profiling::Client* client = GetClient();
-  void* addr = dispatch->malloc(size);
-  if (client) {
-    if (!client->MaybeSampleAlloc(size, reinterpret_cast<uint64_t>(addr),
-                                  dispatch->malloc, dispatch->free))
-      MallocDispatchReset(GetDispatch());
-  }
-  return addr;
-}
-
-void HEAPPROFD_ADD_PREFIX(_free)(void* pointer) {
-  const MallocDispatch* dispatch = GetDispatch();
-  perfetto::profiling::Client* client = GetClient();
-  if (client)
-    if (!client->RecordFree(reinterpret_cast<uint64_t>(pointer)))
-      MallocDispatchReset(GetDispatch());
-  return dispatch->free(pointer);
-}
-
-void* HEAPPROFD_ADD_PREFIX(_aligned_alloc)(size_t alignment, size_t size) {
-  const MallocDispatch* dispatch = GetDispatch();
-  perfetto::profiling::Client* client = GetClient();
-  void* addr = dispatch->aligned_alloc(alignment, size);
-  if (client) {
-    if (!client->MaybeSampleAlloc(size, reinterpret_cast<uint64_t>(addr),
-                                  dispatch->malloc, dispatch->free))
-      MallocDispatchReset(GetDispatch());
-  }
-  return addr;
-}
-
-void* HEAPPROFD_ADD_PREFIX(_memalign)(size_t alignment, size_t size) {
-  const MallocDispatch* dispatch = GetDispatch();
-  perfetto::profiling::Client* client = GetClient();
-  void* addr = dispatch->memalign(alignment, size);
-  if (client) {
-    if (!client->MaybeSampleAlloc(size, reinterpret_cast<uint64_t>(addr),
-                                  dispatch->malloc, dispatch->free))
-      MallocDispatchReset(GetDispatch());
-  }
-  return addr;
-}
-
-void* HEAPPROFD_ADD_PREFIX(_realloc)(void* pointer, size_t size) {
-  const MallocDispatch* dispatch = GetDispatch();
-  perfetto::profiling::Client* client = GetClient();
-  if (client && pointer)
-    if (!client->RecordFree(reinterpret_cast<uint64_t>(pointer)))
-      MallocDispatchReset(GetDispatch());
-  void* addr = dispatch->realloc(pointer, size);
-  if (client && size > 0) {
-    if (!client->MaybeSampleAlloc(size, reinterpret_cast<uint64_t>(addr),
-                                  dispatch->malloc, dispatch->free))
-      MallocDispatchReset(GetDispatch());
-  }
-  return addr;
-}
-
-void* HEAPPROFD_ADD_PREFIX(_calloc)(size_t nmemb, size_t size) {
-  const MallocDispatch* dispatch = GetDispatch();
-  perfetto::profiling::Client* client = GetClient();
-  void* addr = dispatch->calloc(nmemb, size);
-  if (client) {
-    if (!client->MaybeSampleAlloc(size, reinterpret_cast<uint64_t>(addr),
-                                  dispatch->malloc, dispatch->free))
-      MallocDispatchReset(GetDispatch());
-  }
-  return addr;
-}
-
 struct mallinfo HEAPPROFD_ADD_PREFIX(_mallinfo)() {
   const MallocDispatch* dispatch = GetDispatch();
   return dispatch->mallinfo();
@@ -372,20 +464,6 @@
   return dispatch->mallopt(param, value);
 }
 
-int HEAPPROFD_ADD_PREFIX(_posix_memalign)(void** memptr,
-                                          size_t alignment,
-                                          size_t size) {
-  const MallocDispatch* dispatch = GetDispatch();
-  perfetto::profiling::Client* client = GetClient();
-  int res = dispatch->posix_memalign(memptr, alignment, size);
-  if (res == 0 && client) {
-    if (!client->MaybeSampleAlloc(size, reinterpret_cast<uint64_t>(*memptr),
-                                  dispatch->malloc, dispatch->free))
-      MallocDispatchReset(GetDispatch());
-  }
-  return res;
-}
-
 int HEAPPROFD_ADD_PREFIX(_iterate)(uintptr_t,
                                    size_t,
                                    void (*)(uintptr_t base,
diff --git a/src/profiling/memory/sampler.cc b/src/profiling/memory/sampler.cc
deleted file mode 100644
index e255715..0000000
--- a/src/profiling/memory/sampler.cc
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (C) 2018 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "src/profiling/memory/sampler.h"
-
-#include "perfetto/base/utils.h"
-
-namespace perfetto {
-namespace profiling {
-namespace {
-ThreadLocalSamplingData* GetSpecific(pthread_key_t key,
-                                     uint64_t interval,
-                                     void* (*unhooked_malloc)(size_t),
-                                     void (*unhooked_free)(void*)) {
-  // This should not be used with glibc as it might re-enter into malloc, see
-  // http://crbug.com/776475.
-  void* specific = pthread_getspecific(key);
-  if (specific == nullptr) {
-    specific = unhooked_malloc(sizeof(ThreadLocalSamplingData));
-    new (specific) ThreadLocalSamplingData(unhooked_free, interval);
-    pthread_setspecific(key, specific);
-  }
-  return reinterpret_cast<ThreadLocalSamplingData*>(specific);
-}
-}  // namespace
-
-// The algorithm below is inspired by the Chromium sampling algorithm at
-// https://cs.chromium.org/search/?q=f:cc+symbol:AllocatorShimLogAlloc+package:%5Echromium$&type=cs
-
-int64_t ThreadLocalSamplingData::NextSampleInterval() {
-  std::exponential_distribution<double> dist(rate_);
-  int64_t next = static_cast<int64_t>(dist(random_engine_));
-  // The +1 corrects the distribution of the first value in the interval.
-  // TODO(fmayer): Figure out why.
-  return next + 1;
-}
-
-size_t ThreadLocalSamplingData::NumberOfSamples(size_t sz) {
-  interval_to_next_sample_ -= sz;
-  size_t sz_multiplier = 0;
-  while (PERFETTO_UNLIKELY(interval_to_next_sample_ <= 0)) {
-    interval_to_next_sample_ += NextSampleInterval();
-    ++sz_multiplier;
-  }
-  return sz_multiplier;
-}
-
-std::atomic<uint64_t> ThreadLocalSamplingData::seed(1);
-
-size_t SampleSize(pthread_key_t key,
-                  size_t sz,
-                  uint64_t interval,
-                  void* (*unhooked_malloc)(size_t),
-                  void (*unhooked_free)(void*)) {
-  if (PERFETTO_UNLIKELY(sz >= interval))
-    return sz;
-  return interval * GetSpecific(key, interval, unhooked_malloc, unhooked_free)
-                        ->NumberOfSamples(sz);
-}
-
-void ThreadLocalSamplingData::KeyDestructor(void* ptr) {
-  ThreadLocalSamplingData* thread_local_data =
-      reinterpret_cast<ThreadLocalSamplingData*>(ptr);
-  void (*unhooked_free)(void*) = thread_local_data->unhooked_free_;
-  thread_local_data->~ThreadLocalSamplingData();
-  unhooked_free(ptr);
-}
-
-}  // namespace profiling
-}  // namespace perfetto
diff --git a/src/profiling/memory/sampler.h b/src/profiling/memory/sampler.h
index 4397e87..0108634 100644
--- a/src/profiling/memory/sampler.h
+++ b/src/profiling/memory/sampler.h
@@ -17,66 +17,73 @@
 #ifndef SRC_PROFILING_MEMORY_SAMPLER_H_
 #define SRC_PROFILING_MEMORY_SAMPLER_H_
 
-#include <atomic>
-
-#include <pthread.h>
 #include <stdint.h>
 
+#include <atomic>
 #include <random>
 
+#include "perfetto/base/utils.h"
+
 namespace perfetto {
 namespace profiling {
 
-// This is the thread-local state needed to apply poission sampling to malloc
-// samples.
+constexpr uint64_t kSamplerSeed = 1;
+
+// Poisson sampler for memory allocations. We apply sampling individually to
+// each byte. The whole allocation gets accounted as often as the number of
+// sampled bytes it contains.
 //
-// We apply poisson sampling individually to each byte. The whole
-// allocation gets accounted as often as the number of sampled bytes it
-// contains.
+// The algorithm is inspired by the Chromium sampling algorithm at
+// https://cs.chromium.org/search/?q=f:cc+symbol:AllocatorShimLogAlloc+package:%5Echromium$&type=cs
+// Googlers: see go/chrome-shp for more details.
 //
-// Googlers see go/chrome-shp for more details about the sampling (from
-// Chrome's heap profiler).
-class ThreadLocalSamplingData {
+// NB: not thread-safe, requires external synchronization.
+class Sampler {
  public:
-  ThreadLocalSamplingData(void (*unhooked_free)(void*), uint64_t interval)
-      : unhooked_free_(unhooked_free),
-        rate_(1 / static_cast<double>(interval)),
-        random_engine_(seed.load(std::memory_order_relaxed)),
+  Sampler(uint64_t sampling_interval)
+      : sampling_interval_(sampling_interval),
+        sampling_rate_(1.0 / static_cast<double>(sampling_interval)),
+        random_engine_(kSamplerSeed),
         interval_to_next_sample_(NextSampleInterval()) {}
-  // Returns number of times a sample should be accounted. Due to how the
-  // poission sampling works, some samples should be accounted multiple times.
-  size_t NumberOfSamples(size_t sz);
 
-  // Destroy a TheadLocalSamplingData object after the pthread key has been
-  // deleted or when the thread shuts down. This uses unhooked_free passed in
-  // the constructor.
-  static void KeyDestructor(void* ptr);
-
-  static std::atomic<uint64_t> seed;
+  // Returns number of bytes that should be be attributed to the sample.
+  // If returned size is 0, the allocation should not be sampled.
+  //
+  // Due to how the poission sampling works, some samples should be accounted
+  // multiple times.
+  size_t SampleSize(size_t alloc_sz) {
+    if (PERFETTO_UNLIKELY(alloc_sz >= sampling_interval_))
+      return alloc_sz;
+    return sampling_interval_ * NumberOfSamples(alloc_sz);
+  }
 
  private:
-  int64_t NextSampleInterval();
-  void (*unhooked_free_)(void*);
-  double rate_;
+  int64_t NextSampleInterval() {
+    std::exponential_distribution<double> dist(sampling_rate_);
+    int64_t next = static_cast<int64_t>(dist(random_engine_));
+    // The +1 corrects the distribution of the first value in the interval.
+    // TODO(fmayer): Figure out why.
+    return next + 1;
+  }
+
+  // Returns number of times a sample should be accounted. Due to how the
+  // poission sampling works, some samples should be accounted multiple times.
+  size_t NumberOfSamples(size_t alloc_sz) {
+    interval_to_next_sample_ -= alloc_sz;
+    size_t num_samples = 0;
+    while (PERFETTO_UNLIKELY(interval_to_next_sample_ <= 0)) {
+      interval_to_next_sample_ += NextSampleInterval();
+      ++num_samples;
+    }
+    return num_samples;
+  }
+
+  uint64_t sampling_interval_;
+  double sampling_rate_;
   std::default_random_engine random_engine_;
   int64_t interval_to_next_sample_;
 };
 
-// Returns number of bytes that should be be attributed to the sample.
-// If returned size is 0, the allocation should not be sampled.
-//
-// Due to how the poission sampling works, some samples should be accounted
-// multiple times.
-//
-// Delegate to this thread's ThreadLocalSamplingData.
-//
-// We have to pass through the real malloc in order to allocate the TLS.
-size_t SampleSize(pthread_key_t key,
-                  size_t sz,
-                  uint64_t rate,
-                  void* (*unhooked_malloc)(size_t),
-                  void (*unhooked_free)(void*));
-
 }  // namespace profiling
 }  // namespace perfetto
 
diff --git a/src/profiling/memory/sampler_unittest.cc b/src/profiling/memory/sampler_unittest.cc
index b8cf76e..4e53f24 100644
--- a/src/profiling/memory/sampler_unittest.cc
+++ b/src/profiling/memory/sampler_unittest.cc
@@ -20,36 +20,25 @@
 
 #include <thread>
 
-#include "src/profiling/memory/client.h"  // For PThreadKey.
-
 namespace perfetto {
 namespace profiling {
 namespace {
 
 TEST(SamplerTest, TestLarge) {
-  PThreadKey key(ThreadLocalSamplingData::KeyDestructor);
-  ASSERT_TRUE(key.valid());
-  EXPECT_EQ(SampleSize(key.get(), 1024, 512, malloc, free), 1024);
+  Sampler sampler(512);
+  EXPECT_EQ(sampler.SampleSize(1024), 1024);
 }
 
 TEST(SamplerTest, TestSmall) {
-  PThreadKey key(ThreadLocalSamplingData::KeyDestructor);
-  ASSERT_TRUE(key.valid());
-  EXPECT_EQ(SampleSize(key.get(), 511, 512, malloc, free), 512);
+  Sampler sampler(512);
+  EXPECT_EQ(sampler.SampleSize(511), 512);
 }
 
-TEST(SamplerTest, TestSmallFromThread) {
-  PThreadKey key(ThreadLocalSamplingData::KeyDestructor);
-  ASSERT_TRUE(key.valid());
-  std::thread th([&key] {
-    EXPECT_EQ(SampleSize(key.get(), 511, 512, malloc, free), 512);
-  });
-  std::thread th2([&key] {
-    // The threads should have separate state.
-    EXPECT_EQ(SampleSize(key.get(), 511, 512, malloc, free), 512);
-  });
-  th.join();
-  th2.join();
+TEST(SamplerTest, TestSequence) {
+  Sampler sampler(1);
+  EXPECT_EQ(sampler.SampleSize(3), 3);
+  EXPECT_EQ(sampler.SampleSize(7), 7);
+  EXPECT_EQ(sampler.SampleSize(5), 5);
 }
 
 }  // namespace
diff --git a/src/profiling/memory/scoped_spinlock.cc b/src/profiling/memory/scoped_spinlock.cc
new file mode 100644
index 0000000..6d522ff
--- /dev/null
+++ b/src/profiling/memory/scoped_spinlock.cc
@@ -0,0 +1,43 @@
+/*
+ * Copyright (C) 2019 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/profiling/memory/scoped_spinlock.h"
+
+#include <unistd.h>
+
+#include <atomic>
+
+#include "perfetto/base/utils.h"
+
+namespace perfetto {
+namespace profiling {
+
+void ScopedSpinlock::LockSlow(Mode mode) {
+  // Slowpath.
+  for (size_t attempt = 0; mode == Mode::Blocking || attempt < 1024 * 10;
+       attempt++) {
+    if (!lock_->load(std::memory_order_relaxed) &&
+        PERFETTO_LIKELY(!lock_->exchange(true, std::memory_order_acquire))) {
+      locked_ = true;
+      return;
+    }
+    if (attempt && attempt % 1024 == 0)
+      usleep(1000);
+  }
+}
+
+}  // namespace profiling
+}  // namespace perfetto
diff --git a/src/profiling/memory/scoped_spinlock.h b/src/profiling/memory/scoped_spinlock.h
new file mode 100644
index 0000000..af65ba6
--- /dev/null
+++ b/src/profiling/memory/scoped_spinlock.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (C) 2019 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SRC_PROFILING_MEMORY_SCOPED_SPINLOCK_H_
+#define SRC_PROFILING_MEMORY_SCOPED_SPINLOCK_H_
+
+#include "perfetto/base/logging.h"
+#include "perfetto/base/utils.h"
+
+#include <atomic>
+#include <new>
+#include <utility>
+
+namespace perfetto {
+namespace profiling {
+
+class ScopedSpinlock {
+ public:
+  enum class Mode { Try, Blocking };
+
+  ScopedSpinlock(std::atomic<bool>* lock, Mode mode) : lock_(lock) {
+    if (PERFETTO_LIKELY(!lock_->exchange(true, std::memory_order_acquire))) {
+      locked_ = true;
+      return;
+    }
+    LockSlow(mode);
+  }
+
+  ScopedSpinlock(const ScopedSpinlock&) = delete;
+  ScopedSpinlock& operator=(const ScopedSpinlock&) = delete;
+
+  ScopedSpinlock(ScopedSpinlock&& other) noexcept
+      : lock_(other.lock_), locked_(other.locked_) {
+    other.locked_ = false;
+  }
+
+  ScopedSpinlock& operator=(ScopedSpinlock&& other) {
+    if (this != &other) {
+      this->~ScopedSpinlock();
+      new (this) ScopedSpinlock(std::move(other));
+    }
+    return *this;
+  }
+
+  ~ScopedSpinlock() { Unlock(); }
+
+  void Unlock() {
+    if (locked_) {
+      PERFETTO_DCHECK(lock_->load());
+      lock_->store(false, std::memory_order_release);
+    }
+    locked_ = false;
+  }
+
+  bool locked() const { return locked_; }
+
+ private:
+  void LockSlow(Mode mode);
+  std::atomic<bool>* lock_;
+  bool locked_ = false;
+};
+
+}  // namespace profiling
+}  // namespace perfetto
+
+#endif  // SRC_PROFILING_MEMORY_SCOPED_SPINLOCK_H_
diff --git a/src/profiling/memory/shared_ring_buffer.cc b/src/profiling/memory/shared_ring_buffer.cc
index b8e222e..52d87a0 100644
--- a/src/profiling/memory/shared_ring_buffer.cc
+++ b/src/profiling/memory/shared_ring_buffer.cc
@@ -27,6 +27,7 @@
 #include "perfetto/base/build_config.h"
 #include "perfetto/base/scoped_file.h"
 #include "perfetto/base/temp_file.h"
+#include "src/profiling/memory/scoped_spinlock.h"
 
 #if PERFETTO_BUILDFLAG(PERFETTO_OS_ANDROID)
 #include <linux/memfd.h>
@@ -48,36 +49,6 @@
 
 }  // namespace
 
-void ScopedSpinlock::LockSlow(Mode mode) {
-  // Slowpath.
-  for (size_t attempt = 0; mode == Mode::Blocking || attempt < 1024 * 10;
-       attempt++) {
-    if (!lock_->load(std::memory_order_relaxed) &&
-        PERFETTO_LIKELY(!lock_->exchange(true, std::memory_order_acquire))) {
-      locked_ = true;
-      return;
-    }
-    if (attempt && attempt % 1024 == 0)
-      usleep(1000);
-  }
-}
-
-ScopedSpinlock::ScopedSpinlock(ScopedSpinlock&& other) noexcept
-    : lock_(other.lock_), locked_(other.locked_) {
-  other.locked_ = false;
-}
-
-ScopedSpinlock& ScopedSpinlock::operator=(ScopedSpinlock&& other) {
-  if (this != &other) {
-    this->~ScopedSpinlock();
-    new (this) ScopedSpinlock(std::move(other));
-  }
-  return *this;
-}
-
-ScopedSpinlock::~ScopedSpinlock() {
-  Unlock();
-}
 
 SharedRingBuffer::SharedRingBuffer(CreateFlag, size_t size) {
   size_t size_with_meta = size + kMetaPageSize;
diff --git a/src/profiling/memory/shared_ring_buffer.h b/src/profiling/memory/shared_ring_buffer.h
index a09de71..9ec09e9 100644
--- a/src/profiling/memory/shared_ring_buffer.h
+++ b/src/profiling/memory/shared_ring_buffer.h
@@ -20,6 +20,7 @@
 #include "perfetto/base/optional.h"
 #include "perfetto/base/unix_socket.h"
 #include "perfetto/base/utils.h"
+#include "src/profiling/memory/scoped_spinlock.h"
 
 #include <atomic>
 #include <map>
@@ -30,42 +31,6 @@
 namespace perfetto {
 namespace profiling {
 
-class ScopedSpinlock {
- public:
-  enum class Mode { Try, Blocking };
-
-  ScopedSpinlock(std::atomic<bool>* lock, Mode mode) : lock_(lock) {
-    if (PERFETTO_LIKELY(!lock_->exchange(true, std::memory_order_acquire))) {
-      locked_ = true;
-      return;
-    }
-    LockSlow(mode);
-  }
-
-  ScopedSpinlock(const ScopedSpinlock&) = delete;
-  ScopedSpinlock& operator=(const ScopedSpinlock&) = delete;
-
-  ScopedSpinlock(ScopedSpinlock&&) noexcept;
-  ScopedSpinlock& operator=(ScopedSpinlock&&);
-
-  ~ScopedSpinlock();
-
-  void Unlock() {
-    if (locked_) {
-      PERFETTO_DCHECK(lock_->load());
-      lock_->store(false, std::memory_order_release);
-    }
-    locked_ = false;
-  }
-
-  bool locked() const { return locked_; }
-
- private:
-  void LockSlow(Mode mode);
-  std::atomic<bool>* lock_;
-  bool locked_ = false;
-};
-
 // A concurrent, multi-writer single-reader ring buffer FIFO, based on a
 // circular buffer over shared memory. It has similar semantics to a SEQ_PACKET
 // + O_NONBLOCK socket, specifically:
diff --git a/src/profiling/memory/shared_ring_buffer_fuzzer.cc b/src/profiling/memory/shared_ring_buffer_fuzzer.cc
index 02aa6f8..7101a15 100644
--- a/src/profiling/memory/shared_ring_buffer_fuzzer.cc
+++ b/src/profiling/memory/shared_ring_buffer_fuzzer.cc
@@ -54,7 +54,9 @@
   auto fd = base::TempFile::CreateUnlinked().ReleaseFD();
   PERFETTO_CHECK(fd);
 
-  // Put the remaining fuzzer input into the data portion of the ring buffer.
+  // Use fuzzer input to first fill the MetadataHeader in the first page, and
+  // then put the remainder into the data portion of the ring buffer (2nd+
+  // pages).
   size_t payload_size = size - sizeof(MetadataHeader);
   const uint8_t* payload = data + sizeof(MetadataHeader);
   size_t payload_size_pages =
@@ -63,9 +65,14 @@
   // for the metadata.
   size_t total_size_pages = 1 + RoundToPow2(payload_size_pages);
 
-  PERFETTO_CHECK(ftruncate(*fd, total_size_pages * base::kPageSize) == 0);
+  // Clear spinlock field, as otherwise the read will wait indefinitely (it
+  // defaults to indefinite blocking mode).
+  MetadataHeader header = {};
+  memcpy(&header, data, sizeof(header));
+  header.spinlock = 0;
 
-  PERFETTO_CHECK(base::WriteAll(*fd, data, sizeof(MetadataHeader)) != -1);
+  PERFETTO_CHECK(ftruncate(*fd, total_size_pages * base::kPageSize) == 0);
+  PERFETTO_CHECK(base::WriteAll(*fd, &header, sizeof(header)) != -1);
   PERFETTO_CHECK(lseek(*fd, base::kPageSize, SEEK_SET) != -1);
   PERFETTO_CHECK(base::WriteAll(*fd, payload, payload_size) != -1);
 
diff --git a/ui/package-lock.json b/ui/package-lock.json
index bb0a4e4..66c7197 100644
--- a/ui/package-lock.json
+++ b/ui/package-lock.json
@@ -6478,17 +6478,6 @@
         }
       }
     },
-    "rollup-plugin-replace": {
-      "version": "2.1.0",
-      "resolved": "https://registry.npmjs.org/rollup-plugin-replace/-/rollup-plugin-replace-2.1.0.tgz",
-      "integrity": "sha512-SxrAIgpH/B5/W4SeULgreOemxcpEgKs2gcD42zXw50bhqGWmcnlXneVInQpAqzA/cIly4bJrOpeelmB9p4YXSQ==",
-      "dev": true,
-      "requires": {
-        "magic-string": "^0.25.1",
-        "minimatch": "^3.0.2",
-        "rollup-pluginutils": "^2.0.1"
-      }
-    },
     "rollup-pluginutils": {
       "version": "2.3.3",
       "resolved": "https://registry.npmjs.org/rollup-pluginutils/-/rollup-pluginutils-2.3.3.tgz",
diff --git a/ui/package.json b/ui/package.json
index f6be004..32244cc 100644
--- a/ui/package.json
+++ b/ui/package.json
@@ -31,7 +31,6 @@
     "rollup": "^1.2.2",
     "rollup-plugin-commonjs": "^9.2.0",
     "rollup-plugin-node-resolve": "^4.0.0",
-    "rollup-plugin-replace": "^2.1.0",
     "sorcery": "^0.10.0",
     "tslib": "^1.9.3",
     "tslint": "^5.12.1",
diff --git a/ui/rollup.config.js b/ui/rollup.config.js
index e63c7be..65d2ce5 100644
--- a/ui/rollup.config.js
+++ b/ui/rollup.config.js
@@ -1,11 +1,10 @@
 import commonjs from 'rollup-plugin-commonjs';
 import nodeResolve from 'rollup-plugin-node-resolve';
-import replace from 'rollup-plugin-replace';
 
 export default {
   output: {name: 'perfetto'},
   plugins: [
-    nodeResolve({browser: true}),
+    nodeResolve({module: false, browser: true}),
 
     // emscripten conditionally executes require('fs') (likewise for others),
     // when running under node. Rollup can't find those libraries so expects
@@ -18,10 +17,5 @@
         'crypto',
       ]
     }),
-
-    replace({
-      'immer_1.produce': 'immer_1',
-    })
-
   ]
 }
diff --git a/ui/src/common/actions.ts b/ui/src/common/actions.ts
index f0989ae..ccf6d0d 100644
--- a/ui/src/common/actions.ts
+++ b/ui/src/common/actions.ts
@@ -231,11 +231,15 @@
   },
 
   // TODO(hjd): Remove setState - it causes problems due to reuse of ids.
-  setState(_state: StateDraft, _args: {newState: State}): void {
-    // This has to be handled at a higher level since we can't
-    // replace the whole tree here however we still need a method here
-    // so it appears on the proxy Actions class.
-    throw new Error('Called setState on StateActions.');
+  setState(state: StateDraft, args: {newState: State}): void {
+    for (const key of Object.keys(state)) {
+      // tslint:disable-next-line no-any
+      delete (state as any)[key];
+    }
+    for (const key of Object.keys(args.newState)) {
+      // tslint:disable-next-line no-any
+      (state as any)[key] = (args.newState as any)[key];
+    }
   },
 
   setRecordConfig(state: StateDraft, args: {config: RecordConfig;}): void {
@@ -292,8 +296,8 @@
     };
   },
 
-  selectTimeSpan(state: StateDraft,
-                 args: {startTs: number, endTs: number}): void {
+  selectTimeSpan(
+      state: StateDraft, args: {startTs: number, endTs: number}): void {
     state.currentSelection = {
       kind: 'TIMESPAN',
       startTs: args.startTs,
diff --git a/ui/src/controller/globals.ts b/ui/src/controller/globals.ts
index ac9f70f..174934a 100644
--- a/ui/src/controller/globals.ts
+++ b/ui/src/controller/globals.ts
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-import {produce} from 'immer';
+import {Patch, produce} from 'immer';
 
 import {assertExists} from '../base/logging';
 import {Remote} from '../base/remote';
@@ -73,14 +73,13 @@
 
     // Run controllers locally until all state machines reach quiescence.
     let runAgain = false;
-    let summary = this._queuedActions.map(action => action.type).join(', ');
-    summary = `Controllers loop (${summary})`;
+    const patches: Patch[] = [];
     for (let iter = 0; runAgain || this._queuedActions.length > 0; iter++) {
       if (iter > 100) throw new Error('Controllers are stuck in a livelock');
       const actions = this._queuedActions;
       this._queuedActions = new Array<DeferredAction>();
       for (const action of actions) {
-        this.applyAction(action);
+        patches.push(...this.applyAction(action));
       }
       this._runningControllers = true;
       try {
@@ -89,7 +88,7 @@
         this._runningControllers = false;
       }
     }
-    assertExists(this._frontend).send<void>('updateState', [this.state]);
+    assertExists(this._frontend).send<void>('patchState', [patches]);
   }
 
   createEngine(): Engine {
@@ -115,21 +114,23 @@
     return assertExists(this._state);
   }
 
-  applyAction(action: DeferredAction): void {
+  applyAction(action: DeferredAction): Patch[] {
     assertExists(this._state);
-    // We need a special case for when we want to replace the whole tree.
-    if (action.type === 'setState') {
-      const args = (action as DeferredAction<{newState: State}>).args;
-      this._state = args.newState;
-      return;
-    }
+    const patches: Patch[] = [];
+
     // 'produce' creates a immer proxy which wraps the current state turning
     // all imperative mutations of the state done in the callback into
     // immutable changes to the returned state.
-    this._state = produce(this.state, draft => {
-      // tslint:disable-next-line no-any
-      (StateActions as any)[action.type](draft, action.args);
-    });
+    this._state = produce(
+        this.state,
+        draft => {
+          // tslint:disable-next-line no-any
+          (StateActions as any)[action.type](draft, action.args);
+        },
+        (morePatches, _) => {
+          patches.push(...morePatches);
+        });
+    return patches;
   }
 
   resetForTesting() {
diff --git a/ui/src/frontend/index.ts b/ui/src/frontend/index.ts
index 1ed31ea..c0ecccf 100644
--- a/ui/src/frontend/index.ts
+++ b/ui/src/frontend/index.ts
@@ -14,13 +14,13 @@
 
 import '../tracks/all_frontend';
 
+import {applyPatches, Patch} from 'immer';
 import * as m from 'mithril';
 
 import {forwardRemoteCalls} from '../base/remote';
 import {Actions} from '../common/actions';
-import {State} from '../common/state';
 
-import {globals, QuantizedLoad, ThreadDesc, SliceDetails} from './globals';
+import {globals, QuantizedLoad, SliceDetails, ThreadDesc} from './globals';
 import {HomePage} from './home_page';
 import {openBufferWithLegacyTraceViewer} from './legacy_trace_viewer';
 import {RecordPage} from './record_page';
@@ -33,12 +33,12 @@
 class FrontendApi {
   constructor(private router: Router) {}
 
-  updateState(state: State) {
-    globals.state = state;
+  patchState(patches: Patch[]) {
+    globals.state = applyPatches(globals.state, patches);
     // If the visible time in the global state has been updated more recently
     // than the visible time handled by the frontend @ 60fps, update it. This
     // typically happens when restoring the state from a permalink.
-    globals.frontendLocalState.mergeState(state.frontendLocalState);
+    globals.frontendLocalState.mergeState(globals.state.frontendLocalState);
     this.redraw();
   }