Separate client management from hooks.

Currently, we include the client in the same library as the malloc
hooks. This is incorrect, as the malloc hooks should ship within the
com.android.runtime APEX, while the client should ship with the
platform (or wherever the heapprofd executable ships from).

This can then also be used for ART object allocation callstack
tracking.

Diff of HEAD~1:malloc_hooks.cc and client_ext.cc:
https://gist.github.com/segfaulthunter/05b7f0253a79c18b851d595a5d1df2a5

Test: get system_server profile.

Change-Id: If2b96e9a8c6baadcac8eea999653c6dd16977dca
Bug: 141241849
diff --git a/Android.bp b/Android.bp
index 6d3a72d..c2d71b6 100644
--- a/Android.bp
+++ b/Android.bp
@@ -209,12 +209,45 @@
   srcs: [
     ":perfetto_include_perfetto_base_base",
     ":perfetto_include_perfetto_ext_base_base",
+    ":perfetto_src_base_base",
+    ":perfetto_src_profiling_memory_malloc_hooks",
+  ],
+  shared_libs: [
+    "heapprofd_client_api",
+  ],
+  static_libs: [
+    "libasync_safe",
+  ],
+  export_include_dirs: [
+    "include",
+    "include/perfetto/base/build_configs/android_tree",
+  ],
+  defaults: [
+    "perfetto_defaults",
+  ],
+  cflags: [
+    "-DPERFETTO_ANDROID_ASYNC_SAFE_LOG",
+  ],
+  include_dirs: [
+    "bionic/libc",
+  ],
+  header_libs: [
+    "bionic_libc_platform_headers",
+  ],
+}
+
+// GN: //src/profiling/memory:heapprofd_client_api
+cc_library_shared {
+  name: "heapprofd_client_api",
+  srcs: [
+    ":perfetto_include_perfetto_base_base",
+    ":perfetto_include_perfetto_ext_base_base",
     ":perfetto_include_perfetto_profiling_normalize",
     ":perfetto_src_base_base",
     ":perfetto_src_base_unix_socket",
     ":perfetto_src_profiling_common_proc_utils",
     ":perfetto_src_profiling_memory_client",
-    ":perfetto_src_profiling_memory_malloc_hooks",
+    ":perfetto_src_profiling_memory_client_ext",
     ":perfetto_src_profiling_memory_ring_buffer",
     ":perfetto_src_profiling_memory_scoped_spinlock",
     ":perfetto_src_profiling_memory_wire_protocol",
@@ -6132,6 +6165,14 @@
   ],
 }
 
+// GN: //src/profiling/memory:client_ext
+filegroup {
+  name: "perfetto_src_profiling_memory_client_ext",
+  srcs: [
+    "src/profiling/memory/client_ext.cc",
+  ],
+}
+
 // GN: //src/profiling/memory:daemon
 filegroup {
   name: "perfetto_src_profiling_memory_daemon",
diff --git a/BUILD.gn b/BUILD.gn
index cb5ee8a..0a1b23c 100644
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -46,7 +46,10 @@
 if (enable_perfetto_heapprofd) {
   all_targets += [ "src/profiling/memory:heapprofd" ]
   if (perfetto_build_with_android) {
-    all_targets += [ "src/profiling/memory:heapprofd_client" ]
+    all_targets += [
+      "src/profiling/memory:heapprofd_client",
+      "src/profiling/memory:heapprofd_client_api",
+    ]
   }
 }
 
diff --git a/include/perfetto/profiling/memory/client_ext.h b/include/perfetto/profiling/memory/client_ext.h
new file mode 100644
index 0000000..324dbf0
--- /dev/null
+++ b/include/perfetto/profiling/memory/client_ext.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (C) 2020 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef INCLUDE_PERFETTO_PROFILING_MEMORY_CLIENT_EXT_H_
+#define INCLUDE_PERFETTO_PROFILING_MEMORY_CLIENT_EXT_H_
+
+#include <inttypes.h>
+#include <stdlib.h>
+
+extern "C" bool heapprofd_init_session(void* (*malloc_fn)(size_t),
+                                       void (*free_fn)(void*));
+
+extern "C" uint32_t heapprofd_register_heap(const char* heap_name);
+
+extern "C" bool heapprofd_report_allocation(uint32_t heap_id,
+                                            uint64_t id,
+                                            uint64_t size);
+
+extern "C" void heapprofd_report_free(uint32_t heap_id, uint64_t id);
+
+#endif  // INCLUDE_PERFETTO_PROFILING_MEMORY_CLIENT_EXT_H_
diff --git a/src/profiling/memory/BUILD.gn b/src/profiling/memory/BUILD.gn
index 18f81b3..d2a70b2 100644
--- a/src/profiling/memory/BUILD.gn
+++ b/src/profiling/memory/BUILD.gn
@@ -43,22 +43,36 @@
 #
 # This builds only in the Android tree, when using the generated Android.bp.
 if (perfetto_build_with_android) {
+  shared_library("heapprofd_client_api") {
+    configs -= [ "//gn/standalone:android_liblog" ]
+    cflags = [ "-DPERFETTO_ANDROID_ASYNC_SAFE_LOG" ]
+    deps = [ ":client_ext" ]
+  }
+
+  source_set("client_ext") {
+    cflags = [ "-DPERFETTO_ANDROID_ASYNC_SAFE_LOG" ]
+    deps = [
+      ":client",
+      "../../../gn:default_deps",
+      "../../base",
+    ]
+    sources = [ "client_ext.cc" ]
+  }
+
   shared_library("heapprofd_client") {
     configs -= [ "//gn/standalone:android_liblog" ]
     cflags = [ "-DPERFETTO_ANDROID_ASYNC_SAFE_LOG" ]
-    deps = [ ":malloc_hooks" ]
+    deps = [
+      ":heapprofd_client_api",
+      ":malloc_hooks",
+    ]
   }
 
   # This will export publicly visible symbols for the malloc_hooks.
   source_set("malloc_hooks") {
     deps = [
-      ":client",
-      ":scoped_spinlock",
-      ":wire_protocol",
       "../../../gn:default_deps",
       "../../base",
-      "../../base:unix_socket",
-      "../common:proc_utils",
     ]
     cflags = [
       "-isystem",
diff --git a/src/profiling/memory/client_ext.cc b/src/profiling/memory/client_ext.cc
new file mode 100644
index 0000000..d5aa73c
--- /dev/null
+++ b/src/profiling/memory/client_ext.cc
@@ -0,0 +1,443 @@
+/*
+ * Copyright (C) 2020 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "perfetto/profiling/memory/client_ext.h"
+
+#include <android/fdsan.h>
+#include <bionic/malloc.h>
+#include <inttypes.h>
+#include <malloc.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/system_properties.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include <atomic>
+#include <tuple>
+
+#include "perfetto/base/build_config.h"
+#include "perfetto/base/logging.h"
+#include "perfetto/ext/base/no_destructor.h"
+#include "perfetto/ext/base/unix_socket.h"
+#include "perfetto/ext/base/utils.h"
+
+#include "src/profiling/common/proc_utils.h"
+#include "src/profiling/memory/client.h"
+#include "src/profiling/memory/scoped_spinlock.h"
+#include "src/profiling/memory/unhooked_allocator.h"
+#include "src/profiling/memory/wire_protocol.h"
+
+using perfetto::profiling::ScopedSpinlock;
+using perfetto::profiling::UnhookedAllocator;
+
+namespace {
+// Holds the active profiling client. Is empty at the start, or after we've
+// started shutting down a profiling session. Hook invocations take shared_ptr
+// copies (ensuring that the client stays alive until no longer needed), and do
+// nothing if this primary pointer is empty.
+//
+// This shared_ptr itself is protected by g_client_lock. Note that shared_ptr
+// handles are not thread-safe by themselves:
+// https://en.cppreference.com/w/cpp/memory/shared_ptr/atomic
+//
+// To avoid on-destruction re-entrancy issues, this shared_ptr needs to be
+// constructed with an allocator that uses the unhooked malloc & free functions.
+// See UnhookedAllocator.
+//
+// NoDestructor<> wrapper is used to avoid destructing the shared_ptr at program
+// exit. The rationale is:
+// * Avoiding the atexit destructor racing against other threads that are
+//   possibly running within the hooks.
+// * Making sure that atexit handlers running after this global's destructor
+//   can still safely enter the hooks.
+perfetto::base::NoDestructor<std::shared_ptr<perfetto::profiling::Client>>
+    g_client;
+
+// Protects g_client, and serves as an external lock for sampling decisions (see
+// perfetto::profiling::Sampler).
+//
+// We rely on this atomic's destuction being a nop, as it is possible for the
+// hooks to attempt to acquire the spinlock after its destructor should have run
+// (technically a use-after-destruct scenario).
+std::atomic<bool> g_client_lock{false};
+
+constexpr char kHeapprofdBinPath[] = "/system/bin/heapprofd";
+
+int CloneWithoutSigchld() {
+  auto ret = clone(nullptr, nullptr, 0, nullptr);
+  if (ret == 0)
+    android_fdsan_set_error_level(ANDROID_FDSAN_ERROR_LEVEL_DISABLED);
+  return ret;
+}
+
+int ForklikeClone() {
+  auto ret = clone(nullptr, nullptr, SIGCHLD, nullptr);
+  if (ret == 0)
+    android_fdsan_set_error_level(ANDROID_FDSAN_ERROR_LEVEL_DISABLED);
+  return ret;
+}
+
+// Like daemon(), but using clone to avoid invoking pthread_atfork(3) handlers.
+int Daemonize() {
+  switch (ForklikeClone()) {
+    case -1:
+      PERFETTO_PLOG("Daemonize.clone");
+      return -1;
+      break;
+    case 0:
+      break;
+    default:
+      _exit(0);
+      break;
+  }
+  if (setsid() == -1) {
+    PERFETTO_PLOG("Daemonize.setsid");
+    return -1;
+  }
+  // best effort chdir & fd close
+  chdir("/");
+  int fd = open("/dev/null", O_RDWR, 0);
+  if (fd != -1) {
+    dup2(fd, STDIN_FILENO);
+    dup2(fd, STDOUT_FILENO);
+    dup2(fd, STDERR_FILENO);
+    if (fd > STDERR_FILENO)
+      close(fd);
+  }
+  return 0;
+}
+
+// Called only if |g_client_lock| acquisition fails, which shouldn't happen
+// unless we're in a completely unexpected state (which we won't know how to
+// recover from). Tries to abort (SIGABRT) the whole process to serve as an
+// explicit indication of a bug.
+//
+// Doesn't use PERFETTO_FATAL as that is a single attempt to self-signal (in
+// practice - SIGTRAP), while abort() tries to make sure the process has
+// exited one way or another.
+__attribute__((noreturn, noinline)) void AbortOnSpinlockTimeout() {
+  PERFETTO_ELOG(
+      "Timed out on the spinlock - something is horribly wrong. "
+      "Aborting whole process.");
+  abort();
+}
+
+std::string ReadSystemProperty(const char* key) {
+  std::string prop_value;
+  const prop_info* prop = __system_property_find(key);
+  if (!prop) {
+    return prop_value;  // empty
+  }
+  __system_property_read_callback(
+      prop,
+      [](void* cookie, const char* name, const char* value, uint32_t) {
+        std::string* prop_value = reinterpret_cast<std::string*>(cookie);
+        *prop_value = value;
+      },
+      &prop_value);
+  return prop_value;
+}
+
+bool ForceForkPrivateDaemon() {
+  // Note: if renaming the property, also update system_property.cc
+  std::string mode = ReadSystemProperty("heapprofd.userdebug.mode");
+  return mode == "fork";
+}
+
+std::shared_ptr<perfetto::profiling::Client> CreateClientForCentralDaemon(
+    UnhookedAllocator<perfetto::profiling::Client> unhooked_allocator) {
+  PERFETTO_LOG("Constructing client for central daemon.");
+  using perfetto::profiling::Client;
+
+  perfetto::base::Optional<perfetto::base::UnixSocketRaw> sock =
+      Client::ConnectToHeapprofd(perfetto::profiling::kHeapprofdSocketFile);
+  if (!sock) {
+    PERFETTO_ELOG("Failed to connect to %s. This is benign on user builds.",
+                  perfetto::profiling::kHeapprofdSocketFile);
+    return nullptr;
+  }
+  return Client::CreateAndHandshake(std::move(sock.value()),
+                                    unhooked_allocator);
+}
+
+std::shared_ptr<perfetto::profiling::Client> CreateClientAndPrivateDaemon(
+    UnhookedAllocator<perfetto::profiling::Client> unhooked_allocator) {
+  PERFETTO_LOG("Setting up fork mode profiling.");
+  perfetto::base::UnixSocketRaw parent_sock;
+  perfetto::base::UnixSocketRaw child_sock;
+  std::tie(parent_sock, child_sock) = perfetto::base::UnixSocketRaw::CreatePair(
+      perfetto::base::SockFamily::kUnix, perfetto::base::SockType::kStream);
+
+  if (!parent_sock || !child_sock) {
+    PERFETTO_PLOG("Failed to create socketpair.");
+    return nullptr;
+  }
+
+  child_sock.RetainOnExec();
+
+  // Record own pid and cmdline, to pass down to the forked heapprofd.
+  pid_t target_pid = getpid();
+  std::string target_cmdline;
+  if (!perfetto::profiling::GetCmdlineForPID(target_pid, &target_cmdline)) {
+    target_cmdline = "failed-to-read-cmdline";
+    PERFETTO_ELOG(
+        "Failed to read own cmdline, proceeding as this might be a by-pid "
+        "profiling request (which will still work).");
+  }
+
+  // Prepare arguments for heapprofd.
+  std::string pid_arg =
+      std::string("--exclusive-for-pid=") + std::to_string(target_pid);
+  std::string cmd_arg =
+      std::string("--exclusive-for-cmdline=") + target_cmdline;
+  std::string fd_arg =
+      std::string("--inherit-socket-fd=") + std::to_string(child_sock.fd());
+  const char* argv[] = {kHeapprofdBinPath, pid_arg.c_str(), cmd_arg.c_str(),
+                        fd_arg.c_str(), nullptr};
+
+  // Use fork-like clone to avoid invoking the host's pthread_atfork(3)
+  // handlers. Also avoid sending the current process a SIGCHILD to further
+  // reduce our interference.
+  pid_t clone_pid = CloneWithoutSigchld();
+  if (clone_pid == -1) {
+    PERFETTO_PLOG("Failed to clone.");
+    return nullptr;
+  }
+  if (clone_pid == 0) {  // child
+    // Daemonize clones again, terminating the calling thread (i.e. the direct
+    // child of the original process). So the rest of this codepath will be
+    // executed in a new reparented process.
+    if (Daemonize() == -1) {
+      PERFETTO_PLOG("Daemonization failed.");
+      _exit(1);
+    }
+    execv(kHeapprofdBinPath, const_cast<char**>(argv));
+    PERFETTO_PLOG("Failed to execute private heapprofd.");
+    _exit(1);
+  }  // else - parent continuing the client setup
+
+  child_sock.ReleaseFd().reset();  // close child socket's fd
+  if (!parent_sock.SetTxTimeout(perfetto::profiling::kClientSockTimeoutMs)) {
+    PERFETTO_PLOG("Failed to set socket transmit timeout.");
+    return nullptr;
+  }
+
+  if (!parent_sock.SetRxTimeout(perfetto::profiling::kClientSockTimeoutMs)) {
+    PERFETTO_PLOG("Failed to set socket receive timeout.");
+    return nullptr;
+  }
+
+  // Wait on the immediate child to exit (allow for ECHILD in the unlikely case
+  // we're in a process that has made its children unwaitable).
+  int unused = 0;
+  if (PERFETTO_EINTR(waitpid(clone_pid, &unused, __WCLONE)) == -1 &&
+      errno != ECHILD) {
+    PERFETTO_PLOG("Failed to waitpid on immediate child.");
+    return nullptr;
+  }
+
+  return perfetto::profiling::Client::CreateAndHandshake(std::move(parent_sock),
+                                                         unhooked_allocator);
+}
+
+// Note: android_mallopt(M_RESET_HOOKS) is mutually exclusive with
+// heapprofd_initialize. Concurrent calls get discarded, which might be our
+// unpatching attempt if there is a concurrent re-initialization running due to
+// a new signal.
+//
+// Note: g_client can be reset by heapprofd_initialize without calling this
+// function.
+void ShutdownLazy() {
+  ScopedSpinlock s(&g_client_lock, ScopedSpinlock::Mode::Try);
+  if (PERFETTO_UNLIKELY(!s.locked()))
+    AbortOnSpinlockTimeout();
+
+  if (!g_client.ref())  // other invocation already initiated shutdown
+    return;
+
+  // Clear primary shared pointer, such that later hook invocations become nops.
+  g_client.ref().reset();
+
+  if (!android_mallopt(M_RESET_HOOKS, nullptr, 0))
+    PERFETTO_PLOG("Unpatching heapprofd hooks failed.");
+}
+
+// We're a library loaded into a potentially-multithreaded process, which might
+// not be explicitly aware of this possiblity. Deadling with forks/clones is
+// extremely complicated in such situations, but we attempt to handle certain
+// cases.
+//
+// There are two classes of forking processes to consider:
+//  * well-behaved processes that fork only when their threads (if any) are at a
+//    safe point, and therefore not in the middle of our hooks/client.
+//  * processes that fork with other threads in an arbitrary state. Though
+//    technically buggy, such processes exist in practice.
+//
+// This atfork handler follows a crude lowest-common-denominator approach, where
+// to handle the latter class of processes, we systematically leak any |Client|
+// state (present only when actively profiling at the time of fork) in the
+// postfork-child path.
+//
+// The alternative with acquiring all relevant locks in the prefork handler, and
+// releasing the state postfork handlers, poses a separate class of edge cases,
+// and is not deemed to be better as a result.
+//
+// Notes:
+// * this atfork handler fires only for the |fork| libc entrypoint, *not*
+//   |clone|. See client.cc's |IsPostFork| for some best-effort detection
+//   mechanisms for clone/vfork.
+// * it should be possible to start a new profiling session in this child
+//   process, modulo the bionic's heapprofd-loading state machine being in the
+//   right state.
+// * we cannot avoid leaks in all cases anyway (e.g. during shutdown sequence,
+//   when only individual straggler threads hold onto the Client).
+void AtForkChild() {
+  PERFETTO_LOG("heapprofd_client: handling atfork.");
+
+  // A thread (that has now disappeared across the fork) could have been holding
+  // the spinlock. We're now the only thread post-fork, so we can reset the
+  // spinlock, though the state it protects (the |g_client| shared_ptr) might
+  // not be in a consistent state.
+  g_client_lock.store(false);
+
+  // Leak the existing shared_ptr contents, including the profiling |Client| if
+  // profiling was active at the time of the fork.
+  // Note: this code assumes that the creation of the empty shared_ptr does not
+  // allocate, which should be the case for all implementations as the
+  // constructor has to be noexcept.
+  std::shared_ptr<perfetto::profiling::Client>& ref = g_client.ref();
+  new (&ref) std::shared_ptr<perfetto::profiling::Client>();
+}
+
+}  // namespace
+
+// TODO(fmayer): Keep track of the heap names and return a proper ID here.
+// For now, we are returning a placeholder so we don't need to change the API.
+__attribute__((visibility("default"))) uint32_t heapprofd_register_heap(
+    const char*) {
+  return 0;
+}
+
+__attribute__((visibility("default"))) bool
+heapprofd_report_allocation(uint32_t heap_id, uint64_t id, uint64_t size) {
+  size_t sampled_alloc_sz = 0;
+  std::shared_ptr<perfetto::profiling::Client> client;
+  {
+    ScopedSpinlock s(&g_client_lock, ScopedSpinlock::Mode::Try);
+    if (PERFETTO_UNLIKELY(!s.locked()))
+      AbortOnSpinlockTimeout();
+
+    if (!g_client.ref())  // no active client (most likely shutting down)
+      return false;
+
+    sampled_alloc_sz = g_client.ref()->GetSampleSizeLocked(size);
+    if (sampled_alloc_sz == 0)  // not sampling
+      return false;
+
+    client = g_client.ref();  // owning copy
+  }                           // unlock
+
+  if (!client->RecordMalloc(sampled_alloc_sz, size, id)) {
+    ShutdownLazy();
+  }
+  return true;
+}
+
+__attribute__((visibility("default"))) void heapprofd_report_free(
+    uint32_t heap_id,
+    uint64_t id) {
+  std::shared_ptr<perfetto::profiling::Client> client;
+  {
+    ScopedSpinlock s(&g_client_lock, ScopedSpinlock::Mode::Try);
+    if (PERFETTO_UNLIKELY(!s.locked()))
+      AbortOnSpinlockTimeout();
+
+    client = g_client.ref();  // owning copy (or empty)
+  }
+
+  if (client) {
+    if (!client->RecordFree(id))
+      ShutdownLazy();
+  }
+}
+
+__attribute__((visibility("default"))) bool heapprofd_init_session(
+    void* (*malloc_fn)(size_t),
+    void (*free_fn)(void*)) {
+  static bool first_init = true;
+  // Install an atfork handler to deal with *some* cases of the host forking.
+  // The handler will be unpatched automatically if we're dlclosed.
+  if (first_init && pthread_atfork(/*prepare=*/nullptr, /*parent=*/nullptr,
+                                   &AtForkChild) != 0) {
+    PERFETTO_PLOG("%s: pthread_atfork failed, not installing hooks.",
+                  getprogname());
+    return false;
+  }
+  first_init = false;
+
+  // TODO(fmayer): Check other destructions of client and make a decision
+  // whether we want to ban heap objects in the client or not.
+  std::shared_ptr<perfetto::profiling::Client> old_client;
+  {
+    ScopedSpinlock s(&g_client_lock, ScopedSpinlock::Mode::Try);
+    if (PERFETTO_UNLIKELY(!s.locked()))
+      AbortOnSpinlockTimeout();
+
+    if (g_client.ref()) {
+      PERFETTO_LOG("%s: Rejecting concurrent profiling initialization.",
+                   getprogname());
+      return true;  // success as we're in a valid state
+    }
+    old_client = g_client.ref();
+    g_client.ref().reset();
+  }
+
+  old_client.reset();
+
+  // The dispatch table never changes, so let the custom allocator retain the
+  // function pointers directly.
+  UnhookedAllocator<perfetto::profiling::Client> unhooked_allocator(malloc_fn,
+                                                                    free_fn);
+
+  // These factory functions use heap objects, so we need to run them without
+  // the spinlock held.
+  std::shared_ptr<perfetto::profiling::Client> client;
+  if (!ForceForkPrivateDaemon())
+    client = CreateClientForCentralDaemon(unhooked_allocator);
+  if (!client)
+    client = CreateClientAndPrivateDaemon(unhooked_allocator);
+
+  if (!client) {
+    PERFETTO_LOG("%s: heapprofd_client not initialized, not installing hooks.",
+                 getprogname());
+    return false;
+  }
+  PERFETTO_LOG("%s: heapprofd_client initialized.", getprogname());
+  {
+    ScopedSpinlock s(&g_client_lock, ScopedSpinlock::Mode::Try);
+    if (PERFETTO_UNLIKELY(!s.locked()))
+      AbortOnSpinlockTimeout();
+
+    // This cannot have been set in the meantime. There are never two concurrent
+    // calls to this function, as Bionic uses atomics to guard against that.
+    PERFETTO_DCHECK(g_client.ref() == nullptr);
+    g_client.ref() = std::move(client);
+  }
+  return true;
+}
diff --git a/src/profiling/memory/malloc_hooks.cc b/src/profiling/memory/malloc_hooks.cc
index d2c3894..be2ab1d 100644
--- a/src/profiling/memory/malloc_hooks.cc
+++ b/src/profiling/memory/malloc_hooks.cc
@@ -14,35 +14,15 @@
  * limitations under the License.
  */
 
-#include <android/fdsan.h>
 #include <bionic/malloc.h>
 #include <inttypes.h>
 #include <malloc.h>
 #include <private/bionic_malloc_dispatch.h>
-#include <stddef.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/system_properties.h>
-#include <sys/types.h>
-#include <sys/wait.h>
-#include <unistd.h>
 
 #include <atomic>
-#include <tuple>
 
-#include "perfetto/base/build_config.h"
-#include "perfetto/base/logging.h"
-#include "perfetto/ext/base/no_destructor.h"
-#include "perfetto/ext/base/unix_socket.h"
 #include "perfetto/ext/base/utils.h"
-#include "src/profiling/common/proc_utils.h"
-#include "src/profiling/memory/client.h"
-#include "src/profiling/memory/scoped_spinlock.h"
-#include "src/profiling/memory/unhooked_allocator.h"
-#include "src/profiling/memory/wire_protocol.h"
-
-using perfetto::profiling::ScopedSpinlock;
-using perfetto::profiling::UnhookedAllocator;
+#include "perfetto/profiling/memory/client_ext.h"
 
 // This is so we can make an so that we can swap out with the existing
 // libc_malloc_hooks.so
@@ -109,287 +89,11 @@
 // reading.
 std::atomic<const MallocDispatch*> g_dispatch{nullptr};
 
-// Holds the active profiling client. Is empty at the start, or after we've
-// started shutting down a profiling session. Hook invocations take shared_ptr
-// copies (ensuring that the client stays alive until no longer needed), and do
-// nothing if this master pointer is empty.
-//
-// This shared_ptr itself is protected by g_client_lock. Note that shared_ptr
-// handles are not thread-safe by themselves:
-// https://en.cppreference.com/w/cpp/memory/shared_ptr/atomic
-//
-// To avoid on-destruction re-entrancy issues, this shared_ptr needs to be
-// constructed with an allocator that uses the unhooked malloc & free functions.
-// See UnhookedAllocator.
-//
-// NoDestructor<> wrapper is used to avoid destructing the shared_ptr at program
-// exit. The rationale is:
-// * Avoiding the atexit destructor racing against other threads that are
-//   possibly running within the hooks.
-// * Making sure that atexit handlers running after this global's destructor
-//   can still safely enter the hooks.
-perfetto::base::NoDestructor<std::shared_ptr<perfetto::profiling::Client>>
-    g_client;
-
-// Protects g_client, and serves as an external lock for sampling decisions (see
-// perfetto::profiling::Sampler).
-//
-// We rely on this atomic's destuction being a nop, as it is possible for the
-// hooks to attempt to acquire the spinlock after its destructor should have run
-// (technically a use-after-destruct scenario).
-std::atomic<bool> g_client_lock{false};
-
-constexpr char kHeapprofdBinPath[] = "/system/bin/heapprofd";
-
 const MallocDispatch* GetDispatch() {
   return g_dispatch.load(std::memory_order_relaxed);
 }
 
-int CloneWithoutSigchld() {
-  auto ret = clone(nullptr, nullptr, 0, nullptr);
-  if (ret == 0)
-    android_fdsan_set_error_level(ANDROID_FDSAN_ERROR_LEVEL_DISABLED);
-  return ret;
-}
-
-int ForklikeClone() {
-  auto ret = clone(nullptr, nullptr, SIGCHLD, nullptr);
-  if (ret == 0)
-    android_fdsan_set_error_level(ANDROID_FDSAN_ERROR_LEVEL_DISABLED);
-  return ret;
-}
-
-// Like daemon(), but using clone to avoid invoking pthread_atfork(3) handlers.
-int Daemonize() {
-  switch (ForklikeClone()) {
-    case -1:
-      PERFETTO_PLOG("Daemonize.clone");
-      return -1;
-      break;
-    case 0:
-      break;
-    default:
-      _exit(0);
-      break;
-  }
-  if (setsid() == -1) {
-    PERFETTO_PLOG("Daemonize.setsid");
-    return -1;
-  }
-  // best effort chdir & fd close
-  chdir("/");
-  int fd = open("/dev/null", O_RDWR, 0);
-  if (fd != -1) {
-    dup2(fd, STDIN_FILENO);
-    dup2(fd, STDOUT_FILENO);
-    dup2(fd, STDERR_FILENO);
-    if (fd > STDERR_FILENO)
-      close(fd);
-  }
-  return 0;
-}
-
-// Called only if |g_client_lock| acquisition fails, which shouldn't happen
-// unless we're in a completely unexpected state (which we won't know how to
-// recover from). Tries to abort (SIGABRT) the whole process to serve as an
-// explicit indication of a bug.
-//
-// Doesn't use PERFETTO_FATAL as that is a single attempt to self-signal (in
-// practice - SIGTRAP), while abort() tries to make sure the process has
-// exited one way or another.
-__attribute__((noreturn, noinline)) void AbortOnSpinlockTimeout() {
-  PERFETTO_ELOG(
-      "Timed out on the spinlock - something is horribly wrong. "
-      "Aborting whole process.");
-  abort();
-}
-
-std::string ReadSystemProperty(const char* key) {
-  std::string prop_value;
-  const prop_info* prop = __system_property_find(key);
-  if (!prop) {
-    return prop_value;  // empty
-  }
-  __system_property_read_callback(
-      prop,
-      [](void* cookie, const char* name, const char* value, uint32_t) {
-        std::string* prop_value = reinterpret_cast<std::string*>(cookie);
-        *prop_value = value;
-      },
-      &prop_value);
-  return prop_value;
-}
-
-bool ForceForkPrivateDaemon() {
-  // Note: if renaming the property, also update system_property.cc
-  std::string mode = ReadSystemProperty("heapprofd.userdebug.mode");
-  return mode == "fork";
-}
-
-std::shared_ptr<perfetto::profiling::Client> CreateClientForCentralDaemon(
-    UnhookedAllocator<perfetto::profiling::Client> unhooked_allocator) {
-  PERFETTO_LOG("Constructing client for central daemon.");
-  using perfetto::profiling::Client;
-
-  perfetto::base::Optional<perfetto::base::UnixSocketRaw> sock =
-      Client::ConnectToHeapprofd(perfetto::profiling::kHeapprofdSocketFile);
-  if (!sock) {
-    PERFETTO_ELOG("Failed to connect to %s. This is benign on user builds.",
-                  perfetto::profiling::kHeapprofdSocketFile);
-    return nullptr;
-  }
-  return Client::CreateAndHandshake(std::move(sock.value()),
-                                    unhooked_allocator);
-}
-
-std::shared_ptr<perfetto::profiling::Client> CreateClientAndPrivateDaemon(
-    UnhookedAllocator<perfetto::profiling::Client> unhooked_allocator) {
-  PERFETTO_LOG("Setting up fork mode profiling.");
-  perfetto::base::UnixSocketRaw parent_sock;
-  perfetto::base::UnixSocketRaw child_sock;
-  std::tie(parent_sock, child_sock) = perfetto::base::UnixSocketRaw::CreatePair(
-      perfetto::base::SockFamily::kUnix, perfetto::base::SockType::kStream);
-
-  if (!parent_sock || !child_sock) {
-    PERFETTO_PLOG("Failed to create socketpair.");
-    return nullptr;
-  }
-
-  child_sock.RetainOnExec();
-
-  // Record own pid and cmdline, to pass down to the forked heapprofd.
-  pid_t target_pid = getpid();
-  std::string target_cmdline;
-  if (!perfetto::profiling::GetCmdlineForPID(target_pid, &target_cmdline)) {
-    target_cmdline = "failed-to-read-cmdline";
-    PERFETTO_ELOG(
-        "Failed to read own cmdline, proceeding as this might be a by-pid "
-        "profiling request (which will still work).");
-  }
-
-  // Prepare arguments for heapprofd.
-  std::string pid_arg =
-      std::string("--exclusive-for-pid=") + std::to_string(target_pid);
-  std::string cmd_arg =
-      std::string("--exclusive-for-cmdline=") + target_cmdline;
-  std::string fd_arg =
-      std::string("--inherit-socket-fd=") + std::to_string(child_sock.fd());
-  const char* argv[] = {kHeapprofdBinPath, pid_arg.c_str(), cmd_arg.c_str(),
-                        fd_arg.c_str(), nullptr};
-
-  // Use fork-like clone to avoid invoking the host's pthread_atfork(3)
-  // handlers. Also avoid sending the current process a SIGCHILD to further
-  // reduce our interference.
-  pid_t clone_pid = CloneWithoutSigchld();
-  if (clone_pid == -1) {
-    PERFETTO_PLOG("Failed to clone.");
-    return nullptr;
-  }
-  if (clone_pid == 0) {  // child
-    // Daemonize clones again, terminating the calling thread (i.e. the direct
-    // child of the original process). So the rest of this codepath will be
-    // executed in a new reparented process.
-    if (Daemonize() == -1) {
-      PERFETTO_PLOG("Daemonization failed.");
-      _exit(1);
-    }
-    execv(kHeapprofdBinPath, const_cast<char**>(argv));
-    PERFETTO_PLOG("Failed to execute private heapprofd.");
-    _exit(1);
-  }  // else - parent continuing the client setup
-
-  child_sock.ReleaseFd().reset();  // close child socket's fd
-  if (!parent_sock.SetTxTimeout(perfetto::profiling::kClientSockTimeoutMs)) {
-    PERFETTO_PLOG("Failed to set socket transmit timeout.");
-    return nullptr;
-  }
-
-  if (!parent_sock.SetRxTimeout(perfetto::profiling::kClientSockTimeoutMs)) {
-    PERFETTO_PLOG("Failed to set socket receive timeout.");
-    return nullptr;
-  }
-
-  // Wait on the immediate child to exit (allow for ECHILD in the unlikely case
-  // we're in a process that has made its children unwaitable).
-  int unused = 0;
-  if (PERFETTO_EINTR(waitpid(clone_pid, &unused, __WCLONE)) == -1 &&
-      errno != ECHILD) {
-    PERFETTO_PLOG("Failed to waitpid on immediate child.");
-    return nullptr;
-  }
-
-  return perfetto::profiling::Client::CreateAndHandshake(std::move(parent_sock),
-                                                         unhooked_allocator);
-}
-
-// Note: android_mallopt(M_RESET_HOOKS) is mutually exclusive with
-// heapprofd_initialize. Concurrent calls get discarded, which might be our
-// unpatching attempt if there is a concurrent re-initialization running due to
-// a new signal.
-//
-// Note: g_client can be reset by heapprofd_initialize without calling this
-// function.
-void ShutdownLazy() {
-  ScopedSpinlock s(&g_client_lock, ScopedSpinlock::Mode::Try);
-  if (PERFETTO_UNLIKELY(!s.locked()))
-    AbortOnSpinlockTimeout();
-
-  if (!g_client.ref())  // other invocation already initiated shutdown
-    return;
-
-  // Clear primary shared pointer, such that later hook invocations become nops.
-  g_client.ref().reset();
-
-  if (!android_mallopt(M_RESET_HOOKS, nullptr, 0))
-    PERFETTO_PLOG("Unpatching heapprofd hooks failed.");
-}
-
-// We're a library loaded into a potentially-multithreaded process, which might
-// not be explicitly aware of this possiblity. Deadling with forks/clones is
-// extremely complicated in such situations, but we attempt to handle certain
-// cases.
-//
-// There are two classes of forking processes to consider:
-//  * well-behaved processes that fork only when their threads (if any) are at a
-//    safe point, and therefore not in the middle of our hooks/client.
-//  * processes that fork with other threads in an arbitrary state. Though
-//    technically buggy, such processes exist in practice.
-//
-// This atfork handler follows a crude lowest-common-denominator approach, where
-// to handle the latter class of processes, we systematically leak any |Client|
-// state (present only when actively profiling at the time of fork) in the
-// postfork-child path.
-//
-// The alternative with acquiring all relevant locks in the prefork handler, and
-// releasing the state postfork handlers, poses a separate class of edge cases,
-// and is not deemed to be better as a result.
-//
-// Notes:
-// * this atfork handler fires only for the |fork| libc entrypoint, *not*
-//   |clone|. See client.cc's |IsPostFork| for some best-effort detection
-//   mechanisms for clone/vfork.
-// * it should be possible to start a new profiling session in this child
-//   process, modulo the bionic's heapprofd-loading state machine being in the
-//   right state.
-// * we cannot avoid leaks in all cases anyway (e.g. during shutdown sequence,
-//   when only individual straggler threads hold onto the Client).
-void AtForkChild() {
-  PERFETTO_LOG("heapprofd_client: handling atfork.");
-
-  // A thread (that has now disappeared across the fork) could have been holding
-  // the spinlock. We're now the only thread post-fork, so we can reset the
-  // spinlock, though the state it protects (the |g_client| shared_ptr) might
-  // not be in a consistent state.
-  g_client_lock.store(false);
-
-  // Leak the existing shared_ptr contents, including the profiling |Client| if
-  // profiling was active at the time of the fork.
-  // Note: this code assumes that the creation of the empty shared_ptr does not
-  // allocate, which should be the case for all implementations as the
-  // constructor has to be noexcept.
-  std::shared_ptr<perfetto::profiling::Client>& ref = g_client.ref();
-  new (&ref) std::shared_ptr<perfetto::profiling::Client>();
-}
+uint32_t g_heap_id = heapprofd_register_heap("malloc");
 
 }  // namespace
 
@@ -406,70 +110,9 @@
 bool HEAPPROFD_ADD_PREFIX(_initialize)(const MallocDispatch* malloc_dispatch,
                                        bool*,
                                        const char*) {
-  using ::perfetto::profiling::Client;
-
   // Table of pointers to backing implementation.
-  bool first_init = g_dispatch.load() == nullptr;
   g_dispatch.store(malloc_dispatch);
-
-  // Install an atfork handler to deal with *some* cases of the host forking.
-  // The handler will be unpatched automatically if we're dlclosed.
-  if (first_init && pthread_atfork(/*prepare=*/nullptr, /*parent=*/nullptr,
-                                   &AtForkChild) != 0) {
-    PERFETTO_PLOG("%s: pthread_atfork failed, not installing hooks.",
-                  getprogname());
-    return false;
-  }
-
-  // TODO(fmayer): Check other destructions of client and make a decision
-  // whether we want to ban heap objects in the client or not.
-  std::shared_ptr<Client> old_client;
-  {
-    ScopedSpinlock s(&g_client_lock, ScopedSpinlock::Mode::Try);
-    if (PERFETTO_UNLIKELY(!s.locked()))
-      AbortOnSpinlockTimeout();
-
-    if (g_client.ref()) {
-      PERFETTO_LOG("%s: Rejecting concurrent profiling initialization.",
-                   getprogname());
-      return true;  // success as we're in a valid state
-    }
-    old_client = g_client.ref();
-    g_client.ref().reset();
-  }
-
-  old_client.reset();
-
-  // The dispatch table never changes, so let the custom allocator retain the
-  // function pointers directly.
-  UnhookedAllocator<Client> unhooked_allocator(malloc_dispatch->malloc,
-                                               malloc_dispatch->free);
-
-  // These factory functions use heap objects, so we need to run them without
-  // the spinlock held.
-  std::shared_ptr<Client> client;
-  if (!ForceForkPrivateDaemon())
-    client = CreateClientForCentralDaemon(unhooked_allocator);
-  if (!client)
-    client = CreateClientAndPrivateDaemon(unhooked_allocator);
-
-  if (!client) {
-    PERFETTO_LOG("%s: heapprofd_client not initialized, not installing hooks.",
-                 getprogname());
-    return false;
-  }
-  PERFETTO_LOG("%s: heapprofd_client initialized.", getprogname());
-  {
-    ScopedSpinlock s(&g_client_lock, ScopedSpinlock::Mode::Try);
-    if (PERFETTO_UNLIKELY(!s.locked()))
-      AbortOnSpinlockTimeout();
-
-    // This cannot have been set in the meantime. There are never two concurrent
-    // calls to this function, as Bionic uses atomics to guard against that.
-    PERFETTO_DCHECK(g_client.ref() == nullptr);
-    g_client.ref() = std::move(client);
-  }
-  return true;
+  return heapprofd_init_session(malloc_dispatch->malloc, malloc_dispatch->free);
 }
 
 void HEAPPROFD_ADD_PREFIX(_finalize)() {
@@ -477,66 +120,35 @@
   // any specific action to take, and cleanup can be left to the OS.
 }
 
-// Decides whether an allocation with the given address and size needs to be
-// sampled, and if so, records it. Performs the necessary synchronization (holds
-// |g_client_lock| spinlock) while accessing the shared sampler, and obtaining a
-// profiling client handle (shared_ptr).
-//
-// If the allocation is to be sampled, the recording is done without holding
-// |g_client_lock|. The client handle is guaranteed to not be invalidated while
-// the allocation is being recorded.
-//
-// If the attempt to record the allocation fails, initiates lazy shutdown of the
-// client & hooks.
-static void MaybeSampleAllocation(size_t size, void* addr) {
-  size_t sampled_alloc_sz = 0;
-  std::shared_ptr<perfetto::profiling::Client> client;
-  {
-    ScopedSpinlock s(&g_client_lock, ScopedSpinlock::Mode::Try);
-    if (PERFETTO_UNLIKELY(!s.locked()))
-      AbortOnSpinlockTimeout();
-
-    if (!g_client.ref())  // no active client (most likely shutting down)
-      return;
-
-    sampled_alloc_sz = g_client.ref()->GetSampleSizeLocked(size);
-    if (sampled_alloc_sz == 0)  // not sampling
-      return;
-
-    client = g_client.ref();  // owning copy
-  }                           // unlock
-
-  if (!client->RecordMalloc(sampled_alloc_sz, size,
-                            reinterpret_cast<uint64_t>(addr))) {
-    ShutdownLazy();
-  }
-}
-
 void* HEAPPROFD_ADD_PREFIX(_malloc)(size_t size) {
   const MallocDispatch* dispatch = GetDispatch();
   void* addr = dispatch->malloc(size);
-  MaybeSampleAllocation(size, addr);
+  heapprofd_report_allocation(g_heap_id, reinterpret_cast<uint64_t>(addr),
+                              size);
   return addr;
 }
 
 void* HEAPPROFD_ADD_PREFIX(_calloc)(size_t nmemb, size_t size) {
   const MallocDispatch* dispatch = GetDispatch();
   void* addr = dispatch->calloc(nmemb, size);
-  MaybeSampleAllocation(nmemb * size, addr);
+  heapprofd_report_allocation(g_heap_id, reinterpret_cast<uint64_t>(addr),
+                              nmemb * size);
   return addr;
 }
 
 void* HEAPPROFD_ADD_PREFIX(_aligned_alloc)(size_t alignment, size_t size) {
   const MallocDispatch* dispatch = GetDispatch();
   void* addr = dispatch->aligned_alloc(alignment, size);
-  MaybeSampleAllocation(size, addr);
+  heapprofd_report_allocation(g_heap_id, reinterpret_cast<uint64_t>(addr),
+                              size);
   return addr;
 }
 
 void* HEAPPROFD_ADD_PREFIX(_memalign)(size_t alignment, size_t size) {
   const MallocDispatch* dispatch = GetDispatch();
   void* addr = dispatch->memalign(alignment, size);
-  MaybeSampleAllocation(size, addr);
+  heapprofd_report_allocation(g_heap_id, reinterpret_cast<uint64_t>(addr),
+                              size);
   return addr;
 }
 
@@ -548,7 +160,8 @@
   if (res != 0)
     return res;
 
-  MaybeSampleAllocation(size, *memptr);
+  heapprofd_report_allocation(g_heap_id, reinterpret_cast<uint64_t>(*memptr),
+                              size);
   return 0;
 }
 
@@ -568,19 +181,7 @@
     return;
 
   const MallocDispatch* dispatch = GetDispatch();
-  std::shared_ptr<perfetto::profiling::Client> client;
-  {
-    ScopedSpinlock s(&g_client_lock, ScopedSpinlock::Mode::Try);
-    if (PERFETTO_UNLIKELY(!s.locked()))
-      AbortOnSpinlockTimeout();
-
-    client = g_client.ref();  // owning copy (or empty)
-  }
-
-  if (client) {
-    if (!client->RecordFree(reinterpret_cast<uint64_t>(pointer)))
-      ShutdownLazy();
-  }
+  heapprofd_report_free(g_heap_id, reinterpret_cast<uint64_t>(pointer));
   return dispatch->free(pointer);
 }
 
@@ -594,39 +195,11 @@
 // processing it.
 void* HEAPPROFD_ADD_PREFIX(_realloc)(void* pointer, size_t size) {
   const MallocDispatch* dispatch = GetDispatch();
-
-  size_t sampled_alloc_sz = 0;
-  std::shared_ptr<perfetto::profiling::Client> client;
-  {
-    ScopedSpinlock s(&g_client_lock, ScopedSpinlock::Mode::Try);
-    if (PERFETTO_UNLIKELY(!s.locked()))
-      AbortOnSpinlockTimeout();
-
-    // If there is no active client, we still want to reach the backing realloc,
-    // so keep going.
-    if (g_client.ref()) {
-      client = g_client.ref();  // owning copy
-      sampled_alloc_sz = g_client.ref()->GetSampleSizeLocked(size);
-    }
-  }  // unlock
-
-  if (client && pointer) {
-    if (!client->RecordFree(reinterpret_cast<uint64_t>(pointer)))
-      ShutdownLazy();
-  }
+  if (pointer)
+    heapprofd_report_free(g_heap_id, reinterpret_cast<uint64_t>(pointer));
   void* addr = dispatch->realloc(pointer, size);
-
-  if (size == 0 || sampled_alloc_sz == 0)
-    return addr;
-
-  // We do not reach this point without a valid client, because in that case
-  // sampled_alloc_sz == 0.
-  PERFETTO_DCHECK(client);
-
-  if (!client->RecordMalloc(sampled_alloc_sz, size,
-                            reinterpret_cast<uint64_t>(addr))) {
-    ShutdownLazy();
-  }
+  heapprofd_report_allocation(g_heap_id, reinterpret_cast<uint64_t>(addr),
+                              size);
   return addr;
 }
 
diff --git a/src/profiling/memory/unwinding.cc b/src/profiling/memory/unwinding.cc
index c9e43fe..00a3cf1 100644
--- a/src/profiling/memory/unwinding.cc
+++ b/src/profiling/memory/unwinding.cc
@@ -70,7 +70,8 @@
 // We do not care about deterministic destructor order.
 #pragma GCC diagnostic ignored "-Wglobal-constructors"
 #pragma GCC diagnostic ignored "-Wexit-time-destructors"
-static std::vector<std::string> kSkipMaps{"heapprofd_client.so"};
+static std::vector<std::string> kSkipMaps{"heapprofd_client.so",
+                                          "heapprofd_client_api.so"};
 #pragma GCC diagnostic pop
 
 size_t GetRegsSize(unwindstack::Regs* regs) {
diff --git a/tools/gen_android_bp b/tools/gen_android_bp
index 099c384..676b7dd 100755
--- a/tools/gen_android_bp
+++ b/tools/gen_android_bp
@@ -57,6 +57,7 @@
     '//src/perfetto_cmd:perfetto',
     '//src/perfetto_cmd:trigger_perfetto',
     '//src/profiling/memory:heapprofd_client',
+    '//src/profiling/memory:heapprofd_client_api',
     '//src/profiling/memory:heapprofd',
     '//src/profiling/perf:traced_perf',
     '//src/traced/probes:traced_probes',
@@ -160,6 +161,11 @@
 
 # Additional arguments to apply to Android.bp rules.
 additional_args = {
+    'heapprofd_client_api': [
+        ('include_dirs', {'bionic/libc'}),
+        ('static_libs', {'libasync_safe'}),
+        ('header_libs', {'bionic_libc_platform_headers'}),
+    ],
     'heapprofd_client': [
         ('include_dirs', {'bionic/libc'}),
         ('static_libs', {'libasync_safe'}),