base: Reland RTFutex for android (#2865)

Bug: 443178555
diff --git a/gn/perfetto.gni b/gn/perfetto.gni
index a79d0be..2497421 100644
--- a/gn/perfetto.gni
+++ b/gn/perfetto.gni
@@ -267,16 +267,15 @@
       perfetto_build_standalone && !is_perfetto_build_generator &&
       defined(use_custom_libcxx) && use_custom_libcxx
 
-  # Enables the use of priority-inheritance mutexes via PTHREAD_PRIO_INHERIT.
+  # Enables the use of priority-inheritance mutexes via PTHREAD_PRIO_INHERIT
+  # or wrapper around PI Futexes (dependant on OS).
   # Note that on Android platform (non-standalone) builds this flag is ignored
-  # and the Android flag "use_rt_mutex" is used instead (perfetto_flags.aconfig)
+  # and Android flags "use_rt_mutex", "use_rt_futex" are used instead (perfetto_flags.aconfig)
   # This is disabled in chromium, because the BPF-sandbox allows PI-futex only
   # when a field-trial is enabled, which is incompatible with a build time flag.
   enable_perfetto_rt_mutex =
-      (!is_wasm &&
-       (((build_with_chromium && is_android) || perfetto_build_standalone) &&
-        (current_cpu == "x64" || current_cpu == "arm64"))) ||
-      is_perfetto_build_generator
+      !is_wasm && (perfetto_build_standalone || is_perfetto_build_generator) &&
+      !build_with_chromium
 
   # This flag is used for the migration of UnixTaskRunner -> LockFreeTaskRunner.
   # It determines whether MaybeLockFreeTaskRunner is backed by UnixTaskRunner
diff --git a/include/perfetto/ext/base/flags.h b/include/perfetto/ext/base/flags.h
index ca71497..ffb36b1 100644
--- a/include/perfetto/ext/base/flags.h
+++ b/include/perfetto/ext/base/flags.h
@@ -32,17 +32,19 @@
 // in `perfetto_flags.aconfig`.
 // The second argument is the default value of the flag in non-Android platform
 // contexts.
+//
+// Note: For rt_mutex and rt_futex, the source of truth for non-Android platform
+// is in rt_mutex.h
 #define PERFETTO_READ_ONLY_FLAGS(X)                                    \
   X(test_read_only_flag, NonAndroidPlatformDefault_FALSE)              \
   X(use_murmur_hash_for_flat_hash_map, NonAndroidPlatformDefault_TRUE) \
   X(ftrace_clear_offline_cpus_only, NonAndroidPlatformDefault_TRUE)    \
-  X(use_rt_mutex, PERFETTO_BUILDFLAG(PERFETTO_ENABLE_RT_MUTEX)         \
-                      ? NonAndroidPlatformDefault_TRUE                 \
-                      : NonAndroidPlatformDefault_FALSE)               \
   X(use_lockfree_taskrunner,                                           \
     PERFETTO_BUILDFLAG(PERFETTO_ENABLE_LOCKFREE_TASKRUNNER)            \
         ? NonAndroidPlatformDefault_TRUE                               \
-        : NonAndroidPlatformDefault_FALSE)
+        : NonAndroidPlatformDefault_FALSE)                             \
+  X(use_rt_mutex, NonAndroidPlatformDefault_FALSE)                     \
+  X(use_rt_futex, NonAndroidPlatformDefault_FALSE)
 
 ////////////////////////////////////////////////////////////////////////////////
 //                                                                            //
diff --git a/include/perfetto/ext/base/rt_mutex.h b/include/perfetto/ext/base/rt_mutex.h
index fa65e50..248f6ad 100644
--- a/include/perfetto/ext/base/rt_mutex.h
+++ b/include/perfetto/ext/base/rt_mutex.h
@@ -23,8 +23,8 @@
 // In the contended case RtMutex is generally slower than a std::mutex (or any
 // non-RT implementation).
 // Under the hoods this class does the following:
-// - Linux/Android: it uses PI futexes.
-// - MacOS/iOS: it uses pthread_mutex with PTHREAD_PRIO_INHERIT.
+// - Android: it uses PI futexes.
+// - Linux/MacOS/iOS: it uses pthread_mutex with PTHREAD_PRIO_INHERIT.
 // - Other platforms: falls back on a standard std::mutex. On Windows 11+
 //   std::mutex has effectively PI semantics due to AutoBoost
 //   https://github.com/MicrosoftDocs/win32/commit/a43cb3b5039c5cfc53642bfcea174003a2f1168f
@@ -34,13 +34,49 @@
 #include "perfetto/ext/base/flags.h"
 #include "perfetto/public/compiler.h"
 
-#if PERFETTO_BUILDFLAG(PERFETTO_OS_ANDROID) || \
-    PERFETTO_BUILDFLAG(PERFETTO_OS_LINUX) ||   \
-    PERFETTO_BUILDFLAG(PERFETTO_OS_APPLE)
-#define PERFETTO_HAS_POSIX_RT_MUTEX() true
-#else
-#define PERFETTO_HAS_POSIX_RT_MUTEX() false
+#define _PERFETTO_MUTEX_MODE_STD 0
+#define _PERFETTO_MUTEX_MODE_RT_FUTEX 1
+#define _PERFETTO_MUTEX_MODE_RT_MUTEX 2
+
+// The logic below determines which mutex implementation to use.
+// For Android platform builds, the choice is controlled by aconfig flags.
+// For other builds, it's determined by OS support and GN build arguments.
+//
+// Rationale for platform-specific choices:
+// 1. `RtFutex` is enabled only on Android because it relies on `gettid()` being
+//    a cheap thread-local storage access provided by Bionic. On Linux with
+//    glibc, `gettid()` is a full syscall, making the pthread-based
+//    implementation faster.
+// 2. The pthread-based `RtPosixMutex` is not viable on all Android versions, as
+//    `pthread_mutexattr_setprotocol` was introduced in API level 28. Using
+//    `dlsym` to backport it can lead to deadlocks with the loader lock if
+//    tracing is initialized from a static constructor (see b/443178555).
+#if PERFETTO_BUILDFLAG(PERFETTO_OS_ANDROID) && \
+    PERFETTO_BUILDFLAG(PERFETTO_ANDROID_BUILD)
+#if PERFETTO_FLAGS_USE_RT_FUTEX
+#define _PERFETTO_MUTEX_MODE _PERFETTO_MUTEX_MODE_RT_FUTEX
+#elif PERFETTO_FLAGS_USE_RT_MUTEX
+#define _PERFETTO_MUTEX_MODE _PERFETTO_MUTEX_MODE_RT_MUTEX
 #endif
+#elif PERFETTO_BUILDFLAG(PERFETTO_ENABLE_RT_MUTEX)
+#if PERFETTO_BUILDFLAG(PERFETTO_OS_ANDROID)
+#define _PERFETTO_MUTEX_MODE _PERFETTO_MUTEX_MODE_RT_FUTEX
+#elif PERFETTO_BUILDFLAG(PERFETTO_OS_LINUX) || \
+    PERFETTO_BUILDFLAG(PERFETTO_OS_APPLE)
+#define _PERFETTO_MUTEX_MODE _PERFETTO_MUTEX_MODE_RT_MUTEX
+#endif
+#endif
+
+// If no RT implementation was selected, default to std::mutex.
+#ifndef _PERFETTO_MUTEX_MODE
+#define _PERFETTO_MUTEX_MODE _PERFETTO_MUTEX_MODE_STD
+#endif
+
+// Public macros for conditional compilation based on the selected mutex type.
+#define PERFETTO_HAS_POSIX_RT_MUTEX() \
+  (_PERFETTO_MUTEX_MODE == _PERFETTO_MUTEX_MODE_RT_MUTEX)
+#define PERFETTO_HAS_RT_FUTEX() \
+  (_PERFETTO_MUTEX_MODE == _PERFETTO_MUTEX_MODE_RT_FUTEX)
 
 #include <atomic>
 #include <mutex>
@@ -50,10 +86,95 @@
 #include <pthread.h>
 #endif
 
+#if !PERFETTO_BUILDFLAG(PERFETTO_OS_WIN)
+#include <unistd.h>  // For gettid().
+#endif
+
 namespace perfetto::base {
 
 namespace internal {
 
+#if PERFETTO_HAS_RT_FUTEX()
+// A wrapper around PI Futexes. A futex is a wrapper around an atomic integer
+// with an ABI shared with the kernel to handle the slowpath in the cases when
+// the mutex is held, or we find out that there are waiters queued when we
+// unlock. The operating principle is the following:
+// - In the no-contention case, a futex boils down to an atomic
+//   compare-and-exchange, without involving the kernel.
+// - If a lock is contented at acquire time, we have to enter the kernel to
+//   suspend our execution and join a wait chain.
+// - It could still happen that we acquire the mutex via the fastpath (without
+//   involving the kernel) but other waiters might queue up while we hold the
+//   mutex. In that case the kernel will add a bit to the atomic int. That bit
+//   will cause the unlock() compare-and-exchange to fail (because it no longer
+//   matches our tid) which in turn will signal us to do a syscall to notify the
+//   waiters.
+class PERFETTO_LOCKABLE RtFutex {
+ public:
+  RtFutex() { PERFETTO_TSAN_MUTEX_CREATE(this, __tsan_mutex_not_static); }
+  ~RtFutex() { PERFETTO_TSAN_MUTEX_DESTROY(this, __tsan_mutex_not_static); }
+
+  // Disable copy or move. Copy doesn't make sense. Move isn't feasible because
+  // the pointer to the atomic integer is the handle used by the kernel to setup
+  // the wait chain. A movable futex would require the atomic integer to be heap
+  // allocated, but that would create an indirection layer that is not needed in
+  // most cases. If you really need a movable RtMutex, wrap it in a unique_ptr.
+  RtFutex(const RtFutex&) = delete;
+  RtFutex& operator=(const RtFutex&) = delete;
+  RtFutex(RtFutex&&) = delete;
+  RtFutex& operator=(RtFutex&&) = delete;
+
+  inline bool TryLockFastpath() noexcept {
+    int expected = 0;
+    return lock_.compare_exchange_strong(expected, ::gettid(),
+                                         std::memory_order_acquire,
+                                         std::memory_order_relaxed);
+  }
+
+  bool try_lock() noexcept PERFETTO_EXCLUSIVE_TRYLOCK_FUNCTION(true) {
+    PERFETTO_TSAN_MUTEX_PRE_LOCK(this, __tsan_mutex_try_lock);
+    if (PERFETTO_LIKELY(TryLockFastpath()) || TryLockSlowpath()) {
+      PERFETTO_TSAN_MUTEX_POST_LOCK(this, __tsan_mutex_try_lock, 0);
+      return true;
+    }
+    PERFETTO_TSAN_MUTEX_POST_LOCK(
+        this, __tsan_mutex_try_lock | __tsan_mutex_try_lock_failed, 0);
+    return false;
+  }
+
+  void lock() PERFETTO_EXCLUSIVE_LOCK_FUNCTION() {
+    PERFETTO_TSAN_MUTEX_PRE_LOCK(this, 0);
+    if (!PERFETTO_LIKELY(TryLockFastpath())) {
+      LockSlowpath();
+    }
+    PERFETTO_TSAN_MUTEX_POST_LOCK(this, 0, 0);
+  }
+
+  void unlock() noexcept PERFETTO_UNLOCK_FUNCTION() {
+    PERFETTO_TSAN_MUTEX_PRE_UNLOCK(this, 0);
+    int expected = ::gettid();
+    // If the current value is our tid, we can unlock without a syscall since
+    // there are no current waiters.
+    if (!PERFETTO_LIKELY(lock_.compare_exchange_strong(
+            expected, 0, std::memory_order_release,
+            std::memory_order_relaxed))) {
+      // The tid doesn't match because the kernel appended the FUTEX_WAITERS
+      // bit. There are waiters, tell the kernel to notify them and unlock.
+      UnlockSlowpath();
+    }
+    PERFETTO_TSAN_MUTEX_POST_UNLOCK(this, 0);
+  }
+
+ private:
+  std::atomic<int> lock_{};
+
+  void LockSlowpath();
+  bool TryLockSlowpath();
+  void UnlockSlowpath();
+};
+
+#endif  // PERFETTO_HAS_RT_FUTEX
+
 #if PERFETTO_HAS_POSIX_RT_MUTEX()
 class PERFETTO_LOCKABLE RtPosixMutex {
  public:
@@ -76,17 +197,16 @@
 #endif  // PERFETTO_HAS_POSIX_RT_MUTEX
 }  // namespace internal
 
-// Pick the best implementation for the target platform.
-// See comments in the top of the doc.
-#if PERFETTO_HAS_POSIX_RT_MUTEX()
-using RtMutex = internal::RtPosixMutex;
+// Select the best real-time mutex implementation for the target platform, or
+// fall back to std::mutex if none is available.
+#if PERFETTO_HAS_RT_FUTEX()
+using MaybeRtMutex = internal::RtFutex;
+#elif PERFETTO_HAS_POSIX_RT_MUTEX()
+using MaybeRtMutex = internal::RtPosixMutex;
 #else
-using RtMutex = std::mutex;
+using MaybeRtMutex = std::mutex;
 #endif
 
-using MaybeRtMutex =
-    std::conditional_t<base::flags::use_rt_mutex, RtMutex, std::mutex>;
-
 }  // namespace perfetto::base
 
 #endif  // INCLUDE_PERFETTO_EXT_BASE_RT_MUTEX_H_
diff --git a/perfetto_flags.aconfig b/perfetto_flags.aconfig
index 9f0af1d..45c5e65 100644
--- a/perfetto_flags.aconfig
+++ b/perfetto_flags.aconfig
@@ -42,3 +42,10 @@
   bug: "441118768"
   is_fixed_read_only: true
 }
+flag {
+  name: "use_rt_futex"
+  namespace: "perfetto"
+  description: "Controls whether base::MaybeRtMutex will use base::RtFutex or resolved type of base::MaybeRtMutex for android."
+  bug: "443948543"
+  is_fixed_read_only: true
+}
diff --git a/src/base/rt_mutex.cc b/src/base/rt_mutex.cc
index cc84a55..5551fb5 100644
--- a/src/base/rt_mutex.cc
+++ b/src/base/rt_mutex.cc
@@ -21,37 +21,53 @@
 #include "perfetto/base/logging.h"
 #include "perfetto/ext/base/utils.h"
 
-#if PERFETTO_BUILDFLAG(PERFETTO_OS_ANDROID)
-#include <dlfcn.h>
+#if PERFETTO_HAS_RT_FUTEX()
+#include <linux/futex.h>
+#include <sys/syscall.h>
+#include <unistd.h>
 #endif
 
 namespace perfetto::base {
 
 namespace internal {
 
+#if PERFETTO_HAS_RT_FUTEX()
+
+void RtFutex::LockSlowpath() {
+  auto res = PERFETTO_EINTR(
+      syscall(SYS_futex, &lock_, FUTEX_LOCK_PI_PRIVATE, 0, nullptr));
+  PERFETTO_CHECK(res == 0);
+}
+
+bool RtFutex::TryLockSlowpath() {
+  auto res = PERFETTO_EINTR(
+      syscall(SYS_futex, &lock_, FUTEX_TRYLOCK_PI_PRIVATE, 0, nullptr));
+  if (res == 0)
+    return true;
+  if (errno == EBUSY || errno == EDEADLK)
+    return false;
+  PERFETTO_FATAL("FUTEX_TRYLOCK_PI_PRIVATE failed");
+}
+
+void RtFutex::UnlockSlowpath() {
+  auto res = PERFETTO_EINTR(
+      syscall(SYS_futex, &lock_, FUTEX_UNLOCK_PI_PRIVATE, 0, nullptr));
+  PERFETTO_CHECK(res == 0);
+}
+
+#endif  // PERFETTO_HAS_RT_FUTEX
+
 #if PERFETTO_HAS_POSIX_RT_MUTEX()
 
 RtPosixMutex::RtPosixMutex() noexcept {
-  pthread_mutexattr_t at{};
-  PERFETTO_CHECK(pthread_mutexattr_init(&at) == 0);
 #if PERFETTO_BUILDFLAG(PERFETTO_OS_ANDROID) && __ANDROID_API__ < 28
   // pthread_mutexattr_setprotocol is only available on API 28.
-  using SetprotocolFuncT = int (*)(pthread_mutexattr_t*, int);
-  static auto setprotocol_func = reinterpret_cast<SetprotocolFuncT>(
-      dlsym(RTLD_DEFAULT, "pthread_mutexattr_setprotocol"));
-  if (setprotocol_func) {
-    PERFETTO_CHECK(setprotocol_func(&at, PTHREAD_PRIO_INHERIT) == 0);
-  } else {
-    static uint64_t log_once = 0;
-    if (log_once++ == 0) {
-      PERFETTO_LOG(
-          "Priority-inheritance RtMutex is not available in this version of "
-          "Android.");
-    }
-  }
-#else  // Not Android (but POSIX RT)
-  PERFETTO_CHECK(pthread_mutexattr_setprotocol(&at, PTHREAD_PRIO_INHERIT) == 0);
+#error \
+    "Priority-inheritance RtMutex is not available in this version of Android."
 #endif
+  pthread_mutexattr_t at{};
+  PERFETTO_CHECK(pthread_mutexattr_init(&at) == 0);
+  PERFETTO_CHECK(pthread_mutexattr_setprotocol(&at, PTHREAD_PRIO_INHERIT) == 0);
   PERFETTO_CHECK(pthread_mutex_init(&mutex_, &at) == 0);
 }
 
diff --git a/src/base/rt_mutex_benchmark.cc b/src/base/rt_mutex_benchmark.cc
index 473d339..453fcbe 100644
--- a/src/base/rt_mutex_benchmark.cc
+++ b/src/base/rt_mutex_benchmark.cc
@@ -102,6 +102,12 @@
 BENCHMARK_TEMPLATE(BM_RtMutex_NoContention, std::mutex)->Apply(BenchmarkArgs);
 BENCHMARK_TEMPLATE(BM_RtMutex_Contention, std::mutex)->Apply(BenchmarkArgs);
 
+#if PERFETTO_HAS_RT_FUTEX()
+using perfetto::base::internal::RtFutex;
+BENCHMARK_TEMPLATE(BM_RtMutex_NoContention, RtFutex)->Apply(BenchmarkArgs);
+BENCHMARK_TEMPLATE(BM_RtMutex_Contention, RtFutex)->Apply(BenchmarkArgs);
+#endif
+
 #if PERFETTO_HAS_POSIX_RT_MUTEX()
 using perfetto::base::internal::RtPosixMutex;
 BENCHMARK_TEMPLATE(BM_RtMutex_NoContention, RtPosixMutex)->Apply(BenchmarkArgs);
diff --git a/src/base/rt_mutex_unittest.cc b/src/base/rt_mutex_unittest.cc
index cd4afdb..05a8b08 100644
--- a/src/base/rt_mutex_unittest.cc
+++ b/src/base/rt_mutex_unittest.cc
@@ -15,6 +15,7 @@
  */
 
 #include "perfetto/ext/base/rt_mutex.h"
+#include "perfetto/ext/base/flags.h"
 
 #include "test/gtest_and_gmock.h"
 
@@ -35,6 +36,10 @@
                                         ,
                                         internal::RtPosixMutex
 #endif
+#if PERFETTO_HAS_RT_FUTEX()
+                                        ,
+                                        internal::RtFutex
+#endif
                                         >;
 
 class NameGenerator {
@@ -47,6 +52,10 @@
     if constexpr (std::is_same_v<T, internal::RtPosixMutex>)
       return "RtPosix";
 #endif
+#if PERFETTO_HAS_RT_FUTEX()
+    if constexpr (std::is_same_v<T, internal::RtFutex>)
+      return "RtFutex";
+#endif
   }
 };