Enable -msse4.2, -mavx, -mpopcnt on x86_64 builds
This allows the compiler to optimize various memory-access
operations, like the ones in TraceProcessor's bit_vector.h.
Also add a run-time check that crashes the process in a
graceful manner to check that the host CPU actually has the
features we are assuming.
Bug: 205302474
Change-Id: I65bc93487aea5202c947686de577a74d3261f14e
diff --git a/src/base/utils.cc b/src/base/utils.cc
index 3f637b1..732117a 100644
--- a/src/base/utils.cc
+++ b/src/base/utils.cc
@@ -27,6 +27,7 @@
PERFETTO_BUILDFLAG(PERFETTO_OS_APPLE) || \
PERFETTO_BUILDFLAG(PERFETTO_OS_FUCHSIA)
#include <limits.h>
+#include <stdlib.h> // For _exit()
#include <unistd.h> // For getpagesize() and geteuid() & fork()
#endif
@@ -58,6 +59,57 @@
} // namespace
#endif // OS_ANDROID
+namespace {
+
+#if PERFETTO_BUILDFLAG(PERFETTO_X64_CPU_OPT)
+
+// Preserve the %rbx register via %rdi to work around a clang bug
+// https://bugs.llvm.org/show_bug.cgi?id=17907 (%rbx in an output constraint
+// is not considered a clobbered register).
+#define PERFETTO_GETCPUID(a, b, c, d, a_inp, c_inp) \
+ asm("mov %%rbx, %%rdi\n" \
+ "cpuid\n" \
+ "xchg %%rdi, %%rbx\n" \
+ : "=a"(a), "=D"(b), "=c"(c), "=d"(d) \
+ : "a"(a_inp), "2"(c_inp))
+
+uint32_t GetXCR0EAX() {
+ uint32_t eax = 0, edx = 0;
+ asm("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0));
+ return eax;
+}
+
+// If we are building with -msse4 check that the CPU actually supports it.
+// This file must be kept in sync with gn/standalone/BUILD.gn.
+void PERFETTO_EXPORT __attribute__((constructor)) CheckCpuOptimizations() {
+ uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0;
+ PERFETTO_GETCPUID(eax, ebx, ecx, edx, 1, 0);
+
+ static constexpr uint64_t xcr0_xmm_mask = 0x2;
+ static constexpr uint64_t xcr0_ymm_mask = 0x4;
+ static constexpr uint64_t xcr0_avx_mask = xcr0_xmm_mask | xcr0_ymm_mask;
+
+ const bool have_popcnt = ecx & (1u << 23);
+ const bool have_sse4_2 = ecx & (1u << 20);
+ const bool have_avx =
+ // Does the OS save/restore XMM and YMM state?
+ ((GetXCR0EAX() & xcr0_avx_mask) == xcr0_avx_mask) &&
+ (ecx & (1u << 27)) && // OS support XGETBV.
+ (ecx & (1u << 28)); // AVX supported in hardware
+
+ if (!have_sse4_2 || !have_popcnt || !have_avx) {
+ fprintf(
+ stderr,
+ "This executable requires a cpu that supports SSE4.2 and AVX2.\n"
+ "Rebuild with enable_perfetto_x64_cpu_opt=false (ebx=%x, ecx=%x).\n",
+ ebx, ecx);
+ _exit(126);
+ }
+}
+#endif
+
+} // namespace
+
namespace perfetto {
namespace base {