Enable -msse4.2, -mavx, -mpopcnt on x86_64 builds

This allows the compiler to optimize various memory-access
operations, like the ones in TraceProcessor's bit_vector.h.
Also add a run-time check that crashes the process in a
graceful manner to check that the host CPU actually has the
features we are assuming.

Bug: 205302474
Change-Id: I65bc93487aea5202c947686de577a74d3261f14e
diff --git a/src/base/utils.cc b/src/base/utils.cc
index 3f637b1..732117a 100644
--- a/src/base/utils.cc
+++ b/src/base/utils.cc
@@ -27,6 +27,7 @@
     PERFETTO_BUILDFLAG(PERFETTO_OS_APPLE) ||   \
     PERFETTO_BUILDFLAG(PERFETTO_OS_FUCHSIA)
 #include <limits.h>
+#include <stdlib.h>  // For _exit()
 #include <unistd.h>  // For getpagesize() and geteuid() & fork()
 #endif
 
@@ -58,6 +59,57 @@
 }  // namespace
 #endif  // OS_ANDROID
 
+namespace {
+
+#if PERFETTO_BUILDFLAG(PERFETTO_X64_CPU_OPT)
+
+// Preserve the %rbx register via %rdi to work around a clang bug
+// https://bugs.llvm.org/show_bug.cgi?id=17907 (%rbx in an output constraint
+// is not considered a clobbered register).
+#define PERFETTO_GETCPUID(a, b, c, d, a_inp, c_inp) \
+  asm("mov %%rbx, %%rdi\n"                          \
+      "cpuid\n"                                     \
+      "xchg %%rdi, %%rbx\n"                         \
+      : "=a"(a), "=D"(b), "=c"(c), "=d"(d)          \
+      : "a"(a_inp), "2"(c_inp))
+
+uint32_t GetXCR0EAX() {
+  uint32_t eax = 0, edx = 0;
+  asm("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0));
+  return eax;
+}
+
+// If we are building with -msse4 check that the CPU actually supports it.
+// This file must be kept in sync with gn/standalone/BUILD.gn.
+void PERFETTO_EXPORT __attribute__((constructor)) CheckCpuOptimizations() {
+  uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0;
+  PERFETTO_GETCPUID(eax, ebx, ecx, edx, 1, 0);
+
+  static constexpr uint64_t xcr0_xmm_mask = 0x2;
+  static constexpr uint64_t xcr0_ymm_mask = 0x4;
+  static constexpr uint64_t xcr0_avx_mask = xcr0_xmm_mask | xcr0_ymm_mask;
+
+  const bool have_popcnt = ecx & (1u << 23);
+  const bool have_sse4_2 = ecx & (1u << 20);
+  const bool have_avx =
+      // Does the OS save/restore XMM and YMM state?
+      ((GetXCR0EAX() & xcr0_avx_mask) == xcr0_avx_mask) &&
+      (ecx & (1u << 27)) &&  // OS support XGETBV.
+      (ecx & (1u << 28));    // AVX supported in hardware
+
+  if (!have_sse4_2 || !have_popcnt || !have_avx) {
+    fprintf(
+        stderr,
+        "This executable requires a cpu that supports SSE4.2 and AVX2.\n"
+        "Rebuild with enable_perfetto_x64_cpu_opt=false (ebx=%x, ecx=%x).\n",
+        ebx, ecx);
+    _exit(126);
+  }
+}
+#endif
+
+}  // namespace
+
 namespace perfetto {
 namespace base {