TraceProcessor: optimize std::sort<TimestampedTracePiece>

TraceSorter::Sort() is a very hot path in trace processor.
That does a std::sort on a CircularQueue<TTP>.
std::sort uses std::swap, which falls back on 2 std::moves
when not implemented.
TTP happens to be precisely 512-bits wide. This allows to
implement a very efficient swap which leverages XMM registers
on x86_64.
This saves 7-10% of trace ingestion time on a large trace.

Bug: 205302474
Change-Id: Ie57fc26e79599b2e27945dc5cef176c1d03d95cc
diff --git a/include/perfetto/ext/base/circular_queue.h b/include/perfetto/ext/base/circular_queue.h
index 8d1f1b7..80574c2 100644
--- a/include/perfetto/ext/base/circular_queue.h
+++ b/include/perfetto/ext/base/circular_queue.h
@@ -18,6 +18,8 @@
 #define INCLUDE_PERFETTO_EXT_BASE_CIRCULAR_QUEUE_H_
 
 #include <stdint.h>
+#include <stdlib.h>
+
 #include <iterator>
 
 #include "perfetto/base/logging.h"
@@ -258,7 +260,12 @@
     PERFETTO_CHECK(new_capacity > capacity_);
     size_t malloc_size = new_capacity * sizeof(T);
     PERFETTO_CHECK(malloc_size > new_capacity);
-    auto* new_vec = static_cast<T*>(malloc(malloc_size));
+
+    void* new_mem = nullptr;
+    // posix_memalign() wants at least void* alignment.
+    static constexpr size_t alignment = AlignUp<sizeof(void*)>(alignof(T));
+    PERFETTO_CHECK(posix_memalign(&new_mem, alignment, malloc_size) == 0);
+    T* new_vec = static_cast<T*>(new_mem);
 
     // Move all elements in the expanded array.
     size_t new_size = 0;
diff --git a/src/trace_processor/timestamped_trace_piece.h b/src/trace_processor/timestamped_trace_piece.h
index 3cd27cd..a1c7112 100644
--- a/src/trace_processor/timestamped_trace_piece.h
+++ b/src/trace_processor/timestamped_trace_piece.h
@@ -78,7 +78,7 @@
 
 // A TimestampedTracePiece is (usually a reference to) a piece of a trace that
 // is sorted by TraceSorter.
-struct TimestampedTracePiece {
+struct alignas(64) TimestampedTracePiece {
   enum class Type {
     kInvalid = 0,
     kFtraceEvent,
@@ -242,6 +242,22 @@
            (timestamp == o.timestamp && packet_idx < o.packet_idx);
   }
 
+  // For std::sort(). Without this the compiler will fall back on invoking
+  // move operators on temporary objects.
+  friend void swap(TimestampedTracePiece& a, TimestampedTracePiece& b) {
+    // We know that TimestampedTracePiece is 64-byte aligned (because of the
+    // alignas(64) in the declaration above). We also know that swapping it is
+    // trivial and we can just swap the memory without invoking move operators.
+    // The cast to aligned_storage below allows the compiler to turn this into
+    // a bunch of movaps with large XMM registers (128/256/512 bit depending on
+    // -mavx).
+    using AS =
+        typename std::aligned_storage<sizeof(TimestampedTracePiece),
+                                      alignof(TimestampedTracePiece)>::type;
+    using std::swap;
+    swap(reinterpret_cast<AS&>(a), reinterpret_cast<AS&>(b));
+  }
+
   // Fields ordered for packing.
 
   // Data for different types of TimestampedTracePiece.
@@ -261,6 +277,14 @@
   Type type;
 };
 
+// std::sort<TTS> is an extremely hot path in TraceProcessor (in trace_sorter.h)
+// When TTS is 512-bit wide, we can leverage SIMD instructions to swap it by
+// declaring it aligned at its own size, without losing any space in the
+// CircularQueue due to fragmentation. This makes a 6% difference in the
+// ingestion time of a large trace. See the comments above in the swap() above.
+static_assert(sizeof(TimestampedTracePiece) <= 64,
+              "TimestampedTracePiece cannot grow beyond 64 bytes");
+
 }  // namespace trace_processor
 }  // namespace perfetto