TraceProcessor: optimize std::sort<TimestampedTracePiece>
TraceSorter::Sort() is a very hot path in trace processor.
That does a std::sort on a CircularQueue<TTP>.
std::sort uses std::swap, which falls back on 2 std::moves
when not implemented.
TTP happens to be precisely 512-bits wide. This allows to
implement a very efficient swap which leverages XMM registers
on x86_64.
This saves 7-10% of trace ingestion time on a large trace.
Bug: 205302474
Change-Id: Ie57fc26e79599b2e27945dc5cef176c1d03d95cc
diff --git a/include/perfetto/ext/base/circular_queue.h b/include/perfetto/ext/base/circular_queue.h
index 8d1f1b7..80574c2 100644
--- a/include/perfetto/ext/base/circular_queue.h
+++ b/include/perfetto/ext/base/circular_queue.h
@@ -18,6 +18,8 @@
#define INCLUDE_PERFETTO_EXT_BASE_CIRCULAR_QUEUE_H_
#include <stdint.h>
+#include <stdlib.h>
+
#include <iterator>
#include "perfetto/base/logging.h"
@@ -258,7 +260,12 @@
PERFETTO_CHECK(new_capacity > capacity_);
size_t malloc_size = new_capacity * sizeof(T);
PERFETTO_CHECK(malloc_size > new_capacity);
- auto* new_vec = static_cast<T*>(malloc(malloc_size));
+
+ void* new_mem = nullptr;
+ // posix_memalign() wants at least void* alignment.
+ static constexpr size_t alignment = AlignUp<sizeof(void*)>(alignof(T));
+ PERFETTO_CHECK(posix_memalign(&new_mem, alignment, malloc_size) == 0);
+ T* new_vec = static_cast<T*>(new_mem);
// Move all elements in the expanded array.
size_t new_size = 0;
diff --git a/src/trace_processor/timestamped_trace_piece.h b/src/trace_processor/timestamped_trace_piece.h
index 3cd27cd..a1c7112 100644
--- a/src/trace_processor/timestamped_trace_piece.h
+++ b/src/trace_processor/timestamped_trace_piece.h
@@ -78,7 +78,7 @@
// A TimestampedTracePiece is (usually a reference to) a piece of a trace that
// is sorted by TraceSorter.
-struct TimestampedTracePiece {
+struct alignas(64) TimestampedTracePiece {
enum class Type {
kInvalid = 0,
kFtraceEvent,
@@ -242,6 +242,22 @@
(timestamp == o.timestamp && packet_idx < o.packet_idx);
}
+ // For std::sort(). Without this the compiler will fall back on invoking
+ // move operators on temporary objects.
+ friend void swap(TimestampedTracePiece& a, TimestampedTracePiece& b) {
+ // We know that TimestampedTracePiece is 64-byte aligned (because of the
+ // alignas(64) in the declaration above). We also know that swapping it is
+ // trivial and we can just swap the memory without invoking move operators.
+ // The cast to aligned_storage below allows the compiler to turn this into
+ // a bunch of movaps with large XMM registers (128/256/512 bit depending on
+ // -mavx).
+ using AS =
+ typename std::aligned_storage<sizeof(TimestampedTracePiece),
+ alignof(TimestampedTracePiece)>::type;
+ using std::swap;
+ swap(reinterpret_cast<AS&>(a), reinterpret_cast<AS&>(b));
+ }
+
// Fields ordered for packing.
// Data for different types of TimestampedTracePiece.
@@ -261,6 +277,14 @@
Type type;
};
+// std::sort<TTS> is an extremely hot path in TraceProcessor (in trace_sorter.h)
+// When TTS is 512-bit wide, we can leverage SIMD instructions to swap it by
+// declaring it aligned at its own size, without losing any space in the
+// CircularQueue due to fragmentation. This makes a 6% difference in the
+// ingestion time of a large trace. See the comments above in the swap() above.
+static_assert(sizeof(TimestampedTracePiece) <= 64,
+ "TimestampedTracePiece cannot grow beyond 64 bytes");
+
} // namespace trace_processor
} // namespace perfetto