src/profiling/perf/event_reader.cc - third_party/perfetto - Git at Google

 /*
  * Copyright (C) 2019 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #include "src/profiling/perf/event_reader.h"

 #include <linux/perf_event.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
 #include <sys/syscall.h>
 #include <sys/types.h>
 #include <unistd.h>

 #include "perfetto/ext/base/utils.h"
 #include "src/profiling/perf/regs_parsing.h"

 namespace perfetto {
 namespace profiling {

 namespace {

 template <typename T>
 const char* ReadValue(T* value_out, const char* ptr) {
   memcpy(value_out, reinterpret_cast<const void*>(ptr), sizeof(T));
   return ptr + sizeof(T);
 }

 template <typename T>
 const char* ReadValues(T* out, const char* ptr, size_t num_values) {
   size_t sz = sizeof(T) * num_values;
   memcpy(out, reinterpret_cast<const void*>(ptr), sz);
   return ptr + sz;
 }

 bool IsPowerOfTwo(size_t v) {
   return (v != 0 && ((v & (v - 1)) == 0));
 }

 static int perf_event_open(perf_event_attr* attr,
                            pid_t pid,
                            int cpu,
                            int group_fd,
                            unsigned long flags) {
   return static_cast<int>(
       syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags));
 }

 base::ScopedFile PerfEventOpen(uint32_t cpu,
                                perf_event_attr* perf_attr,
                                int group_fd = -1) {
   base::ScopedFile perf_fd{perf_event_open(perf_attr, /*pid=*/-1,
                                            static_cast<int>(cpu), group_fd,
                                            PERF_FLAG_FD_CLOEXEC)};
   return perf_fd;
 }

 // If counting tracepoints, set an event filter if requested.
 bool MaybeApplyTracepointFilter(int fd, const PerfCounter& event) {
   if (event.type != PerfCounter::Type::kTracepoint ||
       event.tracepoint_filter.empty()) {
     return true;
   }
   PERFETTO_DCHECK(event.attr_type == PERF_TYPE_TRACEPOINT);

   if (ioctl(fd, PERF_EVENT_IOC_SET_FILTER, event.tracepoint_filter.c_str())) {
     PERFETTO_PLOG("Failed ioctl to set event filter");
     return false;
   }
   return true;
 }

 }  // namespace

 PerfRingBuffer::PerfRingBuffer(PerfRingBuffer&& other) noexcept
     : metadata_page_(other.metadata_page_),
       mmap_sz_(other.mmap_sz_),
       data_buf_(other.data_buf_),
       data_buf_sz_(other.data_buf_sz_) {
   other.metadata_page_ = nullptr;
   other.mmap_sz_ = 0;
   other.data_buf_ = nullptr;
   other.data_buf_sz_ = 0;
 }

 PerfRingBuffer& PerfRingBuffer::operator=(PerfRingBuffer&& other) noexcept {
   if (this == &other)
     return *this;

   this->~PerfRingBuffer();
   new (this) PerfRingBuffer(std::move(other));
   return *this;
 }

 PerfRingBuffer::~PerfRingBuffer() {
   if (!valid())
     return;

   if (munmap(reinterpret_cast<void*>(metadata_page_), mmap_sz_) != 0)
     PERFETTO_PLOG("failed munmap");
 }

 std::optional<PerfRingBuffer> PerfRingBuffer::Allocate(int perf_fd,
                                                        size_t data_page_count) {
   // perf_event_open requires the ring buffer to be a power of two in size.
   PERFETTO_DCHECK(IsPowerOfTwo(data_page_count));

   PerfRingBuffer ret;

   // mmap request is one page larger than the buffer size (for the metadata).
   ret.data_buf_sz_ = data_page_count * base::GetSysPageSize();
   ret.mmap_sz_ = ret.data_buf_sz_ + base::GetSysPageSize();

   // If PROT_WRITE, kernel won't overwrite unread samples.
   void* mmap_addr = mmap(nullptr, ret.mmap_sz_, PROT_READ | PROT_WRITE,
                          MAP_SHARED, perf_fd, 0);
   if (mmap_addr == MAP_FAILED) {
     PERFETTO_PLOG("failed mmap");
     return std::nullopt;
   }

   // Expected layout is [ metadata page ] [ data pages ... ]
   ret.metadata_page_ = reinterpret_cast<perf_event_mmap_page*>(mmap_addr);
   ret.data_buf_ = reinterpret_cast<char*>(mmap_addr) + base::GetSysPageSize();
   PERFETTO_CHECK(ret.metadata_page_->data_offset == base::GetSysPageSize());
   PERFETTO_CHECK(ret.metadata_page_->data_size == ret.data_buf_sz_);

   PERFETTO_DCHECK(IsPowerOfTwo(ret.data_buf_sz_));

   return std::make_optional(std::move(ret));
 }

 // See |perf_output_put_handle| for the necessary synchronization between the
 // kernel and this userspace thread (which are using the same shared memory, but
 // might be on different cores).
 // TODO(rsavitski): is there false sharing between |data_tail| and |data_head|?
 // Is there an argument for maintaining our own copy of |data_tail| instead of
 // reloading it?
 char* PerfRingBuffer::ReadRecordNonconsuming() {
   static_assert(sizeof(std::atomic<uint64_t>) == sizeof(uint64_t), "");

   PERFETTO_DCHECK(valid());

   // |data_tail| is written only by this userspace thread, so we can safely read
   // it without any synchronization.
   uint64_t read_offset = metadata_page_->data_tail;

   // |data_head| is written by the kernel, perform an acquiring load such that
   // the payload reads below are ordered after this load.
   uint64_t write_offset =
       reinterpret_cast<std::atomic<uint64_t>*>(&metadata_page_->data_head)
           ->load(std::memory_order_acquire);

   PERFETTO_DCHECK(read_offset <= write_offset);
   if (write_offset == read_offset)
     return nullptr;  // no new data

   size_t read_pos = static_cast<size_t>(read_offset & (data_buf_sz_ - 1));

   // event header (64 bits) guaranteed to be contiguous
   PERFETTO_DCHECK(read_pos <= data_buf_sz_ - sizeof(perf_event_header));
   PERFETTO_DCHECK(0 == reinterpret_cast<size_t>(data_buf_ + read_pos) %
                            alignof(perf_event_header));

   perf_event_header* evt_header =
       reinterpret_cast<perf_event_header*>(data_buf_ + read_pos);
   uint16_t evt_size = evt_header->size;

   // event wrapped - reconstruct it, and return a pointer to the buffer
   if (read_pos + evt_size > data_buf_sz_) {
     PERFETTO_DLOG("PerfRingBuffer: returning reconstructed event");

     size_t prefix_sz = data_buf_sz_ - read_pos;
     memcpy(&reconstructed_record_[0], data_buf_ + read_pos, prefix_sz);
     memcpy(&reconstructed_record_[0] + prefix_sz, data_buf_,
            evt_size - prefix_sz);
     return &reconstructed_record_[0];
   } else {
     // usual case - contiguous sample
     return data_buf_ + read_pos;
   }
 }

 void PerfRingBuffer::Consume(size_t bytes) {
   PERFETTO_DCHECK(valid());

   // Advance |data_tail|, which is written only by this thread. The store of the
   // updated value needs to have release semantics such that the preceding
   // payload reads are ordered before it. The reader in this case is the kernel,
   // which reads |data_tail| to calculate the available ring buffer capacity
   // before trying to store a new record.
   uint64_t updated_tail = metadata_page_->data_tail + bytes;
   reinterpret_cast<std::atomic<uint64_t>*>(&metadata_page_->data_tail)
       ->store(updated_tail, std::memory_order_release);
 }

 EventReader::EventReader(uint32_t cpu,
                          perf_event_attr event_attr,
                          base::ScopedFile perf_fd,
                          std::vector<base::ScopedFile> followers_fds,
                          PerfRingBuffer ring_buffer)
     : cpu_(cpu),
       event_attr_(event_attr),
       perf_fd_(std::move(perf_fd)),
       follower_fds_(std::move(followers_fds)),
       ring_buffer_(std::move(ring_buffer)) {}

 EventReader& EventReader::operator=(EventReader&& other) noexcept {
   if (this == &other)
     return *this;

   this->~EventReader();
   new (this) EventReader(std::move(other));
   return *this;
 }

 std::optional<EventReader> EventReader::ConfigureEvents(
     uint32_t cpu,
     const EventConfig& event_cfg) {
   auto timebase_fd = PerfEventOpen(cpu, event_cfg.perf_attr());
   if (!timebase_fd) {
     PERFETTO_PLOG("Failed perf_event_open");
     return std::nullopt;
   }

   // Open followers.
   std::vector<base::ScopedFile> follower_fds;
   for (auto follower_attr : event_cfg.perf_attr_followers()) {
     auto follower_fd = PerfEventOpen(cpu, &follower_attr, timebase_fd.get());
     if (!follower_fd) {
       PERFETTO_PLOG("Failed perf_event_open (follower)");
       return std::nullopt;
     }
     follower_fds.push_back(std::move(follower_fd));
   }

   // Apply the tracepoint to the timebase.
   if (!MaybeApplyTracepointFilter(timebase_fd.get(),
                                   event_cfg.timebase_event()))
     return std::nullopt;

   // Apply the tracepoint to the followers.
   if (follower_fds.size() != event_cfg.follower_events().size()) {
     return std::nullopt;
   }

   for (size_t i = 0; i < follower_fds.size(); ++i) {
     if (!MaybeApplyTracepointFilter(follower_fds[i].get(),
                                     event_cfg.follower_events()[i]))
       return std::nullopt;
   }

   auto ring_buffer = PerfRingBuffer::Allocate(timebase_fd.get(),
                                               event_cfg.ring_buffer_pages());
   if (!ring_buffer.has_value()) {
     return std::nullopt;
   }
   return EventReader(cpu, *event_cfg.perf_attr(), std::move(timebase_fd),
                      std::move(follower_fds), std::move(ring_buffer.value()));
 }

 std::optional<ParsedSample> EventReader::ReadUntilSample(
     std::function<void(uint64_t)> records_lost_callback) {
   for (;;) {
     char* event = ring_buffer_.ReadRecordNonconsuming();
     if (!event)
       return std::nullopt;  // caught up with the writer

     auto* event_hdr = reinterpret_cast<const perf_event_header*>(event);

     if (event_hdr->type == PERF_RECORD_SAMPLE) {
       ParsedSample sample = ParseSampleRecord(cpu_, event);
       ring_buffer_.Consume(event_hdr->size);
       return std::make_optional(std::move(sample));
     }

     if (event_hdr->type == PERF_RECORD_LOST) {
       /*
        * struct {
        *   struct perf_event_header header;
        *   u64 id;
        *   u64 lost;
        *   struct sample_id sample_id;
        * };
        */
       uint64_t records_lost = *reinterpret_cast<const uint64_t*>(
           event + sizeof(perf_event_header) + sizeof(uint64_t));

       records_lost_callback(records_lost);
       ring_buffer_.Consume(event_hdr->size);
       continue;  // keep looking for a sample
     }

     // Kernel had to throttle irqs.
     if (event_hdr->type == PERF_RECORD_THROTTLE ||
         event_hdr->type == PERF_RECORD_UNTHROTTLE) {
       ring_buffer_.Consume(event_hdr->size);
       continue;  // keep looking for a sample
     }

     PERFETTO_DFATAL_OR_ELOG("Unsupported event type [%zu]",
                             static_cast<size_t>(event_hdr->type));
     ring_buffer_.Consume(event_hdr->size);
   }
 }

 // Generally, samples can belong to any cpu (which can be recorded with
 // PERF_SAMPLE_CPU). However, this producer uses only cpu-scoped events,
 // therefore it is already known.
 ParsedSample EventReader::ParseSampleRecord(uint32_t cpu,
                                             const char* record_start) {
   if (event_attr_.sample_type &
       (~uint64_t(PERF_SAMPLE_TID | PERF_SAMPLE_TIME | PERF_SAMPLE_STACK_USER |
                  PERF_SAMPLE_REGS_USER | PERF_SAMPLE_CALLCHAIN |
                  PERF_SAMPLE_READ))) {
     PERFETTO_FATAL("Unsupported sampling option");
   }

   auto* event_hdr = reinterpret_cast<const perf_event_header*>(record_start);
   size_t sample_size = event_hdr->size;

   ParsedSample sample = {};
   sample.common.cpu = cpu;
   sample.common.cpu_mode = event_hdr->misc & PERF_RECORD_MISC_CPUMODE_MASK;

   // Parse the payload, which consists of concatenated data for each
   // |attr.sample_type| flag.
   const char* parse_pos = record_start + sizeof(perf_event_header);

   if (event_attr_.sample_type & PERF_SAMPLE_TID) {
     uint32_t pid = 0;
     uint32_t tid = 0;
     parse_pos = ReadValue(&pid, parse_pos);
     parse_pos = ReadValue(&tid, parse_pos);
     sample.common.pid = static_cast<pid_t>(pid);
     sample.common.tid = static_cast<pid_t>(tid);
   }

   if (event_attr_.sample_type & PERF_SAMPLE_TIME) {
     parse_pos = ReadValue(&sample.common.timestamp, parse_pos);
   }

   if (event_attr_.sample_type & PERF_SAMPLE_READ) {
     if (event_attr_.read_format & PERF_FORMAT_GROUP) {
       // When PERF_FORMAT_GROUP is specified, the record starts with the number
       // of events it contains followed by the events. The event list always
       // starts with the value of the timebase.
       // In a ParsedSample, the value of the timebase goes into timebase_count
       // and the value of the followers events goes into follower_counts.
       uint64_t nr = 0;
       parse_pos = ReadValue(&nr, parse_pos);
       PERFETTO_CHECK(nr != 0);
       parse_pos = ReadValue(&sample.common.timebase_count, parse_pos);
       sample.common.follower_counts.resize(nr - 1);
       for (size_t i = 0; i < nr - 1; ++i) {
         parse_pos = ReadValue(&sample.common.follower_counts[i], parse_pos);
       }
     } else {
       parse_pos = ReadValue(&sample.common.timebase_count, parse_pos);
     }
   }

   if (event_attr_.sample_type & PERF_SAMPLE_CALLCHAIN) {
     uint64_t chain_len = 0;
     parse_pos = ReadValue(&chain_len, parse_pos);
     sample.kernel_ips.resize(static_cast<size_t>(chain_len));
     parse_pos = ReadValues<uint64_t>(sample.kernel_ips.data(), parse_pos,
                                      static_cast<size_t>(chain_len));
   }

   if (event_attr_.sample_type & PERF_SAMPLE_REGS_USER) {
     // Can be empty, e.g. if we sampled a kernel thread.
     sample.regs = ReadPerfUserRegsData(&parse_pos);
   }

   if (event_attr_.sample_type & PERF_SAMPLE_STACK_USER) {
     // Maximum possible sampled stack size for this sample. Can be lower than
     // the requested size if there wasn't enough room in the sample (which is
     // limited to 64k).
     uint64_t max_stack_size;
     parse_pos = ReadValue(&max_stack_size, parse_pos);

     const char* stack_start = parse_pos;
     parse_pos += max_stack_size;  // skip to dyn_size

     // Payload written conditionally, e.g. kernel threads don't have a
     // user stack.
     if (max_stack_size > 0) {
       uint64_t filled_stack_size;
       parse_pos = ReadValue(&filled_stack_size, parse_pos);

       // copy stack bytes into a vector
       size_t payload_sz = static_cast<size_t>(filled_stack_size);
       sample.stack.resize(payload_sz);
       memcpy(sample.stack.data(), stack_start, payload_sz);

       // remember whether the stack sample is (most likely) truncated
       sample.stack_maxed = (filled_stack_size == max_stack_size);
     }
   }

   PERFETTO_CHECK(parse_pos == record_start + sample_size);
   return sample;
 }

 void EventReader::EnableEvents() {
   int ret = ioctl(perf_fd_.get(), PERF_EVENT_IOC_ENABLE);
   PERFETTO_CHECK(ret == 0);
 }

 void EventReader::DisableEvents() {
   int ret = ioctl(perf_fd_.get(), PERF_EVENT_IOC_DISABLE);
   PERFETTO_CHECK(ret == 0);
 }

 }  // namespace profiling
 }  // namespace perfetto
	/*
	* Copyright (C) 2019 The Android Open Source Project
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#include "src/profiling/perf/event_reader.h"

	#include <linux/perf_event.h>
	#include <sys/ioctl.h>
	#include <sys/mman.h>
	#include <sys/syscall.h>
	#include <sys/types.h>
	#include <unistd.h>

	#include "perfetto/ext/base/utils.h"
	#include "src/profiling/perf/regs_parsing.h"

	namespace perfetto {
	namespace profiling {

	namespace {

	template <typename T>
	const char* ReadValue(T* value_out, const char* ptr) {
	memcpy(value_out, reinterpret_cast<const void*>(ptr), sizeof(T));
	return ptr + sizeof(T);
	}

	template <typename T>
	const char* ReadValues(T* out, const char* ptr, size_t num_values) {
	size_t sz = sizeof(T) * num_values;
	memcpy(out, reinterpret_cast<const void*>(ptr), sz);
	return ptr + sz;
	}

	bool IsPowerOfTwo(size_t v) {
	return (v != 0 && ((v & (v - 1)) == 0));
	}

	static int perf_event_open(perf_event_attr* attr,
	pid_t pid,
	int cpu,
	int group_fd,
	unsigned long flags) {
	return static_cast<int>(
	syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags));
	}

	base::ScopedFile PerfEventOpen(uint32_t cpu,
	perf_event_attr* perf_attr,
	int group_fd = -1) {
	base::ScopedFile perf_fd{perf_event_open(perf_attr, /pid=/-1,
	static_cast<int>(cpu), group_fd,
	PERF_FLAG_FD_CLOEXEC)};
	return perf_fd;
	}

	// If counting tracepoints, set an event filter if requested.
	bool MaybeApplyTracepointFilter(int fd, const PerfCounter& event) {
	if (event.type != PerfCounter::Type::kTracepoint \|\|
	event.tracepoint_filter.empty()) {
	return true;
	}
	PERFETTO_DCHECK(event.attr_type == PERF_TYPE_TRACEPOINT);

	if (ioctl(fd, PERF_EVENT_IOC_SET_FILTER, event.tracepoint_filter.c_str())) {
	PERFETTO_PLOG("Failed ioctl to set event filter");
	return false;
	}
	return true;
	}

	} // namespace

	PerfRingBuffer::PerfRingBuffer(PerfRingBuffer&& other) noexcept
	: metadata_page_(other.metadata_page_),
	mmap_sz_(other.mmap_sz_),
	data_buf_(other.data_buf_),
	data_buf_sz_(other.data_buf_sz_) {
	other.metadata_page_ = nullptr;
	other.mmap_sz_ = 0;
	other.data_buf_ = nullptr;
	other.data_buf_sz_ = 0;
	}

	PerfRingBuffer& PerfRingBuffer::operator=(PerfRingBuffer&& other) noexcept {
	if (this == &other)
	return *this;

	this->~PerfRingBuffer();
	new (this) PerfRingBuffer(std::move(other));
	return *this;
	}

	PerfRingBuffer::~PerfRingBuffer() {
	if (!valid())
	return;

	if (munmap(reinterpret_cast<void*>(metadata_page_), mmap_sz_) != 0)
	PERFETTO_PLOG("failed munmap");
	}

	std::optional<PerfRingBuffer> PerfRingBuffer::Allocate(int perf_fd,
	size_t data_page_count) {
	// perf_event_open requires the ring buffer to be a power of two in size.
	PERFETTO_DCHECK(IsPowerOfTwo(data_page_count));

	PerfRingBuffer ret;

	// mmap request is one page larger than the buffer size (for the metadata).
	ret.data_buf_sz_ = data_page_count * base::GetSysPageSize();
	ret.mmap_sz_ = ret.data_buf_sz_ + base::GetSysPageSize();

	// If PROT_WRITE, kernel won't overwrite unread samples.
	void* mmap_addr = mmap(nullptr, ret.mmap_sz_, PROT_READ \| PROT_WRITE,
	MAP_SHARED, perf_fd, 0);
	if (mmap_addr == MAP_FAILED) {
	PERFETTO_PLOG("failed mmap");
	return std::nullopt;
	}

	// Expected layout is [ metadata page ] [ data pages ... ]
	ret.metadata_page_ = reinterpret_cast<perf_event_mmap_page*>(mmap_addr);
	ret.data_buf_ = reinterpret_cast<char*>(mmap_addr) + base::GetSysPageSize();
	PERFETTO_CHECK(ret.metadata_page_->data_offset == base::GetSysPageSize());
	PERFETTO_CHECK(ret.metadata_page_->data_size == ret.data_buf_sz_);

	PERFETTO_DCHECK(IsPowerOfTwo(ret.data_buf_sz_));

	return std::make_optional(std::move(ret));
	}

	// See \|perf_output_put_handle\| for the necessary synchronization between the
	// kernel and this userspace thread (which are using the same shared memory, but
	// might be on different cores).
	// TODO(rsavitski): is there false sharing between \|data_tail\| and \|data_head\|?
	// Is there an argument for maintaining our own copy of \|data_tail\| instead of
	// reloading it?
	char* PerfRingBuffer::ReadRecordNonconsuming() {
	static_assert(sizeof(std::atomic<uint64_t>) == sizeof(uint64_t), "");

	PERFETTO_DCHECK(valid());

	// \|data_tail\| is written only by this userspace thread, so we can safely read
	// it without any synchronization.
	uint64_t read_offset = metadata_page_->data_tail;

	// \|data_head\| is written by the kernel, perform an acquiring load such that
	// the payload reads below are ordered after this load.
	uint64_t write_offset =
	reinterpret_cast<std::atomic<uint64_t>*>(&metadata_page_->data_head)
	->load(std::memory_order_acquire);

	PERFETTO_DCHECK(read_offset <= write_offset);
	if (write_offset == read_offset)
	return nullptr; // no new data

	size_t read_pos = static_cast<size_t>(read_offset & (data_buf_sz_ - 1));

	// event header (64 bits) guaranteed to be contiguous
	PERFETTO_DCHECK(read_pos <= data_buf_sz_ - sizeof(perf_event_header));
	PERFETTO_DCHECK(0 == reinterpret_cast<size_t>(data_buf_ + read_pos) %
	alignof(perf_event_header));

	perf_event_header* evt_header =
	reinterpret_cast<perf_event_header*>(data_buf_ + read_pos);
	uint16_t evt_size = evt_header->size;

	// event wrapped - reconstruct it, and return a pointer to the buffer
	if (read_pos + evt_size > data_buf_sz_) {
	PERFETTO_DLOG("PerfRingBuffer: returning reconstructed event");

	size_t prefix_sz = data_buf_sz_ - read_pos;
	memcpy(&reconstructed_record_[0], data_buf_ + read_pos, prefix_sz);
	memcpy(&reconstructed_record_[0] + prefix_sz, data_buf_,
	evt_size - prefix_sz);
	return &reconstructed_record_[0];
	} else {
	// usual case - contiguous sample
	return data_buf_ + read_pos;
	}
	}

	void PerfRingBuffer::Consume(size_t bytes) {
	PERFETTO_DCHECK(valid());

	// Advance \|data_tail\|, which is written only by this thread. The store of the
	// updated value needs to have release semantics such that the preceding
	// payload reads are ordered before it. The reader in this case is the kernel,
	// which reads \|data_tail\| to calculate the available ring buffer capacity
	// before trying to store a new record.
	uint64_t updated_tail = metadata_page_->data_tail + bytes;
	reinterpret_cast<std::atomic<uint64_t>*>(&metadata_page_->data_tail)
	->store(updated_tail, std::memory_order_release);
	}

	EventReader::EventReader(uint32_t cpu,
	perf_event_attr event_attr,
	base::ScopedFile perf_fd,
	std::vector<base::ScopedFile> followers_fds,
	PerfRingBuffer ring_buffer)
	: cpu_(cpu),
	event_attr_(event_attr),
	perf_fd_(std::move(perf_fd)),
	follower_fds_(std::move(followers_fds)),
	ring_buffer_(std::move(ring_buffer)) {}

	EventReader& EventReader::operator=(EventReader&& other) noexcept {
	if (this == &other)
	return *this;

	this->~EventReader();
	new (this) EventReader(std::move(other));
	return *this;
	}

	std::optional<EventReader> EventReader::ConfigureEvents(
	uint32_t cpu,
	const EventConfig& event_cfg) {
	auto timebase_fd = PerfEventOpen(cpu, event_cfg.perf_attr());
	if (!timebase_fd) {
	PERFETTO_PLOG("Failed perf_event_open");
	return std::nullopt;
	}

	// Open followers.
	std::vector<base::ScopedFile> follower_fds;
	for (auto follower_attr : event_cfg.perf_attr_followers()) {
	auto follower_fd = PerfEventOpen(cpu, &follower_attr, timebase_fd.get());
	if (!follower_fd) {
	PERFETTO_PLOG("Failed perf_event_open (follower)");
	return std::nullopt;
	}
	follower_fds.push_back(std::move(follower_fd));
	}

	// Apply the tracepoint to the timebase.
	if (!MaybeApplyTracepointFilter(timebase_fd.get(),
	event_cfg.timebase_event()))
	return std::nullopt;

	// Apply the tracepoint to the followers.
	if (follower_fds.size() != event_cfg.follower_events().size()) {
	return std::nullopt;
	}

	for (size_t i = 0; i < follower_fds.size(); ++i) {
	if (!MaybeApplyTracepointFilter(follower_fds[i].get(),
	event_cfg.follower_events()[i]))
	return std::nullopt;
	}

	auto ring_buffer = PerfRingBuffer::Allocate(timebase_fd.get(),
	event_cfg.ring_buffer_pages());
	if (!ring_buffer.has_value()) {
	return std::nullopt;
	}
	return EventReader(cpu, *event_cfg.perf_attr(), std::move(timebase_fd),
	std::move(follower_fds), std::move(ring_buffer.value()));
	}

	std::optional<ParsedSample> EventReader::ReadUntilSample(
	std::function<void(uint64_t)> records_lost_callback) {
	for (;;) {
	char* event = ring_buffer_.ReadRecordNonconsuming();
	if (!event)
	return std::nullopt; // caught up with the writer

	auto* event_hdr = reinterpret_cast<const perf_event_header*>(event);

	if (event_hdr->type == PERF_RECORD_SAMPLE) {
	ParsedSample sample = ParseSampleRecord(cpu_, event);
	ring_buffer_.Consume(event_hdr->size);
	return std::make_optional(std::move(sample));
	}

	if (event_hdr->type == PERF_RECORD_LOST) {
	/*
	* struct {
	* struct perf_event_header header;
	* u64 id;
	* u64 lost;
	* struct sample_id sample_id;
	* };
	*/
	uint64_t records_lost = reinterpret_cast<const uint64_t>(
	event + sizeof(perf_event_header) + sizeof(uint64_t));

	records_lost_callback(records_lost);
	ring_buffer_.Consume(event_hdr->size);
	continue; // keep looking for a sample
	}

	// Kernel had to throttle irqs.
	if (event_hdr->type == PERF_RECORD_THROTTLE \|\|
	event_hdr->type == PERF_RECORD_UNTHROTTLE) {
	ring_buffer_.Consume(event_hdr->size);
	continue; // keep looking for a sample
	}

	PERFETTO_DFATAL_OR_ELOG("Unsupported event type [%zu]",
	static_cast<size_t>(event_hdr->type));
	ring_buffer_.Consume(event_hdr->size);
	}
	}

	// Generally, samples can belong to any cpu (which can be recorded with
	// PERF_SAMPLE_CPU). However, this producer uses only cpu-scoped events,
	// therefore it is already known.
	ParsedSample EventReader::ParseSampleRecord(uint32_t cpu,
	const char* record_start) {
	if (event_attr_.sample_type &
	(~uint64_t(PERF_SAMPLE_TID \| PERF_SAMPLE_TIME \| PERF_SAMPLE_STACK_USER \|
	PERF_SAMPLE_REGS_USER \| PERF_SAMPLE_CALLCHAIN \|
	PERF_SAMPLE_READ))) {
	PERFETTO_FATAL("Unsupported sampling option");
	}

	auto* event_hdr = reinterpret_cast<const perf_event_header*>(record_start);
	size_t sample_size = event_hdr->size;

	ParsedSample sample = {};
	sample.common.cpu = cpu;
	sample.common.cpu_mode = event_hdr->misc & PERF_RECORD_MISC_CPUMODE_MASK;

	// Parse the payload, which consists of concatenated data for each
	// \|attr.sample_type\| flag.
	const char* parse_pos = record_start + sizeof(perf_event_header);

	if (event_attr_.sample_type & PERF_SAMPLE_TID) {
	uint32_t pid = 0;
	uint32_t tid = 0;
	parse_pos = ReadValue(&pid, parse_pos);
	parse_pos = ReadValue(&tid, parse_pos);
	sample.common.pid = static_cast<pid_t>(pid);
	sample.common.tid = static_cast<pid_t>(tid);
	}

	if (event_attr_.sample_type & PERF_SAMPLE_TIME) {
	parse_pos = ReadValue(&sample.common.timestamp, parse_pos);
	}

	if (event_attr_.sample_type & PERF_SAMPLE_READ) {
	if (event_attr_.read_format & PERF_FORMAT_GROUP) {
	// When PERF_FORMAT_GROUP is specified, the record starts with the number
	// of events it contains followed by the events. The event list always
	// starts with the value of the timebase.
	// In a ParsedSample, the value of the timebase goes into timebase_count
	// and the value of the followers events goes into follower_counts.
	uint64_t nr = 0;
	parse_pos = ReadValue(&nr, parse_pos);
	PERFETTO_CHECK(nr != 0);
	parse_pos = ReadValue(&sample.common.timebase_count, parse_pos);
	sample.common.follower_counts.resize(nr - 1);
	for (size_t i = 0; i < nr - 1; ++i) {
	parse_pos = ReadValue(&sample.common.follower_counts[i], parse_pos);
	}
	} else {
	parse_pos = ReadValue(&sample.common.timebase_count, parse_pos);
	}
	}

	if (event_attr_.sample_type & PERF_SAMPLE_CALLCHAIN) {
	uint64_t chain_len = 0;
	parse_pos = ReadValue(&chain_len, parse_pos);
	sample.kernel_ips.resize(static_cast<size_t>(chain_len));
	parse_pos = ReadValues<uint64_t>(sample.kernel_ips.data(), parse_pos,
	static_cast<size_t>(chain_len));
	}

	if (event_attr_.sample_type & PERF_SAMPLE_REGS_USER) {
	// Can be empty, e.g. if we sampled a kernel thread.
	sample.regs = ReadPerfUserRegsData(&parse_pos);
	}

	if (event_attr_.sample_type & PERF_SAMPLE_STACK_USER) {
	// Maximum possible sampled stack size for this sample. Can be lower than
	// the requested size if there wasn't enough room in the sample (which is
	// limited to 64k).
	uint64_t max_stack_size;
	parse_pos = ReadValue(&max_stack_size, parse_pos);

	const char* stack_start = parse_pos;
	parse_pos += max_stack_size; // skip to dyn_size

	// Payload written conditionally, e.g. kernel threads don't have a
	// user stack.
	if (max_stack_size > 0) {
	uint64_t filled_stack_size;
	parse_pos = ReadValue(&filled_stack_size, parse_pos);

	// copy stack bytes into a vector
	size_t payload_sz = static_cast<size_t>(filled_stack_size);
	sample.stack.resize(payload_sz);
	memcpy(sample.stack.data(), stack_start, payload_sz);

	// remember whether the stack sample is (most likely) truncated
	sample.stack_maxed = (filled_stack_size == max_stack_size);
	}
	}

	PERFETTO_CHECK(parse_pos == record_start + sample_size);
	return sample;
	}

	void EventReader::EnableEvents() {
	int ret = ioctl(perf_fd_.get(), PERF_EVENT_IOC_ENABLE);
	PERFETTO_CHECK(ret == 0);
	}

	void EventReader::DisableEvents() {
	int ret = ioctl(perf_fd_.get(), PERF_EVENT_IOC_DISABLE);
	PERFETTO_CHECK(ret == 0);
	}

	} // namespace profiling
	} // namespace perfetto