traced_perf: allow collection and symbolization of kernel frames

If a config asks for kernel frames, we include PERF_SAMPLE_CALLCHAIN in
the event config. The kernel unwinds its own frames, and we only need to
symbolize them in the unwinder, using the kallsyms symbolizer that was
added for traced_probes.

The kernel frames are output as part of the normal callstack, and can
be further identified by having a magical "kernel" string for their
mapping name. This is similar to what we do for synthetic error frames.

Tested with aosp/1496216 for the SELinux changes around /proc/kallsyms
access on android.

Bug: 173124818
Change-Id: I05ef78621d00c60e05bb4833c63505c910a0928a
diff --git a/Android.bp b/Android.bp
index 1756f7b..65c9f37 100644
--- a/Android.bp
+++ b/Android.bp
@@ -8844,6 +8844,7 @@
     ":perfetto_src_profiling_perf_traced_perf_main",
     ":perfetto_src_profiling_perf_unwinding",
     ":perfetto_src_protozero_protozero",
+    ":perfetto_src_traced_probes_ftrace_kallsyms_kallsyms",
     ":perfetto_src_tracing_common",
     ":perfetto_src_tracing_core_core",
     ":perfetto_src_tracing_core_service",
diff --git a/src/profiling/perf/BUILD.gn b/src/profiling/perf/BUILD.gn
index b715de1..c6c48ae 100644
--- a/src/profiling/perf/BUILD.gn
+++ b/src/profiling/perf/BUILD.gn
@@ -92,6 +92,7 @@
     ":common_types",
     "../../../gn:default_deps",
     "../../../include/perfetto/ext/tracing/core",
+    "../../../src//traced/probes/ftrace/kallsyms",
     "../../../src/base",
     "../common:unwind_support",
   ]
diff --git a/src/profiling/perf/common_types.h b/src/profiling/perf/common_types.h
index 109ff35..8577503 100644
--- a/src/profiling/perf/common_types.h
+++ b/src/profiling/perf/common_types.h
@@ -49,6 +49,7 @@
   std::unique_ptr<unwindstack::Regs> regs;
   std::vector<char> stack;
   bool stack_maxed = false;
+  std::vector<uint64_t> kernel_ips;
 };
 
 // Entry in an unwinding queue. Either a sample that requires unwinding, or a
diff --git a/src/profiling/perf/event_config.cc b/src/profiling/perf/event_config.cc
index 896fa70..2e8e58d 100644
--- a/src/profiling/perf/event_config.cc
+++ b/src/profiling/perf/event_config.cc
@@ -164,7 +164,8 @@
       samples_per_tick_limit_(samples_per_tick_limit),
       target_filter_(std::move(target_filter)),
       remote_descriptor_timeout_ms_(remote_descriptor_timeout_ms),
-      unwind_state_clear_period_ms_(cfg.unwind_state_clear_period_ms()) {
+      unwind_state_clear_period_ms_(cfg.unwind_state_clear_period_ms()),
+      kernel_frames_(cfg.kernel_frames()) {
   auto& pe = perf_event_attr_;
   pe.size = sizeof(perf_event_attr);
 
@@ -190,6 +191,12 @@
   // PERF_SAMPLE_REGS_USER:
   pe.sample_regs_user =
       PerfUserRegsMaskForArch(unwindstack::Regs::CurrentArch());
+
+  // Optional kernel call frames (unwound by the kernel itself):
+  if (kernel_frames_) {
+    pe.sample_type |= PERF_SAMPLE_CALLCHAIN;
+    pe.exclude_callchain_user = true;
+  }
 }
 
 }  // namespace profiling
diff --git a/src/profiling/perf/event_config.h b/src/profiling/perf/event_config.h
index 7a9ca00..da77075 100644
--- a/src/profiling/perf/event_config.h
+++ b/src/profiling/perf/event_config.h
@@ -59,8 +59,8 @@
   uint32_t unwind_state_clear_period_ms() const {
     return unwind_state_clear_period_ms_;
   }
-
   const TargetFilter& filter() const { return target_filter_; }
+  bool kernel_frames() const { return kernel_frames_; }
 
   perf_event_attr* perf_attr() const {
     return const_cast<perf_event_attr*>(&perf_event_attr_);
@@ -100,6 +100,9 @@
 
   // Optional period for clearing cached unwinder state. Skipped if zero.
   const uint32_t unwind_state_clear_period_ms_;
+
+  // If true, include kernel frames in the callstacks.
+  const bool kernel_frames_;
 };
 
 }  // namespace profiling
diff --git a/src/profiling/perf/event_reader.cc b/src/profiling/perf/event_reader.cc
index 35054f8..c6ddf3d 100644
--- a/src/profiling/perf/event_reader.cc
+++ b/src/profiling/perf/event_reader.cc
@@ -33,10 +33,17 @@
 
 template <typename T>
 const char* ReadValue(T* value_out, const char* ptr) {
-  memcpy(value_out, reinterpret_cast<const void*>(ptr), sizeof(T));
+  memcpy(value_out, ptr, sizeof(T));
   return ptr + sizeof(T);
 }
 
+template <typename T>
+const char* ReadValues(T* out, const char* ptr, size_t num_values) {
+  size_t sz = sizeof(T) * num_values;
+  memcpy(out, ptr, sz);
+  return ptr + sz;
+}
+
 bool IsPowerOfTwo(size_t v) {
   return (v != 0 && ((v & (v - 1)) == 0));
 }
@@ -281,7 +288,7 @@
                                             const char* record_start) {
   if (event_attr_.sample_type &
       (~uint64_t(PERF_SAMPLE_TID | PERF_SAMPLE_TIME | PERF_SAMPLE_STACK_USER |
-                 PERF_SAMPLE_REGS_USER))) {
+                 PERF_SAMPLE_REGS_USER | PERF_SAMPLE_CALLCHAIN))) {
     PERFETTO_FATAL("Unsupported sampling option");
   }
 
@@ -309,6 +316,14 @@
     parse_pos = ReadValue(&sample.timestamp, parse_pos);
   }
 
+  if (event_attr_.sample_type & PERF_SAMPLE_CALLCHAIN) {
+    uint64_t chain_len = 0;
+    parse_pos = ReadValue(&chain_len, parse_pos);
+    sample.kernel_ips.resize(static_cast<size_t>(chain_len));
+    parse_pos = ReadValues<uint64_t>(sample.kernel_ips.data(), parse_pos,
+                                     static_cast<size_t>(chain_len));
+  }
+
   if (event_attr_.sample_type & PERF_SAMPLE_REGS_USER) {
     // Can be empty, e.g. if we sampled a kernel thread.
     sample.regs = ReadPerfUserRegsData(&parse_pos);
diff --git a/src/profiling/perf/perf_producer.cc b/src/profiling/perf/perf_producer.cc
index 32bc20a..b99b1f5 100644
--- a/src/profiling/perf/perf_producer.cc
+++ b/src/profiling/perf/perf_producer.cc
@@ -257,7 +257,8 @@
 
   // Inform unwinder of the new data source instance, and optionally start a
   // periodic task to clear its cached state.
-  unwinding_worker_->PostStartDataSource(instance_id);
+  unwinding_worker_->PostStartDataSource(instance_id,
+                                         ds.event_config.kernel_frames());
   if (ds.event_config.unwind_state_clear_period_ms()) {
     unwinding_worker_->PostClearCachedStatePeriodic(
         instance_id, ds.event_config.unwind_state_clear_period_ms());
diff --git a/src/profiling/perf/unwinding.cc b/src/profiling/perf/unwinding.cc
index fb310cc..1dfb005 100644
--- a/src/profiling/perf/unwinding.cc
+++ b/src/profiling/perf/unwinding.cc
@@ -20,6 +20,8 @@
 
 #include <inttypes.h>
 
+#include <unwindstack/Unwinder.h>
+
 #include "perfetto/ext/base/metatrace.h"
 #include "perfetto/ext/base/thread_utils.h"
 #include "perfetto/ext/base/utils.h"
@@ -40,18 +42,24 @@
   base::MaybeSetThreadName("stack-unwinding");
 }
 
-void Unwinder::PostStartDataSource(DataSourceInstanceID ds_id) {
+void Unwinder::PostStartDataSource(DataSourceInstanceID ds_id,
+                                   bool kernel_frames) {
   // No need for a weak pointer as the associated task runner quits (stops
   // running tasks) strictly before the Unwinder's destruction.
-  task_runner_->PostTask([this, ds_id] { StartDataSource(ds_id); });
+  task_runner_->PostTask(
+      [this, ds_id, kernel_frames] { StartDataSource(ds_id, kernel_frames); });
 }
 
-void Unwinder::StartDataSource(DataSourceInstanceID ds_id) {
+void Unwinder::StartDataSource(DataSourceInstanceID ds_id, bool kernel_frames) {
   PERFETTO_DCHECK_THREAD(thread_checker_);
   PERFETTO_DLOG("Unwinder::StartDataSource(%zu)", static_cast<size_t>(ds_id));
 
   auto it_and_inserted = data_sources_.emplace(ds_id, DataSourceState{});
   PERFETTO_DCHECK(it_and_inserted.second);
+
+  if (kernel_frames) {
+    kernel_symbolizer_.GetOrCreateKernelSymbolMap();
+  }
 }
 
 // c++11: use shared_ptr to transfer resource handles, so that the resources get
@@ -358,7 +366,12 @@
     unwind = attempt_unwind();
   }
 
-  ret.frames.reserve(unwind.frames.size());
+  // Symbolize kernel-unwound kernel frames (if any).
+  std::vector<FrameData> kernel_frames = SymbolizeKernelCallchain(sample);
+
+  // Concatenate the kernel and userspace frames.
+  ret.frames = std::move(kernel_frames);
+  ret.frames.reserve(ret.frames.size() + unwind.frames.size());
   for (unwindstack::FrameData& frame : unwind.frames) {
     ret.frames.emplace_back(unwind_state->AnnotateFrame(std::move(frame)));
   }
@@ -379,6 +392,38 @@
   return ret;
 }
 
+std::vector<FrameData> Unwinder::SymbolizeKernelCallchain(
+    const ParsedSample& sample) {
+  std::vector<FrameData> ret;
+  if (sample.kernel_ips.empty())
+    return ret;
+
+  // The list of addresses contains special context marker values (inserted by
+  // the kernel's unwinding) to indicate which section of the callchain belongs
+  // to the kernel/user mode (if the kernel can successfully unwind user
+  // stacks). In our case, we request only the kernel frames.
+  if (sample.kernel_ips[0] != PERF_CONTEXT_KERNEL) {
+    PERFETTO_DFATAL_OR_ELOG(
+        "Unexpected: 0th frame of callchain is not PERF_CONTEXT_KERNEL.");
+    return ret;
+  }
+
+  auto* kernel_map = kernel_symbolizer_.GetOrCreateKernelSymbolMap();
+  ret.reserve(sample.kernel_ips.size());
+  for (size_t i = 1; i < sample.kernel_ips.size(); i++) {
+    std::string function_name = kernel_map->Lookup(sample.kernel_ips[i]);
+
+    // Synthesise a partially-valid libunwindstack frame struct for the kernel
+    // frame. We reuse the type for convenience. The kernel frames are marked by
+    // a magical "kernel" string as their containing mapping.
+    unwindstack::FrameData frame{};
+    frame.function_name = std::move(function_name);
+    frame.map_name = "kernel";
+    ret.emplace_back(FrameData{std::move(frame), /*build_id=*/""});
+  }
+  return ret;
+}
+
 void Unwinder::PostInitiateDataSourceStop(DataSourceInstanceID ds_id) {
   task_runner_->PostTask([this, ds_id] { InitiateDataSourceStop(ds_id); });
 }
@@ -414,8 +459,10 @@
   data_sources_.erase(it);
 
   // Clean up state if there are no more active sources.
-  if (data_sources_.empty())
+  if (data_sources_.empty()) {
+    kernel_symbolizer_.Destroy();
     ResetAndEnableUnwindstackCache();
+  }
 
   // Inform service thread that the unwinder is done with the source.
   delegate_->PostFinishDataSourceStop(ds_id);
diff --git a/src/profiling/perf/unwinding.h b/src/profiling/perf/unwinding.h
index 4b7b091..274d485 100644
--- a/src/profiling/perf/unwinding.h
+++ b/src/profiling/perf/unwinding.h
@@ -35,6 +35,10 @@
 #include "src/profiling/perf/common_types.h"
 #include "src/profiling/perf/unwind_queue.h"
 
+// TODO(rsavitski): move kallsyms code to a common location.
+#include "src/traced/probes/ftrace/kallsyms/kernel_symbol_map.h"
+#include "src/traced/probes/ftrace/kallsyms/lazy_kernel_symbolizer.h"
+
 namespace perfetto {
 namespace profiling {
 
@@ -85,7 +89,7 @@
 
   ~Unwinder() { PERFETTO_DCHECK_THREAD(thread_checker_); }
 
-  void PostStartDataSource(DataSourceInstanceID ds_id);
+  void PostStartDataSource(DataSourceInstanceID ds_id, bool kernel_frames);
   void PostAdoptProcDescriptors(DataSourceInstanceID ds_id,
                                 pid_t pid,
                                 base::ScopedFile maps_fd,
@@ -128,7 +132,8 @@
   Unwinder(Delegate* delegate, base::UnixTaskRunner* task_runner);
 
   // Marks the data source as valid and active at the unwinding stage.
-  void StartDataSource(DataSourceInstanceID ds_id);
+  // Initializes kernel address symbolization if needed.
+  void StartDataSource(DataSourceInstanceID ds_id, bool kernel_frames);
 
   void AdoptProcDescriptors(DataSourceInstanceID ds_id,
                             pid_t pid,
@@ -149,6 +154,9 @@
                                UnwindingMetadata* unwind_state,
                                bool pid_unwound_before);
 
+  // Returns a list of symbolized kernel frames in the sample (if any).
+  std::vector<FrameData> SymbolizeKernelCallchain(const ParsedSample& sample);
+
   // Marks the data source as shutting down at the unwinding stage. It is known
   // that no new samples for this source will be pushed into the queue, but we
   // need to delay the unwinder state teardown until all previously-enqueued
@@ -196,6 +204,7 @@
   Delegate* const delegate_;
   UnwindQueue<UnwindEntry, kUnwindQueueCapacity> unwind_queue_;
   std::map<DataSourceInstanceID, DataSourceState> data_sources_;
+  LazyKernelSymbolizer kernel_symbolizer_;
 
   PERFETTO_THREAD_CHECKER(thread_checker_)
 };