blob: ae8fff740f87804aa7e7bf1a4507aec781383691 [file] [log] [blame]
Primiano Tucci02691162020-01-21 13:30:13 +00001/*
2 * Copyright (C) 2019 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
Ryan Savitskidc218882020-11-16 18:02:02 +000017#include "src/kallsyms/kernel_symbol_map.h"
Primiano Tucci02691162020-01-21 13:30:13 +000018
Primiano Tucciab293f52020-12-08 11:46:52 +010019#include "perfetto/base/build_config.h"
Primiano Tucci02691162020-01-21 13:30:13 +000020#include "perfetto/base/logging.h"
Primiano Tucciab293f52020-12-08 11:46:52 +010021#include "perfetto/ext/base/file_utils.h"
Primiano Tucci02691162020-01-21 13:30:13 +000022#include "perfetto/ext/base/metatrace.h"
23#include "perfetto/ext/base/paged_memory.h"
24#include "perfetto/ext/base/scoped_file.h"
25#include "perfetto/ext/base/string_view.h"
26#include "perfetto/ext/base/utils.h"
27#include "perfetto/protozero/proto_utils.h"
28
Primiano Tucci02691162020-01-21 13:30:13 +000029#include <stdio.h>
30
31#include <algorithm>
Primiano Tucci58d2dc62021-06-24 16:03:24 +010032#include <cinttypes>
Primiano Tucci02691162020-01-21 13:30:13 +000033#include <functional>
34#include <map>
Daniele Di Proietto3c965d22021-10-01 17:17:13 +010035#include <unordered_map>
Primiano Tucci02691162020-01-21 13:30:13 +000036#include <utility>
37
38namespace perfetto {
39
40// On a Pixel 3 this gives an avg. lookup time of 600 ns and a memory usage
41// of 1.1 MB for 65k symbols. See go/kallsyms-parser-bench.
42size_t KernelSymbolMap::kSymIndexSampling = 16;
43size_t KernelSymbolMap::kTokenIndexSampling = 4;
44
45namespace {
46
47using TokenId = KernelSymbolMap::TokenTable::TokenId;
48constexpr size_t kSymNameMaxLen = 128;
Primiano Tuccia1221cb2020-10-08 11:00:42 +020049constexpr size_t kSymMaxSizeBytes = 1024 * 1024;
Primiano Tucci02691162020-01-21 13:30:13 +000050
51// Reads a kallsyms file in blocks of 4 pages each and decode its lines using
52// a simple FSM. Calls the passed lambda for each valid symbol.
53// It skips undefined symbols and other useless stuff.
54template <typename Lambda /* void(uint64_t, const char*) */>
55void ForEachSym(const std::string& kallsyms_path, Lambda fn) {
56 base::ScopedFile fd = base::OpenFile(kallsyms_path.c_str(), O_RDONLY);
57 if (!fd) {
58 PERFETTO_PLOG("Cannot open %s", kallsyms_path.c_str());
59 return;
60 }
61
62 // /proc/kallsyms looks as follows:
63 // 0000000000026a80 A bpf_trace_sds
64 //
65 // ffffffffc03a6000 T cpufreq_gov_powersave_init<TAB> [cpufreq_powersave]
66 // ffffffffc035d000 T cpufreq_gov_userspace_init<TAB> [cpufreq_userspace]
67 //
68 // We parse it with a state machine that has four states, one for each column.
69 // We don't care about the part in the square brackets and ignore everything
70 // after the symbol name.
71
72 static constexpr size_t kBufSize = 16 * 1024;
73 base::PagedMemory buffer = base::PagedMemory::Allocate(kBufSize);
74 enum { kSymAddr, kSymType, kSymName, kEatRestOfLine } state = kSymAddr;
75 uint64_t sym_addr = 0;
76 char sym_type = '\0';
77 char sym_name[kSymNameMaxLen + 1];
78 size_t sym_name_len = 0;
79 for (;;) {
80 char* buf = static_cast<char*>(buffer.Get());
Primiano Tucciab293f52020-12-08 11:46:52 +010081 auto rsize = base::Read(*fd, buf, kBufSize);
Primiano Tucci02691162020-01-21 13:30:13 +000082 if (rsize < 0) {
83 PERFETTO_PLOG("read(%s) failed", kallsyms_path.c_str());
84 return;
85 }
86 if (rsize == 0)
87 return; // EOF
88 for (size_t i = 0; i < static_cast<size_t>(rsize); i++) {
89 char c = buf[i];
90 const bool is_space = c == ' ' || c == '\t';
91 switch (state) {
92 case kSymAddr:
93 if (c >= '0' && c <= '9') {
94 sym_addr = (sym_addr << 4) | static_cast<uint8_t>(c - '0');
95 } else if (c >= 'a' && c <= 'f') {
96 sym_addr = (sym_addr << 4) | static_cast<uint8_t>(c - 'a' + 10);
97 } else if (is_space) {
98 state = kSymType;
99 } else if (c == '\0') {
100 return;
101 } else {
102 PERFETTO_ELOG("kallsyms parser error: chr 0x%x @ off=%zu", c, i);
103 return;
104 }
105 break;
106
107 case kSymType:
108 if (is_space)
109 break; // Eat leading spaces.
110 sym_type = c;
111 state = kSymName;
112 sym_name_len = 0;
113 break;
114
115 case kSymName:
116 if (is_space && sym_name_len == 0)
117 break; // Eat leading spaces.
118 if (c && c != '\n' && !is_space && sym_name_len < kSymNameMaxLen) {
119 sym_name[sym_name_len++] = c;
120 break;
121 }
122 sym_name[sym_name_len] = '\0';
123 fn(sym_addr, sym_type, sym_name);
124 sym_addr = 0;
125 sym_type = '\0';
126 state = c == '\n' ? kSymAddr : kEatRestOfLine;
127 break;
128
129 case kEatRestOfLine:
130 if (c == '\n')
131 state = kSymAddr;
132 break;
133 } // switch(state)
134 } // for (char in buf)
135 } // for (read chunk)
136}
137
138// Splits a symbol name into tokens using '_' as a separator, calling the passed
139// lambda for each token. It splits tokens in a way that allows the original
140// string to be rebuilt as-is by re-joining using a '_' between each token.
141// For instance:
142// _fo_a_b -> ["", fo, a, b]
143// __fo_a_b -> [_, fo, a, b]
144// __fo_a_b_ -> [_, fo, a, b, ""]
145// __fo_a_b____ -> [_, fo, a, b, ___]
146template <typename Lambda /* void(base::StringView) */>
147void Tokenize(const char* name, Lambda fn) {
148 const char* tok_start = name;
149 bool is_start_of_token = true;
150 bool tok_is_sep = false;
151 for (const char* ptr = name;; ptr++) {
152 const char c = *ptr;
153 if (is_start_of_token) {
154 tok_is_sep = *tok_start == '_'; // Deals with tokens made of '_'s.
155 is_start_of_token = false;
156 }
157 // Scan until either the end of string or the next character (which is a '_'
158 // in nominal cases, or anything != '_' for tokens made by 1+ '_').
159 if (c == '\0' || (!tok_is_sep && c == '_') || (tok_is_sep && c != '_')) {
160 size_t tok_len = static_cast<size_t>(ptr - tok_start);
161 if (tok_is_sep && c != '\0')
162 --tok_len;
163 fn(base::StringView(tok_start, tok_len));
164 if (c == '\0')
165 return;
166 tok_start = tok_is_sep ? ptr : ptr + 1;
167 is_start_of_token = true;
168 }
169 }
170}
171
172} // namespace
173
174KernelSymbolMap::TokenTable::TokenTable() {
175 // Insert a null token as id 0. We can't just add "" because the empty string
176 // is special-cased and doesn't insert an actual token. So we push a string of
177 // size one that contains only the null character instead.
178 char null_tok = 0;
179 Add(std::string(&null_tok, 1));
180}
181
182KernelSymbolMap::TokenTable::~TokenTable() = default;
183
184// Adds a new token to the db. Does not dedupe identical token (with the
185// exception of the empty string). The caller has to deal with that.
186// Supports only ASCII characters in the range [1, 127].
187// The last character of the token will have the MSB set.
188TokenId KernelSymbolMap::TokenTable::Add(const std::string& token) {
189 const size_t token_size = token.size();
190 if (token_size == 0)
191 return 0;
192 TokenId id = num_tokens_++;
193
194 const size_t buf_size_before_insertion = buf_.size();
195 if (id % kTokenIndexSampling == 0)
196 index_.emplace_back(buf_size_before_insertion);
197
198 const size_t prev_size = buf_.size();
199 buf_.resize(prev_size + token_size);
200 char* tok_wptr = &buf_[prev_size];
201 for (size_t i = 0; i < token_size - 1; i++) {
202 PERFETTO_DCHECK((token.at(i) & 0x80) == 0); // |token| must be ASCII only.
203 *(tok_wptr++) = token.at(i) & 0x7f;
204 }
Primiano Tuccie20db982020-09-23 10:05:25 +0200205 *(tok_wptr++) = static_cast<char>(token.at(token_size - 1) | 0x80);
Primiano Tuccic0ec36e2021-05-17 16:22:25 +0100206 PERFETTO_DCHECK(tok_wptr == buf_.data() + buf_.size());
Primiano Tucci02691162020-01-21 13:30:13 +0000207 return id;
208}
209
210// NOTE: the caller need to mask the returned chars with 0x7f. The last char of
211// the StringView will have its MSB set (it's used as a EOF char internally).
212base::StringView KernelSymbolMap::TokenTable::Lookup(TokenId id) {
213 if (id == 0)
214 return base::StringView();
215 if (id > num_tokens_)
216 return base::StringView("<error>");
217 // We don't know precisely where the id-th token starts in the buffer. We
218 // store only one position every kTokenIndexSampling. From there, the token
219 // can be found with a linear scan of at most kTokenIndexSampling steps.
220 size_t index_off = id / kTokenIndexSampling;
221 PERFETTO_DCHECK(index_off < index_.size());
222 TokenId cur_id = static_cast<TokenId>(index_off * kTokenIndexSampling);
223 uint32_t begin = index_[index_off];
224 PERFETTO_DCHECK(begin == 0 || buf_[begin - 1] & 0x80);
225 const size_t buf_size = buf_.size();
226 for (uint32_t off = begin; off < buf_size; ++off) {
227 // Advance |off| until the end of the token (which has the MSB set).
228 if ((buf_[off] & 0x80) == 0)
229 continue;
230 if (cur_id == id)
231 return base::StringView(&buf_[begin], off - begin + 1);
232 ++cur_id;
233 begin = off + 1;
234 }
235 return base::StringView();
236}
237
238size_t KernelSymbolMap::Parse(const std::string& kallsyms_path) {
Ryan Savitskidc218882020-11-16 18:02:02 +0000239 PERFETTO_METATRACE_SCOPED(TAG_PRODUCER, KALLSYMS_PARSE);
Primiano Tucci02691162020-01-21 13:30:13 +0000240 using SymAddr = uint64_t;
241
242 struct TokenInfo {
243 uint32_t count = 0;
244 TokenId id = 0;
245 };
246
Daniele Di Proietto3c965d22021-10-01 17:17:13 +0100247 // Note if changing the container: the code below doesn't rely on stable
248 // iterators, but relies on stable pointers.
249 using TokenMap = std::unordered_map<std::string, TokenInfo>;
Primiano Tucci02691162020-01-21 13:30:13 +0000250 using TokenMapPtr = TokenMap::value_type*;
251 TokenMap tokens;
252
253 // Keep the (ordered) list of tokens for each symbol.
Primiano Tuccia1221cb2020-10-08 11:00:42 +0200254 struct SymAddrAndTokenPtr {
255 SymAddr addr;
256 TokenMapPtr token_map_entry;
257
258 bool operator<(const SymAddrAndTokenPtr& other) const {
259 return addr < other.addr;
260 }
261 };
262 std::vector<SymAddrAndTokenPtr> symbol_tokens;
263
264 // Based on `cat /proc/kallsyms | egrep "\b[tT]\b" | wc -l`.
265 symbol_tokens.reserve(128 * 1024);
Primiano Tucci02691162020-01-21 13:30:13 +0000266
267 ForEachSym(kallsyms_path, [&](SymAddr addr, char type, const char* name) {
268 if (addr == 0 || (type != 't' && type != 'T') || name[0] == '$') {
269 return;
270 }
271
272 // Split each symbol name in tokens, using '_' as a separator (so that
273 // "foo_bar" -> ["foo", "bar"]). For each token hash:
274 // 1. Keep track of the frequency of each token.
275 // 2. Keep track of the list of token hashes for each symbol.
Primiano Tuccia1221cb2020-10-08 11:00:42 +0200276 Tokenize(name, [&tokens, &symbol_tokens, addr](base::StringView token) {
Primiano Tucci02691162020-01-21 13:30:13 +0000277 // Strip the .cfi part if present.
278 if (token.substr(token.size() - 4) == ".cfi")
279 token = token.substr(0, token.size() - 4);
280 auto it_and_ins = tokens.emplace(token.ToStdString(), TokenInfo{});
281 it_and_ins.first->second.count++;
Primiano Tuccia1221cb2020-10-08 11:00:42 +0200282 symbol_tokens.emplace_back(SymAddrAndTokenPtr{addr, &*it_and_ins.first});
Primiano Tucci02691162020-01-21 13:30:13 +0000283 });
284 });
285
Primiano Tuccia1221cb2020-10-08 11:00:42 +0200286 symbol_tokens.shrink_to_fit();
287
288 // For each symbol address, T entries are inserted into |symbol_tokens|, one
289 // for each token. These symbols are added in arbitrary address (as seen in
290 // /proc/kallsyms). Here we want to sort symbols by addresses, but at the same
291 // time preserve the order of tokens within each address.
292 // For instance, if kallsyms has: {0x41: connect_socket, 0x42: write_file}:
293 // Before sort: [(0x42, write), (0x42, file), (0x41, connect), (0x41, socket)]
294 // After sort: [(0x41, connect), (0x41, socket), (0x42, write), (0x42, file)]
295 std::stable_sort(symbol_tokens.begin(), symbol_tokens.end());
296
Primiano Tucci02691162020-01-21 13:30:13 +0000297 // At this point we have broken down each symbol into a set of token hashes.
298 // Now generate the token ids, putting high freq tokens first, so they use
299 // only one byte to varint encode.
300
301 // This block limits the lifetime of |tokens_by_freq|.
302 {
303 std::vector<TokenMapPtr> tokens_by_freq;
304 tokens_by_freq.resize(tokens.size());
305 size_t tok_idx = 0;
306 for (auto& kv : tokens)
307 tokens_by_freq[tok_idx++] = &kv;
308
309 auto comparer = [](TokenMapPtr a, TokenMapPtr b) {
310 PERFETTO_DCHECK(a && b);
311 return b->second.count < a->second.count;
312 };
313 std::sort(tokens_by_freq.begin(), tokens_by_freq.end(), comparer);
314 for (TokenMapPtr tinfo : tokens_by_freq) {
315 tinfo->second.id = tokens_.Add(tinfo->first);
316 }
317 }
318 tokens_.shrink_to_fit();
319
320 buf_.resize(2 * 1024 * 1024); // Based on real-word observations.
Primiano Tuccia1221cb2020-10-08 11:00:42 +0200321 base_addr_ = symbol_tokens.empty() ? 0 : symbol_tokens.begin()->addr;
Primiano Tucci02691162020-01-21 13:30:13 +0000322 SymAddr prev_sym_addr = base_addr_;
323 uint8_t* wptr = buf_.data();
324
Primiano Tuccia1221cb2020-10-08 11:00:42 +0200325 for (auto it = symbol_tokens.begin(); it != symbol_tokens.end();) {
326 const SymAddr sym_addr = it->addr;
Primiano Tucci02691162020-01-21 13:30:13 +0000327
328 // Find the iterator to the first token of the next symbol (or the end).
329 auto sym_start = it;
330 auto sym_end = it;
Primiano Tuccia1221cb2020-10-08 11:00:42 +0200331 while (sym_end != symbol_tokens.end() && sym_end->addr == sym_addr)
Primiano Tucci02691162020-01-21 13:30:13 +0000332 ++sym_end;
333
334 // The range [sym_start, sym_end) has all the tokens for the current symbol.
335 uint32_t size_before = static_cast<uint32_t>(wptr - buf_.data());
336
337 // Make sure there is enough headroom to write the symbol.
338 if (buf_.size() - size_before < 1024) {
339 buf_.resize(buf_.size() + 32768);
340 wptr = buf_.data() + size_before;
341 }
342
343 uint32_t sym_rel_addr = static_cast<uint32_t>(sym_addr - base_addr_);
344 const size_t sym_num = num_syms_++;
345 if (sym_num % kSymIndexSampling == 0)
346 index_.emplace_back(std::make_pair(sym_rel_addr, size_before));
347 PERFETTO_DCHECK(sym_addr >= prev_sym_addr);
348 uint32_t delta = static_cast<uint32_t>(sym_addr - prev_sym_addr);
349 wptr = protozero::proto_utils::WriteVarInt(delta, wptr);
350 // Append all the token ids.
351 for (it = sym_start; it != sym_end;) {
Primiano Tuccia1221cb2020-10-08 11:00:42 +0200352 PERFETTO_DCHECK(it->addr == sym_addr);
353 TokenMapPtr const token_map_entry = it->token_map_entry;
354 const TokenInfo& token_info = token_map_entry->second;
355 TokenId token_id = token_info.id << 1;
Primiano Tucci02691162020-01-21 13:30:13 +0000356 ++it;
357 token_id |= (it == sym_end) ? 1 : 0; // Last one has LSB set to 1.
358 wptr = protozero::proto_utils::WriteVarInt(token_id, wptr);
359 }
360 prev_sym_addr = sym_addr;
361 } // for (symbols)
362
363 buf_.resize(static_cast<size_t>(wptr - buf_.data()));
364 buf_.shrink_to_fit();
Primiano Tuccia1221cb2020-10-08 11:00:42 +0200365 base::MaybeReleaseAllocatorMemToOS(); // For Scudo, b/170217718.
Primiano Tucci02691162020-01-21 13:30:13 +0000366
Primiano Tuccif44e2a12020-10-22 16:15:31 +0200367 if (num_syms_ == 0) {
368 PERFETTO_ELOG(
369 "Failed to parse kallsyms. Kernel functions will not be symbolized. On "
370 "Linux this requires either running traced_probes as root or manually "
371 "lowering /proc/sys/kernel/kptr_restrict");
372 } else {
373 PERFETTO_DLOG(
374 "Loaded %zu kalllsyms entries. Mem usage: %zu B (addresses) + %zu B "
375 "(tokens), total: %zu B",
376 num_syms_, addr_bytes(), tokens_.size_bytes(), size_bytes());
377 }
Primiano Tucci02691162020-01-21 13:30:13 +0000378
379 return num_syms_;
380}
381
382std::string KernelSymbolMap::Lookup(uint64_t sym_addr) {
383 if (index_.empty() || sym_addr < base_addr_)
384 return "";
385
386 // First find the highest symbol address <= sym_addr.
387 // Start with a binary search using the sparse index.
388
389 const uint32_t sym_rel_addr = static_cast<uint32_t>(sym_addr - base_addr_);
390 auto it = std::upper_bound(index_.cbegin(), index_.cend(),
391 std::make_pair(sym_rel_addr, 0u));
392 if (it != index_.cbegin())
393 --it;
394
395 // Then continue with a linear scan (of at most kSymIndexSampling steps).
396 uint32_t addr = it->first;
397 uint32_t off = it->second;
398 const uint8_t* rdptr = &buf_[off];
Primiano Tuccic0ec36e2021-05-17 16:22:25 +0100399 const uint8_t* const buf_end = buf_.data() + buf_.size();
Primiano Tucci02691162020-01-21 13:30:13 +0000400 bool parsing_addr = true;
401 const uint8_t* next_rdptr = nullptr;
Primiano Tuccia1221cb2020-10-08 11:00:42 +0200402 uint64_t sym_start_addr = 0;
Primiano Tucci02691162020-01-21 13:30:13 +0000403 for (bool is_first_addr = true;; is_first_addr = false) {
404 uint64_t v = 0;
405 const auto* prev_rdptr = rdptr;
406 rdptr = protozero::proto_utils::ParseVarInt(rdptr, buf_end, &v);
407 if (rdptr == prev_rdptr)
408 break;
409 if (parsing_addr) {
410 addr += is_first_addr ? 0 : static_cast<uint32_t>(v);
411 parsing_addr = false;
412 if (addr > sym_rel_addr)
413 break;
414 next_rdptr = rdptr;
Primiano Tuccia1221cb2020-10-08 11:00:42 +0200415 sym_start_addr = addr;
Primiano Tucci02691162020-01-21 13:30:13 +0000416 } else {
417 // This is a token. Wait for the EOF maker.
418 parsing_addr = (v & 1) == 1;
419 }
420 }
421
422 if (!next_rdptr)
423 return "";
424
Primiano Tuccia1221cb2020-10-08 11:00:42 +0200425 PERFETTO_DCHECK(sym_rel_addr >= sym_start_addr);
426
427 // If this address is too far from the start of the symbol, this is likely
428 // a pointer to something else (e.g. some vmalloc struct) and we just picked
429 // the very last symbol for a loader region.
430 if (sym_rel_addr - sym_start_addr > kSymMaxSizeBytes)
431 return "";
432
Primiano Tucci02691162020-01-21 13:30:13 +0000433 // The address has been found. Now rejoin the tokens to form the symbol name.
434
435 rdptr = next_rdptr;
436 std::string sym_name;
437 sym_name.reserve(kSymNameMaxLen);
438 for (bool eof = false, is_first_token = true; !eof; is_first_token = false) {
439 uint64_t v = 0;
440 const auto* old = rdptr;
441 rdptr = protozero::proto_utils::ParseVarInt(rdptr, buf_end, &v);
442 if (rdptr == old)
443 break;
444 eof = v & 1;
445 base::StringView token = tokens_.Lookup(static_cast<TokenId>(v >> 1));
446 if (!is_first_token)
447 sym_name.push_back('_');
448 for (size_t i = 0; i < token.size(); i++)
449 sym_name.push_back(token.at(i) & 0x7f);
450 }
451 return sym_name;
452}
453
454} // namespace perfetto