| // Copyright (C) 2023 The Android Open Source Project |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| import { |
| LONG, |
| NUM, |
| NUM_NULL, |
| STR, |
| STR_NULL, |
| } from '../../trace_processor/query_result'; |
| import type {Trace} from '../../public/trace'; |
| import type {PerfettoPlugin} from '../../public/plugin'; |
| import {TrackNode} from '../../public/workspace'; |
| import {SliceTrack} from '../../components/tracks/slice_track'; |
| import {SourceDataset} from '../../trace_processor/dataset'; |
| import {ThreadSliceDetailsPanel} from '../../components/details/thread_slice_details_tab'; |
| import {Gpu} from '../../components/gpu'; |
| import ProcessThreadGroupsPlugin from '../dev.perfetto.ProcessThreadGroups'; |
| |
| function getProcessDisplayName( |
| name: string | null, |
| pid: number | null, |
| ): string { |
| if (name != null) { |
| return name; |
| } else if (pid != null) { |
| return `Process ${pid}`; |
| } |
| return 'Unknown'; |
| } |
| |
| interface PathPart { |
| // Display name shown in the workspace tree. |
| name: string; |
| // Sort order within the immediate parent. |
| sortOrder: number; |
| // Stable key used to dedupe groups (combined with upid + ancestors). |
| key: string; |
| } |
| |
| interface LeafTrack { |
| // The owning process. |
| upid: number; |
| pid: number | null; |
| processName: string | null; |
| // Ordered groups from outermost (under the per-process "GPU" node) to |
| // innermost (parent of the leaf track). May be empty. |
| pathParts: PathPart[]; |
| // Leaf track display name. |
| leafName: string; |
| // Sort order of the leaf within its immediate parent. |
| leafSortOrder: number; |
| // Stable URI suffix appended after the per-process URI prefix. |
| uriSuffix: string; |
| // The dataset that drives the SliceTrack. Discoverers prefer |
| // src='gpu_slice' + a structured `filter` (so aggregation across tracks |
| // can merge them). When the constraint can't be expressed by the dataset |
| // Filter API (e.g. predicates on extract_arg() values), a custom |
| // subquery src is used instead. |
| dataset: SourceDataset<{ |
| id: number; |
| name: string; |
| ts: bigint; |
| dur: bigint; |
| depth: number; |
| }>; |
| } |
| |
| // CUDA / HIP: events that carry both a "device" and "stream" launch arg get |
| // nested under "Device #N -> Context #N -> Stream #N", with the leaf track |
| // holding the actual slices. Other APIs can be added by writing a similar |
| // discovery function and adding it to discoverApiTracks() below. |
| // |
| // Notes on per-sequence scoping: |
| // * device, stream and the gpu_slice.context_id (which is the |
| // InternedGraphicsContext IID) are all per-process. Two processes can |
| // reuse the same numeric values for distinct logical entities; including |
| // upid in every URI / partition key keeps them disambiguated. |
| async function discoverCudaHipTracks(ctx: Trace): Promise<LeafTrack[]> { |
| // Pick up the API name (CUDA / HIP / OPEN_CL / VULKAN / ...) from |
| // gpu_context so we can label the per-process top group with the right |
| // API. gpu_context is populated from InternedGraphicsContext.api, which |
| // both the CUDA and HIP injection producers set. The view lives in the |
| // std.gpu.context perfetto SQL module and must be included before use. |
| const result = await ctx.engine.query(` |
| INCLUDE PERFETTO MODULE std.gpu.context; |
| SELECT |
| s.upid AS upid, |
| extract_arg(s.arg_set_id, 'device') AS device, |
| s.context_id AS context, |
| extract_arg(s.arg_set_id, 'stream') AS stream, |
| gc.api AS api, |
| p.pid AS pid, |
| p.name AS process_name |
| FROM gpu_slice s |
| JOIN process p USING (upid) |
| LEFT JOIN gpu_context gc ON gc.context_id = s.context_id |
| WHERE s.upid IS NOT NULL |
| AND s.context_id IS NOT NULL |
| AND extract_arg(s.arg_set_id, 'device') IS NOT NULL |
| AND extract_arg(s.arg_set_id, 'stream') IS NOT NULL |
| GROUP BY s.upid, device, s.context_id, stream |
| ORDER BY s.upid, device, s.context_id, stream |
| `); |
| |
| const it = result.iter({ |
| upid: NUM, |
| device: NUM, |
| context: NUM, |
| stream: NUM, |
| api: STR_NULL, |
| pid: NUM_NULL, |
| process_name: STR_NULL, |
| }); |
| |
| interface Raw { |
| upid: number; |
| device: number; |
| context: number; |
| stream: number; |
| api: string | null; |
| pid: number | null; |
| processName: string | null; |
| } |
| const raws: Raw[] = []; |
| // Hierarchy collapse: skip the Device level if the process only ever |
| // touched a single device, and skip the Context level for any |
| // particular (process, device) where only a single context is used. |
| // Stream is always shown as the leaf. |
| const devicesByUpid = new Map<number, Set<number>>(); |
| const contextsByUpidDevice = new Map<string, Set<number>>(); |
| for (; it.valid(); it.next()) { |
| raws.push({ |
| upid: it.upid, |
| device: it.device, |
| context: it.context, |
| stream: it.stream, |
| api: it.api, |
| pid: it.pid, |
| processName: it.process_name, |
| }); |
| const dSet = devicesByUpid.get(it.upid) ?? new Set<number>(); |
| dSet.add(it.device); |
| devicesByUpid.set(it.upid, dSet); |
| const ctxKey = `${it.upid}#${it.device}`; |
| const cSet = contextsByUpidDevice.get(ctxKey) ?? new Set<number>(); |
| cSet.add(it.context); |
| contextsByUpidDevice.set(ctxKey, cSet); |
| } |
| |
| return raws.map((r) => { |
| // The top API group is named after the actual API on the slices' |
| // graphics context (e.g. "CUDA" for cuda-injection traces, "HIP" for |
| // hip-injection traces). Slices for which the API couldn't be |
| // resolved fall back to a generic "GPU" label. Different APIs within |
| // the same process get separate sibling groups via the path key. |
| const apiName = r.api ?? 'GPU'; |
| const apiKey = `api_${apiName.toLowerCase()}`; |
| const pathParts: PathPart[] = [{name: apiName, sortOrder: 0, key: apiKey}]; |
| if ((devicesByUpid.get(r.upid)?.size ?? 0) > 1) { |
| pathParts.push({ |
| name: `Device #${r.device}`, |
| sortOrder: r.device, |
| key: `${apiKey}_device_${r.device}`, |
| }); |
| } |
| const contextsForDevice = |
| contextsByUpidDevice.get(`${r.upid}#${r.device}`)?.size ?? 0; |
| if (contextsForDevice > 1) { |
| pathParts.push({ |
| name: `Context #${r.context}`, |
| sortOrder: r.context, |
| key: `${apiKey}_device_${r.device}_context_${r.context}`, |
| }); |
| } |
| // ORDER BY ts is required because SliceTrack's |
| // __intrinsic_slice_mipmap operator runs a galloping binary search |
| // (slice_mipmap_operator.cc) that assumes the per-depth timestamps |
| // array is sorted. Our filter unions events across multiple raw |
| // track_ids (e.g. Channel #1 + Channel #2 for the same stream), and |
| // without an explicit ORDER BY SQLite's row order is unspecified, |
| // causing the mipmap to silently skip out-of-order rows. |
| const whereClause = |
| `upid = ${r.upid}` + |
| ` AND extract_arg(arg_set_id, 'device') = ${r.device}` + |
| ` AND context_id = ${r.context}` + |
| ` AND extract_arg(arg_set_id, 'stream') = ${r.stream}`; |
| return { |
| upid: r.upid, |
| pid: r.pid, |
| processName: r.processName, |
| pathParts, |
| leafName: `Stream #${r.stream}`, |
| leafSortOrder: r.stream, |
| uriSuffix: `${apiKey}_d${r.device}_c${r.context}_s${r.stream}`, |
| dataset: new SourceDataset({ |
| src: `(SELECT id, name, ts, dur, depth FROM gpu_slice WHERE ${whereClause} ORDER BY ts)`, |
| schema: { |
| id: NUM, |
| name: STR, |
| ts: LONG, |
| dur: LONG, |
| depth: NUM, |
| }, |
| }), |
| }; |
| }); |
| } |
| |
| // Fallback: events that are not classified by any API-specific discovery |
| // (i.e. lack the device + stream launch args used by CUDA/HIP). Each |
| // (process, hw_queue_id) tuple gets one leaf track named after the global |
| // hw queue track ("Channel #1", "Channel #2", ...). When a process spans |
| // multiple GPUs, those leaves are nested under per-GPU sub-groups. |
| async function discoverFallbackTracks(ctx: Trace): Promise<LeafTrack[]> { |
| const result = await ctx.engine.query(` |
| SELECT |
| s.upid AS upid, |
| s.hw_queue_id AS hw_queue_id, |
| MIN(t.name) AS track_name, |
| extract_arg(t.dimension_arg_set_id, 'ugpu') AS ugpu, |
| extract_arg(t.dimension_arg_set_id, 'gpu') AS gpu_id, |
| t.machine_id AS machine_id, |
| g.name AS gpu_name, |
| p.pid AS pid, |
| p.name AS process_name |
| FROM gpu_slice s |
| JOIN gpu_track t ON s.track_id = t.id |
| JOIN process p USING (upid) |
| LEFT JOIN gpu g ON extract_arg(t.dimension_arg_set_id, 'ugpu') = g.id |
| WHERE s.upid IS NOT NULL AND s.hw_queue_id IS NOT NULL |
| AND (extract_arg(s.arg_set_id, 'device') IS NULL |
| OR extract_arg(s.arg_set_id, 'stream') IS NULL) |
| GROUP BY s.upid, s.hw_queue_id |
| ORDER BY s.upid, ugpu, s.hw_queue_id |
| `); |
| |
| const it = result.iter({ |
| upid: NUM, |
| hw_queue_id: NUM, |
| track_name: STR, |
| ugpu: NUM_NULL, |
| gpu_id: NUM_NULL, |
| machine_id: NUM, |
| gpu_name: STR_NULL, |
| pid: NUM_NULL, |
| process_name: STR_NULL, |
| }); |
| |
| interface FallbackRow { |
| upid: number; |
| pid: number | null; |
| processName: string | null; |
| hwqId: number; |
| trackName: string; |
| gpu: Gpu | null; |
| } |
| |
| const rows: FallbackRow[] = []; |
| const ugpusByUpid = new Map<number, Set<number>>(); |
| for (; it.valid(); it.next()) { |
| const gpu = |
| it.gpu_id !== null |
| ? new Gpu( |
| it.ugpu ?? it.gpu_id, |
| it.gpu_id, |
| it.machine_id, |
| it.gpu_name ?? undefined, |
| ) |
| : null; |
| rows.push({ |
| upid: it.upid, |
| pid: it.pid, |
| processName: it.process_name, |
| hwqId: it.hw_queue_id, |
| trackName: it.track_name, |
| gpu, |
| }); |
| if (gpu !== null) { |
| let set = ugpusByUpid.get(it.upid); |
| if (set === undefined) { |
| set = new Set<number>(); |
| ugpusByUpid.set(it.upid, set); |
| } |
| set.add(gpu.ugpu); |
| } |
| } |
| |
| return rows.map((row) => { |
| const pathParts: PathPart[] = []; |
| const distinctGpus = ugpusByUpid.get(row.upid)?.size ?? 0; |
| if (row.gpu !== null && distinctGpus > 1) { |
| pathParts.push({ |
| name: `${row.gpu.displayName}${row.gpu.maybeMachineLabel()}`, |
| sortOrder: row.gpu.sortOrder, |
| key: `gpu_${row.gpu.ugpu}`, |
| }); |
| } |
| // ORDER BY ts is required because SliceTrack's |
| // __intrinsic_slice_mipmap operator runs a galloping binary search |
| // (slice_mipmap_operator.cc) that assumes the per-depth timestamps |
| // array is sorted. Our filter unions events across multiple raw |
| // track_ids (e.g. Channel #1 + Channel #2 for the same stream), and |
| // without an explicit ORDER BY SQLite's row order is unspecified, |
| // causing the mipmap to silently skip out-of-order rows. |
| const whereClause = |
| `upid = ${row.upid}` + |
| ` AND hw_queue_id = ${row.hwqId}` + |
| ` AND (extract_arg(arg_set_id, 'device') IS NULL` + |
| ` OR extract_arg(arg_set_id, 'stream') IS NULL)`; |
| return { |
| upid: row.upid, |
| pid: row.pid, |
| processName: row.processName, |
| pathParts, |
| leafName: row.trackName, |
| leafSortOrder: row.hwqId, |
| uriSuffix: `hwq_${row.hwqId}`, |
| dataset: new SourceDataset({ |
| src: `(SELECT id, name, ts, dur, depth FROM gpu_slice WHERE ${whereClause} ORDER BY ts)`, |
| schema: { |
| id: NUM, |
| name: STR, |
| ts: LONG, |
| dur: LONG, |
| depth: NUM, |
| }, |
| }), |
| }; |
| }); |
| } |
| |
| // API-specific discoverers run before the fallback. Each emits leaf tracks |
| // for slices it claims; the fallback then handles whatever's left. To add |
| // a new API, write an async discoverer returning LeafTrack[] and append it |
| // here, plus update discoverFallbackTracks()'s WHERE clause to also |
| // exclude that API's slices. |
| async function discoverApiTracks(ctx: Trace): Promise<LeafTrack[]> { |
| const cuda = await discoverCudaHipTracks(ctx); |
| return cuda; |
| } |
| |
| export default class implements PerfettoPlugin { |
| static readonly id = 'dev.perfetto.GpuByProcess'; |
| static readonly dependencies = [ProcessThreadGroupsPlugin]; |
| |
| async onTraceLoad(ctx: Trace): Promise<void> { |
| const apiTracks = await discoverApiTracks(ctx); |
| const fallbackTracks = await discoverFallbackTracks(ctx); |
| const allTracks = [...apiTracks, ...fallbackTracks]; |
| |
| const processGroups = ctx.plugins.getPlugin(ProcessThreadGroupsPlugin); |
| const gpuGroupByUpid = new Map<number, TrackNode>(); |
| const subGroupByKey = new Map<string, TrackNode>(); |
| const processInfoByUpid = new Map< |
| number, |
| {pid: number | null; processName: string | null} |
| >(); |
| for (const t of allTracks) { |
| if (!processInfoByUpid.has(t.upid)) { |
| processInfoByUpid.set(t.upid, { |
| pid: t.pid, |
| processName: t.processName, |
| }); |
| } |
| } |
| |
| for (const t of allTracks) { |
| const uri = `dev.perfetto.GpuByProcess#${t.upid}#${t.uriSuffix}`; |
| ctx.tracks.registerTrack({ |
| uri, |
| renderer: SliceTrack.create({ |
| trace: ctx, |
| uri, |
| dataset: t.dataset, |
| detailsPanel: () => new ThreadSliceDetailsPanel(ctx), |
| }), |
| }); |
| |
| let processGroup = processGroups.getGroupForProcess(t.upid); |
| if (processGroup === undefined) { |
| const info = processInfoByUpid.get(t.upid)!; |
| const displayName = getProcessDisplayName(info.processName, info.pid); |
| processGroup = new TrackNode({ |
| uri: `/process_${t.upid}`, |
| name: `${displayName} ${info.pid ?? t.upid}`, |
| isSummary: true, |
| sortOrder: 50, |
| }); |
| ctx.defaultWorkspace.addChildInOrder(processGroup); |
| } |
| |
| let gpuGroup = gpuGroupByUpid.get(t.upid); |
| if (gpuGroup === undefined) { |
| gpuGroup = new TrackNode({ |
| uri: `dev.perfetto.GpuByProcess#${t.upid}`, |
| name: 'GPU', |
| isSummary: true, |
| sortOrder: -50, |
| }); |
| processGroup.addChildInOrder(gpuGroup); |
| gpuGroupByUpid.set(t.upid, gpuGroup); |
| } |
| |
| // Walk pathParts, lazily creating sub-groups along the way. |
| let parent = gpuGroup; |
| let cumulativeKey = `${t.upid}`; |
| for (const part of t.pathParts) { |
| cumulativeKey += `#${part.key}`; |
| let sub = subGroupByKey.get(cumulativeKey); |
| if (sub === undefined) { |
| sub = new TrackNode({ |
| uri: `dev.perfetto.GpuByProcess#${cumulativeKey}`, |
| name: part.name, |
| isSummary: true, |
| sortOrder: part.sortOrder, |
| }); |
| parent.addChildInOrder(sub); |
| subGroupByKey.set(cumulativeKey, sub); |
| } |
| parent = sub; |
| } |
| |
| parent.addChildInOrder( |
| new TrackNode({ |
| uri, |
| name: t.leafName, |
| sortOrder: t.leafSortOrder, |
| }), |
| ); |
| } |
| } |
| } |