[stdlib]: Cache critical path join with thread_states

The current impl of span_join has a limitation that there
must be no overlaps within a partition. This is a challenge when
span joining all critical paths with all thread_states because
multiple threads can share the same critical paths.

This restriction means that the result of querying the critical_path
+ thread_state table for more than one critical path thread at a time
is undefined. This is fine because the exposed APIs don't allow you
do that anyways.

With the introduction of macros, it would be great to expose APIs
to extract the critical path over arbitrary (overlapping) time windows.
This change makes such queries possible on the critical_path
+ thread_state table. The limitation still applies for slices but having
this functionality for thread_states is quite powerful for instance to
extract the critical paths over all monitor contentions in a trace.

There is a performance hit to caching this result but it should speed
up subsequent queries.

Test: tools/diff_test_trace_processor.py out/android/trace_processor_shell --name-filter '.*thread_executing_span.*'
Change-Id: I5a44f91df4d6e08e8ffae4422dc88ad7c3ea4ff8
diff --git a/src/trace_processor/perfetto_sql/stdlib/experimental/thread_executing_span.sql b/src/trace_processor/perfetto_sql/stdlib/experimental/thread_executing_span.sql
index ef4921a..b0f9310 100644
--- a/src/trace_processor/perfetto_sql/stdlib/experimental/thread_executing_span.sql
+++ b/src/trace_processor/perfetto_sql/stdlib/experimental/thread_executing_span.sql
@@ -440,19 +440,53 @@
 SELECT ts, dur, id, slice_id, slice_depth, slice_name
 FROM internal_span_graph_slice_sp;
 
--- |experimental_thread_executing_span_graph| + thread_state view span joined with critical_path information.
-CREATE VIRTUAL TABLE internal_critical_path_thread_state_sp
-USING
-  SPAN_JOIN(
-    internal_span_graph_thread_state PARTITIONED id,
-     internal_critical_path PARTITIONED id);
+-- |experimental_thread_executing_span_graph| + thread_state view joined with critical_path information.
+CREATE PERFETTO TABLE internal_critical_path_thread_state AS
+WITH span AS MATERIALIZED (
+    SELECT * FROM internal_critical_path
+  ),
+  span_starts AS (
+    SELECT
+      span.id,
+      span.utid,
+      span.critical_path_id,
+      span.critical_path_blocked_dur,
+      span.critical_path_blocked_state,
+      span.critical_path_blocked_function,
+      span.critical_path_utid,
+      thread_state_id,
+      MAX(thread_state.ts, span.ts) AS ts,
+      span.ts + span.dur AS span_end_ts,
+      thread_state.ts + thread_state.dur AS thread_state_end_ts,
+      thread_state.state,
+      thread_state.function,
+      thread_state.cpu
+    FROM span
+    JOIN internal_span_graph_thread_state_sp thread_state USING(id)
+  )
+SELECT
+  id,
+  thread_state_id,
+  ts,
+  MIN(span_end_ts, thread_state_end_ts) - ts AS dur,
+  utid,
+  state,
+  function,
+  cpu,
+  critical_path_id,
+  critical_path_blocked_dur,
+  critical_path_blocked_state,
+  critical_path_blocked_function,
+  critical_path_utid
+FROM span_starts
+WHERE MIN(span_end_ts, thread_state_end_ts) - ts > 0;
 
 -- |experimental_thread_executing_span_graph| + thread_state + critical_path span joined with
 -- |experimental_thread_executing_span_graph| + slice view.
 CREATE VIRTUAL TABLE internal_critical_path_sp
 USING
   SPAN_LEFT_JOIN(
-    internal_critical_path_thread_state_sp PARTITIONED id,
+    internal_critical_path_thread_state PARTITIONED id,
      internal_span_graph_slice PARTITIONED id);
 
 -- Flattened slices span joined with their thread_states. This contains the 'self' information