Return query result as a pandas dataframe. Change-Id: I64bb4ba92610190bfc24fcf58ddac2addc80e5e2

commit: f8ccdc63a188fd977953864bab7490eac9f9852f [log] [tgz]
author: Anindita Ghosh <aninditaghosh@google.com> Mon Aug 03 20:48:02 2020 +0000
committer: Anindita Ghosh <aninditaghosh@google.com> Tue Aug 04 21:00:28 2020 +0000
tree: 82934cc92cbe0a19bd0c0a636b7d7e27c9271dd5
parent: a269ff35deca7ac27b1e9e7137e2353bc3605be2 [diff]
diff --git a/src/trace_processor/python/example.py b/src/trace_processor/python/example.py
index 9f6c784..5ef4128 100644
--- a/src/trace_processor/python/example.py
+++ b/src/trace_processor/python/example.py

@@ -45,10 +45,21 @@
     tp = TraceProcessor(
         addr=args.address, file_path=args.file, bin_path=args.binary)
 
-  # Call functions on the loaded trace
+  # Iterate through QueryResultIterator
   res_it = tp.query('select * from slice limit 10')
   for row in res_it:
     print(row.name)
+
+  # Convert QueryResultIterator into a pandas dataframe + iterate. This yields
+  # the same results as the function above.
+  try:
+    res_df = tp.query('select * from slice limit 10').as_pandas()
+    for index, row in res_df.iterrows():
+      print(row['name'])
+  except Exception:
+    pass
+
+  # Call another function on the loaded trace
   am_metrics = tp.metric(['android_mem'])
   tp.close()
 

diff --git a/src/trace_processor/python/trace_processor/api.py b/src/trace_processor/python/trace_processor/api.py
index 8143172..4217df4 100644
--- a/src/trace_processor/python/trace_processor/api.py
+++ b/src/trace_processor/python/trace_processor/api.py

@@ -32,8 +32,9 @@
 
   # Values of these constants correspond to the QueryResponse message at
   # protos/perfetto/trace_processor/trace_processor.proto
-  # Values 0 and 1 correspond to CELL_INVALID and CELL_NULL respectively,
-  # which are both represented as None in this class's response
+  # Value 0 corresponds to CELL_INVALID, which is represented as None in
+  # this class
+  QUERY_CELL_NULL_FIELD_ID = 1
   QUERY_CELL_VARINT_FIELD_ID = 2
   QUERY_CELL_FLOAT64_FIELD_ID = 3
   QUERY_CELL_STRING_FIELD_ID = 4
@@ -57,7 +58,9 @@
       self.__string_cells = batches[0].string_cells.split('\0')
 
     def get_cell_list(self, proto_index):
-      if proto_index == TraceProcessor.QUERY_CELL_VARINT_FIELD_ID:
+      if proto_index == TraceProcessor.QUERY_CELL_NULL_FIELD_ID:
+        return None
+      elif proto_index == TraceProcessor.QUERY_CELL_VARINT_FIELD_ID:
         return self.__batches[self.__batch_index].varint_cells
       elif proto_index == TraceProcessor.QUERY_CELL_FLOAT64_FIELD_ID:
         return self.__batches[self.__batch_index].float64_cells
@@ -71,6 +74,48 @@
     def cells(self):
       return self.__batches[self.__batch_index].cells
 
+    # To use the query result as a populated Pandas dataframe, this
+    # function must be called directly after calling query inside
+    # TraceProcesor.
+    def as_pandas(self):
+      try:
+        import numpy as np
+        import pandas as pd
+
+        batch_index = 0
+        next_index = 0
+        df = pd.DataFrame(columns=self.__column_names)
+        # TODO(aninditaghosh): Revisit string cells for larger traces
+        self.__string_cells = self.__batches[0].string_cells.split('\0')
+
+        # Populate the dataframe with the query results
+        while True:
+          # If all cells are read, then check if last batch before
+          # returning the populated dataframe
+          if next_index >= len(self.__batches[batch_index].cells):
+            if self.__batches[batch_index].is_last_batch:
+              ordered_df = df.reset_index(drop=True)
+              return ordered_df
+            batch_index += 1
+            next_index = 0
+            string_cells = batches[batch_index].string_cells.split('\0')
+
+          row = []
+          for num, column_name in enumerate(self.__column_names):
+            cell_list = self.get_cell_list(
+                self.__batches[batch_index].cells[next_index + num])
+            if cell_list is None:
+              row.append(np.NAN)
+            else:
+              row.append(cell_list.pop(0))
+          df.loc[-1] = row
+          df.index = df.index + 1
+          next_index = next_index + len(self.__column_names)
+
+      except ModuleNotFoundError:
+        raise TraceProcessorException(
+            'The sufficient libraries are not installed')
+
     def __iter__(self):
       return self
 
@@ -118,7 +163,9 @@
       sql: SQL query written as a String
 
     Returns:
-      A class which can iterate through each row of the results table
+      A class which can iterate through each row of the results table. This
+      can also be converted to a pandas dataframe by calling the as_pandas()
+      function after calling query.
     """
     response = self.http.execute_query(sql)
     if response.error:
commit	f8ccdc63a188fd977953864bab7490eac9f9852f	[log] [tgz]
author	Anindita Ghosh <aninditaghosh@google.com>	Mon Aug 03 20:48:02 2020 +0000
committer	Anindita Ghosh <aninditaghosh@google.com>	Tue Aug 04 21:00:28 2020 +0000
tree	82934cc92cbe0a19bd0c0a636b7d7e27c9271dd5
parent	a269ff35deca7ac27b1e9e7137e2353bc3605be2 [diff]