tp: Cleanup of presubmit/docs generating code

Change-Id: I4a845ed35bf0e4db5acba6079c239c9a47c7662f
diff --git a/python/generators/sql_processing/docs_extractor.py b/python/generators/sql_processing/docs_extractor.py
new file mode 100644
index 0000000..52fbd66
--- /dev/null
+++ b/python/generators/sql_processing/docs_extractor.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+# Copyright (C) 2022 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from re import Match
+from typing import List, Optional, Tuple
+
+from python.generators.sql_processing.utils import ObjKind
+from python.generators.sql_processing.utils import extract_comment
+from python.generators.sql_processing.utils import match_pattern
+from python.generators.sql_processing.utils import PATTERN_BY_KIND
+
+
+class DocsExtractor:
+  """Extracts documentation for views/tables/functions from SQL."""
+  path: str
+  module_name: str
+  sql: str
+
+  @dataclass
+  class Annotation:
+    key: str
+    value: str
+
+  @dataclass
+  class Extract:
+    """Extracted documentation for a single view/table/function."""
+    obj_kind: ObjKind
+    obj_match: Match
+
+    description: str
+    annotations: List['DocsExtractor.Annotation']
+
+  def __init__(self, path: str, module_name: str, sql: str):
+    self.path = path
+    self.module_name = module_name
+    self.sql = sql
+
+    self.sql_lines = sql.split("\n")
+    self.errors = []
+
+  def extract(self) -> List[Extract]:
+    extracted = []
+    extracted += self._extract_for_kind(ObjKind.table_view)
+    extracted += self._extract_for_kind(ObjKind.function)
+    extracted += self._extract_for_kind(ObjKind.view_function)
+    return extracted
+
+  def _extract_for_kind(self, kind: ObjKind) -> List[Extract]:
+    line_number_to_matches = match_pattern(PATTERN_BY_KIND[kind], self.sql)
+    extracts = []
+    for line_number, match in sorted(list(line_number_to_matches.items())):
+      comment_lines = extract_comment(self.sql_lines, line_number)
+      e = self._extract_from_comment(kind, match, comment_lines)
+      if e:
+        extracts.append(e)
+    return extracts
+
+  def _extract_from_comment(self, kind: ObjKind, match: Match,
+                            comment_lines: List[str]) -> Optional[Extract]:
+    extract = DocsExtractor.Extract(kind, match, '', [])
+    for line in comment_lines:
+      assert line.startswith('--')
+
+      # Remove the comment.
+      stripped = line.lstrip('--').lstrip()
+
+      # Ignore lines which only contain '--'.
+      if not stripped:
+        continue
+
+      # Check if the line is an annotation.
+      if not stripped.startswith('@'):
+        # We are not in annotation: if we haven't seen an annotation yet, we
+        # must be still be parsing the description. Just add to that
+        if not extract.annotations:
+          extract.description += stripped + " "
+          continue
+
+        # Otherwise, add to the latest annotation.
+        extract.annotations[-1].value += " " + stripped
+        continue
+
+      # This line is an annotation: find its name and add a new entry
+      annotation, rest = stripped.split(' ', 1)
+      extract.annotations.append(DocsExtractor.Annotation(annotation, rest))
+    return extract
diff --git a/python/generators/sql_processing/docs_parse.py b/python/generators/sql_processing/docs_parse.py
new file mode 100644
index 0000000..079a600
--- /dev/null
+++ b/python/generators/sql_processing/docs_parse.py
@@ -0,0 +1,337 @@
+#!/usr/bin/env python3
+# Copyright (C) 2022 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC
+from dataclasses import dataclass
+import re
+import sys
+from typing import Any, Dict, List, Optional, Set, Tuple
+
+from python.generators.sql_processing.docs_extractor import DocsExtractor
+from python.generators.sql_processing.utils import ObjKind
+from python.generators.sql_processing.utils import ARG_ANNOTATION_PATTERN
+from python.generators.sql_processing.utils import NAME_AND_TYPE_PATTERN
+from python.generators.sql_processing.utils import FUNCTION_RETURN_PATTERN
+from python.generators.sql_processing.utils import COLUMN_ANNOTATION_PATTERN
+
+
+def is_internal(name: str) -> bool:
+  return re.match(r'^internal_.*', name, re.IGNORECASE) is not None
+
+
+def is_snake_case(s: str) -> bool:
+  """Returns true if the string is snake_case."""
+  return re.fullmatch(r'^[a-z_0-9]*$', s) is not None
+
+
+class AbstractDocParser(ABC):
+
+  @dataclass
+  class Column:
+    pass
+
+  def __init__(self, path: str, module: str):
+    self.path = path
+    self.module = module
+    self.name = None
+    self.errors = []
+
+  def _parse_name(self, upper: bool = False):
+    assert self.name
+    assert isinstance(self.name, str)
+    module_pattern = f"^{self.module}_.*"
+    if upper:
+      module_pattern = module_pattern.upper()
+    starts_with_module_name = re.match(module_pattern, self.name, re.IGNORECASE)
+    if self.module == "common":
+      if starts_with_module_name:
+        self._error('Names of tables/views/functions in the "common" module '
+                    f'should not start with {module_pattern}')
+      return self.name
+    if not starts_with_module_name:
+      self._error('Names of tables/views/functions should be prefixed with the '
+                  f'module name (i.e. should start with {module_pattern})')
+    return self.name.strip()
+
+  def _parse_desc_not_empty(self, desc: str):
+    if not desc:
+      self._error('Description of the table/view/function is missing')
+    return desc.strip()
+
+  def _validate_only_contains_annotations(self,
+                                          ans: List[DocsExtractor.Annotation],
+                                          ans_types: Set[str]):
+    used_ans_types = set(a.key for a in ans)
+    for type in used_ans_types.difference(ans_types):
+      self._error(f'Unknown documentation annotation {type}')
+
+  def _parse_columns(self, ans: List[DocsExtractor.Annotation],
+                     sql_cols_str: str) -> Dict[str, str]:
+    cols = {}
+    for t in ans:
+      if t.key != '@column':
+        continue
+      m = re.match(COLUMN_ANNOTATION_PATTERN, t.value)
+      if not m:
+        self._error(f'@column annotation value {t.value} does not match '
+                    f'pattern {COLUMN_ANNOTATION_PATTERN}')
+        continue
+      cols[m.group(1)] = m.group(2).strip()
+
+    sql_cols = self._parse_name_and_types_str(sql_cols_str)
+    if sql_cols:
+      for col in set(cols.keys()).difference(sql_cols.keys()):
+        self._error(f'@column "{col}" documented but does not exist in '
+                    'function definition')
+      for col in set(sql_cols.keys()).difference(cols):
+        self._error(f'Column "{col}" defined in SQL but is not documented with '
+                    '@column')
+    return cols
+
+  def _parse_args(self, ans: List[DocsExtractor.Annotation],
+                  sql_args_str: str) -> Dict[str, Any]:
+    args = {}
+    for an in ans:
+      if an.key != '@arg':
+        continue
+      m = re.match(ARG_ANNOTATION_PATTERN, an.value)
+      if m is None:
+        self._error(f'Expected arg documentation "{an.value}" to match pattern '
+                    f'{ARG_ANNOTATION_PATTERN}')
+        continue
+      args[m.group(1)] = {'type': m.group(2), 'desc': m.group(3).strip()}
+
+    sql_args = self._parse_name_and_types_str(sql_args_str)
+    if sql_args:
+      for col in set(args.keys()).difference(sql_args.keys()):
+        self._error(f'Arg "{col}" documented with @arg but does not exist '
+                    'in function definition')
+      for arg in set(sql_args.keys()).difference(args.keys()):
+        self._error(f'Arg "{arg}" defined in SQL but is not documented with '
+                    '@arg')
+    return args
+
+  def _parse_ret(self, ans: List[DocsExtractor.Annotation],
+                 sql_ret_type: str) -> Tuple[str, str]:
+    rets = [a.value for a in ans if a.key == '@ret']
+    if len(rets) != 1:
+      self._error('Return value is not documentated with @ret')
+      return '', ''
+
+    ret = rets[0]
+    m = re.match(FUNCTION_RETURN_PATTERN, ret)
+    if not m:
+      self._error(
+          f'@ret {ret} does not match pattern {FUNCTION_RETURN_PATTERN}')
+      return '', ''
+
+    ret_type, ret_desc = m.group(1), m.group(2)
+    if ret_type != sql_ret_type:
+      self._error(
+          f'@ret {ret_type} does not match SQL return type {sql_ret_type}')
+      return '', ''
+    return ret_type, ret_desc.strip()
+
+  def _parse_name_and_types_str(self, args_str: str) -> Dict[str, str]:
+    if not args_str:
+      return {}
+
+    args = {}
+    for arg_str in args_str.split(","):
+      m = re.match(NAME_AND_TYPE_PATTERN, arg_str)
+      if m is None:
+        self._error(f'Expected "{arg_str}" to match pattern '
+                    f'{NAME_AND_TYPE_PATTERN}')
+        continue
+      args[m.group(1)] = m.group(2).strip()
+    return args
+
+  def _error(self, error: str):
+    self.errors.append(
+        f'Error while parsing documentation for {self.name} in {self.path}: '
+        f'{error}')
+
+
+class TableOrView:
+  name: str
+  type: str
+  desc: str
+  cols: Dict[str, str]
+
+  def __init__(self, name, type, desc, cols):
+    self.name = name
+    self.type = type
+    self.desc = desc
+    self.cols = cols
+
+
+class TableViewDocParser(AbstractDocParser):
+  """Parses documentation for CREATE TABLE and CREATE VIEW statements."""
+
+  def __init__(self, path: str, module: str):
+    super().__init__(path, module)
+
+  def parse(self, doc: DocsExtractor.Extract) -> Optional[TableOrView]:
+    assert doc.obj_kind == ObjKind.table_view
+
+    self.name = doc.obj_match[1]
+    if is_internal(self.name):
+      return None
+
+    self._validate_only_contains_annotations(doc.annotations, {'@column'})
+    return TableOrView(
+        name=self._parse_name(),
+        type=doc.obj_match[0],
+        desc=self._parse_desc_not_empty(doc.description),
+        cols=self._parse_columns(doc.annotations, ''),
+    )
+
+
+class Function:
+  name: str
+  desc: str
+  args: Dict[str, Any]
+  return_type: str
+  return_desc: str
+
+  def __init__(self, name, desc, args, return_type, return_desc):
+    self.name = name
+    self.desc = desc
+    self.args = args
+    self.return_type = return_type
+    self.return_desc = return_desc
+
+
+class FunctionDocParser(AbstractDocParser):
+  """Parses documentation for CREATE_FUNCTION statements."""
+
+  def __init__(self, path: str, module: str):
+    super().__init__(path, module)
+
+  def parse(self, doc: DocsExtractor.Extract) -> Optional[Function]:
+    self.name, args, ret, _ = doc.obj_match
+
+    # Ignore internal functions.
+    if is_internal(self.name):
+      return None
+
+    self._validate_only_contains_annotations(doc.annotations, {'@arg', '@ret'})
+
+    ret_type, ret_desc = self._parse_ret(doc.annotations, ret)
+    name = self._parse_name(upper=True)
+
+    if not is_snake_case(name):
+      self._error('Function name %s is not snake_case (should be %s) ' %
+                  (name, name.casefold()))
+
+    return Function(
+        name=self._parse_name(upper=True),
+        desc=self._parse_desc_not_empty(doc.description),
+        args=self._parse_args(doc.annotations, args),
+        return_type=ret_type,
+        return_desc=ret_desc,
+    )
+
+
+class TableFunction:
+  name: str
+  desc: str
+  cols: Dict[str, str]
+  args: Dict[str, Any]
+
+  def __init__(self, name, desc, cols, args):
+    self.name = name
+    self.desc = desc
+    self.cols = cols
+    self.args = args
+
+
+class ViewFunctionDocParser(AbstractDocParser):
+  """Parses documentation for CREATE_VIEW_FUNCTION statements."""
+
+  def __init__(self, path: str, module: str):
+    super().__init__(path, module)
+
+  def parse(self, doc: DocsExtractor.Extract) -> Optional[TableFunction]:
+    self.name, args, columns, _ = doc.obj_match
+
+    # Ignore internal functions.
+    if is_internal(self.name):
+      return None
+
+    self._validate_only_contains_annotations(doc.annotations,
+                                             {'@arg', '@column'})
+    return TableFunction(
+        name=self._parse_name(upper=True),
+        desc=self._parse_desc_not_empty(doc.description),
+        cols=self._parse_columns(doc.annotations, columns),
+        args=self._parse_args(doc.annotations, args),
+    )
+
+
+class ParsedFile:
+  errors: List[str] = []
+  table_views: List[TableOrView] = []
+  functions: List[Function] = []
+  table_functions: List[TableFunction] = []
+
+  def __init__(self, errors, table_views, functions, view_functions):
+    self.errors = errors
+    self.table_views = table_views
+    self.functions = functions
+    self.table_functions = view_functions
+
+
+# Reads the provided SQL and, if possible, generates a dictionary with data
+# from documentation together with errors from validation of the schema.
+def parse_file(path: str, sql: str) -> ParsedFile:
+  if sys.platform.startswith('win'):
+    path = path.replace('\\', '/')
+
+  # Get module name
+  module_name = path.split('/stdlib/')[-1].split('/')[0]
+
+  # Extract all the docs from the SQL.
+  extractor = DocsExtractor(path, module_name, sql)
+  docs = extractor.extract()
+  if extractor.errors:
+    return ParsedFile(extractor.errors, [], [], [])
+
+  # Parse the extracted docs.
+  errors = []
+  table_views = []
+  functions = []
+  view_functions = []
+  for doc in docs:
+    if doc.obj_kind == ObjKind.table_view:
+      parser = TableViewDocParser(path, module_name)
+      res = parser.parse(doc)
+      if res:
+        table_views.append(res)
+      errors += parser.errors
+    if doc.obj_kind == ObjKind.function:
+      parser = FunctionDocParser(path, module_name)
+      res = parser.parse(doc)
+      if res:
+        functions.append(res)
+      errors += parser.errors
+    if doc.obj_kind == ObjKind.view_function:
+      parser = ViewFunctionDocParser(path, module_name)
+      res = parser.parse(doc)
+      if res:
+        view_functions.append(res)
+      errors += parser.errors
+
+  return ParsedFile(errors, table_views, functions, view_functions)
diff --git a/python/generators/sql_processing/utils.py b/python/generators/sql_processing/utils.py
new file mode 100644
index 0000000..345c800
--- /dev/null
+++ b/python/generators/sql_processing/utils.py
@@ -0,0 +1,126 @@
+# Copyright (C) 2022 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from enum import Enum
+import re
+from typing import Dict, List
+
+NAME = r'[a-zA-Z_\d\{\}]+'
+ANY_WORDS = r'[^\s].*'
+ANY_NON_QUOTE = r'[^\']*.*'
+TYPE = r'[A-Z]+'
+SQL = r'[\s\S]*?'
+WS = r'\s*'
+
+CREATE_TABLE_VIEW_PATTERN = (
+    # Match create table/view and catch type
+    fr'^CREATE{WS}(?:VIRTUAL )?{WS}(TABLE|VIEW){WS}(?:IF NOT EXISTS)?{WS}'
+    # Catch the name
+    fr'{WS}({NAME}){WS}(?:AS|USING)?{WS}.*')
+
+DROP_TABLE_VIEW_PATTERN = (fr'^DROP{WS}(TABLE|VIEW){WS}IF{WS}EXISTS{WS}'
+                           fr'({NAME});$')
+
+CREATE_FUNCTION_PATTERN = (
+    # Function name.
+    fr"CREATE{WS}PERFETTO{WS}FUNCTION{WS}({NAME}){WS}"
+    # Args: anything in the brackets.
+    fr"{WS}\({WS}({ANY_WORDS}){WS}\){WS}"
+    # Type: word after RETURNS.
+    fr"{WS}RETURNS{WS}({TYPE}){WS}AS{WS}"
+    # Sql: Anything between ' and ');. We are catching \'.
+    fr"{WS}({SQL});")
+
+CREATE_VIEW_FUNCTION_PATTERN = (
+    fr"SELECT{WS}CREATE_VIEW_FUNCTION\({WS}"
+    # Function name: we are matching everything [A-Z]* between ' and ).
+    fr"{WS}'{WS}({NAME}){WS}\({WS}"
+    # Args: anything before closing bracket with '.
+    fr"{WS}({ANY_WORDS}){WS}\){WS}'{WS},{WS}"
+    # Return columns: anything between two '.
+    fr"'{WS}({ANY_NON_QUOTE}){WS}',{WS}"
+    # Sql: Anything between ' and ');. We are catching \'.
+    fr"{WS}'{WS}({SQL}){WS}'{WS}\){WS};")
+
+COLUMN_ANNOTATION_PATTERN = fr'^\s*({NAME})\s*({ANY_WORDS})'
+
+NAME_AND_TYPE_PATTERN = fr'\s*({NAME})\s+({TYPE})\s*'
+
+ARG_ANNOTATION_PATTERN = fr'\s*{NAME_AND_TYPE_PATTERN}\s+({ANY_WORDS})'
+
+FUNCTION_RETURN_PATTERN = fr'^\s*({TYPE})\s+({ANY_WORDS})'
+
+
+class ObjKind(str, Enum):
+  table_view = 'table_view'
+  function = 'function'
+  view_function = 'view_function'
+
+
+PATTERN_BY_KIND = {
+    ObjKind.table_view: CREATE_TABLE_VIEW_PATTERN,
+    ObjKind.function: CREATE_FUNCTION_PATTERN,
+    ObjKind.view_function: CREATE_VIEW_FUNCTION_PATTERN,
+}
+
+
+# Given a regex pattern and a string to match against, returns all the
+# matching positions. Specifically, it returns a dictionary from the line
+# number of the match to the regex match object.
+def match_pattern(pattern: str, file_str: str) -> Dict[int, re.Match]:
+  line_number_to_matches = {}
+  for match in re.finditer(pattern, file_str, re.MULTILINE):
+    line_id = file_str[:match.start()].count('\n')
+    line_number_to_matches[line_id] = match.groups()
+  return line_number_to_matches
+
+
+# Given a list of lines in a text and the line number, scans backwards to find
+# all the comments.
+def extract_comment(lines: List[str], line_number: int) -> List[str]:
+  comments = []
+  for line in lines[line_number - 1::-1]:
+    # Break on empty line, as that suggests it is no longer a part of
+    # this comment.
+    if not line or not line.startswith('--'):
+      break
+    comments.append(line)
+
+  # Reverse as the above was reversed
+  comments.reverse()
+  return comments
+
+
+# Given SQL string check whether any of the words is used, and create error
+# string if needed.
+def check_banned_words(sql: str, path: str) -> List[str]:
+  lines = [l.strip() for l in sql.split('\n')]
+  errors = []
+
+  # Ban the use of LIKE in non-comment lines.
+  for line in lines:
+    if line.startswith('--'):
+      continue
+
+    if 'like' in line.casefold():
+      errors.append(
+          'LIKE is banned in trace processor metrics. Prefer GLOB instead.\n'
+          f'Offending file: {path}\n')
+      continue
+
+    if 'create_function' in line.casefold():
+      errors.append('CREATE_FUNCTION is deprecated in trace processor. '
+                    'Prefer CREATE PERFETTO FUNCTION instead.\n'
+                    f'Offending file: {path}')
+  return errors