tp: Refactor SQL module presubmit check. Bug:255535171 Change-Id: I5da63402788143e8596a7730438afb4385bcde69

commit: a6cdc364378720e8a2fd85415336c54d8a673933 [log] [tgz]
author: Anna Mayzner <mayzner@google.com> Thu Dec 15 03:08:46 2022 +0000
committer: Anna Mayzner <mayzner@google.com> Thu Dec 15 03:08:46 2022 +0000
tree: 53d191007cd290a32dc701953db4ae08c7efcc70
parent: 165c172f3a6e30aa98495eaef67d5042cb4133a8 [diff] [blame]
diff --git a/tools/check_sql_modules.py b/tools/check_sql_modules.py
index 33b1ed2..7537977 100755
--- a/tools/check_sql_modules.py
+++ b/tools/check_sql_modules.py

@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# This tool checks that every create (table|view) without prefix
+# This tool checks that every SQL object created without prefix
 # 'internal_' is documented with proper schema.
 
 from __future__ import absolute_import
@@ -24,264 +24,480 @@
 import re
 import sys
 from sql_modules_utils import *
+from typing import Union, List
 
 ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 
 
-# Check that CREATE VIEW/TABLE has a matching schema before it.
-def check_create_table_view(path, module, sql):
-  errors = 0
-  obj_name, schema_cols, schema_desc = None, False, False
-  lines = sql.split('\n')
-  for i, line in enumerate(lines):
-    create_line = re.match(create_table_view_pattern(), line)
+# Stores documentation for CREATE {TABLE|VIEW} with comment split into
+# segments.
+class TableViewDocs:
 
-    # Ignore all lines that don't create an object
-    if create_line is None:
-      continue
+  def __init__(self, name: str, desc: List[str], columns: List[str], path: str):
+    self.name = name
+    self.desc = desc
+    self.columns = columns
+    self.path = path
 
-    obj_name = create_line.group(2)
+  # Contructs new TableViewDocs from whole comment, by splitting it on typed
+  # lines. Returns None for improperly structured schemas.
+  @staticmethod
+  def create_from_comment(path: str, comment: List[str], module: str,
+                          matches: tuple) -> tuple["TableViewDocs", List[str]]:
+    obj_type, name = matches[:2]
 
-    # Ignore 'internal_' tables|views
-    if re.match(r'^internal_.*', obj_name):
-      continue
+    # Ignore internal tables and views.
+    if re.match(r"^internal_.*", name):
+      return None, []
 
-    # Check whether the name starts with module_name
-    starts_with_module_name = re.match(f'^{module}_.*', obj_name)
-    if module == 'common':
-      if starts_with_module_name:
-        sys.stderr.write(
-            f"Invalid name in module {obj_name}. "
-            f"In module 'common' the name shouldn't start with 'common_'.\n")
-        errors += 1
-    else:
-      if not starts_with_module_name:
-        sys.stderr.write(f"Invalid name in module {obj_name}. "
-                         f"View/table name has to begin with {module}_.\n")
-        sys.stderr.write(f'{path}:\n"{line}"\n')
-        errors += 1
+    errors = validate_name(name, module)
+    col_start = None
 
-    # Validate the schema before the create line.
-    for comment_line in fetch_comment(lines[i - 1::-1]):
+    # Splits code into segments by finding beginning of column segment.
+    for i, line in enumerate(comment):
       # Ignore only '--' line.
-      if comment_line == '--':
+      if line == "--":
         continue
 
-      # Break on SQL lines (lines with words without '--' at the beginning)
-      # and empty lines.
-      if not line or not comment_line.startswith('--'):
-        break
+      m = re.match(typed_comment_pattern(), line)
 
-      # Look for '-- @column' line as a column description
-      column_line = re.match(column_pattern(), comment_line)
-      if column_line is not None:
-        if not schema_desc:
-          sys.stderr.write(f"Columns needs to be defined after description.\n")
-          sys.stderr.write(f'{path}:\n"{comment_line}"\n')
-          errors += 1
-          continue
-
-        schema_cols = True
+      # Ignore untyped lines
+      if not m:
         continue
 
-      # The only  option left is a description, but it has to be after
-      # schema columns.
-      schema_desc = True
+      line_type = m.group(1)
+      if line_type == "column" and not col_start:
+        col_start = i
+        continue
 
-    if not schema_cols:
-      sys.stderr.write((f"Missing documentation schema for {obj_name}\n"))
-      sys.stderr.write(f'{path}:\n"{line}"\n')
-      errors += 1
-    obj_name, schema_cols, schema_desc = None, False, False
+    if not col_start:
+      errors.append(f"No columns for {obj_type}.\n"
+                    f"'{name}' in {path}:\n'{line}'\n")
+      return None, errors
 
-  return errors
+    return (
+        TableViewDocs(name, comment[:col_start], comment[col_start:], path),
+        errors,
+    )
+
+  def check_comment(self) -> List[str]:
+    errors = validate_desc(self)
+    errors += validate_columns(self)
+    return errors
 
 
-def parse_args(args_str):
-  errors = 0
-  args = {}
-  for arg_str in args_str.split(","):
-    m = re.match(arg_pattern(), arg_str)
-    if m is None:
-      sys.stderr.write(f"Wrong arguments formatting for '{arg_str}'\n")
-      errors += 1
-      continue
-    args[m.group(1)] = m.group(2)
-  return errors, args
+# Stores documentation for CREATE_FUNCTION with comment split into segments.
+class FunctionDocs:
+
+  def __init__(
+      self,
+      path: str,
+      data_from_sql: dict,
+      module: str,
+      name: str,
+      desc: str,
+      args: List[str],
+      ret: List[str],
+  ):
+    self.path = path
+    self.data_from_sql = data_from_sql
+    self.module = module
+    self.name = name
+    self.desc = desc
+    self.args = args
+    self.ret = ret
+
+  # Contructs new FunctionDocs from whole comment, by splitting it on typed
+  # lines. Returns None for improperly structured schemas.
+  @staticmethod
+  def create_from_comment(path: str, comment: List[str], module: str,
+                          matches: tuple) -> tuple["FunctionDocs", List[str]]:
+    name, args, ret, sql = matches
+
+    # Ignore internal functions.
+    if re.match(r"^INTERNAL_.*", name):
+      return None, []
+
+    errors = validate_name(name, module, upper=True)
+    start_args, start_ret = None, None
+
+    # Splits code into segments by finding beginning of args and ret segments.
+    for i, line in enumerate(comment):
+      # Ignore only '--' line.
+      if line == "--":
+        continue
+
+      m = re.match(typed_comment_pattern(), line)
+
+      # Ignore untyped lines
+      if not m:
+        continue
+
+      line_type = m.group(1)
+      if line_type == "arg" and not start_args:
+        start_args = i
+        continue
+
+      if line_type == "ret" and not start_ret:
+        start_ret = i
+        continue
+
+    if not start_ret or not start_args:
+      errors.append(f"Function requires 'arg' and 'ret' comments.\n"
+                    f"'{name}' in {path}:\n'{line}'\n")
+      return None, errors
+
+    args_dict, parse_errors = parse_args(args)
+    data_from_sql = {'name': name, 'args': args_dict, 'ret': ret, 'sql': sql}
+    return (
+        FunctionDocs(
+            path,
+            data_from_sql,
+            module,
+            name,
+            comment[:start_args],
+            comment[start_args:start_ret],
+            comment[start_ret:],
+        ),
+        errors + parse_errors,
+    )
+
+  def check_comment(self) -> List[str]:
+    errors = validate_desc(self)
+    errors += validate_args(self)
+    errors += validate_ret(self)
+    return errors
 
 
-# Check that CREATE_FUNCTION has a matching schema before it.
-def match_create_functions(sql):
-  errors = 0
+# Stores documentation for CREATE_VIEW_FUNCTION with comment split into
+# segments.
+class ViewFunctionDocs:
 
-  line_to_match_dict = match_pattern(create_function_pattern(), sql)
-  if line_to_match_dict:
-    return []
+  def __init__(
+      self,
+      path: str,
+      data_from_sql: str,
+      module: str,
+      name: str,
+      desc: List[str],
+      args: List[str],
+      columns: List[str],
+  ):
+    self.path = path
+    self.data_from_sql = data_from_sql
+    self.module = module
+    self.name = name
+    self.desc = desc
+    self.args = args
+    self.columns = columns
 
-  functions = {}
-  for line_id, match_groups in line_to_match_dict.items():
-    name = match_groups[0]
-    if re.match(r'^INTERNAL_.*', name):
-      continue
+  # Contructs new ViewFunctionDocs from whole comment, by splitting it on typed
+  # lines. Returns None for improperly structured schemas.
+  @staticmethod
+  def create_from_comment(path: str, comment: List[str], module: str,
+                          matches: tuple[str]
+                         ) -> tuple["ViewFunctionDocs", List[str]]:
+    name, args, columns, sql = matches
 
-    parse_errors, args = parse_args(match_groups[1])
+    # Ignore internal functions.
+    if re.match(r"^INTERNAL_.*", name):
+      return None, []
+
+    errors = validate_name(name, module, upper=True)
+    start_args, start_cols = None, None
+
+    # Splits code into segments by finding beginning of args and cols segments.
+    for i, line in enumerate(comment):
+      # Ignore only '--' line.
+      if line == "--":
+        continue
+
+      m = re.match(typed_comment_pattern(), line)
+
+      # Ignore untyped lines
+      if not m:
+        continue
+
+      line_type = m.group(1)
+      if line_type == "arg" and not start_args:
+        start_args = i
+        continue
+
+      if line_type == "column" and not start_cols:
+        start_cols = i
+        continue
+
+    if not start_cols or not start_args:
+      errors.append(f"Function requires 'arg' and 'column' comments.\n"
+                    f"'{name}' in {path}:\n'{line}'\n")
+      return None, errors
+
+    args_dict, parse_errors = parse_args(args)
     errors += parse_errors
-    functions[line_id] = dict(
-        name=name, args=args, ret_type=match_groups[2], sql=match_groups[3])
 
-  return dict(sorted(functions.items()))
+    cols_dict, parse_errors = parse_args(columns)
+    errors += parse_errors
+
+    data_from_sql = dict(name=name, args=args_dict, columns=cols_dict, sql=sql)
+    return (
+        ViewFunctionDocs(
+            path,
+            data_from_sql,
+            module,
+            name,
+            comment[:start_args],
+            comment[start_args:start_cols],
+            comment[start_cols:],
+        ),
+        errors,
+    )
+
+  def check_comment(self) -> List[str]:
+    errors = validate_desc(self)
+    errors += validate_args(self)
+    errors += validate_columns(self, use_data_from_sql=True)
+    return errors
 
 
-def check_function_docs(path, rev_comment, fun_data):
-  errors = 0
-  has_ret, has_args, has_desc = False, False, False
+# Whether the name starts with module_name.
+def validate_name(name: str, module: str, upper: bool = False) -> List[str]:
+  module_pattern = f"^{module}_.*"
+  if upper:
+    module_pattern = module_pattern.upper()
+  starts_with_module_name = re.match(module_pattern, name)
+  if module == "common":
+    if starts_with_module_name:
+      return [(f"Invalid name in module {name}. "
+               f"In module 'common' the name shouldn't "
+               f"start with '{module_pattern}'.\n")]
+  else:
+    if not starts_with_module_name:
+      return [(f"Invalid name in module {name}. "
+               f"Name has to begin with '{module_pattern}'.\n")]
+  return []
 
-  for line in rev_comment:
-    # Break if the comment is finished
-    if not line or not line.startswith('--'):
-      break
 
-    # Ignore empty lines
+# Whether the only typed comment in provided comment segment is of type
+# `comment_type`.
+def validate_typed_comment(
+    comment_segment: str,
+    comment_type: str,
+    docs: Union["TableViewDocs", "FunctionDocs", "ViewFunctionDocs"],
+) -> List[str]:
+  for line in comment_segment:
+    # Ignore only '--' line.
     if line == "--":
       continue
 
-    if line.startswith('-- @ret'):
-      if has_ret:
-        sys.stderr.write(f"Function can only return one element: '{line}'\n")
-        sys.stderr.write(f'{path}:\n"{line}"\n')
-        errors += 1
-        continue
+    m = re.match(typed_comment_pattern(), line)
 
-      m = re.match(function_return_pattern(), line)
-      if m is None:
-        sys.stderr.write("The return docs formatting is wrong. It should be:\n"
-                         "-- @ret [A-Z]* {desc}\n")
-        sys.stderr.write(f'{path}:\n"{line}"\n')
-        errors += 1
-        continue
-
-      if fun_data['ret_type'] != m.group(1):
-        sys.stderr.write(
-            f"The code specifies {fun_data['ret_type']} as return type, "
-            f"but its {m.group(1)} in docs.\n")
-        sys.stderr.write(f'{path}:\n"{line}"\n')
-        errors += 1
-        continue
-
-      has_ret = True
+    # Ignore untyped lines
+    if not m:
       continue
 
-    if line.startswith('-- @arg'):
-      if not has_ret:
-        sys.stderr.write(
-            f"Arguments should be specified before return: '{line}'\n")
-        sys.stderr.write(f'{path}:\n"{line}"\n')
-        errors += 1
-        continue
+    line_type = m.group(1)
 
-      m = re.match(args_pattern(), line)
-      if m is None:
-        sys.stderr.write("The arg docs formatting is wrong. It should be:\n"
-                         "-- @arg [a-z_]* [A-Z]* {desc}\n")
-        sys.stderr.write(f'{path}:\n"{line}"\n')
-        errors += 1
-        continue
+    if line_type != comment_type:
+      return [(
+          f"Wrong comment type. Expected '{comment_type}', got '{line_type}'.\n"
+          f"'{docs.name}' in {docs.path}:\n'{line}'\n")]
+  return []
 
-      arg_name, arg_type = m.group(1), m.group(2)
-      if arg_name not in fun_data['args']:
-        sys.stderr.write(
-            f"There is not argument '{arg_name} specified in code.\n")
-        sys.stderr.write(f'{path}:\n"{line}"\n')
-        errors += 1
-        continue
 
-      if arg_type != fun_data['args'][arg_name]:
-        sys.stderr.write(
-            f"In the code, the type of '{arg_name} is "
-            f"{fun_data['args'][arg_name]}, but according to the docs "
-            f"it is '{arg_type}.\n")
-        sys.stderr.write(f'{path}:\n"{line}"\n')
-        errors += 1
-        continue
+# Whether comment segment with description of the object contains content.
+def validate_desc(
+    docs: Union["TableViewDocs", "FunctionDocs", "ViewFunctionDocs"]
+) -> List[str]:
+  for line in docs.desc:
+    if line == "--":
+      continue
+    return []
+  return [(f"Missing documentation for {docs.name}\n"
+           f"'{docs.name}' in {docs.path}:\n'{line}'\n")]
 
-      has_args = True
+
+# Whether comment segment about columns contain proper schema. Can be matched
+# against parsed SQL data by setting `use_data_from_sql`.
+def validate_columns(docs: Union["TableViewDocs", "ViewFunctionDocs"],
+                     use_data_from_sql=False) -> List[str]:
+  errors = validate_typed_comment(docs.columns, "column", docs)
+
+  if errors:
+    return errors
+
+  if use_data_from_sql:
+    cols_from_sql = docs.data_from_sql["columns"]
+
+  for line in docs.columns:
+    # Ignore only '--' line.
+    if line == "--" or not line.startswith("-- @column"):
       continue
 
-    if has_args:
-      has_desc = True
+    # Look for '-- @column' line as a column description
+    m = re.match(column_pattern(), line)
+    if not m:
+      errors.append(f"Wrong column description.\n"
+                    f"'{docs.name}' in {docs.path}:\n'{line}'\n")
+      continue
+
+    if not use_data_from_sql:
       return errors
 
-  if not has_ret:
-    sys.stderr.write(f"Return value was not specified in the documentation "
-                     f"of function '{fun_data['name']}'.\n")
-    sys.stderr.write(f'{path}')
-    errors += 1
+    col_name = m.group(1)
+    if col_name not in cols_from_sql:
+      errors.append(f"There is no argument '{col_name}' specified in code.\n"
+                    f"'{docs.name}' in {docs.path}:\n'{line}'\n")
+      continue
+
+    cols_from_sql.pop(col_name)
+
+  if not use_data_from_sql:
+    errors.append(f"Missing columns for {docs.name}\n{docs.path}\n")
     return errors
 
-  if not has_args:
-    sys.stderr.write(f"Arguments were not specified in the documentation "
-                     f"of function '{fun_data['name']}'.\n")
-    sys.stderr.write(f'{path}')
-    errors += 1
+  if not cols_from_sql:
     return errors
 
-  if not has_desc:
-    sys.stderr.write(f"Missing description of function '{fun_data['name']}'.\n")
-    sys.stderr.write(f'{path}')
-    errors += 1
+  errors.append(
+      f"Missing documentation of columns: {list(cols_from_sql.keys())}.\n"
+      f"'{docs.name}' in {docs.path}:\n")
+  return errors
+
+
+# Whether comment segment about columns contain proper schema. Matches against
+# parsed SQL data.
+def validate_args(docs: "FunctionDocs") -> List[str]:
+  errors = validate_typed_comment(docs.args, "arg", docs)
+
+  if errors:
     return errors
 
+  args_from_sql = docs.data_from_sql["args"]
+  for line in docs.args:
+    # Ignore only '--' line.
+    if line == "--" or not line.startswith("-- @"):
+      continue
 
-def check_create_functions(path, module, sql):
-  errors = 0
-  matched_create_functions = match_create_functions(sql)
+    m = re.match(args_pattern(), line)
+    if m is None:
+      errors.append("The arg docs formatting is wrong. It should be:\n"
+                    "-- @arg [a-z_]* [A-Z]* {desc}\n"
+                    f"'{docs.name}' in {docs.path}:\n'{line}'\n")
+      return errors
 
-  if not bool(matched_create_functions):
+    arg_name, arg_type = m.group(1), m.group(2)
+    if arg_name not in args_from_sql:
+      errors.append(f"There is not argument '{arg_name}' specified in code.\n"
+                    f"'{docs.name}' in {docs.path}:\n'{line}'\n")
+      continue
+
+    arg_type_from_sql = args_from_sql.pop(arg_name)
+    if arg_type != arg_type_from_sql:
+      errors.append(f"In the code, the type of '{arg_name}' is "
+                    f"'{arg_type_from_sql}', but according to the docs "
+                    f"it is '{arg_type}'.\n"
+                    f"'{docs.name}' in {docs.path}:\n'{line}'\n")
+
+  if not args_from_sql:
     return errors
 
-  lines = sql.split('\n')
-  for line_id, fun_data in matched_create_functions.items():
-    starts_with_module = fun_data['name'].startswith('{module}_'.upper())
-    if module == 'common' and starts_with_module:
-      sys.stderr.write(
-          f"For module 'common', function name shouldn't start with "
-          f"'COMMON_', as in {fun_data['name']}'.\n")
-      sys.stderr.write(f'{path}')
-      errors += 1
-    if module != 'common' and not starts_with_module:
-      sys.stderr.write(f"Function name ({fun_data['name']}) "
-                       f"should start with '{module.upper()}_'\n")
-      sys.stderr.write(f'{path}')
-      errors += 1
-    errors += check_function_docs(path, lines[line_id - 1::-1], fun_data)
+  errors.append(
+      f"Missing documentation of args: {list(args_from_sql.keys())}.\n"
+      f"{docs.path}\n")
+  return errors
+
+
+# Whether comment segment about return contain proper schema. Matches against
+# parsed SQL data.
+def validate_ret(docs: "FunctionDocs") -> List[str]:
+  errors = validate_typed_comment(docs.ret, "ret", docs)
+  if errors:
+    return errors
+
+  ret_type_from_sql = docs.data_from_sql["ret"]
+
+  for line in docs.ret:
+    # Ignore only '--' line.
+    if line == "--" or not line.startswith("-- @ret"):
+      continue
+
+    m = re.match(function_return_pattern(), line)
+    if m is None:
+      return [("The return docs formatting is wrong. It should be:\n"
+               "-- @ret [A-Z]* {desc}\n"
+               f"'{docs.name}' in {docs.path}:\n'{line}'\n")]
+    docs_ret_type = m.group(1)
+    if ret_type_from_sql != docs_ret_type:
+      return [(f"The return type in docs is '{docs_ret_type}', "
+               f"but it is {ret_type_from_sql} in code.\n"
+               f"'{docs.name}' in {docs.path}:\n'{line}'\n")]
+    return []
+
+
+# Parses string with multiple arguments with type separated by comma into dict.
+def parse_args(args_str: str) -> tuple[dict, List[str]]:
+  errors = []
+  args = {}
+  for arg_str in args_str.split(","):
+    m = re.match(arg_str_pattern(), arg_str)
+    if m is None:
+      errors.append(f"Wrong arguments formatting for '{arg_str}'\n")
+      continue
+    args[m.group(1)] = m.group(2)
+  return args, errors
+
+
+# After matching file to pattern, fetches and validates related documentation.
+def validate_docs_for_sql_object_type(path: str, module: str, sql: str,
+                                      pattern: str, docs_object: type):
+  errors = []
+  line_id_to_match = match_pattern(pattern, sql)
+  lines = sql.split("\n")
+  for line_id, matches in line_id_to_match.items():
+    # Fetch comment by looking at lines over beginning of match in reverse
+    # order.
+    comment = fetch_comment(lines[line_id - 1::-1])
+    docs, obj_errors = docs_object.create_from_comment(path, comment, module,
+                                                       matches)
+    errors += obj_errors
+    if docs:
+      errors += docs.check_comment()
 
   return errors
 
 
-def check(path):
-  errors = 0
+def check(path: str):
+  errors = []
 
   # Get module name
-  module_name = path.split('/stdlib/')[-1].split('/')[0]
+  module_name = path.split("/stdlib/")[-1].split("/")[0]
 
   with open(path) as f:
     sql = f.read()
 
-  errors += check_create_table_view(path, module_name, sql)
-  errors += check_create_functions(path, module_name, sql)
+  errors += validate_docs_for_sql_object_type(path, module_name, sql,
+                                              create_table_view_pattern(),
+                                              TableViewDocs)
+  errors += validate_docs_for_sql_object_type(path, module_name, sql,
+                                              create_function_pattern(),
+                                              FunctionDocs)
+  errors += validate_docs_for_sql_object_type(path, module_name, sql,
+                                              create_view_function_pattern(),
+                                              ViewFunctionDocs)
   return errors
 
 
 def main():
-  errors = 0
-  metrics_sources = os.path.join(ROOT_DIR, 'src', 'trace_processor', 'stdlib')
+  errors = []
+  metrics_sources = os.path.join(ROOT_DIR, "src", "trace_processor", "stdlib")
   for root, _, files in os.walk(metrics_sources, topdown=True):
     for f in files:
       path = os.path.join(root, f)
-      if path.endswith('.sql'):
+      if path.endswith(".sql"):
         errors += check(path)
-  return 0 if errors == 0 else 1
+  sys.stderr.write("\n\n".join(errors))
+  return 0 if not errors else 1
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   sys.exit(main())
commit	a6cdc364378720e8a2fd85415336c54d8a673933	[log] [tgz]
author	Anna Mayzner <mayzner@google.com>	Thu Dec 15 03:08:46 2022 +0000
committer	Anna Mayzner <mayzner@google.com>	Thu Dec 15 03:08:46 2022 +0000
tree	53d191007cd290a32dc701953db4ae08c7efcc70
parent	165c172f3a6e30aa98495eaef67d5042cb4133a8 [diff] [blame]