tp: Generate stdlib documentation

Result: https://screenshot.googleplex.com/4kNXxsoX66rVMwB

Bug:255535171
Change-Id: Ia4f0dcbdcaffc07158f40c3a1b024ce5f36824bd
diff --git a/tools/check_sql_modules.py b/tools/check_sql_modules.py
index d526d37..33b1ed2 100755
--- a/tools/check_sql_modules.py
+++ b/tools/check_sql_modules.py
@@ -23,74 +23,255 @@
 import os
 import re
 import sys
+from sql_modules_utils import *
 
 ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 
 
-def check(path):
-
-  # Get module name
-  module_name = path.split('/stdlib/')[-1].split('/')[0]
-
-  with open(path) as f:
-    lines = [l.strip() for l in f.readlines()]
-
-  # Check that CREATE VIEW/TABLE has a matching schema before it.
+# Check that CREATE VIEW/TABLE has a matching schema before it.
+def check_create_table_view(path, module, sql):
   errors = 0
-  obj_type, obj_name, schema_cols, schema_desc = None, None, False, False
-  for i in range(len(lines)):
-    m = re.match(
-        r'^CREATE (?:VIRTUAL )?(TABLE|VIEW)?'
-        r'(?:IF NOT EXISTS)? (.*) (?:AS|USING).*', lines[i])
+  obj_name, schema_cols, schema_desc = None, False, False
+  lines = sql.split('\n')
+  for i, line in enumerate(lines):
+    create_line = re.match(create_table_view_pattern(), line)
 
     # Ignore all lines that don't create an object
-    if m is None:
+    if create_line is None:
       continue
 
-    obj_name = m.group(2)
+    obj_name = create_line.group(2)
 
     # Ignore 'internal_' tables|views
     if re.match(r'^internal_.*', obj_name):
       continue
 
     # Check whether the name starts with module_name
-    if not re.match(f'^{module_name}_.*', obj_name):
-      sys.stderr.write(f"Invalid name in module {obj_name}. "
-                       f"View/table name has to begin with {module_name}_.\n")
-      sys.stderr.write(('%s:\n"%s"\n') % (path, lines[i]))
-      errors += 1
+    starts_with_module_name = re.match(f'^{module}_.*', obj_name)
+    if module == 'common':
+      if starts_with_module_name:
+        sys.stderr.write(
+            f"Invalid name in module {obj_name}. "
+            f"In module 'common' the name shouldn't start with 'common_'.\n")
+        errors += 1
+    else:
+      if not starts_with_module_name:
+        sys.stderr.write(f"Invalid name in module {obj_name}. "
+                         f"View/table name has to begin with {module}_.\n")
+        sys.stderr.write(f'{path}:\n"{line}"\n')
+        errors += 1
 
     # Validate the schema before the create line.
-    lines_over_create = lines[i - 1::-1]
-    for line in lines_over_create:
-      # Ignore empty lines, or only '--' line.
-      if not line or line == '--':
+    for comment_line in fetch_comment(lines[i - 1::-1]):
+      # Ignore only '--' line.
+      if comment_line == '--':
         continue
 
       # Break on SQL lines (lines with words without '--' at the beginning)
-      if not line.startswith('--'):
+      # and empty lines.
+      if not line or not comment_line.startswith('--'):
         break
 
       # Look for '-- @column' line as a column description
-      m = re.match(r'^-- @column[ \t]+(\w+)[ \t]+(.*)', line)
-      if m is not None:
+      column_line = re.match(column_pattern(), comment_line)
+      if column_line is not None:
+        if not schema_desc:
+          sys.stderr.write(f"Columns needs to be defined after description.\n")
+          sys.stderr.write(f'{path}:\n"{comment_line}"\n')
+          errors += 1
+          continue
+
         schema_cols = True
         continue
 
       # The only  option left is a description, but it has to be after
       # schema columns.
-      if schema_cols:
-        schema_desc = True
+      schema_desc = True
 
-    if not schema_cols or not schema_desc:
+    if not schema_cols:
       sys.stderr.write((f"Missing documentation schema for {obj_name}\n"))
-      sys.stderr.write(('%s:\n"%s"\n') % (path, lines[i]))
+      sys.stderr.write(f'{path}:\n"{line}"\n')
       errors += 1
-    d_type, d_name, schema_cols, schema_desc = None, None, False, False
+    obj_name, schema_cols, schema_desc = None, False, False
 
   return errors
 
 
+def parse_args(args_str):
+  errors = 0
+  args = {}
+  for arg_str in args_str.split(","):
+    m = re.match(arg_pattern(), arg_str)
+    if m is None:
+      sys.stderr.write(f"Wrong arguments formatting for '{arg_str}'\n")
+      errors += 1
+      continue
+    args[m.group(1)] = m.group(2)
+  return errors, args
+
+
+# Check that CREATE_FUNCTION has a matching schema before it.
+def match_create_functions(sql):
+  errors = 0
+
+  line_to_match_dict = match_pattern(create_function_pattern(), sql)
+  if line_to_match_dict:
+    return []
+
+  functions = {}
+  for line_id, match_groups in line_to_match_dict.items():
+    name = match_groups[0]
+    if re.match(r'^INTERNAL_.*', name):
+      continue
+
+    parse_errors, args = parse_args(match_groups[1])
+    errors += parse_errors
+    functions[line_id] = dict(
+        name=name, args=args, ret_type=match_groups[2], sql=match_groups[3])
+
+  return dict(sorted(functions.items()))
+
+
+def check_function_docs(path, rev_comment, fun_data):
+  errors = 0
+  has_ret, has_args, has_desc = False, False, False
+
+  for line in rev_comment:
+    # Break if the comment is finished
+    if not line or not line.startswith('--'):
+      break
+
+    # Ignore empty lines
+    if line == "--":
+      continue
+
+    if line.startswith('-- @ret'):
+      if has_ret:
+        sys.stderr.write(f"Function can only return one element: '{line}'\n")
+        sys.stderr.write(f'{path}:\n"{line}"\n')
+        errors += 1
+        continue
+
+      m = re.match(function_return_pattern(), line)
+      if m is None:
+        sys.stderr.write("The return docs formatting is wrong. It should be:\n"
+                         "-- @ret [A-Z]* {desc}\n")
+        sys.stderr.write(f'{path}:\n"{line}"\n')
+        errors += 1
+        continue
+
+      if fun_data['ret_type'] != m.group(1):
+        sys.stderr.write(
+            f"The code specifies {fun_data['ret_type']} as return type, "
+            f"but its {m.group(1)} in docs.\n")
+        sys.stderr.write(f'{path}:\n"{line}"\n')
+        errors += 1
+        continue
+
+      has_ret = True
+      continue
+
+    if line.startswith('-- @arg'):
+      if not has_ret:
+        sys.stderr.write(
+            f"Arguments should be specified before return: '{line}'\n")
+        sys.stderr.write(f'{path}:\n"{line}"\n')
+        errors += 1
+        continue
+
+      m = re.match(args_pattern(), line)
+      if m is None:
+        sys.stderr.write("The arg docs formatting is wrong. It should be:\n"
+                         "-- @arg [a-z_]* [A-Z]* {desc}\n")
+        sys.stderr.write(f'{path}:\n"{line}"\n')
+        errors += 1
+        continue
+
+      arg_name, arg_type = m.group(1), m.group(2)
+      if arg_name not in fun_data['args']:
+        sys.stderr.write(
+            f"There is not argument '{arg_name} specified in code.\n")
+        sys.stderr.write(f'{path}:\n"{line}"\n')
+        errors += 1
+        continue
+
+      if arg_type != fun_data['args'][arg_name]:
+        sys.stderr.write(
+            f"In the code, the type of '{arg_name} is "
+            f"{fun_data['args'][arg_name]}, but according to the docs "
+            f"it is '{arg_type}.\n")
+        sys.stderr.write(f'{path}:\n"{line}"\n')
+        errors += 1
+        continue
+
+      has_args = True
+      continue
+
+    if has_args:
+      has_desc = True
+      return errors
+
+  if not has_ret:
+    sys.stderr.write(f"Return value was not specified in the documentation "
+                     f"of function '{fun_data['name']}'.\n")
+    sys.stderr.write(f'{path}')
+    errors += 1
+    return errors
+
+  if not has_args:
+    sys.stderr.write(f"Arguments were not specified in the documentation "
+                     f"of function '{fun_data['name']}'.\n")
+    sys.stderr.write(f'{path}')
+    errors += 1
+    return errors
+
+  if not has_desc:
+    sys.stderr.write(f"Missing description of function '{fun_data['name']}'.\n")
+    sys.stderr.write(f'{path}')
+    errors += 1
+    return errors
+
+
+def check_create_functions(path, module, sql):
+  errors = 0
+  matched_create_functions = match_create_functions(sql)
+
+  if not bool(matched_create_functions):
+    return errors
+
+  lines = sql.split('\n')
+  for line_id, fun_data in matched_create_functions.items():
+    starts_with_module = fun_data['name'].startswith('{module}_'.upper())
+    if module == 'common' and starts_with_module:
+      sys.stderr.write(
+          f"For module 'common', function name shouldn't start with "
+          f"'COMMON_', as in {fun_data['name']}'.\n")
+      sys.stderr.write(f'{path}')
+      errors += 1
+    if module != 'common' and not starts_with_module:
+      sys.stderr.write(f"Function name ({fun_data['name']}) "
+                       f"should start with '{module.upper()}_'\n")
+      sys.stderr.write(f'{path}')
+      errors += 1
+    errors += check_function_docs(path, lines[line_id - 1::-1], fun_data)
+
+  return errors
+
+
+def check(path):
+  errors = 0
+
+  # Get module name
+  module_name = path.split('/stdlib/')[-1].split('/')[0]
+
+  with open(path) as f:
+    sql = f.read()
+
+  errors += check_create_table_view(path, module_name, sql)
+  errors += check_create_functions(path, module_name, sql)
+  return errors
+
+
 def main():
   errors = 0
   metrics_sources = os.path.join(ROOT_DIR, 'src', 'trace_processor', 'stdlib')