[DNS] tp: simplify stdlib docs generation and make it more robust The docs generator had some minor bugs with writing output to stderr instead of propogating errors back to the caller. As part of trying to fix this, I realised that the amount of state which is passed between functions makes it very difficult to understand what is happening. Moreover, there seemed to be quite a lot of duplication (i.e. repeated parisng using regexes, merging annotations across multiple lines) o subtle logic and unnecessary complexity in how we were finding the boundaries of annotations. To address this, rework the docs generator to instead be split into two stages: a) extraction which is responsible for extracting the comments from the SQL and "tokenizing" into description and annotations (including merging sequential lines related to the same column) b) parsinng which is repsonsble for actually verifying the semantics of the extracted documentation and parsing into the JSON dictionary format expected by the markdown generator Bug: 283524256 Change-Id: Iccad2e64a4cb02411e39914d1b6cdbaa343a3611

commit: 6b2ac6940bfdfb2c5565a3ae8775a30afa90ef86 [log] [tgz]
author: Lalit Maganti <lalitm@google.com> Tue May 23 01:24:29 2023 +0100
committer: Lalit Maganti <lalitm@google.com> Tue May 23 01:24:29 2023 +0100
tree: 95dcff67e63ed80f11c1610f8f0d1b342ec64643
parent: 24fae90b5e4ce0b2b970670e2795f9c258db5033 [diff]
diff --git a/python/generators/stdlib_docs/extractor.py b/python/generators/stdlib_docs/extractor.py
new file mode 100644
index 0000000..94db4d3
--- /dev/null
+++ b/python/generators/stdlib_docs/extractor.py

@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+# Copyright (C) 2022 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from re import Match
+from typing import List, Optional, Tuple
+
+from python.generators.stdlib_docs.types import ObjKind
+from python.generators.stdlib_docs.utils import extract_comment
+from python.generators.stdlib_docs.utils import match_pattern
+from python.generators.stdlib_docs.utils import PATTERN_BY_KIND
+
+
+class DocsExtractor:
+  """Extracts documentation for views/tables/functions from SQL."""
+  path: str
+  module_name: str
+  sql: str
+
+  @dataclass
+  class Annotation:
+    key: str
+    value: str
+
+  @dataclass
+  class Extract:
+    """Extracted documentation for a single view/table/function."""
+    obj_kind: ObjKind
+    obj_match: Match
+
+    description: str
+    annotations: List['DocsExtractor.Annotation']
+
+  def __init__(self, path: str, module_name: str, sql: str):
+    self.path = path
+    self.module_name = module_name
+    self.sql = sql
+
+    self.sql_lines = sql.split("\n")
+    self.errors = []
+
+  def extract(self) -> List[Extract]:
+    extracted = []
+    extracted += self._extract_for_kind(ObjKind.table_view)
+    extracted += self._extract_for_kind(ObjKind.function)
+    extracted += self._extract_for_kind(ObjKind.view_function)
+    return extracted
+
+  def _extract_for_kind(self, kind: ObjKind) -> List[Extract]:
+    line_number_to_matches = match_pattern(PATTERN_BY_KIND[kind], self.sql)
+    extracts = []
+    for line_number, match in sorted(list(line_number_to_matches.items())):
+      comment_lines = extract_comment(self.sql_lines, line_number)
+      e = self._extract_from_comment(kind, match, comment_lines)
+      if e:
+        extracts.append(e)
+    return extracts
+
+  def _extract_from_comment(self, kind: ObjKind, match: Match,
+                            comment_lines: List[str]) -> Optional[Extract]:
+    extract = DocsExtractor.Extract(kind, match, '', [])
+    for line in comment_lines:
+      assert line.startswith('--')
+
+      # Remove the comment.
+      stripped = line.lstrip('--').lstrip()
+
+      # Ignore lines which only contain '--'.
+      if not stripped:
+        continue
+
+      # Check if the line is an annotation.
+      if not stripped.startswith('@'):
+        # We are not in annotation: if we haven't seen an annotation yet, we
+        # must be still be parsing the description. Just add to that
+        if not extract.annotations:
+          extract.description += stripped + " "
+          continue
+
+        # Otherwise, add to the latest annotation.
+        extract.annotations[-1].value += " " + stripped
+        continue
+
+      # This line is an annotation: find its name and add a new entry
+      annotation, rest = stripped.split(' ', 1)
+      extract.annotations.append(DocsExtractor.Annotation(annotation, rest))
+    return extract

diff --git a/python/generators/stdlib_docs/parse.py b/python/generators/stdlib_docs/parse.py
index bdef717..a78bcca 100644
--- a/python/generators/stdlib_docs/parse.py
+++ b/python/generators/stdlib_docs/parse.py

@@ -13,102 +13,261 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from abc import ABC
+from dataclasses import dataclass
 import re
-from typing import Union, List, Tuple
+import sys
+from typing import Any, Dict, List, Optional, Set, Tuple, Union
 
-from python.generators.stdlib_docs import stdlib
-from python.generators.stdlib_docs.utils import Errors, Pattern, get_text, fetch_comment, match_pattern
+from python.generators.stdlib_docs.extractor import DocsExtractor
+from python.generators.stdlib_docs.types import ObjKind
+from python.generators.stdlib_docs.utils import ARG_ANNOTATION_PATTERN
+from python.generators.stdlib_docs.utils import NAME_AND_TYPE_PATTERN
+from python.generators.stdlib_docs.utils import FUNCTION_RETURN_PATTERN
+from python.generators.stdlib_docs.utils import COLUMN_ANNOTATION_PATTERN
 
 
-def parse_desc(docs: 'stdlib.AnyDocs') -> str:
-  desc_lines = [get_text(line, False) for line in docs.desc]
-  return ' '.join(desc_lines).strip('\n').strip()
+class AbstractDocParser(ABC):
 
+  @dataclass
+  class Column:
+    pass
 
-# Whether comment segment about columns contain proper schema. Can be matched
-# against parsed SQL data by setting `use_data_from_sql`.
-def parse_columns(docs: Union['stdlib.TableViewDocs', 'stdlib.ViewFunctionDocs']
-                 ) -> dict:
-  cols = {}
-  last_col = None
-  last_desc = []
-  for line in docs.columns:
-    # Ignore only '--' line.
-    if line == "--" or not line.startswith("-- @column"):
-      last_desc.append(get_text(line))
-      continue
+  def __init__(self, path: str, module: str):
+    self.path = path
+    self.module = module
+    self.name = None
+    self.errors = []
 
-    # Look for '-- @column' line as a column description
-    m = re.match(Pattern['column'], line)
-    if last_col:
-      cols[last_col] = ' '.join(last_desc)
+  def _parse_name(self, upper: bool = False):
+    assert self.name
+    assert isinstance(self.name, str)
+    module_pattern = f"^{self.module}_.*"
+    if upper:
+      module_pattern = module_pattern.upper()
+    starts_with_module_name = re.match(module_pattern, self.name)
+    if self.module == "common":
+      if starts_with_module_name:
+        self._error('Names of tables/views/functions in the "common" module '
+                    f'should not start with {module_pattern}')
+      return self.name
+    if not starts_with_module_name:
+      self._error('Names of tables/views/functions should be prefixed with the '
+                  f'module name (i.e. should start with {module_pattern})')
+    return self.name.strip()
+
+  def _parse_desc_not_empty(self, desc: str):
+    if not desc:
+      self._error('Description of the table/view/function is missing')
+    return desc.strip()
+
+  def _validate_only_contains_annotations(self,
+                                          ans: List[DocsExtractor.Annotation],
+                                          ans_types: Set[str]):
+    used_ans_types = set(a.key for a in ans)
+    for type in used_ans_types.difference(ans_types):
+      self._error(f'Unknown documentation annotation {type}')
+
+  def _parse_columns(self, ans: List[DocsExtractor.Annotation],
+                     sql_cols_str: str) -> Dict[str, str]:
+    cols = {}
+    for t in ans:
+      if t.key != '@column':
+        continue
+      m = re.match(COLUMN_ANNOTATION_PATTERN, t.value)
+      if not m:
+        self._error(f'@column annotation value {t.value} does not match '
+                    f'pattern {COLUMN_ANNOTATION_PATTERN}')
+        continue
+      cols[m.group(1)] = m.group(2).strip()
+
+    sql_cols = self._parse_name_and_types_str(sql_cols_str)
+    if sql_cols:
+      for col in set(cols.keys()).difference(sql_cols.keys()):
+        self._error(f'@column "{col}" documented but does not exist in '
+                    'function definition')
+      for col in set(sql_cols.keys()).difference(cols):
+        self._error(f'Column "{col}" defined in SQL but is not documented with '
+                    '@column')
+    return cols
+
+  def _parse_args(self, ans: List[DocsExtractor.Annotation],
+                  sql_args_str: str) -> Dict[str, Any]:
+    args = {}
+    for an in ans:
+      if an.key != '@arg':
+        continue
+      m = re.match(ARG_ANNOTATION_PATTERN, an.value)
+      if m is None:
+        self._error(f'Expected arg documentation "{an.value}" to match pattern '
+                    f'{ARG_ANNOTATION_PATTERN}')
+        continue
+      args[m.group(1)] = {'type': m.group(2), 'desc': m.group(3).strip()}
+
+    sql_args = self._parse_name_and_types_str(sql_args_str)
+    if sql_args:
+      for col in set(args.keys()).difference(sql_args.keys()):
+        self._error(f'Arg "{col}" documented with @arg but does not exist '
+                    'in function definition')
+      for arg in set(sql_args.keys()).difference(args.keys()):
+        self._error(f'Arg "{arg}" defined in SQL but is not documented with '
+                    '@arg')
+    return args
+
+  def _parse_ret(self, ans: List[DocsExtractor.Annotation],
+                 sql_ret_type: str) -> Tuple[str, str]:
+    rets = [a.value for a in ans if a.key == '@ret']
+    if len(rets) != 1:
+      self._error('Return value is not documentated with @ret')
+      return '', ''
+
+    ret = rets[0]
+    m = re.match(FUNCTION_RETURN_PATTERN, ret)
     if not m:
-      print(f'Expected line {line} to match @column format', file=sys.stderr)
-    last_col, last_desc = m.group(1), [m.group(2)]
+      self._error(
+          f'@ret {ret} does not match pattern {FUNCTION_RETURN_PATTERN}')
+      return '', ''
 
-  cols[last_col] = ' '.join(last_desc)
-  return cols
+    ret_type, ret_desc = m.group(1), m.group(2)
+    if ret_type != sql_ret_type:
+      self._error(
+          f'@ret {ret_type} does not match SQL return type {sql_ret_type}')
+      return '', ''
+    return ret_type, ret_desc.strip()
+
+  def _parse_name_and_types_str(self, args_str: str) -> Dict[str, str]:
+    if not args_str:
+      return {}
+
+    args = {}
+    for arg_str in args_str.split(","):
+      m = re.match(NAME_AND_TYPE_PATTERN, arg_str)
+      if m is None:
+        self._error(f'Expected "{arg_str}" to match pattern '
+                    f'{NAME_AND_TYPE_PATTERN}')
+        continue
+      args[m.group(1)] = m.group(2).strip()
+    return args
+
+  def _error(self, error: str):
+    self.errors.append(
+        f'Error while parsing documentation for {self.name} in {self.path}: '
+        f'{error}')
 
 
-def parse_args(docs: "stdlib.FunctionDocs") -> dict:
-  if not docs.args:
-    return {}
+class TableViewDocParser(AbstractDocParser):
+  """Parses documentation for CREATE TABLE and CREATE VIEW statements."""
 
-  args = {}
-  last_arg, last_desc, last_type = None, [], None
-  for line in docs.args:
-    # Ignore only '--' line.
-    if line == "--" or not line.startswith("-- @arg"):
-      last_desc.append(get_text(line))
-      continue
+  def __init__(self, path: str, module: str):
+    super().__init__(path, module)
 
-    m = re.match(Pattern['args'], line)
-    if last_arg:
-      args[last_arg] = {'type': last_type, 'desc': ' '.join(last_desc)}
-    last_arg, last_type, last_desc = m.group(1), m.group(2), [m.group(3)]
+  def parse(self, doc: DocsExtractor.Extract) -> Optional[Dict[str, Any]]:
+    assert doc.obj_kind == ObjKind.table_view
 
-  args[last_arg] = {'type': last_type, 'desc': ' '.join(last_desc)}
-  return args
+    # Ignore internal tables and views.
+    self.name = doc.obj_match[1]
+    if re.match(r'^internal_.*', self.name):
+      return None
+
+    self._validate_only_contains_annotations(doc.annotations, {'@column'})
+    return {
+        'name': self._parse_name(),
+        'type': doc.obj_match[0],
+        'desc': self._parse_desc_not_empty(doc.description),
+        'cols': self._parse_columns(doc.annotations, ''),
+    }
 
 
-# Whether comment segment about return contain proper schema. Matches against
-# parsed SQL data.
-def parse_ret(docs: "stdlib.FunctionDocs") -> Tuple[str, str]:
-  desc = []
-  for line in docs.ret:
-    # Ignore only '--' line.
-    if line == "--" or not line.startswith("-- @ret"):
-      desc.append(get_text(line))
+class FunctionDocParser(AbstractDocParser):
+  """Parses documentation for CREATE_FUNCTION statements."""
 
-    m = re.match(Pattern['return_arg'], line)
-    if not m:
-      print(f'Expected line {line} to match @ret format', file=sys.stderr)
-    ret_type, desc = m.group(1), [m.group(2)]
-  return (ret_type, ' '.join(desc))
+  def __init__(self, path: str, module: str):
+    super().__init__(path, module)
+
+  def parse(self, doc: DocsExtractor.Extract) -> Optional[Dict[str, Any]]:
+    self.name, args, ret, _ = doc.obj_match
+
+    # Ignore internal functions.
+    if re.match(r'^INTERNAL_.*', self.name):
+      return None
+
+    self._validate_only_contains_annotations(doc.annotations, {'@arg', '@ret'})
+
+    ret_type, ret_desc = self._parse_ret(doc.annotations, ret)
+    return {
+        'name': self._parse_name(upper=True),
+        'desc': self._parse_desc_not_empty(doc.description),
+        'args': self._parse_args(doc.annotations, args),
+        'return_type': ret_type,
+        'return_desc': ret_desc,
+    }
 
 
-# After matching file to Pattern, fetches and validates related documentation.
-def parse_typed_docs(path: str, module: str, sql: str, Pattern: str,
-                     docs_object: type
-                    ) -> Tuple[List['stdlib.AnyDocs'], Errors]:
+class ViewFunctionDocParser(AbstractDocParser):
+  """Parses documentation for CREATE_VIEW_FUNCTION statements."""
+
+  def __init__(self, path: str, module: str):
+    super().__init__(path, module)
+
+  def parse(self, doc: DocsExtractor.Extract) -> Optional[Dict[str, Any]]:
+    self.name, args, columns, _ = doc.obj_match
+
+    # Ignore internal functions.
+    if re.match(r'^INTERNAL_.*', self.name):
+      return None
+
+    self._validate_only_contains_annotations(doc.annotations,
+                                             {'@arg', '@column'})
+    return {
+        'name': self._parse_name(upper=True),
+        'desc': self._parse_desc_not_empty(doc.description),
+        'cols': self._parse_columns(doc.annotations, columns),
+        'args': self._parse_args(doc.annotations, args),
+    }
+
+
+# Reads the provided SQL and, if possible, generates a dictionary with data
+# from documentation together with errors from validation of the schema.
+def parse_file_to_dict(path: str, sql: str) -> Union[Dict[str, Any], List[str]]:
+  if sys.platform.startswith('win'):
+    path = path.replace('\\', '/')
+
+  # Get module name
+  module_name = path.split('/stdlib/')[-1].split('/')[0]
+
+  # Extract all the docs from the SQL.
+  extractor = DocsExtractor(path, module_name, sql)
+  docs = extractor.extract()
+  if extractor.errors:
+    return extractor.errors
+
+  # Parse the extracted docs.
   errors = []
-  line_id_to_match = match_pattern(Pattern, sql)
-  lines = sql.split("\n")
-  all_typed_docs = []
-  for line_id, matches in line_id_to_match.items():
-    # Fetch comment by looking at lines over beginning of match in reverse
-    # order.
-    comment = fetch_comment(lines[line_id - 1::-1])
-    typed_docs, obj_errors = docs_object.create_from_comment(
-        path, comment, module, matches)
-    errors += obj_errors
+  table_views = []
+  functions = []
+  view_functions = []
+  for doc in docs:
+    if doc.obj_kind == ObjKind.table_view:
+      parser = TableViewDocParser(path, module_name)
+      res = parser.parse(doc)
+      if res:
+        table_views.append(res)
+      errors += parser.errors
+    if doc.obj_kind == ObjKind.function:
+      parser = FunctionDocParser(path, module_name)
+      res = parser.parse(doc)
+      if res:
+        functions.append(res)
+      errors += parser.errors
+    if doc.obj_kind == ObjKind.view_function:
+      parser = ViewFunctionDocParser(path, module_name)
+      res = parser.parse(doc)
+      if res:
+        view_functions.append(res)
+      errors += parser.errors
 
-    if not typed_docs:
-      continue
-
-    errors += typed_docs.check_comment()
-
-    if not errors:
-      all_typed_docs.append(typed_docs)
-
-  return all_typed_docs, errors
+  return errors if errors else {
+      'imports': table_views,
+      'functions': functions,
+      'view_functions': view_functions
+  }

diff --git a/python/generators/stdlib_docs/stdlib.py b/python/generators/stdlib_docs/stdlib.py
deleted file mode 100644
index bf35d55..0000000
--- a/python/generators/stdlib_docs/stdlib.py
+++ /dev/null

@@ -1,343 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (C) 2022 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This tool checks that every SQL object created without prefix
-# 'internal_' is documented with proper schema.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import re
-import sys
-from typing import Union, List, Tuple, Dict
-from dataclasses import dataclass
-
-from python.generators.stdlib_docs.utils import *
-from python.generators.stdlib_docs.validate import *
-from python.generators.stdlib_docs.parse import *
-
-CommentLines = List[str]
-AnyDocs = Union['TableViewDocs', 'FunctionDocs', 'ViewFunctionDocs']
-
-
-# Stores documentation for CREATE {TABLE|VIEW} with comment split into
-# segments.
-@dataclass
-class TableViewDocs:
-  name: str
-  obj_type: str
-  desc: CommentLines
-  columns: CommentLines
-  path: str
-
-  # Contructs new TableViewDocs from the entire comment, by splitting it on
-  # typed lines. Returns None for improperly structured schemas.
-  @staticmethod
-  def create_from_comment(path: str, comment: CommentLines, module: str,
-                          matches: Tuple) -> Tuple['TableViewDocs', Errors]:
-    obj_type, name = matches[:2]
-
-    # Ignore internal tables and views.
-    if re.match(r"^internal_.*", name):
-      return None, []
-
-    errors = validate_name(name, module)
-    col_start = None
-    has_desc = False
-
-    # Splits code into segments by finding beginning of column segment.
-    for i, line in enumerate(comment):
-      # Ignore only '--' line.
-      if line == "--":
-        continue
-
-      m = re.match(Pattern['typed_line'], line)
-
-      # Ignore untyped lines
-      if not m:
-        if not col_start:
-          has_desc = True
-        continue
-
-      line_type = m.group(1)
-      if line_type == "column" and not col_start:
-        col_start = i
-        continue
-
-    if not has_desc:
-      errors.append(f"No description for {obj_type}: '{name}' in {path}'\n")
-      return None, errors
-
-    if not col_start:
-      errors.append(f"No columns for {obj_type}: '{name}' in {path}'\n")
-      return None, errors
-
-    return (
-        TableViewDocs(name, obj_type, comment[:col_start], comment[col_start:],
-                      path),
-        errors,
-    )
-
-  def check_comment(self) -> Errors:
-    return validate_columns(self)
-
-  def parse_comment(self) -> dict:
-    return {
-        'name': self.name,
-        'type': self.obj_type,
-        'desc': parse_desc(self),
-        'cols': parse_columns(self)
-    }
-
-
-# Stores documentation for create_function with comment split into segments.
-class FunctionDocs:
-
-  def __init__(
-      self,
-      path: str,
-      data_from_sql: dict,
-      module: str,
-      name: str,
-      desc: str,
-      args: CommentLines,
-      ret: CommentLines,
-  ):
-    self.path = path
-    self.data_from_sql = data_from_sql
-    self.module = module
-    self.name = name
-    self.desc = desc
-    self.args = args
-    self.ret = ret
-
-  # Contructs new FunctionDocs from whole comment, by splitting it on typed
-  # lines. Returns None for improperly structured schemas.
-  @staticmethod
-  def create_from_comment(path: str, comment: CommentLines, module: str,
-                          matches: Tuple) -> Tuple['FunctionDocs', Errors]:
-    name, args, ret, sql = matches
-
-    # Ignore internal functions.
-    if re.match(r"^INTERNAL_.*", name):
-      return None, []
-
-    errors = validate_name(name, module, upper=True)
-    has_desc, start_args, start_ret = False, None, None
-
-    args_dict, parse_errors = parse_args_str(args)
-    errors += parse_errors
-
-    # Splits code into segments by finding beginning of args and ret segments.
-    for i, line in enumerate(comment):
-      # Ignore only '--' line.
-      if line == "--":
-        continue
-
-      m = re.match(Pattern['typed_line'], line)
-
-      # Ignore untyped lines
-      if not m:
-        if not start_args:
-          has_desc = True
-        continue
-
-      line_type = m.group(1)
-      if line_type == "arg" and not start_args:
-        start_args = i
-        continue
-
-      if line_type == "ret" and not start_ret:
-        start_ret = i
-        continue
-
-    if not has_desc:
-      errors.append(f"No description for '{name}' in {path}'\n")
-      return None, errors
-
-    if not start_ret or (args_dict and not start_args):
-      errors.append(f"Function requires 'arg' and 'ret' comments.\n"
-                    f"'{name}' in {path}\n")
-      return None, errors
-
-    if not args_dict:
-      start_args = start_ret
-
-    data_from_sql = {'name': name, 'args': args_dict, 'ret': ret, 'sql': sql}
-    return (
-        FunctionDocs(
-            path,
-            data_from_sql,
-            module,
-            name,
-            comment[:start_args],
-            comment[start_args:start_ret] if args_dict else None,
-            comment[start_ret:],
-        ),
-        errors,
-    )
-
-  def check_comment(self) -> Errors:
-    errors = validate_args(self)
-    errors += validate_ret(self)
-    return errors
-
-  def parse_comment(self) -> dict:
-    ret_type, ret_desc = parse_ret(self)
-    return {
-        'name': self.name,
-        'desc': parse_desc(self),
-        'args': parse_args(self),
-        'return_type': ret_type,
-        'return_desc': ret_desc
-    }
-
-
-# Stores documentation for create_view_function with comment split into
-# segments.
-class ViewFunctionDocs:
-
-  def __init__(
-      self,
-      path: str,
-      data_from_sql: str,
-      module: str,
-      name: str,
-      desc: CommentLines,
-      args: CommentLines,
-      columns: CommentLines,
-  ):
-    self.path = path
-    self.data_from_sql = data_from_sql
-    self.module = module
-    self.name = name
-    self.desc = desc
-    self.args = args
-    self.columns = columns
-
-  # Contructs new ViewFunctionDocs from whole comment, by splitting it on typed
-  # lines. Returns None for improperly structured schemas.
-  @staticmethod
-  def create_from_comment(path: str, comment: CommentLines, module: str,
-                          matches: Tuple) -> Tuple['ViewFunctionDocs', Errors]:
-    name, args, columns, sql = matches
-
-    # Ignore internal functions.
-    if re.match(r"^INTERNAL_.*", name):
-      return None, []
-
-    errors = validate_name(name, module, upper=True)
-    args_dict, parse_errors = parse_args_str(args)
-    errors += parse_errors
-    has_desc, start_args, start_cols = False, None, None
-
-    # Splits code into segments by finding beginning of args and cols segments.
-    for i, line in enumerate(comment):
-      # Ignore only '--' line.
-      if line == "--":
-        continue
-
-      m = re.match(Pattern['typed_line'], line)
-
-      # Ignore untyped lines
-      if not m:
-        if not start_args:
-          has_desc = True
-        continue
-
-      line_type = m.group(1)
-      if line_type == "arg" and not start_args:
-        start_args = i
-        continue
-
-      if line_type == "column" and not start_cols:
-        start_cols = i
-        continue
-
-    if not has_desc:
-      errors.append(f"No description for '{name}' in {path}'\n")
-      return None, errors
-
-    if not start_cols or (args_dict and not start_args):
-      errors.append(f"Function requires 'arg' and 'column' comments.\n"
-                    f"'{name}' in {path}\n")
-      return None, errors
-
-    if not args_dict:
-      start_args = start_cols
-
-    cols_dict, parse_errors = parse_args_str(columns)
-    errors += parse_errors
-
-    data_from_sql = dict(name=name, args=args_dict, columns=cols_dict, sql=sql)
-    return (
-        ViewFunctionDocs(
-            path,
-            data_from_sql,
-            module,
-            name,
-            comment[:start_args],
-            comment[start_args:start_cols] if args_dict else None,
-            comment[start_cols:],
-        ),
-        errors,
-    )
-
-  def check_comment(self) -> Errors:
-    errors = validate_args(self)
-    errors += validate_columns(self, use_data_from_sql=True)
-    return errors
-
-  def parse_comment(self) -> dict:
-    return {
-        'name': self.name,
-        'desc': parse_desc(self),
-        'args': parse_args(self),
-        'cols': parse_columns(self)
-    }
-
-
-# Reads the provided SQL and, if possible, generates a dictionary with data
-# from documentation together with errors from validation of the schema.
-def parse_file_to_dict(path: str, sql: str) -> Tuple[Dict[str, any], Errors]:
-  if sys.platform.startswith('win'):
-    path = path.replace("\\", "/")
-
-  # Get module name
-  module_name = path.split("/stdlib/")[-1].split("/")[0]
-
-  imports, import_errors = parse_typed_docs(path, module_name, sql,
-                                            Pattern['create_table_view'],
-                                            TableViewDocs)
-  functions, function_errors = parse_typed_docs(path, module_name, sql,
-                                                Pattern['create_function'],
-                                                FunctionDocs)
-  view_functions, view_function_errors = parse_typed_docs(
-      path, module_name, sql, Pattern['create_view_function'], ViewFunctionDocs)
-
-  errors = import_errors + function_errors + view_function_errors
-
-  if errors:
-    sys.stderr.write("\n\n".join(errors))
-
-  return ({
-      'imports': [imp.parse_comment() for imp in imports if imp],
-      'functions': [fun.parse_comment() for fun in functions if fun],
-      'view_functions': [
-          view_fun.parse_comment() for view_fun in view_functions if view_fun
-      ]
-  }, errors)

diff --git a/python/generators/stdlib_docs/types.py b/python/generators/stdlib_docs/types.py
new file mode 100644
index 0000000..a9baad3
--- /dev/null
+++ b/python/generators/stdlib_docs/types.py

@@ -0,0 +1,21 @@
+# Copyright (C) 2023 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from enum import Enum
+
+
+class ObjKind(str, Enum):
+  table_view = 'table_view'
+  function = 'function'
+  view_function = 'view_function'

diff --git a/python/generators/stdlib_docs/utils.py b/python/generators/stdlib_docs/utils.py
index d23b414..eeccc01 100644
--- a/python/generators/stdlib_docs/utils.py
+++ b/python/generators/stdlib_docs/utils.py

@@ -1,4 +1,3 @@
-#!/usr/bin/env python3
 # Copyright (C) 2022 The Android Open Source Project
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,116 +12,82 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
 import re
-from typing import List, Tuple
+from typing import Dict, List
 
-Errors = List[str]
-CommentLines = List[str]
+from python.generators.stdlib_docs.types import ObjKind
 
-LOWER_NAME = r'[a-z_\d]*'
-UPPER_NAME = r'[A-Z_\d]*'
-ANY_WORDS = r'[A-Za-z_\d, \n]*'
-TYPE = r'[A-Z]*'
+LOWER_NAME = r'[a-z_\d]+'
+UPPER_NAME = r'[A-Z_\d]+'
+ANY_WORDS = r'[^\s].*'
+TYPE = r'[A-Z]+'
 SQL = r'[\s\S]*?'
 
-Pattern = {
-    'create_table_view': (
-        # Match create table/view and catch type
-        r'CREATE (?:VIRTUAL )?(TABLE|VIEW)?(?:IF NOT EXISTS)?\s*'
-        # Catch the name
-        fr'({LOWER_NAME})\s*(?:AS|USING)?.*'),
-    'create_function': (
-        r"SELECT\s*CREATE_FUNCTION\(\s*"
-        # Function name: we are matching everything [A-Z]* between ' and ).
-        fr"'\s*({UPPER_NAME})\s*\("
-        # Args: anything before closing bracket with '.
-        fr"({ANY_WORDS})\)',\s*"
-        # Type: [A-Z]* between two '.
-        fr"'({TYPE})',\s*"
-        # Sql: Anything between ' and ');. We are catching \'.
-        fr"'({SQL})'\s*\);"),
-    'create_view_function': (
-        r"SELECT\s*CREATE_VIEW_FUNCTION\(\s*"
-        # Function name: we are matching everything [A-Z]* between ' and ).
-        fr"'({UPPER_NAME})\s*\("
-        # Args: anything before closing bracket with '.
-        fr"({ANY_WORDS})\)',\s*"
-        # Return columns: anything between two '.
-        fr"'\s*({ANY_WORDS})',\s*"
-        # Sql: Anything between ' and ');. We are catching \'.
-        fr"'({SQL})'\s*\);"),
-    'column': fr'^-- @column\s*({LOWER_NAME})\s*({ANY_WORDS})',
-    'arg_str': fr"\s*({LOWER_NAME})\s*({TYPE})\s*",
-    'args': fr'^-- @arg\s*({LOWER_NAME})\s*({TYPE})\s*(.*)',
-    'return_arg': fr"^-- @ret ({TYPE})\s*(.*)",
-    'typed_line': fr'^-- @([a-z]*)'
+CREATE_TABLE_VIEW_PATTERN = (
+    # Match create table/view and catch type
+    r'CREATE (?:VIRTUAL )?(TABLE|VIEW)?(?:IF NOT EXISTS)?\s*'
+    # Catch the name
+    fr'({LOWER_NAME})\s*(?:AS|USING)?.*')
+
+CREATE_FUNCTION_PATTERN = (
+    r"SELECT\s*CREATE_FUNCTION\(\s*"
+    # Function name: we are matching everything [A-Z]* between ' and ).
+    fr"'\s*({UPPER_NAME})\s*\("
+    # Args: anything before closing bracket with '.
+    fr"({ANY_WORDS})\)',\s*"
+    # Type: [A-Z]* between two '.
+    fr"'({TYPE})',\s*"
+    # Sql: Anything between ' and ');. We are catching \'.
+    fr"'({SQL})'\s*\);")
+
+CREATE_VIEW_FUNCTION_PATTERN = (
+    r"SELECT\s*CREATE_VIEW_FUNCTION\(\s*"
+    # Function name: we are matching everything [A-Z]* between ' and ).
+    fr"'({UPPER_NAME})\s*\("
+    # Args: anything before closing bracket with '.
+    fr"({ANY_WORDS})\)',\s*"
+    # Return columns: anything between two '.
+    fr"'\s*({ANY_WORDS})',\s*"
+    # Sql: Anything between ' and ');. We are catching \'.
+    fr"'({SQL})'\s*\);")
+
+PATTERN_BY_KIND = {
+    ObjKind.table_view: CREATE_TABLE_VIEW_PATTERN,
+    ObjKind.function: CREATE_FUNCTION_PATTERN,
+    ObjKind.view_function: CREATE_VIEW_FUNCTION_PATTERN,
 }
 
+COLUMN_ANNOTATION_PATTERN = fr'^\s*({LOWER_NAME})\s*({ANY_WORDS})'
 
-def fetch_comment(lines_reversed: CommentLines) -> CommentLines:
-  comment_reversed = []
-  for line in lines_reversed:
+NAME_AND_TYPE_PATTERN = fr'\s*({LOWER_NAME})\s+({TYPE})\s*'
+
+ARG_ANNOTATION_PATTERN = fr'\s*{NAME_AND_TYPE_PATTERN}\s+({ANY_WORDS})'
+
+FUNCTION_RETURN_PATTERN = fr'^\s*({TYPE})\s+({ANY_WORDS})'
+
+
+# Given a list of lines in a text and the line number, scans backwards to find
+# all the comments.
+def extract_comment(lines: List[str], line_number: int) -> List[str]:
+  comments = []
+  for line in lines[line_number - 1::-1]:
     # Break on empty line, as that suggests it is no longer a part of
     # this comment.
     if not line or not line.startswith('--'):
       break
+    comments.append(line)
 
-    # The only  option left is a description, but it has to be after
-    # schema columns.
-    comment_reversed.append(line)
-
-  comment_reversed.reverse()
-  return comment_reversed
+  # Reverse as the above was reversed
+  comments.reverse()
+  return comments
 
 
-def match_pattern(pattern: str, file_str: str) -> dict:
-  objects = {}
+# Given a regex pattern and a string to match against, returns all the
+# matching positions. Specifically, it returns a dictionary from the line
+# number of the match to the regex match object.
+def match_pattern(pattern: str, file_str: str) -> Dict[int, re.Match]:
+  line_number_to_matches = {}
   for match in re.finditer(pattern, file_str):
     line_id = file_str[:match.start()].count('\n')
-    objects[line_id] = match.groups()
-  return dict(sorted(objects.items()))
-
-
-# Whether the name starts with module_name.
-def validate_name(name: str, module: str, upper: bool = False) -> Errors:
-  module_pattern = f"^{module}_.*"
-  if upper:
-    module_pattern = module_pattern.upper()
-  starts_with_module_name = re.match(module_pattern, name)
-  if module == "common":
-    if starts_with_module_name:
-      return [(f"Invalid name in module {name}. "
-               f"In module 'common' the name shouldn't "
-               f"start with '{module_pattern}'.\n")]
-  else:
-    if not starts_with_module_name:
-      return [(f"Invalid name in module {name}. "
-               f"Name has to begin with '{module_pattern}'.\n")]
-  return []
-
-
-# Parses string with multiple arguments with type separated by comma into dict.
-def parse_args_str(args_str: str) -> Tuple[dict, Errors]:
-  if not args_str.strip():
-    return None, []
-
-  errors = []
-  args = {}
-  for arg_str in args_str.split(","):
-    m = re.match(Pattern['arg_str'], arg_str)
-    if m is None:
-      errors.append(f"Wrong arguments formatting for '{arg_str}'\n")
-      continue
-    args[m.group(1)] = m.group(2)
-  return args, errors
-
-
-def get_text(line: str, no_break_line: bool = True) -> str:
-  line = line.lstrip('--').strip()
-  if not line:
-    return '' if no_break_line else '\n'
-  return line
+    line_number_to_matches[line_id] = match.groups()
+  return line_number_to_matches

diff --git a/python/generators/stdlib_docs/validate.py b/python/generators/stdlib_docs/validate.py
deleted file mode 100644
index fb419e2..0000000
--- a/python/generators/stdlib_docs/validate.py
+++ /dev/null

@@ -1,175 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (C) 2022 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import re
-from typing import Union, List
-
-from python.generators.stdlib_docs import stdlib
-from python.generators.stdlib_docs.utils import Pattern, Errors
-
-
-# Whether the only typed comment in provided comment segment is of type
-# `comment_type`.
-def validate_typed_comment(
-    comment_segment: str,
-    comment_type: str,
-    docs: 'stdlib.AnyDocs',
-) -> Errors:
-  for line in comment_segment:
-    # Ignore only '--' line.
-    if line == "--":
-      continue
-
-    m = re.match(Pattern['typed_line'], line)
-
-    # Ignore untyped lines
-    if not m:
-      continue
-
-    line_type = m.group(1)
-
-    if line_type != comment_type:
-      return [(
-          f"Wrong comment type. Expected '{comment_type}', got '{line_type}'.\n"
-          f"'{docs.name}' in {docs.path}:\n'{line}'\n")]
-  return []
-
-
-# Whether comment segment about columns contain proper schema. Can be matched
-# against parsed SQL data by setting `use_data_from_sql`.
-def validate_columns(
-    docs: Union['stdlib.TableViewDocs', 'stdlib.ViewFunctionDocs'],
-    use_data_from_sql=False) -> Errors:
-  errors = validate_typed_comment(docs.columns, "column", docs)
-
-  if errors:
-    return errors
-
-  if use_data_from_sql:
-    cols_from_sql = docs.data_from_sql["columns"]
-
-  for line in docs.columns:
-    # Ignore only '--' line.
-    if line == "--" or not line.startswith("-- @column"):
-      continue
-
-    # Look for '-- @column' line as a column description
-    m = re.match(Pattern['column'], line)
-    if not m:
-      errors.append(f"Wrong column description.\n"
-                    f"'{docs.name}' in {docs.path}:\n'{line}'\n")
-      continue
-
-    if not m.group(2).strip():
-      errors.append(f"No description for column '{m.group(1)}'.\n"
-                    f"'{docs.name}' in {docs.path}:\n'{line}'\n")
-      continue
-
-    if not use_data_from_sql:
-      return errors
-
-    col_name = m.group(1)
-    if col_name not in cols_from_sql:
-      errors.append(f"There is no argument '{col_name}' specified in code.\n"
-                    f"'{docs.name}' in {docs.path}:\n'{line}'\n")
-      continue
-
-    cols_from_sql.pop(col_name)
-
-  if not use_data_from_sql:
-    errors.append(f"Missing columns for {docs.name}\n{docs.path}\n")
-    return errors
-
-  if not cols_from_sql:
-    return errors
-
-  errors.append(
-      f"Missing documentation of columns: {list(cols_from_sql.keys())}.\n"
-      f"'{docs.name}' in {docs.path}:\n")
-  return errors
-
-
-# Whether comment segment about columns contain proper schema. Matches against
-# parsed SQL data.
-def validate_args(docs: Union['stdlib.FunctionDocs', 'stdlib.ViewFunctionDocs']
-                 ) -> Errors:
-  if not docs.args:
-    return []
-
-  errors = validate_typed_comment(docs.args, "arg", docs)
-
-  if errors:
-    return errors
-
-  args_from_sql = docs.data_from_sql["args"]
-  for line in docs.args:
-    # Ignore only '--' line.
-    if line == "--" or not line.startswith("-- @"):
-      continue
-
-    m = re.match(Pattern['args'], line)
-    if m is None:
-      errors.append("The arg docs formatting is wrong. It should be:\n"
-                    "-- @arg [a-z_]* [A-Z]* {desc}\n"
-                    f"'{docs.name}' in {docs.path}:\n'{line}'\n")
-      return errors
-
-    arg_name, arg_type = m.group(1), m.group(2)
-    if arg_name not in args_from_sql:
-      errors.append(f"There is not argument '{arg_name}' specified in code.\n"
-                    f"'{docs.name}' in {docs.path}:\n'{line}'\n")
-      continue
-
-    arg_type_from_sql = args_from_sql.pop(arg_name)
-    if arg_type != arg_type_from_sql:
-      errors.append(f"In the code, the type of '{arg_name}' is "
-                    f"'{arg_type_from_sql}', but according to the docs "
-                    f"it is '{arg_type}'.\n"
-                    f"'{docs.name}' in {docs.path}:\n'{line}'\n")
-
-  if not args_from_sql:
-    return errors
-
-  errors.append(
-      f"Missing documentation of args: {list(args_from_sql.keys())}.\n"
-      f"'{docs.name}' in {docs.path}\n")
-  return errors
-
-
-# Whether comment segment about return contain proper schema. Matches against
-# parsed SQL data.
-def validate_ret(docs: "stdlib.FunctionDocs") -> Errors:
-  errors = validate_typed_comment(docs.ret, "ret", docs)
-  if errors:
-    return errors
-
-  ret_type_from_sql = docs.data_from_sql["ret"]
-
-  for line in docs.ret:
-    # Ignore only '--' line.
-    if line == "--" or not line.startswith("-- @ret"):
-      continue
-
-    m = re.match(Pattern['return_arg'], line)
-    if m is None:
-      return [("The return docs formatting is wrong. It should be:\n"
-               "-- @ret [A-Z]* {desc}\n"
-               f"'{docs.name}' in {docs.path}:\n'{line}'\n")]
-    docs_ret_type = m.group(1)
-    if ret_type_from_sql != docs_ret_type:
-      return [(f"The return type in docs is '{docs_ret_type}', "
-               f"but it is {ret_type_from_sql} in code.\n"
-               f"'{docs.name}' in {docs.path}:\n'{line}'\n")]
-    return []
commit	6b2ac6940bfdfb2c5565a3ae8775a30afa90ef86	[log] [tgz]
author	Lalit Maganti <lalitm@google.com>	Tue May 23 01:24:29 2023 +0100
committer	Lalit Maganti <lalitm@google.com>	Tue May 23 01:24:29 2023 +0100
tree	95dcff67e63ed80f11c1610f8f0d1b342ec64643
parent	24fae90b5e4ce0b2b970670e2795f9c258db5033 [diff]