[DNS] tp: simplify stdlib docs generation and make it more robust The docs generator had some minor bugs with writing output to stderr instead of propogating errors back to the caller. As part of trying to fix this, I realised that the amount of state which is passed between functions makes it very difficult to understand what is happening. Moreover, there seemed to be quite a lot of duplication (i.e. repeated parisng using regexes, merging annotations across multiple lines) o subtle logic and unnecessary complexity in how we were finding the boundaries of annotations. To address this, rework the docs generator to instead be split into two stages: a) extraction which is responsible for extracting the comments from the SQL and "tokenizing" into description and annotations (including merging sequential lines related to the same column) b) parsinng which is repsonsble for actually verifying the semantics of the extracted documentation and parsing into the JSON dictionary format expected by the markdown generator Bug: 283524256 Change-Id: Iccad2e64a4cb02411e39914d1b6cdbaa343a3611

commit: 6b2ac6940bfdfb2c5565a3ae8775a30afa90ef86 [log] [tgz]
author: Lalit Maganti <lalitm@google.com> Tue May 23 01:24:29 2023 +0100
committer: Lalit Maganti <lalitm@google.com> Tue May 23 01:24:29 2023 +0100
tree: 95dcff67e63ed80f11c1610f8f0d1b342ec64643
parent: 24fae90b5e4ce0b2b970670e2795f9c258db5033 [diff] [blame]
diff --git a/python/generators/stdlib_docs/extractor.py b/python/generators/stdlib_docs/extractor.py
new file mode 100644
index 0000000..94db4d3
--- /dev/null
+++ b/python/generators/stdlib_docs/extractor.py

@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+# Copyright (C) 2022 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from re import Match
+from typing import List, Optional, Tuple
+
+from python.generators.stdlib_docs.types import ObjKind
+from python.generators.stdlib_docs.utils import extract_comment
+from python.generators.stdlib_docs.utils import match_pattern
+from python.generators.stdlib_docs.utils import PATTERN_BY_KIND
+
+
+class DocsExtractor:
+  """Extracts documentation for views/tables/functions from SQL."""
+  path: str
+  module_name: str
+  sql: str
+
+  @dataclass
+  class Annotation:
+    key: str
+    value: str
+
+  @dataclass
+  class Extract:
+    """Extracted documentation for a single view/table/function."""
+    obj_kind: ObjKind
+    obj_match: Match
+
+    description: str
+    annotations: List['DocsExtractor.Annotation']
+
+  def __init__(self, path: str, module_name: str, sql: str):
+    self.path = path
+    self.module_name = module_name
+    self.sql = sql
+
+    self.sql_lines = sql.split("\n")
+    self.errors = []
+
+  def extract(self) -> List[Extract]:
+    extracted = []
+    extracted += self._extract_for_kind(ObjKind.table_view)
+    extracted += self._extract_for_kind(ObjKind.function)
+    extracted += self._extract_for_kind(ObjKind.view_function)
+    return extracted
+
+  def _extract_for_kind(self, kind: ObjKind) -> List[Extract]:
+    line_number_to_matches = match_pattern(PATTERN_BY_KIND[kind], self.sql)
+    extracts = []
+    for line_number, match in sorted(list(line_number_to_matches.items())):
+      comment_lines = extract_comment(self.sql_lines, line_number)
+      e = self._extract_from_comment(kind, match, comment_lines)
+      if e:
+        extracts.append(e)
+    return extracts
+
+  def _extract_from_comment(self, kind: ObjKind, match: Match,
+                            comment_lines: List[str]) -> Optional[Extract]:
+    extract = DocsExtractor.Extract(kind, match, '', [])
+    for line in comment_lines:
+      assert line.startswith('--')
+
+      # Remove the comment.
+      stripped = line.lstrip('--').lstrip()
+
+      # Ignore lines which only contain '--'.
+      if not stripped:
+        continue
+
+      # Check if the line is an annotation.
+      if not stripped.startswith('@'):
+        # We are not in annotation: if we haven't seen an annotation yet, we
+        # must be still be parsing the description. Just add to that
+        if not extract.annotations:
+          extract.description += stripped + " "
+          continue
+
+        # Otherwise, add to the latest annotation.
+        extract.annotations[-1].value += " " + stripped
+        continue
+
+      # This line is an annotation: find its name and add a new entry
+      annotation, rest = stripped.split(' ', 1)
+      extract.annotations.append(DocsExtractor.Annotation(annotation, rest))
+    return extract
commit	6b2ac6940bfdfb2c5565a3ae8775a30afa90ef86	[log] [tgz]
author	Lalit Maganti <lalitm@google.com>	Tue May 23 01:24:29 2023 +0100
committer	Lalit Maganti <lalitm@google.com>	Tue May 23 01:24:29 2023 +0100
tree	95dcff67e63ed80f11c1610f8f0d1b342ec64643
parent	24fae90b5e4ce0b2b970670e2795f9c258db5033 [diff] [blame]