[DNS] tp: simplify stdlib docs generation and make it more robust
The docs generator had some minor bugs with writing output to stderr
instead of propogating errors back to the caller. As part of trying to
fix this, I realised that the amount of state which is passed between
functions makes it very difficult to understand what is happening.
Moreover, there seemed to be quite a lot of duplication (i.e. repeated
parisng using regexes, merging annotations across multiple lines) o
subtle logic and unnecessary complexity in how we were finding the
boundaries of annotations.
To address this, rework the docs generator to instead be split into two
stages: a) extraction which is responsible for extracting the comments from
the SQL and "tokenizing" into description and annotations (including
merging sequential lines related to the same column) b) parsinng which
is repsonsble for actually verifying the semantics of the extracted
documentation and parsing into the JSON dictionary format expected by
the markdown generator
Bug: 283524256
Change-Id: Iccad2e64a4cb02411e39914d1b6cdbaa343a3611
diff --git a/python/generators/stdlib_docs/extractor.py b/python/generators/stdlib_docs/extractor.py
new file mode 100644
index 0000000..94db4d3
--- /dev/null
+++ b/python/generators/stdlib_docs/extractor.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+# Copyright (C) 2022 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from re import Match
+from typing import List, Optional, Tuple
+
+from python.generators.stdlib_docs.types import ObjKind
+from python.generators.stdlib_docs.utils import extract_comment
+from python.generators.stdlib_docs.utils import match_pattern
+from python.generators.stdlib_docs.utils import PATTERN_BY_KIND
+
+
+class DocsExtractor:
+ """Extracts documentation for views/tables/functions from SQL."""
+ path: str
+ module_name: str
+ sql: str
+
+ @dataclass
+ class Annotation:
+ key: str
+ value: str
+
+ @dataclass
+ class Extract:
+ """Extracted documentation for a single view/table/function."""
+ obj_kind: ObjKind
+ obj_match: Match
+
+ description: str
+ annotations: List['DocsExtractor.Annotation']
+
+ def __init__(self, path: str, module_name: str, sql: str):
+ self.path = path
+ self.module_name = module_name
+ self.sql = sql
+
+ self.sql_lines = sql.split("\n")
+ self.errors = []
+
+ def extract(self) -> List[Extract]:
+ extracted = []
+ extracted += self._extract_for_kind(ObjKind.table_view)
+ extracted += self._extract_for_kind(ObjKind.function)
+ extracted += self._extract_for_kind(ObjKind.view_function)
+ return extracted
+
+ def _extract_for_kind(self, kind: ObjKind) -> List[Extract]:
+ line_number_to_matches = match_pattern(PATTERN_BY_KIND[kind], self.sql)
+ extracts = []
+ for line_number, match in sorted(list(line_number_to_matches.items())):
+ comment_lines = extract_comment(self.sql_lines, line_number)
+ e = self._extract_from_comment(kind, match, comment_lines)
+ if e:
+ extracts.append(e)
+ return extracts
+
+ def _extract_from_comment(self, kind: ObjKind, match: Match,
+ comment_lines: List[str]) -> Optional[Extract]:
+ extract = DocsExtractor.Extract(kind, match, '', [])
+ for line in comment_lines:
+ assert line.startswith('--')
+
+ # Remove the comment.
+ stripped = line.lstrip('--').lstrip()
+
+ # Ignore lines which only contain '--'.
+ if not stripped:
+ continue
+
+ # Check if the line is an annotation.
+ if not stripped.startswith('@'):
+ # We are not in annotation: if we haven't seen an annotation yet, we
+ # must be still be parsing the description. Just add to that
+ if not extract.annotations:
+ extract.description += stripped + " "
+ continue
+
+ # Otherwise, add to the latest annotation.
+ extract.annotations[-1].value += " " + stripped
+ continue
+
+ # This line is an annotation: find its name and add a new entry
+ annotation, rest = stripped.split(' ', 1)
+ extract.annotations.append(DocsExtractor.Annotation(annotation, rest))
+ return extract