Start splitting output back into modules.

The difference between this split, and the old, is that everything is now clearly under output.

Start splitting output back into modules.
4892081b · Hugo Hörnquist · 030b736f · 4892081b · 4892081b · 4892081b
Commit 4892081b authored Sep 25, 2023 by Hugo Hörnquist
--- a/doc/muppet.rst
+++ b/doc/muppet.rst
@@ -7,6 +7,7 @@ Subpackages
 .. toctree::
   :maxdepth: 4

+   muppet.output
   muppet.puppet
   muppet.syntax_highlight

@@ -21,11 +22,9 @@ Submodules
   muppet.intersperse
   muppet.lookup
   muppet.markdown
-   muppet.output
   muppet.parser_combinator
   muppet.symbols
   muppet.tabs
-   muppet.templates
   muppet.util

 Module contents

--- a/muppet/output.py
+++ b/muppet/output.py
--- a/muppet/output/docstring.py
+++ b/muppet/output/docstring.py
+"""
+Generate output for Puppet Docstrings.
+
+Docstrings are the functions preceeding any top level puppet
+declaration (such as classes, rosource definitions, ...). These have a
+number of "magic" tags for attaching metadata, along with usually
+being Markdown formatted. This module assumes that they all are
+Markdown formatted, which unfortunately leads to some (minor) errors.
+
+(The final output also contains the original source, allowing these
+errors to be overlooked).
+"""
+
+from dataclasses import dataclass, field
+import html
+import re
+from typing import cast
+from muppet.markdown import markdown
+from muppet.puppet.strings import (
+    DocString,
+    DocStringApiTag,
+    DocStringAuthorTag,
+    DocStringExampleTag,
+    DocStringOptionTag,
+    DocStringOverloadTag,
+    DocStringParamTag,
+    DocStringRaiseTag,
+    DocStringReturnTag,
+    DocStringSeeTag,
+    DocStringSinceTag,
+    DocStringSummaryTag,
+    DocStringTag,
+)
+
+
+# TODO what even is this for?
+param_doc: dict[str, str] = {}
+
+
+@dataclass
+class GroupedTags:
+    """
+    All tags from a class (or similar) docstring.
+
+    Most fields are simply lists of tags. The reason for trailing
+    underscores on each entry is since some tag names collide with
+    python keywords (e.g. ``raise``).
+    """
+
+    param_:     list[DocStringParamTag]             = field(default_factory=list)  # noqa: E221
+    example_:   list[DocStringExampleTag]           = field(default_factory=list)  # noqa: E221
+    overload_:  list[DocStringOverloadTag]          = field(default_factory=list)  # noqa: E221
+    option_:    dict[str, list[DocStringOptionTag]] = field(default_factory=dict)  # noqa: E221
+    """
+    Options document Hash parameters valid values.
+
+    Each key is the corresponding parameter, and the value is the list
+    of registered options for that hash.
+    """
+
+    author_:    list[DocStringAuthorTag]            = field(default_factory=list)  # noqa: E221
+    api_:       list[DocStringApiTag]               = field(default_factory=list)  # noqa: E221
+    raise_:     list[DocStringRaiseTag]             = field(default_factory=list)  # noqa: E221
+    return_:    list[DocStringReturnTag]            = field(default_factory=list)  # noqa: E221
+    since_:     list[DocStringSinceTag]             = field(default_factory=list)  # noqa: E221
+    summary_:   list[DocStringSummaryTag]           = field(default_factory=list)  # noqa: E221
+    see_:       list[DocStringSeeTag]               = field(default_factory=list)  # noqa: E221
+    other_:     list[DocStringTag]                  = field(default_factory=list)  # noqa: E221
+    """All tags of unknown type."""
+
+    @classmethod
+    def from_taglist(cls, tags: list[DocStringTag]) -> 'GroupedTags':
+        """Group a list of tags."""
+        grouped_tags = cls()
+        for tag in tags:
+            if tag.tag_name == 'option':
+                tag = cast(DocStringOptionTag, tag)
+                grouped_tags.option_.setdefault(tag.parent, []).append(tag)
+            elif tag.tag_name in {'param', 'example', 'overload', 'author', 'api',
+                                  'raise', 'return', 'since', 'summary', 'see'}:
+                getattr(grouped_tags, tag.tag_name + '_').append(tag)
+            else:
+                grouped_tags.other_.append(tag)
+        return grouped_tags
+
+
+def parse_author(author: str) -> str:
+    """
+    Format author tags' content.
+
+    :param author:
+        The contents of the author tag. If the string is on the
+        regular "author" format of ``"Firstname Lastname
+        <first.last@example.com>"`` then the email will be formatted
+        and hyperlinked. Otherwise the string is returned verbatim.
+    :return:
+        An HTML safe string, possibly including tags.
+    """
+    m = re.match(r'(?P<author>.*) (<(?P<email>.*)>)|(?P<any>.*)', author)
+    assert m, "The above regex can't fail"
+    if m['author'] and m['email']:
+        author = html.escape(m['author'])
+        email = html.escape(m['email'])
+        return f'{author} <a class="email" href="mailto:{email}">&lt;{email}&gt</a>;'
+    else:
+        return html.escape(m['any'])
+
+
+def format_docstring(name: str, docstring: DocString) -> tuple[str, str]:
+    """
+    Format docstrings as they appear in some puppet types.
+
+    Those types being:
+
+    * puppet_classes,
+    * puppet_type_aliases, and
+    * defined_types
+    """
+    global param_doc
+
+    # The api tag is ignored, since it instead is shown from context
+
+    out = ''
+
+    param_doc = {tag.name: tag.text or ''
+                 for tag in docstring.tags
+                 if isinstance(tag, DocStringParamTag)}
+
+    grouped_tags = GroupedTags.from_taglist(docstring.tags)
+
+    # --------------------------------------------------
+
+    out += '<a href="#code">Jump to Code</a><br/>'
+
+    if tags := grouped_tags.summary_:
+        out += '<em class="summary">'
+        for tag in tags:
+            out += html.escape(tag.text)
+        out += '</em>'
+
+    out += '<div class="description">'
+    # TODO "TODO" highlighting
+    out += markdown(docstring.text)
+    out += '</div>'
+
+    # TODO proper handling of multiple @see tags
+    if sees := grouped_tags.see_:
+        out += '<b>See</b> '
+        for see in sees:
+            link: str
+            m = re.match(r'((?P<url>https?://.*)|(?P<man>.*\([0-9]\))|(?P<other>.*))', see.name)
+            assert m, "Regex always matched"
+            if m['url']:
+                link = f'<a href="{see.name}">{see.name}</a>'
+                out += link
+            elif m['man']:
+                page = see.name[:-3]
+                section = see.name[-2]
+                # TODO man providers
+                link = f"https://manned.org/man/{page}.{section}"
+                out += link
+            else:
+                if '::' in m['other']:
+                    # TODO
+                    pass
+                else:
+                    # TODO
+                    # link = see
+                    pass
+                out += m['other']
+            out += ' ' + see.text
+
+    if authors := grouped_tags.author_:
+        out += '<div class="author">'
+        out += "<em>Written by </em>"
+        if len(authors) == 1:
+            out += parse_author(authors[0].text)
+        else:
+            out += '<ul>'
+            for author in authors:
+                out += f'<li>{parse_author(author.text)}</li>'
+            out += '</ul>'
+        out += '</div>'
+
+    out += '<hr/>'
+
+    t: DocStringTag
+
+    for t in grouped_tags .example_:
+        out += '<div class="code-example">'
+
+        if name := t.name:
+            # TODO markup for title
+            out += f'<div class="code-example-header">{html.escape(name)}</div>\n'
+        # TODO highlight?
+        # Problem is that we don't know what language the example
+        # is in. Pygemntize however seems to do a reasonable job
+        # treating anything as puppet code
+        text = html.escape(t.text)
+        out += f'<pre><code class="puppet">{text}</code></pre>\n'
+        out += '</div>'
+
+    out += '<hr/>'
+
+    out += '<dl>'
+    for t in grouped_tags.param_:
+        name = html.escape(t.name)
+        out += f'<dt><span id="{name}" class="variable">{name}</span>'
+        match t.types:
+            case [x]:
+                # TODO highlight type?
+                out += f': <code>{html.escape(x)}</code>'
+            case [_, *_]:
+                raise ValueError("How did you get multiple types onto a parameter?")
+
+        # TODO Fetch default values from puppet strings output
+        # Then in javascript query Hiera to get the true "default"
+        # values for a given machine (somewhere have a setting for
+        # selecting machine).
+        out += '</dt>'
+
+        if text := t.text:
+            text = re.sub(r'(NOTE|TODO)',
+                          r'<mark>\1</mark>',
+                          markdown(text))
+
+            if options := grouped_tags.option_.get(t.name):
+                text += '<dl>'
+                for option in options:
+                    text += '<dt>'
+                    text += html.escape(option.opt_name)
+                    match option.opt_types:
+                        case [x]:
+                            text += f' [<code>{html.escape(x)}</code>]'
+                        case [_, *_]:
+                            raise ValueError("How did you get multiple types onto an option?")
+                    text += '</dt>'
+                    text += '<dd>'
+                    if option.opt_text:
+                        text += re.sub(r'(NOTE|TODO)',
+                                       r'<mark>\1</mark>',
+                                       markdown(option.opt_text))
+                    text += '</dd>'
+                text += '</dl>'
+
+            out += f"<dd>{text}</dd>"
+        else:
+            out += '<dd><em>Undocumented</em></dd>'
+    out += '</dl>'
+
+    # TODO remaining tags
+    # "overload"
+    # raise
+    # return
+    # since
+    # _other
+
+    return (name, out)
--- a/muppet/output/puppet_source.py
+++ b/muppet/output/puppet_source.py
+"""Generate output for Puppet Source code."""
+
+import html
+import logging
+from typing import Sequence
+
+from muppet.parser_combinator import (
+    ParserCombinator,
+    MatchCompound,
+    MatchObject,
+)
+from muppet.puppet.ast import build_ast
+from muppet.puppet.parser import puppet_parser
+from muppet.puppet.format.parser import ParserFormatter
+
+from .util import inner_text
+
+
+logger = logging.getLogger(__name__)
+
+
+_puppet_doc_base = 'https://www.puppet.com/docs/puppet/7'
+_lang_facts_builtin_variables = (f'{_puppet_doc_base}/lang_facts_builtin_variables'
+                                 '#lang_facts_builtin_variables')
+_server_variables = f'{_lang_facts_builtin_variables}-server-variables'
+_compiler_variables = f'{_lang_facts_builtin_variables}-compiler-variables'
+_trusted_facts = f'{_lang_facts_builtin_variables}-trusted-facts'
+_server_facts = f'{_lang_facts_builtin_variables}-server-facts'
+
+_built_in_variables = {
+    'facts': 'https://google.com',
+    # clientcert, clientversion, puppetversion, clientnoop,
+    # agent_specified_environment:
+    # https://www.puppet.com/docs/puppet/7/lang_facts_builtin_variables#lang_facts_builtin_variables-agent-facts
+    'trusted': _trusted_facts,
+    'server_facts': _server_facts,
+    'environment': _server_variables,
+    'servername': _server_variables,
+    'serverip': _server_variables,
+    'serverversion': _server_variables,
+    'module_name': _compiler_variables,
+    'caller_module_name': _compiler_variables,
+
+    # Also note the special variable $title and $name
+    # https://www.puppet.com/docs/puppet/7/lang_defined_types#lang_defined_types-title-and-name
+}
+
+
+# https://www.puppet.com/docs/puppet/7/cheatsheet_core_types.html
+# https://www.puppet.com/docs/puppet/7/types/file.html
+# ...
+_built_in_types = {
+    'package',
+    'file',
+    'service',
+    'notify',
+    'exec',
+    'user',
+    'group',
+}
+
+# https://www.puppet.com/docs/puppet/7/function.html#{}
+_built_in_functions = {
+    'abs',
+    'alert',
+    'all',
+    'annotate',
+    'any',
+    'assert_type',
+    'binary_file',
+    'break',
+    'call',
+    'camelcase',
+    'capitalize',
+    'ceiling',
+    'chomp',
+    'chop',
+    'compare',
+    'contain',
+    'convert_to',
+    'create_resources',
+    'crit',
+    'debug',
+    'defined',
+    'dig',
+    'digest',
+    'downcase',
+    'each',
+    'emerg',
+    'empty',
+    'epp',
+    'err',
+    'eyaml_lookup_key',
+    'fail',
+    'file',
+    'filter',
+    'find_file',
+    'find_template',
+    'flatten',
+    'floor',
+    'fqdn_rand',
+    'generate',
+    'get',
+    'getvar',
+    'group_by',
+    'hiera',
+    'hiera_array',
+    'hiera_hash',
+    'hiera_include',
+    'hocon_data',
+    'import',
+    'include',
+    'index',
+    'info',
+    'inline_epp',
+    'inline_template',
+    'join',
+    'json_data',
+    'keys',
+    'length',
+    'lest',
+    'lookup',
+    'lstrip',
+    'map',
+    'match',
+    'max',
+    'md5',
+    'min',
+    'module_directory',
+    'new',
+    'next',
+    'notice',
+    'partition',
+    'realize',
+    'reduce',
+    'regsubst',
+    'require',
+    'return',
+    'reverse_each',
+    'round',
+    'rstrip',
+    'scanf',
+    'sha1',
+    'sha256',
+    'shellquote',
+    'size',
+    'slice',
+    'sort',
+    'split',
+    'sprintf',
+    'step',
+    'strftime',
+    'strip',
+    'tag',
+    'tagged',
+    'template',
+    'then',
+    'tree_each',
+    'type',
+    'unique',
+    'unwrap',
+    'upcase',
+    'values',
+    'versioncmp',
+    'warning',
+    'with',
+    'yaml_data',
+}
+
+
+def _find_declarations(objs: list[MatchObject]) -> list[str]:
+    """
+    Find all local variable declarations.
+
+    Searches the code for all local variable declarations, returing a
+    list of variable names.
+
+    Note that the same variable might appear multiple times, for example:
+
+    .. code-block:: puppet
+        :caption: The same variable being declared twice
+
+        if $something {
+            $x = 10
+        } else {
+            $x = 20
+        }
+    """
+    declarations = []
+    for obj in objs:
+        match obj:
+            case MatchCompound(type='declaration', matched=xs):
+                for x in xs:
+                    match x:
+                        case MatchCompound(type='var', matched=ys):
+                            declarations.append(inner_text(ys))
+    return declarations
+
+
+class _PuppetReserializer:
+    """
+    Reserializes parsed puppet code back into puppet code.
+
+    This allows syntax highlighting, and hyperlinking to be added to the code.
+
+    :param local_vars:
+        Variables declared within this file. Used when resolving
+        hyperlinks.
+    """
+
+    def __init__(self, local_vars: list[str]):
+        self.local_vars: list[str] = local_vars
+
+    def reserialize(self, obj: MatchObject | Sequence[MatchObject]) -> str:
+        """
+        Reconstruct puppet code after parsing it.
+
+        After building the parser, and parsing the puppet code into a tree
+        of MatchObjects; this procedure returns it into puppet code.
+        Difference being that we now have metadata, meaning that syntax
+        highlighting and variable hyperlinks can be inserted.
+
+        :param obj:
+            Should be assumed to be a list of MatchObject's, or something similar.
+
+            MatchCompound objects are serialized as
+
+            .. code-block:: html
+
+                <span class="{type}">{body}</span>
+
+            esrings as themselves, and lists have reserialize mapped over them.
+
+        """
+        out: list[str] = []
+        # logger.info("obj = %a", obj)
+
+        # TODO hyperlink functions.
+        # The problem is that a function can either be implemented in
+        # Puppet, or in Ruby. And Ruby functions' names aren't bound
+        # by the directory layout.
+        match obj:
+            case str(s):
+                out.append(html.escape(s))
+
+            case MatchCompound(type='resource-name', matched=xs):
+                name = inner_text(xs)
+                url, cls = name_to_url(name)
+                if url:
+                    out.append(f'<a href="{url}" class="resource-name {cls}">{name}</a>')
+                else:
+                    # TODO this is class, but the class name should
+                    # also be hyperlinked
+                    out.append(f'<span class="resource-name {cls}">{name}</span>')
+
+            case MatchCompound(type='invoke', matched=xs):
+                function = None
+                for x in xs:
+                    match x:
+                        case MatchCompound(type='qn', matched=ys):
+                            if function is None:
+                                function = inner_text(ys)
+                                if function in _built_in_functions:
+                                    # class="qn"
+                                    url = f"https://www.puppet.com/docs/puppet/7/function.html#{function}"  # noqa: E501
+                                    tag = f'<a href="{url}" class="puppet-doc">{self.reserialize(ys)}</a>'  # noqa: E501
+                                    out.append(tag)
+                                else:
+                                    # TODO function to url
+                                    out.append(f'<span class="qn">{self.reserialize(ys)}</span>')
+                            else:
+                                if function == 'include':
+                                    url, cls = name_to_url(inner_text(ys))
+                                    # class="qn"
+                                    tag = f'<a href="{url}" class="{cls}">{self.reserialize(ys)}</a>'  # noqa: E501
+                                    out.append(tag)
+                                else:
+                                    out.append(self.reserialize(ys))
+                        case _:
+                            out.append(self.reserialize(x))
+
+            case MatchCompound(type='declaration', matched=xs):
+                for x in xs:
+                    match x:
+                        case MatchCompound(type='var', matched=ys):
+                            inner = ''.join(self.reserialize(y) for y in ys)
+                            out.append(f'<span id="{inner_text(ys)}">{inner}</span>')
+                        case _:
+                            out.append(self.reserialize(x))
+
+            case MatchCompound(type='var', matched=xs):
+                out.append(self.var_to_url(inner_text(xs)))
+
+            case MatchCompound(type=type, matched=xs):
+                body = ''.join(self.reserialize(x) for x in xs)
+                out.append(f'<span class="{type}">{body}</span>')
+
+            case [*xs]:
+                out.extend(self.reserialize(x) for x in xs)
+
+            case rest:
+                logger.error("Unknown type: %a", rest)
+
+        return ''.join(out)
+
+    def var_to_url(self, var: str) -> str:
+        """
+        Format variable, adding hyperlink to its definition.
+
+        TODO these can refer to both defined types (`manifests/*.pp`),
+        as well as resource types (`lib/puppet/provider/*/*.rb` /
+        `lib/tpuppet/type/*.rb`)
+
+        Same goes for functions (`functions/*.pp`),
+        (`lib/puppet/functions.rb`).
+
+        :param var:
+            Name of the variable.
+
+        :return:
+            An HTML anchor element.
+        """
+        match var.split('::'):
+            case [name]:
+                # Either a local or global variable
+                # https://www.puppet.com/docs/puppet/7/lang_facts_and_builtin_vars.html
+
+                href = None
+                cls = ''
+                if name in self.local_vars:
+                    href = f'#{html.escape(var)}'
+                elif name in _built_in_variables:
+                    href = html.escape(_built_in_variables[name])
+                    cls = 'puppet-doc'
+
+                if href:
+                    return f'<a class="var {cls}" href="{href}">{var}</a>'
+                else:
+                    # `name` refers to a global fact.
+                    return f'<span class="var">{var}</span>'
+
+            case ['', name]:
+                # A global variable
+                if name in _built_in_variables:
+                    href = html.escape(_built_in_variables[name])
+                    img = '<img src="/code/muppet-strings/output/static/favicon.ico" />'
+                    return f'<a class="var" href="{href}">{var}{img}</a>'
+                else:
+                    return f'<span class="var">{var}</span>'
+
+            # Note the "special module" 'settings',
+            # https://www.puppet.com/docs/puppet/7/lang_facts_builtin_variables#lang_facts_builtin_variables-server-variables
+            case ['', module, *items, name]:
+                s = '/code/muppet-strings/output/' \
+                    + '/'.join([module, 'manifests', *(items if items else ['init'])])
+                s += f'#{name}'
+                return f'<a class="var" href="{s}">{var}</a>'
+            case [module, *items, name]:
+                s = '/code/muppet-strings/output/' \
+                    + '/'.join([module, 'manifests', *(items if items else ['init'])])
+                s += f'#{name}'
+                return f'<a class="var" href="{s}">{var}</a>'
+            case _:
+                raise ValueError()
+
+
+def hyperlink_puppet_source(source: str, file: str, in_parameters: list[str]) -> str:
+    """
+    Parse and syntax highlight the given puppet source.
+
+    :returns: An HTML string
+    """
+    # Run the upstream puppet parser,
+    # then masage the tree into a usable form.
+    ast = build_ast(puppet_parser(source))
+
+    # From the ast, build a parser combinator parser.
+    # This parser will attach sufficient metadata to allow syntax
+    # highlighting and hyperlinking
+    parser = ParserFormatter().serialize(ast)
+
+    # Run the generated parser, giving us a list of match objects.
+    match_objects = ParserCombinator(source, file).get(parser)
+
+    # Reserialize the matched data back into puppet code, realizing
+    # the syntax highlighting and hyperlinks.
+    return _PuppetReserializer(_find_declarations(match_objects) + (in_parameters)) \
+        .reserialize(match_objects)
+
+
+def name_to_url(name: str) -> tuple[str | None, str]:
+    """
+    Resolve a class or resource name into an url.
+
+    :param name:
+        The name of a class or resource, surch as "example::resource".
+    :return:
+        A tuple consisting of
+
+        - One of
+          - An internal link to the definition of that type
+          - A link to the official puppet documentation
+          - ``None``, if `name` is "class"
+        - A string indicating extra HTML classes for this url.
+          This is mostly so external references can be marked properly.
+    """
+    if name in _built_in_types:
+        return (f'https://www.puppet.com/docs/puppet/7/types/{name}.html', 'puppet-doc')
+    elif name == 'class':
+        return (None, '')
+    else:
+        # TODO special cases for puppet's built in types.
+        # https://www.puppet.com/docs/puppet/7/cheatsheet_core_types.html
+        module, *items = name.lstrip(':').split('::')
+        # TODO get prefix from the command line/config file
+        return ('/code/muppet-strings/output/'
+                + '/'.join([module, 'manifests', *(items if items else ['init'])]),
+                '')
--- a/muppet/output/util.py
+++ b/muppet/output/util.py
+"""
+Misc utilities for the final output.
+
+These don't really belong to any sub-system, even though some are more
+useful than other.
+
+The aim is to only have pure functions here.
+"""
+
+from muppet.parser_combinator import (
+    MatchCompound,
+    MatchObject,
+)
+
+
+def inner_text(obj: MatchObject | list[MatchObject]) -> str:
+    """
+    Extract the text content from a set of MatchObjects.
+
+    This is really similar to HTML's inner_text.
+
+    Empty whitespace tags are expanded into nothing, non-empty
+    whitespace tags becomes a single space (note that this discards
+    commets).
+
+    This only works properly if no function was mapped over the parser
+    return values in tree, see :func:`muppet.parser_combinator.fmap`.
+
+    :param obj:
+        Match Objects to search.
+    """
+    match obj:
+        case str(s):
+            return s
+        case MatchCompound(type='ws', matched=[]):
+            return ''
+        case MatchCompound(type='ws'):
+            return ' '
+        case MatchCompound(matched=xs):
+            return ''.join(inner_text(x) for x in xs)
+        case [*xs]:
+            return ''.join(inner_text(x) for x in xs)
+        case _:
+            raise ValueError('How did we get here')