Source code for xdoctest.docstr.docscrape_google

"""
Handles parsing of information out of google style docstrings

It is not clear which of these `GoogleStyleDocs1`_ `GoogleStyleDocs2`_ is *the*
standard or if there is one.

This code has been exported to a standalone package

    * https://github.com/Erotemic/googledoc

This is similar to:

    * https://pypi.org/project/docstring-parser/
    * https://pypi.org/project/numpydoc/

It hasn't been decided if this will remain vendored in xdoctest or pulled in as
a dependency.

References:
    .. [GoogleStyleDocs1] https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html#example-google
    .. [GoogleStyleDocs2] http://www.sphinx-doc.org/en/stable/ext/example_google.html#example-google
"""
import re
import textwrap
import collections
from xdoctest import exceptions
from xdoctest.utils.util_str import ensure_unicode

DocBlock = collections.namedtuple('DocBlock', ['text', 'offset'])



[docs]
def split_google_docblocks(docstr):
    """
    Breaks a docstring into parts defined by google style

    Args:
        docstr (str): a docstring

    Returns:
        List[Tuple[str, DocBlock]]:
            list of 2-tuples where the first item is a google style docstring
            tag and the second item is the bock corresponding to that tag. The
            block itself is a 2-tuple where the first item is the unindented
            text and the second item is the line offset indicating that blocks
            location in the docstring.

    Note:
        Unknown or "freeform" sections are given a generic "__DOC__" tag.
        A section tag may be specified multiple times.

    CommandLine:
        xdoctest xdoctest.docstr.docscrape_google split_google_docblocks:2

    Example:
        >>> from xdoctest.docstr.docscrape_google import *  # NOQA
        >>> from xdoctest import utils
        >>> docstr = utils.codeblock(
        ...     '''
        ...     one line description
        ...
        ...     multiline
        ...     description
        ...
        ...     Args:
        ...         foo: bar
        ...
        ...     Returns:
        ...         None
        ...
        ...     Example:
        ...         >>> print('eg1')
        ...         eg1
        ...
        ...     Example:
        ...         >>> print('eg2')
        ...         eg2
        ...     ''')
        >>> groups = split_google_docblocks(docstr)
        >>> assert len(groups) == 5
        >>> [g[0] for g in groups]
        ['__DOC__', 'Args', 'Returns', 'Example', 'Example']

    Example:
        >>> from xdoctest.docstr.docscrape_google import *  # NOQA
        >>> docstr = split_google_docblocks.__doc__
        >>> groups = split_google_docblocks(docstr)

    Example:
        >>> from xdoctest.docstr.docscrape_google import *  # NOQA
        >>> from xdoctest import utils
        >>> docstr = utils.codeblock(
        ...     '''
        ...      a description with a leading space
        ...
        ...     Example:
        ...         >>> foobar
        ...     ''')
        >>> groups = split_google_docblocks(docstr)
        >>> print('groups = {!r}'.format(groups))

    Example:
        >>> from xdoctest.docstr.docscrape_google import *  # NOQA
        >>> from xdoctest import utils
        >>> docstr = utils.codeblock(
        ...     '''
        ...     Example:
        ...         >>> foobar
        ...     ''')
        >>> # Check that line offsets are valid if the first line is not blank
        >>> groups = split_google_docblocks(docstr)
        >>> offset = groups[0][1][1]
        >>> print('offset = {!r}'.format(offset))
        >>> assert offset == 0
        >>> # Check that line offsets are valid if the first line is blank
        >>> groups = split_google_docblocks(chr(10) + docstr)
        >>> offset = groups[0][1][1]
        >>> print('offset = {!r}'.format(offset))
        >>> assert offset == 1
    """
    if not isinstance(docstr, str):
        raise TypeError('Input docstr must be a string. Got {} instead'.format(
            type(docstr)))

    def get_indentation(line_):
        """ returns number of preceding spaces """
        return len(line_) - len(line_.lstrip())

    # Parse out initial documentation lines
    # Then parse out the blocked lines.
    docstr = ensure_unicode(docstr)

    docstr = textwrap.dedent(docstr)
    docstr_lines = docstr.split('\n')
    line_indent = [get_indentation(line) for line in docstr_lines]
    line_len = [len(line) for line in docstr_lines]

    # The first line may not have the correct indentation if it starts
    # right after the triple quotes. Adjust it in this case to ensure that
    # base indent is always 0
    adjusted = False
    is_nonzero = [len_ > 0 for len_ in line_len]
    if len(line_indent) >= 2:
        if line_len[0] != 0:
            indents = [x for x, f in zip(line_indent, is_nonzero) if f]
            if len(indents) >= 2:
                indent_adjust = min(indents[1:])
                line_indent[0] += indent_adjust
                line_len[0] += indent_adjust
                docstr_lines[0] = (' ' * indent_adjust) + docstr_lines[0]
                adjusted = True
    if adjusted:
        # Redo prepreocessing, but this time on a rectified input
        docstr = textwrap.dedent('\n'.join(docstr_lines))
        docstr_lines = docstr.split('\n')
        line_indent = [get_indentation(line) for line in docstr_lines]
        line_len = [len(line) for line in docstr_lines]

    indents = [x for x, f in zip(line_indent, is_nonzero) if f]
    if False and len(indents) >= 1:
        if indents[0] != 0:
            # debug info
            print('INDENTATION ERROR IN PARSING DOCSTRING')
            print('CHECK TO MAKE SURE YOU USED A RAW STRING IF YOU USE "\\n"')
            # TODO: Report this error with line number and file information
            print('Docstring:')
            print('----------')
            print(docstr)
            print('----------')
            raise exceptions.MalformedDocstr('malformed google docstr')

    base_indent = 0
    # We will group lines by their indentation.
    # Rectify empty lines by giving them their parent's indentation.
    true_indent = []
    prev_indent = None
    for indent_, len_ in zip(line_indent, line_len):
        if len_ == 0:
            # Empty lines take on their parents indentation
            indent_ = prev_indent
        true_indent.append(indent_)
        prev_indent = indent_

    # List of google style tags grouped by alias
    tag_groups = [
        ['Args', 'Arguments', 'Parameters', 'Other Parameters'],
        ['Kwargs', 'Keyword Args', 'Keyword Arguments'],
        ['Warns', 'Warning', 'Warnings'],
        ['Returns', 'Return'],
        ['Example', 'Examples'],
        ['Doctest'],
        ['Note', 'Notes'],
        ['Yields', 'Yield'],
        ['Attributes'],
        ['Methods'],
        ['Raises'],
        ['References'],
        ['See Also'],
        ['Todo'],
    ]
    # Map aliased tags to a canonical name (the first item in the group).
    tag_aliases = dict([(item, group[0]) for group in tag_groups for item in group])
    # Allow for single or double colon (support for pytorch)
    tag_pattern = '^' + '(' + '|'.join(tag_aliases.keys()) + ') *::? *$'

    # Label lines by their group-id
    group_id = 0
    prev_indent = 0
    group_list = []
    in_tag = False
    for line_num, (line, indent_) in enumerate(zip(docstr_lines, true_indent)):
        if re.match(tag_pattern, line):
            # Check if we can look ahead
            if line_num + 1 < len(docstr_lines):
                # A tag is only valid if its next line is properly indented,
                # empty, or is a tag itself.
                indent_increase = true_indent[line_num + 1] > base_indent
                indent_zero = line_len[line_num + 1] == 0
                matches_tag = re.match(tag_pattern, docstr_lines[line_num + 1])
                if (indent_increase or indent_zero or matches_tag):
                    group_id += 1
                    in_tag = True
            else:
                group_id += 1
                in_tag = True
        # If the indentation goes back to the base, then we have left the tag
        elif in_tag and indent_ != prev_indent and indent_ == base_indent:
            group_id += 1
            in_tag = False
        group_list.append(group_id)
        prev_indent = indent_

    assert len(docstr_lines) == len(group_list)

    # Group docstr lines by group list
    groups_ = collections.defaultdict(list)
    for groupid, line in zip(group_list, docstr_lines):
        groups_[groupid].append(line)

    groups = []
    line_offset = 0
    for k, lines in groups_.items():
        if len(lines) == 0 or (len(lines) == 1 and len(lines[0]) == 0):
            line_offset += len(lines)
            continue
        elif len(lines) >= 1 and re.match(tag_pattern, lines[0]):
            # An encoded google sub-block
            key = lines[0].strip().rstrip(':')
            val = lines[1:]
            subblock = textwrap.dedent('\n'.join(val))
        else:
            # A top level text documentation block
            key = '__DOC__'
            val = lines[:]
            subblock = '\n'.join(val)

        key = tag_aliases.get(key, key)
        block = DocBlock(subblock, line_offset)
        groups.append((key, block))
        line_offset += len(lines)
    return groups




[docs]
def parse_google_args(docstr):
    r"""
    Generates dictionaries of argument hints based on a google docstring

    Args:
        docstr (str): a google-style docstring

    Yields:
        Dict[str, str]: dictionaries of parameter hints

    Example:
        >>> docstr = parse_google_args.__doc__
        >>> argdict_list = list(parse_google_args(docstr))
        >>> print([sorted(d.items()) for d in argdict_list])
        [[('desc', 'a google-style docstring'), ('name', 'docstr'), ('type', 'str')]]
    """
    blocks = split_google_docblocks(docstr)
    for key, block in blocks:
        lines = block[0]
        if key == 'Args':
            for argdict in parse_google_argblock(lines):
                yield argdict




[docs]
def parse_google_returns(docstr, return_annot=None):
    r"""
    Generates dictionaries of possible return hints based on a google docstring

    Args:
        docstr (str): a google-style docstring

        return_annot (str | None):
            the return type annotation (if one exists)

    Yields:
        Dict[str, str]: dictionaries of return value hints

    Example:
        >>> docstr = parse_google_returns.__doc__
        >>> retdict_list = list(parse_google_returns(docstr))
        >>> print([sorted(d.items()) for d in retdict_list])
        [[('desc', 'dictionaries of return value hints'), ('type', 'Dict[str, str]')]]

    Example:
        >>> docstr = split_google_docblocks.__doc__
        >>> retdict_list = list(parse_google_returns(docstr))
        >>> print([sorted(d.items())[1] for d in retdict_list])
        [('type', 'List[Tuple[str, DocBlock]]')]
    """
    blocks = split_google_docblocks(docstr)
    for key, block in blocks:
        lines = block[0]
        if key == 'Returns':
            for retdict in parse_google_retblock(lines, return_annot):
                yield retdict
        if key == 'Yields':
            for retdict in parse_google_retblock(lines, return_annot):
                yield retdict




[docs]
def parse_google_retblock(lines, return_annot=None):
    r"""
    Parse information out of a returns or yeilds block.

    A returns or yeids block should be formatted as one or more
    ``'{type}:{description}'`` strings. The description can occupy multiple
    lines, but the indentation should increase.

    Args:
        lines (str):
            unindented lines from a Returns or Yields section

        return_annot (str | None):
            the return type annotation (if one exists)

    Yields:
        Dict[str, str]: each dict specifies the return type and its description

    Example:
        >>> # Test various ways that retlines can be written
        >>> assert len(list(parse_google_retblock('list: a desc'))) == 1
        >>> # ---
        >>> hints = list(parse_google_retblock('\n'.join([
        ...     'entire line can be desc',
        ...     ' ',
        ...     ' if a return type annotation is given',
        ... ]), return_annot='int'))
        >>> assert len(hints) == 1
        >>> # ---
        >>> hints = list(parse_google_retblock('\n'.join([
        ...     'bool: a description',
        ...     ' with a newline',
        ... ])))
        >>> assert len(hints) == 1
        >>> # ---
        >>> hints = list(parse_google_retblock('\n'.join([
        ...     'int or bool: a description',
        ...     ' ',
        ...     ' with a separated newline',
        ...     ' ',
        ... ])))
        >>> assert len(hints) == 1
        >>> # ---
        >>> hints = list(parse_google_retblock('\n'.join([
        ...     # Multiple types can be specified
        ...     'threading.Thread: a description',
        ...     '(int, str): a tuple of int and str',
        ...     'tuple: a tuple of int and str',
        ...     'Tuple[int, str]: a tuple of int and str',
        ... ])))
        >>> assert len(hints) == 4
        >>> # ---
        >>> # If the colon is not specified nothing will be parsed
        >>> # according to the "official" spec, but lets try and do it anyway
        >>> hints = list(parse_google_retblock('\n'.join([
        ...     'list',
        ...     'Tuple[int, str]',
        ... ])))
        >>> assert len(hints) == 2
        >>> assert len(list(parse_google_retblock('no type, just desc'))) == 1
        ...
    """
    if return_annot is not None:
        # If the function has a return type annotation then the return block
        # should only be interpreted as a description. The formatting of the
        # lines is not modified in this case.
        retdict = {'type': return_annot, 'desc': lines}
        yield retdict
    else:
        # Otherwise, this examines each line without any extra indentation (wrt
        # the returns block) splits each line using a colon, and interprets
        # anything to the left of the colon as the type hint. The rest of the
        # parts are the description. Extra whitespace is removed from the
        # descriptions.
        def finalize(retdict):
            final_desc = ' '.join([p for p in retdict['desc'] if p])
            retdict['desc'] = final_desc
            return retdict
        retdict = None
        noindent_pat = re.compile(r'^[^\s]')
        for line in lines.split('\n'):
            # Lines without indentation should declare new type hints
            if noindent_pat.match(line):
                if retdict is not None:
                    # Finalize and return any previously constructed type hint
                    yield finalize(retdict)
                    retdict = None
                # FIXME:
                # This doesn't quite work if ":" is part of the type
                # definition.  Not sure if it can be. Needs better parsing
                # to ensure the ":" is actually the separator between
                # type and desc
                if ':' in line:
                    parts = line.split(':')
                    retdict = {
                        'type': parts[0].strip(),
                        'desc': [':'.join(parts[1:]).strip()],
                    }
                else:
                    # warning (malformatted google docstring) We should support
                    # the case where they just specify the type and no
                    # description.
                    USE_TYPE_HACK = 1
                    if USE_TYPE_HACK:
                        import ast
                        try:
                            ast.parse(line.strip())
                        except Exception:
                            # Not parseable, assume this is a description.
                            retdict = {
                                'type': None,
                                'desc': [line.strip()],
                            }
                        else:
                            # Parseable, assume this is a type
                            retdict = {
                                'type': line.strip(),
                                'desc': [],
                            }
            else:
                # Lines with indentation should extend previous descriptions.
                if retdict is not None:
                    retdict['desc'].append(line.strip())
        if retdict is not None:
            yield finalize(retdict)




[docs]
def parse_google_argblock(lines, clean_desc=True):
    r"""
    Parse out individual items from google-style args blocks.

    Args:
        lines (str): the unindented lines from an Args docstring section

        clean_desc (bool):
            if True, will strip the description of newlines and indents.
            Defaults to True.

    Yields:
        Dict[str, str | None]:
            A dictionary containing keys, "name", "type", and "desc"
            corresponding to an argument in the Args block.

    Example:
        >>> # Test various ways that arglines can be written
        >>> line_list = [
        ...     '',
        ...     'foo1 (int): a description',
        ...     'foo2: a description\n    with a newline',
        ...     'foo3 (int or str): a description',
        ...     'foo4 (int or threading.Thread): a description',
        ...     #
        ...     # this is sphynx-like typing style
        ...     'param1 (:obj:`str`, optional): ',
        ...     'param2 (:obj:`list` of :obj:`str`):',
        ...     #
        ...     # the Type[type] syntax is defined by the python typeing module
        ...     'attr1 (Optional[int]): Description of `attr1`.',
        ...     'attr2 (List[str]): Description of `attr2`.',
        ...     'attr3 (Dict[str, str]): Description of `attr3`.',
        ...     '*args : variable positional args description',
        ...     '**kwargs : keyword arguments description',
        ...     'malformed and unparseable',
        ...     'param_no_desc1',  # todo: this should be parseable
        ...     'param_no_desc2:',
        ...     'param_no_desc3 ()',  # todo: this should be parseable
        ...     'param_no_desc4 ():',
        ...     'param_no_desc5 (str)',  # todo: this should be parseable
        ...     'param_no_desc6 (str):',
        ... ]
        >>> lines = '\n'.join(line_list)
        >>> argdict_list = list(parse_google_argblock(lines))
        >>> # All lines except the first should be accepted
        >>> assert len(argdict_list) == len(line_list) - 5
        >>> assert argdict_list[1]['desc'] == 'a description with a newline'
    """
    def named(key, pattern):
        return '(?P<{}>{})'.format(key, pattern)
    def optional(pattern):
        return '({})?'.format(pattern)
    def positive_lookahead(pattern):
        return '(?={})'.format(pattern)
    def regex_or(patterns):
        return '({})'.format('|'.join(patterns))
    whitespace = r'\s*'
    endofstr = r'\Z'

    # Define characters that can be part of variable / type names
    # Note: a variable name might be prefixed with 0, 1, or 2, `*` to indicate
    # *args or **kwargs
    varname = named('name', r'\*?\*?[A-Za-z_][A-Za-z0-9_]*')
    typename = named('type', '[^)]*?')
    argdesc = named('desc', '.*?')
    # Types are optional, and must be enclosed in parens
    optional_type = optional(whitespace.join([r'\(', typename, r'\)']))
    # Each arg hint must defined a on newline without any indentation
    argdef = whitespace.join([varname, optional_type, ':'])
    # the description is everything after the colon until either the next line
    # without any indentation or the end of the string
    end_desc = regex_or(['^' + positive_lookahead(r'[^\s]'), endofstr])

    flags = re.MULTILINE | re.DOTALL
    argline_pat = re.compile('^' + argdef + argdesc + end_desc, flags=flags)

    for match in argline_pat.finditer(lines):
        argdict = match.groupdict()
        # Clean description
        if clean_desc:
            desc_lines = [p.strip() for p in argdict['desc'].split('\n')]
            argdict['desc'] = ' '.join([p for p in desc_lines if p])

        yield argdict