"""
Handles parsing of information out of google style docstrings
It is not clear which of these `GoogleStyleDocs1`_ `GoogleStyleDocs2`_ is *the*
standard or if there is one.
This code has been exported to a standalone package
* https://github.com/Erotemic/googledoc
This is similar to:
* https://pypi.org/project/docstring-parser/
* https://pypi.org/project/numpydoc/
It hasn't been decided if this will remain vendored in xdoctest or pulled in as
a dependency.
References:
.. [GoogleStyleDocs1] https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html#example-google
.. [GoogleStyleDocs2] http://www.sphinx-doc.org/en/stable/ext/example_google.html#example-google
"""
import re
import textwrap
import collections
from xdoctest import exceptions
from xdoctest.utils.util_str import ensure_unicode
DocBlock = collections.namedtuple('DocBlock', ['text', 'offset'])
[docs]
def split_google_docblocks(docstr):
"""
Breaks a docstring into parts defined by google style
Args:
docstr (str): a docstring
Returns:
List[Tuple[str, DocBlock]]:
list of 2-tuples where the first item is a google style docstring
tag and the second item is the bock corresponding to that tag. The
block itself is a 2-tuple where the first item is the unindented
text and the second item is the line offset indicating that blocks
location in the docstring.
Note:
Unknown or "freeform" sections are given a generic "__DOC__" tag.
A section tag may be specified multiple times.
CommandLine:
xdoctest xdoctest.docstr.docscrape_google split_google_docblocks:2
Example:
>>> from xdoctest.docstr.docscrape_google import * # NOQA
>>> from xdoctest import utils
>>> docstr = utils.codeblock(
... '''
... one line description
...
... multiline
... description
...
... Args:
... foo: bar
...
... Returns:
... None
...
... Example:
... >>> print('eg1')
... eg1
...
... Example:
... >>> print('eg2')
... eg2
... ''')
>>> groups = split_google_docblocks(docstr)
>>> assert len(groups) == 5
>>> [g[0] for g in groups]
['__DOC__', 'Args', 'Returns', 'Example', 'Example']
Example:
>>> from xdoctest.docstr.docscrape_google import * # NOQA
>>> docstr = split_google_docblocks.__doc__
>>> groups = split_google_docblocks(docstr)
Example:
>>> from xdoctest.docstr.docscrape_google import * # NOQA
>>> from xdoctest import utils
>>> docstr = utils.codeblock(
... '''
... a description with a leading space
...
... Example:
... >>> foobar
... ''')
>>> groups = split_google_docblocks(docstr)
>>> print('groups = {!r}'.format(groups))
Example:
>>> from xdoctest.docstr.docscrape_google import * # NOQA
>>> from xdoctest import utils
>>> docstr = utils.codeblock(
... '''
... Example:
... >>> foobar
... ''')
>>> # Check that line offsets are valid if the first line is not blank
>>> groups = split_google_docblocks(docstr)
>>> offset = groups[0][1][1]
>>> print('offset = {!r}'.format(offset))
>>> assert offset == 0
>>> # Check that line offsets are valid if the first line is blank
>>> groups = split_google_docblocks(chr(10) + docstr)
>>> offset = groups[0][1][1]
>>> print('offset = {!r}'.format(offset))
>>> assert offset == 1
"""
if not isinstance(docstr, str):
raise TypeError('Input docstr must be a string. Got {} instead'.format(
type(docstr)))
def get_indentation(line_):
""" returns number of preceding spaces """
return len(line_) - len(line_.lstrip())
# Parse out initial documentation lines
# Then parse out the blocked lines.
docstr = ensure_unicode(docstr)
docstr = textwrap.dedent(docstr)
docstr_lines = docstr.split('\n')
line_indent = [get_indentation(line) for line in docstr_lines]
line_len = [len(line) for line in docstr_lines]
# The first line may not have the correct indentation if it starts
# right after the triple quotes. Adjust it in this case to ensure that
# base indent is always 0
adjusted = False
is_nonzero = [len_ > 0 for len_ in line_len]
if len(line_indent) >= 2:
if line_len[0] != 0:
indents = [x for x, f in zip(line_indent, is_nonzero) if f]
if len(indents) >= 2:
indent_adjust = min(indents[1:])
line_indent[0] += indent_adjust
line_len[0] += indent_adjust
docstr_lines[0] = (' ' * indent_adjust) + docstr_lines[0]
adjusted = True
if adjusted:
# Redo prepreocessing, but this time on a rectified input
docstr = textwrap.dedent('\n'.join(docstr_lines))
docstr_lines = docstr.split('\n')
line_indent = [get_indentation(line) for line in docstr_lines]
line_len = [len(line) for line in docstr_lines]
indents = [x for x, f in zip(line_indent, is_nonzero) if f]
if False and len(indents) >= 1:
if indents[0] != 0:
# debug info
print('INDENTATION ERROR IN PARSING DOCSTRING')
print('CHECK TO MAKE SURE YOU USED A RAW STRING IF YOU USE "\\n"')
# TODO: Report this error with line number and file information
print('Docstring:')
print('----------')
print(docstr)
print('----------')
raise exceptions.MalformedDocstr('malformed google docstr')
base_indent = 0
# We will group lines by their indentation.
# Rectify empty lines by giving them their parent's indentation.
true_indent = []
prev_indent = None
for indent_, len_ in zip(line_indent, line_len):
if len_ == 0:
# Empty lines take on their parents indentation
indent_ = prev_indent
true_indent.append(indent_)
prev_indent = indent_
# List of google style tags grouped by alias
tag_groups = [
['Args', 'Arguments', 'Parameters', 'Other Parameters'],
['Kwargs', 'Keyword Args', 'Keyword Arguments'],
['Warns', 'Warning', 'Warnings'],
['Returns', 'Return'],
['Example', 'Examples'],
['Doctest'],
['Note', 'Notes'],
['Yields', 'Yield'],
['Attributes'],
['Methods'],
['Raises'],
['References'],
['See Also'],
['Todo'],
]
# Map aliased tags to a canonical name (the first item in the group).
tag_aliases = dict([(item, group[0]) for group in tag_groups for item in group])
# Allow for single or double colon (support for pytorch)
tag_pattern = '^' + '(' + '|'.join(tag_aliases.keys()) + ') *::? *$'
# Label lines by their group-id
group_id = 0
prev_indent = 0
group_list = []
in_tag = False
for line_num, (line, indent_) in enumerate(zip(docstr_lines, true_indent)):
if re.match(tag_pattern, line):
# Check if we can look ahead
if line_num + 1 < len(docstr_lines):
# A tag is only valid if its next line is properly indented,
# empty, or is a tag itself.
indent_increase = true_indent[line_num + 1] > base_indent
indent_zero = line_len[line_num + 1] == 0
matches_tag = re.match(tag_pattern, docstr_lines[line_num + 1])
if (indent_increase or indent_zero or matches_tag):
group_id += 1
in_tag = True
else:
group_id += 1
in_tag = True
# If the indentation goes back to the base, then we have left the tag
elif in_tag and indent_ != prev_indent and indent_ == base_indent:
group_id += 1
in_tag = False
group_list.append(group_id)
prev_indent = indent_
assert len(docstr_lines) == len(group_list)
# Group docstr lines by group list
groups_ = collections.defaultdict(list)
for groupid, line in zip(group_list, docstr_lines):
groups_[groupid].append(line)
groups = []
line_offset = 0
for k, lines in groups_.items():
if len(lines) == 0 or (len(lines) == 1 and len(lines[0]) == 0):
line_offset += len(lines)
continue
elif len(lines) >= 1 and re.match(tag_pattern, lines[0]):
# An encoded google sub-block
key = lines[0].strip().rstrip(':')
val = lines[1:]
subblock = textwrap.dedent('\n'.join(val))
else:
# A top level text documentation block
key = '__DOC__'
val = lines[:]
subblock = '\n'.join(val)
key = tag_aliases.get(key, key)
block = DocBlock(subblock, line_offset)
groups.append((key, block))
line_offset += len(lines)
return groups
[docs]
def parse_google_args(docstr):
r"""
Generates dictionaries of argument hints based on a google docstring
Args:
docstr (str): a google-style docstring
Yields:
Dict[str, str]: dictionaries of parameter hints
Example:
>>> docstr = parse_google_args.__doc__
>>> argdict_list = list(parse_google_args(docstr))
>>> print([sorted(d.items()) for d in argdict_list])
[[('desc', 'a google-style docstring'), ('name', 'docstr'), ('type', 'str')]]
"""
blocks = split_google_docblocks(docstr)
for key, block in blocks:
lines = block[0]
if key == 'Args':
for argdict in parse_google_argblock(lines):
yield argdict
[docs]
def parse_google_returns(docstr, return_annot=None):
r"""
Generates dictionaries of possible return hints based on a google docstring
Args:
docstr (str): a google-style docstring
return_annot (str | None):
the return type annotation (if one exists)
Yields:
Dict[str, str]: dictionaries of return value hints
Example:
>>> docstr = parse_google_returns.__doc__
>>> retdict_list = list(parse_google_returns(docstr))
>>> print([sorted(d.items()) for d in retdict_list])
[[('desc', 'dictionaries of return value hints'), ('type', 'Dict[str, str]')]]
Example:
>>> docstr = split_google_docblocks.__doc__
>>> retdict_list = list(parse_google_returns(docstr))
>>> print([sorted(d.items())[1] for d in retdict_list])
[('type', 'List[Tuple[str, DocBlock]]')]
"""
blocks = split_google_docblocks(docstr)
for key, block in blocks:
lines = block[0]
if key == 'Returns':
for retdict in parse_google_retblock(lines, return_annot):
yield retdict
if key == 'Yields':
for retdict in parse_google_retblock(lines, return_annot):
yield retdict
[docs]
def parse_google_retblock(lines, return_annot=None):
r"""
Parse information out of a returns or yeilds block.
A returns or yeids block should be formatted as one or more
``'{type}:{description}'`` strings. The description can occupy multiple
lines, but the indentation should increase.
Args:
lines (str):
unindented lines from a Returns or Yields section
return_annot (str | None):
the return type annotation (if one exists)
Yields:
Dict[str, str]: each dict specifies the return type and its description
Example:
>>> # Test various ways that retlines can be written
>>> assert len(list(parse_google_retblock('list: a desc'))) == 1
>>> # ---
>>> hints = list(parse_google_retblock('\n'.join([
... 'entire line can be desc',
... ' ',
... ' if a return type annotation is given',
... ]), return_annot='int'))
>>> assert len(hints) == 1
>>> # ---
>>> hints = list(parse_google_retblock('\n'.join([
... 'bool: a description',
... ' with a newline',
... ])))
>>> assert len(hints) == 1
>>> # ---
>>> hints = list(parse_google_retblock('\n'.join([
... 'int or bool: a description',
... ' ',
... ' with a separated newline',
... ' ',
... ])))
>>> assert len(hints) == 1
>>> # ---
>>> hints = list(parse_google_retblock('\n'.join([
... # Multiple types can be specified
... 'threading.Thread: a description',
... '(int, str): a tuple of int and str',
... 'tuple: a tuple of int and str',
... 'Tuple[int, str]: a tuple of int and str',
... ])))
>>> assert len(hints) == 4
>>> # ---
>>> # If the colon is not specified nothing will be parsed
>>> # according to the "official" spec, but lets try and do it anyway
>>> hints = list(parse_google_retblock('\n'.join([
... 'list',
... 'Tuple[int, str]',
... ])))
>>> assert len(hints) == 2
>>> assert len(list(parse_google_retblock('no type, just desc'))) == 1
...
"""
if return_annot is not None:
# If the function has a return type annotation then the return block
# should only be interpreted as a description. The formatting of the
# lines is not modified in this case.
retdict = {'type': return_annot, 'desc': lines}
yield retdict
else:
# Otherwise, this examines each line without any extra indentation (wrt
# the returns block) splits each line using a colon, and interprets
# anything to the left of the colon as the type hint. The rest of the
# parts are the description. Extra whitespace is removed from the
# descriptions.
def finalize(retdict):
final_desc = ' '.join([p for p in retdict['desc'] if p])
retdict['desc'] = final_desc
return retdict
retdict = None
noindent_pat = re.compile(r'^[^\s]')
for line in lines.split('\n'):
# Lines without indentation should declare new type hints
if noindent_pat.match(line):
if retdict is not None:
# Finalize and return any previously constructed type hint
yield finalize(retdict)
retdict = None
# FIXME:
# This doesn't quite work if ":" is part of the type
# definition. Not sure if it can be. Needs better parsing
# to ensure the ":" is actually the separator between
# type and desc
if ':' in line:
parts = line.split(':')
retdict = {
'type': parts[0].strip(),
'desc': [':'.join(parts[1:]).strip()],
}
else:
# warning (malformatted google docstring) We should support
# the case where they just specify the type and no
# description.
USE_TYPE_HACK = 1
if USE_TYPE_HACK:
import ast
try:
ast.parse(line.strip())
except Exception:
# Not parseable, assume this is a description.
retdict = {
'type': None,
'desc': [line.strip()],
}
else:
# Parseable, assume this is a type
retdict = {
'type': line.strip(),
'desc': [],
}
else:
# Lines with indentation should extend previous descriptions.
if retdict is not None:
retdict['desc'].append(line.strip())
if retdict is not None:
yield finalize(retdict)
[docs]
def parse_google_argblock(lines, clean_desc=True):
r"""
Parse out individual items from google-style args blocks.
Args:
lines (str): the unindented lines from an Args docstring section
clean_desc (bool):
if True, will strip the description of newlines and indents.
Defaults to True.
Yields:
Dict[str, str | None]:
A dictionary containing keys, "name", "type", and "desc"
corresponding to an argument in the Args block.
Example:
>>> # Test various ways that arglines can be written
>>> line_list = [
... '',
... 'foo1 (int): a description',
... 'foo2: a description\n with a newline',
... 'foo3 (int or str): a description',
... 'foo4 (int or threading.Thread): a description',
... #
... # this is sphynx-like typing style
... 'param1 (:obj:`str`, optional): ',
... 'param2 (:obj:`list` of :obj:`str`):',
... #
... # the Type[type] syntax is defined by the python typeing module
... 'attr1 (Optional[int]): Description of `attr1`.',
... 'attr2 (List[str]): Description of `attr2`.',
... 'attr3 (Dict[str, str]): Description of `attr3`.',
... '*args : variable positional args description',
... '**kwargs : keyword arguments description',
... 'malformed and unparseable',
... 'param_no_desc1', # todo: this should be parseable
... 'param_no_desc2:',
... 'param_no_desc3 ()', # todo: this should be parseable
... 'param_no_desc4 ():',
... 'param_no_desc5 (str)', # todo: this should be parseable
... 'param_no_desc6 (str):',
... ]
>>> lines = '\n'.join(line_list)
>>> argdict_list = list(parse_google_argblock(lines))
>>> # All lines except the first should be accepted
>>> assert len(argdict_list) == len(line_list) - 5
>>> assert argdict_list[1]['desc'] == 'a description with a newline'
"""
def named(key, pattern):
return '(?P<{}>{})'.format(key, pattern)
def optional(pattern):
return '({})?'.format(pattern)
def positive_lookahead(pattern):
return '(?={})'.format(pattern)
def regex_or(patterns):
return '({})'.format('|'.join(patterns))
whitespace = r'\s*'
endofstr = r'\Z'
# Define characters that can be part of variable / type names
# Note: a variable name might be prefixed with 0, 1, or 2, `*` to indicate
# *args or **kwargs
varname = named('name', r'\*?\*?[A-Za-z_][A-Za-z0-9_]*')
typename = named('type', '[^)]*?')
argdesc = named('desc', '.*?')
# Types are optional, and must be enclosed in parens
optional_type = optional(whitespace.join([r'\(', typename, r'\)']))
# Each arg hint must defined a on newline without any indentation
argdef = whitespace.join([varname, optional_type, ':'])
# the description is everything after the colon until either the next line
# without any indentation or the end of the string
end_desc = regex_or(['^' + positive_lookahead(r'[^\s]'), endofstr])
flags = re.MULTILINE | re.DOTALL
argline_pat = re.compile('^' + argdef + argdesc + end_desc, flags=flags)
for match in argline_pat.finditer(lines):
argdict = match.groupdict()
# Clean description
if clean_desc:
desc_lines = [p.strip() for p in argdict['desc'].split('\n')]
argdict['desc'] = ' '.join([p for p in desc_lines if p])
yield argdict