"""
The XDoctest Parser
-------------------
This parses a docstring into one or more "doctest part" *after* the docstrings
have been extracted from the source code by either static or dynamic means.
Terms and definitions:
logical block:
a snippet of code that can be executed by itself if given the correct
global / local variable context.
PS1:
The original meaning is "Prompt String 1". For details see:
[SE32096]_ [BashPS1]_ [CustomPrompt]_ [GeekPrompt]_. In the context of
xdoctest, instead of referring to the prompt prefix, we use PS1 to
refer to a line that starts a "logical block" of code. In the original
doctest module these all had to be prefixed with ">>>". In xdoctest the
prefix is used to simply denote the code is part of a doctest. It does
not necessarily mean a new "logical block" is starting.
PS2:
The original meaning is "Prompt String 2". In the context of xdoctest,
instead of referring to the prompt prefix, we use PS2 to refer to a
line that continues a "logical block" of code. In the original doctest
module these all had to be prefixed with "...". However, xdoctest uses
parsing to automatically determine this.
want statement:
Lines directly after a logical block of code in a doctest indicating
the desired result of executing the previous block.
While I do believe this AST-based code is a significant improvement over the
RE-based builtin doctest parser, I acknowledge that I'm not an AST expert and
there is room for improvement here.
References:
.. [SE32096] https://unix.stackexchange.com/questions/32096/why-is-bashs-prompt-variable-called-ps1
.. [BashPS1] https://www.gnu.org/savannah-checkouts/gnu/bash/manual/bash.html#index-PS1
.. [CustomPrompt] https://wiki.archlinux.org/title/Bash/Prompt_customization
.. [GeekPrompt] https://web.archive.org/web/20230824025647/https://www.thegeekstuff.com/2008/09/bash-shell-take-control-of-ps1-ps2-ps3-ps4-and-prompt_command/
"""
from __future__ import annotations
import ast
import re
import sys
import tokenize
import typing
from xdoctest import directive, doctest_part, exceptions, global_state, utils
from xdoctest import static_analysis as static
INDENT_RE = re.compile(r'^([ ]*)(?=\S)', re.MULTILINE)
[docs]
class DoctestParser:
r"""
Breaks docstrings into parts using the `parse` method.
Example:
>>> from xdoctest.parser import * # NOQA
>>> parser = DoctestParser()
>>> doctest_parts = parser.parse(
>>> '''
>>> >>> j = 0
>>> >>> for i in range(10):
>>> >>> j += 1
>>> >>> print(j)
>>> 10
>>> '''.lstrip('\n'))
>>> print('\n'.join(list(map(str, doctest_parts))))
<DoctestPart(ln 0, src="j = 0...", want=None)>
<DoctestPart(ln 3, src="print(j)...", want="10...")>
Example:
>>> # Having multiline strings in doctests can be nice
>>> string = utils.codeblock(
'''
>>> name = 'name'
'anything'
''')
>>> self = DoctestParser()
>>> doctest_parts = self.parse(string)
>>> print('\n'.join(list(map(str, doctest_parts))))
"""
def __init__(self, simulate_repl: bool = False) -> None:
"""
Args:
simulate_repl (bool): if True each line will be treated as its
own doctest. This more closely mimics the original doctest
module. Defaults to False.
"""
self.simulate_repl = simulate_repl
[docs]
def parse(
self, string: str, info: dict | None = None
) -> list[doctest_part.DoctestPart | str]:
r"""
Divide the given string into examples and interleaving text.
Args:
string (str): The docstring that may contain one or more doctests.
info (dict | None): info about where the string came from in case of an
error
Returns:
List[xdoctest.doctest_part.DoctestPart | str]:
a list of `DoctestPart` objects and intervening text in the
input docstring.
CommandLine:
python -m xdoctest.parser DoctestParser.parse
Example:
>>> docstr = '''
>>> A simple docstring contains text followed by an example.
>>> >>> numbers = [1, 2, 3, 4]
>>> >>> thirds = [x / 3 for x in numbers]
>>> >>> print(thirds)
>>> [0.33 0.66 1 1.33]
>>> '''
>>> from xdoctest import parser
>>> self = parser.DoctestParser()
>>> results = self.parse(docstr)
>>> assert len(results) == 3
>>> for index, result in enumerate(results):
>>> print(f'results[{index}] = {result!r}')
results[0] = '\nA simple docstring contains text followed by an example.'
results[1] = <DoctestPart(ln 2, src="numbers ...", want=None) at ...>
results[2] = <DoctestPart(ln 4, src="print(th...", want="[0.33 0...") at ...>
Example:
>>> s = 'I am a dummy example with two parts'
>>> x = 10
>>> print(s)
I am a dummy example with two parts
>>> s = 'My purpose it so demonstrate how wants work here'
>>> print('The new want applies ONLY to stdout')
>>> print('given before the last want')
>>> '''
this wont hurt the test at all
even though its multiline '''
>>> y = 20
The new want applies ONLY to stdout
given before the last want
>>> # Parts from previous examples are executed in the same context
>>> print(x + y)
30
this is simply text, and doesnt apply to the previous doctest the
<BLANKLINE> directive is still in effect.
Example:
>>> from xdoctest.parser import * # NOQA
>>> from xdoctest import parser
>>> from xdoctest.docstr import docscrape_google
>>> from xdoctest import core
>>> self = parser.DoctestParser()
>>> docstr = self.parse.__doc__
>>> blocks = docscrape_google.split_google_docblocks(docstr)
>>> doclineno = self.parse.__func__.__code__.co_firstlineno
>>> key, (string, offset) = blocks[-2]
>>> self._label_docsrc_lines(string)
>>> doctest_parts = self.parse(string)
>>> # each part with a want-string needs to be broken in two
>>> assert len(doctest_parts) == 6
>>> len(doctest_parts)
"""
if global_state.DEBUG_PARSER > 1:
print('\n===== PARSE ====')
if sys.version_info.major == 2: # nocover
string = utils.ensure_unicode(string)
if not isinstance(string, str):
raise TypeError('Expected string but got {!r}'.format(string))
# If all lines begin with the same indentation, then strip it.
min_indent = _min_indentation(string)
if min_indent > 0:
string = '\n'.join([ln[min_indent:] for ln in string.splitlines()])
labeled_lines = None
grouped_lines = None
all_parts = None
try:
labeled_lines = self._label_docsrc_lines(string)
grouped_lines = self._group_labeled_lines(labeled_lines)
all_parts = list(self._package_groups(grouped_lines))
except Exception as orig_ex:
if labeled_lines is None:
failpoint = '_label_docsrc_lines'
elif grouped_lines is None:
failpoint = '_group_labeled_lines'
elif all_parts is None:
failpoint = '_package_groups'
if global_state.DEBUG_PARSER:
print('<FAILPOINT>')
print('!!! FAILED !!!')
print('failpoint = {!r}'.format(failpoint))
import traceback
import ubelt as ub
tb_text = traceback.format_exc()
tb_text = ub.highlight_code(tb_text)
tb_text = ub.indent(tb_text)
print(tb_text)
print('Failed to parse string = <{[<{[<{[ # xdoc debug')
print(string)
print(']}>]}>]}> # xdoc debug end string')
print('info = {}'.format(ub.repr2(info)))
print('-----')
print('orig_ex = {}'.format(orig_ex))
print('labeled_lines = {}'.format(ub.repr2(labeled_lines)))
print(
'grouped_lines = {}'.format(ub.repr2(grouped_lines, nl=3))
)
print('all_parts = {}'.format(ub.repr2(all_parts)))
print('</FAILPOINT>')
# sys.exit(1)
raise exceptions.DoctestParseError(
'Failed to parse doctest in {}'.format(failpoint),
string=string,
info=info,
orig_ex=orig_ex,
)
if global_state.DEBUG_PARSER > 1:
print('\n===== FINISHED PARSE ====')
return all_parts
[docs]
def _package_groups(self, grouped_lines):
if global_state.DEBUG_PARSER > 1:
import ubelt as ub
print('<PACKAGE LABEL GROUPS>')
print('grouped_lines = {}'.format(ub.repr2(grouped_lines, nl=2)))
lineno = 0
for chunk in grouped_lines:
if isinstance(chunk, tuple):
slines, wlines = chunk
for example in self._package_chunk(slines, wlines, lineno):
yield example
lineno += len(slines) + len(wlines)
else:
text_part = '\n'.join(chunk)
yield text_part
lineno += len(chunk)
if global_state.DEBUG_PARSER > 1:
print('</PACKAGE LABEL GROUPS>')
[docs]
def _package_chunk(self, raw_source_lines, raw_want_lines, lineno=0):
"""
if `self.simulate_repl` is True, then each statement is broken into its
own part. Otherwise, statements are grouped by the closest `want`
statement.
TODO:
- [ ] EXCEPT IN CASES OF EXPLICIT CONTINUATION
Example:
>>> from xdoctest.parser import *
>>> raw_source_lines = ['>>> "string"']
>>> raw_want_lines = ['string']
>>> self = DoctestParser()
>>> part, = self._package_chunk(raw_source_lines, raw_want_lines)
>>> part.source
'"string"'
>>> part.want
'string'
"""
if global_state.DEBUG_PARSER > 1:
print('<PACKAGE CHUNK>')
match = INDENT_RE.search(raw_source_lines[0])
line_indent = 0 if match is None else (match.end() - match.start())
source_lines = [p[line_indent:] for p in raw_source_lines]
want_lines = [p[line_indent:] for p in raw_want_lines]
# TODO:
# - [ ] Fix pytorch indentation issue here
exec_source_lines = [p[4:] for p in source_lines]
if global_state.DEBUG_PARSER > 1:
print(' * locate ps1 lines')
# Find the line number of each standalone statement
ps1_linenos, mode_hint = self._locate_ps1_linenos(source_lines)
if global_state.DEBUG_PARSER > 1:
print('mode_hint = {!r}'.format(mode_hint))
print(' * located ps1 lines')
print(f'ps1_linenos={ps1_linenos}')
# Find all directives here:
# A directive necessarily will split a doctest into multiple parts
# There are two types: block directives and inline-directives
# * Block directives must exist on their own PS1 line
# * Block directives insert a breakpoint before
# * Inline directives may be on a PS1 or PS2 line
# * Inline directives inserts a breakpoint before and after
# First find block directives which must exist on there own PS1 line
break_linenos = []
ps1_to_directive = {}
for s1, s2 in zip(ps1_linenos, ps1_linenos[1:] + [None]):
lines = exec_source_lines[s1:s2]
directives = list(directive.Directive.extract('\n'.join(lines)))
if directives:
ps1_to_directive[s1] = directives
break_linenos.append(s1)
if directives[0].inline:
if s2 is not None:
break_linenos.append(s2)
if global_state.DEBUG_PARSER > 3:
print(f'break_linenos={break_linenos}')
def slice_example(s1, s2, want_lines=None) -> doctest_part.DoctestPart:
exec_lines = exec_source_lines[s1:s2]
orig_lines = source_lines[s1:s2]
directives = ps1_to_directive.get(s1, None)
example = doctest_part.DoctestPart(
exec_lines,
want_lines=want_lines,
orig_lines=orig_lines,
line_offset=lineno + s1,
directives=directives,
)
return example
s1 = 0
s2 = 0
if self.simulate_repl:
# Break down first parts which dont have any want
for s1, s2 in zip(ps1_linenos, ps1_linenos[1:]):
example = slice_example(s1, s2)
yield example
s1 = s2
else:
if break_linenos:
break_linenos = sorted(set([0] + break_linenos))
# directives are forcing us to further breakup the parts
for s1, s2 in zip(break_linenos, break_linenos[1:]):
example = slice_example(s1, s2)
yield example
s1 = s2
if want_lines and mode_hint in {'eval', 'single'}:
# Whenever the evaluation of the final line needs to be tested
# against want, that line must be separated into its own part.
# We break the last line off so we can eval its value, but keep
# previous groupings.
s2 = ps1_linenos[-1]
if s2 != s1: # make sure the last line is not the only line
example = slice_example(s1, s2)
yield example
s1 = s2
s2 = None
example = slice_example(s1, s2, want_lines)
# if mode_hint is False:
# mode_hint = 'exec'
# if mode_hint is True:
# mode_hint = 'eval'
if not bool(want_lines):
example.compile_mode = 'exec'
else:
assert mode_hint in {'eval', 'exec', 'single'}
example.compile_mode = mode_hint
if global_state.DEBUG_PARSER > 1:
print('example.compile_mode = {!r}'.format(example.compile_mode))
print('<YIELD CHUNK>')
yield example
[docs]
def _group_labeled_lines(self, labeled_lines) -> list[list | tuple | str]:
"""
Group labeled lines into logical parts to be executed together
Returns:
List[List[str] | Tuple[List[str], str]]:
A list of parts. Text parts are just returned as a list of
lines. Executable parts are returned as a tuple of source
lines and an optional "want" statement.
"""
if global_state.DEBUG_PARSER > 1:
print('<GROUP LABEL LINES>')
# Now that lines have types, groups them. This could have done this
# above, but functionality is split for readability.
prev_source = None
# TODO: make typing more sane here.
grouped_lines: list[list | tuple | str] = []
# WORKON_BACKWARDS_COMPAT_CONTINUE_EVAL
# Break up explicit continuations for backwards compat
groups: list[tuple] = []
current: list[str] = []
state = None
if global_state.DEBUG_PARSER > 4:
print('labeled_lines = {!r}'.format(labeled_lines))
# Need to ensure that old-style continuations with want statements are
# placed in their own group, so they can be executed as "single".
for left, mid, right in _iterthree(
labeled_lines, pad_value=(None, None)
):
if left[0] != mid[0] or (mid[0] == 'dsrc' and right[0] == 'dcnt'):
if not (left[0] == 'dsrc' and mid[0] == 'dcnt'):
# Start a new group
if state is not None:
groups.append((state, current))
state = mid[0]
current = []
current.append(mid)
if current:
groups.append((state, current))
if global_state.DEBUG_PARSER > 4:
print('groups = {!r}'.format(groups))
# need to merge consecutive dsrc groups without want statements
merged_groups: list[tuple] = []
merge_current: list[str] = []
state = None
for left, mid, right in _iterthree(groups, pad_value=(None, None)):
# Merge consecutive groups unless it is followed by a want
if left[0] == mid[0] and right[0] != 'want':
# extend the previous group
merge_current.extend(mid[1])
else:
# start a new group
if state is not None:
merged_groups.append((left[0], merge_current))
state = mid[0]
merge_current = []
merge_current.extend(mid[1])
if merge_current:
merged_groups.append((state, merge_current))
# More iterating and grouping. This section needs a careful rewrite
prev_source = None
grouped_lines = []
for state, group in merged_groups:
block: list[str] = [t[1] for t in group]
if state == 'text':
if prev_source is not None:
# accept a source block without a want block
grouped_lines.append((prev_source, ''))
prev_source = None
# accept the text
grouped_lines.append(block)
elif state == 'want':
assert prev_source is not None, 'impossible'
grouped_lines.append((prev_source, block))
prev_source = None
elif state in {'dsrc', 'dcnt'}:
if prev_source is not None:
# accept a source block without a want block
grouped_lines.append((prev_source, ''))
prev_source = None
# need to check if there is a want after us
prev_source = block
# Case where last block is source
if prev_source:
grouped_lines.append((prev_source, ''))
if global_state.DEBUG_PARSER > 1: # nocover
print('</GROUP LABEL LINES>')
return grouped_lines
[docs]
def _locate_ps1_linenos(
self, source_lines: list[str]
) -> tuple[list[int], str]:
"""
Determines which lines in the source begin a "logical block" of code.
Args:
source_lines (List[str]): lines belonging only to the doctest src
these will be unindented, prefixed, and without any want.
Returns:
Tuple[List[int], bool]:
linenos is the first value a list of indices indicating which
lines are considered "PS1" and
mode_hint, the second value, is a flag indicating if the final
line should be considered for a got/want assertion.
Example:
>>> self = DoctestParser()
>>> source_lines = ['>>> def foo():', '>>> return 0', '>>> 3']
>>> linenos, mode_hint = self._locate_ps1_linenos(source_lines)
>>> assert linenos == [0, 2]
>>> assert mode_hint == 'eval'
Example:
>>> from xdoctest.parser import * # NOQA
>>> self = DoctestParser()
>>> source_lines = ['>>> x = [1, 2, ', '>>> 3, 4]', '>>> print(len(x))']
>>> linenos, mode_hint = self._locate_ps1_linenos(source_lines)
>>> assert linenos == [0, 2]
>>> assert mode_hint == 'eval'
Example:
>>> from xdoctest.parser import * # NOQA
>>> self = DoctestParser()
>>> source_lines = [
>>> '>>> x = 1',
>>> '>>> try: raise Exception',
>>> '>>> except Exception: pass',
>>> '...',
>>> ]
>>> linenos, mode_hint = self._locate_ps1_linenos(source_lines)
>>> assert linenos == [0, 1]
>>> assert mode_hint == 'exec'
Example:
>>> from xdoctest.parser import * # NOQA
>>> self = DoctestParser()
>>> source_lines = [
>>> '>>> import os; print(os)',
>>> '...',
>>> ]
>>> linenos, mode_hint = self._locate_ps1_linenos(source_lines)
>>> assert linenos == [0]
>>> assert mode_hint == 'single'
Example:
>>> # We should ensure that decorators are PS1 lines
>>> from xdoctest.parser import * # NOQA
>>> self = DoctestParser()
>>> source_lines = [
>>> '>>> # foo',
>>> '>>> @foo',
>>> '... def bar():',
>>> '... ...',
>>> ]
>>> linenos, mode_hint = self._locate_ps1_linenos(source_lines)
>>> print(f'linenos={linenos}')
>>> assert linenos == [0, 1]
"""
# Strip indentation (and PS1 / PS2 from source)
exec_source_lines = [p[4:] for p in source_lines]
def _hack_comment_statements(lines) -> typing.Iterable[str]:
# Hack to make comments appear like executable statements
# note, this hack never leaves this function because we only are
# returning line numbers.
# FIXME: there is probably a better way to do this.
def balanced_intervals(lines: list[str]) -> list[tuple[int, int]]:
"""
Finds intervals of balanced nesting syntax
Args:
lines (List[str]): lines of source code
"""
intervals = []
a = len(lines) - 1
b = len(lines)
while b > 0:
# move the head pointer up until we become balanced
while (
not static.is_balanced_statement(
lines[a:b], only_tokens=True
)
and a >= 0
):
a -= 1
if a < 0:
raise exceptions.IncompleteParseError(
'ill-formed doctest: cannot find balanced ps1 lines.'
)
# we found a balanced interval
intervals.append((a, b))
b = a
a = a - 1
intervals = intervals[::-1]
return intervals
intervals = balanced_intervals(lines)
interval_starts = {t[0] for t in intervals}
def _indent(line: str) -> str:
return line[: len(line) - len(line.lstrip())]
def _infer_comment_indent(idx: int, line: str) -> str:
"""Infer the indentation a placeholder comment statement
should have.
In regular Python, comments may ignore indentation rules,
but once we replace the comment with a statement we must honor
the expected indentation to avoid `IndentationError`. When a
comment is missing indentation, try to infer it from the next
relevant line in the doctest.
"""
indent = _indent(line)
stripped = line.lstrip()
if indent or not stripped.startswith('#'):
return indent
# Look ahead for the next non-comment line to determine the
# indentation level this block expects. Only adopt it if it is
# more indented than the current comment, which prevents
# top-level comments from being modified.
base_len = len(indent)
for look_ahead in range(idx + 1, len(lines)):
look_line = lines[look_ahead]
look_stripped = look_line.strip()
if not look_stripped:
continue
if look_stripped.startswith('#'):
continue
look_indent = _indent(look_line)
if len(look_indent) > base_len:
return look_indent
break
return indent
for i, line in enumerate(lines):
stripped = line.lstrip()
if i in interval_starts and stripped.startswith('#'):
indent = _infer_comment_indent(i, line)
# Replace any comment that is not within an interval with a
# statement, so ast.parse will record its line number
yield indent + '_._ = None'
else:
yield line
exec_source_lines = list(_hack_comment_statements(exec_source_lines))
source_block = '\n'.join(exec_source_lines)
try:
pt = ast.parse(source_block, filename='<source_block>')
except SyntaxError as syn_ex:
# Assign missing information to the syntax error.
if syn_ex.text is None:
if syn_ex.lineno is not None:
# Grab the line where the error occurs
# (why is this not populated in SyntaxError by default?)
# (because filename does not point to a valid loc)
line = source_block.split('\n')[syn_ex.lineno - 1]
syn_ex.text = line + '\n'
raise syn_ex
# print(ast.dump(pt))
# print('pt = {!r}'.format(pt))
statement_nodes = pt.body
ps1_linenos = [node.lineno - 1 for node in statement_nodes]
if 1:
# Get PS1 line numbers of statements accounting for decorators
ps1_linenos = []
for node in statement_nodes:
if hasattr(node, 'decorator_list') and node.decorator_list:
lineno = node.decorator_list[0].lineno - 1 # type: ignore
else:
lineno = node.lineno - 1
ps1_linenos.append(lineno)
# Respect any line explicitly defined as PS2 (via its prefix)
ps2_linenos = {x for x, p in enumerate(source_lines) if p[:4] != '>>> '}
ps1_linenos = sorted(set(ps1_linenos).difference(ps2_linenos))
# There are 3 ways to compile python code
# exec, eval, and single.
# We almost always want to exec, but if we want to match the return
# value of the function, we will need to run it in eval or single mode.
mode_hint = 'exec'
if len(statement_nodes) == 0:
mode_hint = 'exec'
else:
# Is the last statement evaluate-able?
if isinstance(statement_nodes[-1], ast.Expr):
# This should just be an Expr in python3
# (todo: ensure this is true)
mode_hint = 'eval'
# WORKON_BACKWARDS_COMPAT_CONTINUE_EVAL:
# Force doctests parts to evaluate in backwards compatible "single"
# mode when using old style doctest syntax.
if len(source_lines) > 1:
if source_lines[0].startswith('>>> '):
if all(_hasprefix(s, ('...',)) for s in source_lines[1:]):
mode_hint = 'single'
if mode_hint == 'eval':
# Also check the tokens in the source lines to look for semicolons
# to fix #108
# Only iterate through non-empty lines otherwise tokenize will stop short
# TODO: we probably could just save the tokens if we got them earlier?
iterable = (line for line in exec_source_lines if line)
def _readline() -> str:
return next(iterable)
# We cannot eval a statement with a semicolon in it
# Single should work.
if any(
t.type == tokenize.OP and t.string == ';'
for t in tokenize.generate_tokens(_readline)
):
mode_hint = 'single'
return ps1_linenos, mode_hint
[docs]
def _label_docsrc_lines(self, string: str) -> list[tuple[str, str]]:
"""
Give each line in the docstring a label so we can distinguish
what parts are text, what parts are code, and what parts are "want"
string.
Args:
string (str): doctest source
Returns:
List[Tuple[str, str]]: labeled_lines - the above source broken
up by lines, each with a label indicating its type for later
use in parsing.
TODO:
- [ ] Sphinx does not parse this doctest properly
Example:
>>> from xdoctest.parser import *
>>> # Having multiline strings in doctests can be nice
>>> string = utils.codeblock(
'''
text
>>> items = ['also', 'nice', 'to', 'not', 'worry',
>>> 'about', '...', 'vs', '>>>']
... print('but its still allowed')
but its still allowed
more text
''')
>>> self = DoctestParser()
>>> labeled = self._label_docsrc_lines(string)
>>> expected = [
>>> ('text', 'text'),
>>> ('dsrc', ">>> items = ['also', 'nice', 'to', 'not', 'worry',"),
>>> ('dsrc', ">>> 'about', '...', 'vs', '>>>']"),
>>> ('dcnt', "... print('but its still allowed')"),
>>> ('want', 'but its still allowed'),
>>> ('text', ''),
>>> ('text', 'more text')
>>> ]
>>> assert labeled == expected
"""
# parse and differentiate between doctest source and want statements.
labeled_lines: list[tuple[str, str]] = []
state_indent = 0
# line states
TEXT = 'text'
DSRC = 'dsrc'
DCNT = 'dcnt' # explicit continuation **new in 0.10.0**
WANT = 'want'
# Move through states, keeping track of points where states change
# text -> [text, dsrc]
# dsrc -> [dsrc, dcnt, want, text]
# dcnt -> [dsrc, dcnt, want, text]
# want -> [want, text, dsrc]
prev_state = TEXT
curr_state = None
line_iter = enumerate(string.splitlines())
for line_idx, line in line_iter:
match = INDENT_RE.search(line)
line_indent = 0 if match is None else (match.end() - match.start())
if global_state.DEBUG_PARSER: # nocover
print('Next line {}: {}'.format(line_idx, line))
print('state_indent = {!r}'.format(state_indent))
print('match = {!r}'.format(match))
print('line_indent = {!r}'.format(line_indent))
norm_line = line[state_indent:] # Normalize line indentation
strip_line = line.strip()
# Check prev_state transitions
if prev_state == TEXT:
# text transitions to source whenever a PS1 line is encountered
# the PS1(>>>) can be at an arbitrary indentation
if _hasprefix(strip_line, ('>>>',)):
curr_state = DSRC
else:
curr_state = TEXT
elif prev_state == WANT:
# blank lines terminate wants
if len(strip_line) == 0:
curr_state = TEXT
# source-inconsistent indentation terminates want
elif _hasprefix(line.strip(), ('>>>',)):
curr_state = DSRC
elif line_indent < state_indent:
curr_state = TEXT
else:
curr_state = WANT
elif prev_state in {DSRC, DCNT}: # pragma: nobranch
if len(strip_line) == 0 or line_indent < state_indent:
curr_state = TEXT
# allow source to continue with either PS1 or PS2
elif _hasprefix(norm_line, ('>>>', '...')):
if strip_line == '...':
# TODO: add mechanism for checking next line.
# if the next line is also a continuation
# then dont treat this as an ellipses
if prev_state == DCNT:
# Hack to fix continuation issue
curr_state = DCNT
else:
curr_state = WANT
else:
if _hasprefix(norm_line, ('...',)):
curr_state = DCNT
else:
curr_state = DSRC
else:
curr_state = WANT
else: # nocover
# This should never happen
raise AssertionError(
'Unknown state prev_state={}'.format(prev_state)
)
# Handle transitions
if prev_state != curr_state:
# Handle start of new states
if curr_state == TEXT:
state_indent = 0
if curr_state in {DSRC, DCNT}:
# Start a new source
state_indent = line_indent
# renormalize line when indentation changes
norm_line = line[state_indent:]
# continue current state
if curr_state in {DSRC, DCNT}:
# source parts may consume more than one line
try:
if global_state.DEBUG_PARSER: # nocover
print('completing source')
for part, norm_line in _complete_source(
line, state_indent, line_iter
):
if global_state.DEBUG_PARSER > 4: # nocover
print('Append Completion Line:')
print('part = {!r}'.format(part))
print('norm_line = {!r}'.format(norm_line))
print('curr_state = {!r}'.format(curr_state))
if _hasprefix(norm_line, ('...',)):
curr_state = DCNT
labeled_lines.append((curr_state, part))
except exceptions.IncompleteParseError:
raise
except SyntaxError:
if global_state.DEBUG_PARSER: # nocover
print('<LABEL FAIL>')
# print('next(line_iter) = {!r}'.format(line_iter))
print('state_indent = {!r}'.format(state_indent))
print('line = {!r}'.format(line))
print('Failed to label source lines')
print('Labeled lines so far: <[[[[[[[[[[')
for _line in labeled_lines:
print(_line)
print(']]]]]]]]]]>')
print('</LABEL FAIL>')
raise
elif curr_state == WANT:
labeled_lines.append((curr_state, line))
elif curr_state == TEXT:
labeled_lines.append((curr_state, line))
prev_state = curr_state
if global_state.DEBUG_PARSER > 1: # nocover
import ubelt as ub
# if global_state.DEBUG_PARSER > 3:
# print('string = {!r}'.format(string))
print('<FINISH LABELED LINES')
print('labeled_lines = {}'.format(ub.repr2(labeled_lines, nl=1)))
print('</FINISH LABELED LINES>')
return labeled_lines
[docs]
def _min_indentation(s):
"Return the minimum indentation of any non-blank line in `s`"
indents = [len(indent) for indent in INDENT_RE.findall(s)]
if len(indents) > 0:
return min(indents)
else:
return 0
[docs]
def _complete_source(line, state_indent, line_iter):
"""
helper
remove lines from the iterator if they are needed to complete source
This uses :func:`static.is_balanced_statement` to do the heavy lifting
Example:
>>> from xdoctest.parser import * # NOQA
>>> from xdoctest.parser import _complete_source
>>> state_indent = 0
>>> line = '>>> x = { # The line is not finished'
>>> remain_lines = ['>>> 1:2,', '>>> 3:4,', '>>> 5:6}', '>>> y = 7']
>>> line_iter = enumerate(remain_lines, start=1)
>>> finished = list(_complete_source(line, state_indent, line_iter))
>>> final = chr(10).join([t[1] for t in finished])
>>> print(final)
"""
norm_line = line[state_indent:] # Normalize line indentation
prefix = norm_line[:4]
suffix = norm_line[4:]
assert prefix.strip() in {'>>>', '...'}, 'unexpected prefix: {!r}'.format(
prefix
)
yield line, norm_line
source_parts = [suffix]
# These hacks actually modify the input doctest slightly
HACK_TRIPLE_QUOTE_FIX = True
try:
while not static.is_balanced_statement(source_parts, only_tokens=True):
line_idx, next_line = next(line_iter)
norm_line = next_line[state_indent:]
prefix = norm_line[:4]
suffix = norm_line[4:]
if prefix.strip() not in {'>>>', '...', ''}: # nocover
error = True
if HACK_TRIPLE_QUOTE_FIX:
# TODO: make a more robust patch
if any("'''" in s or '"""' in s for s in source_parts):
# print('HACK FIXING TRIPLE QUOTE')
next_line = (
next_line[:state_indent] + '... ' + norm_line
)
norm_line = '... ' + norm_line
prefix = ''
suffix = norm_line
error = False
if error:
if global_state.DEBUG_PARSER:
print(' * !!!ERROR!!!')
print(' * source_parts = {!r}'.format(source_parts))
print(' * prefix = {!r}'.format(prefix))
print(' * norm_line = {!r}'.format(norm_line))
print(' * !!!!!!!!!!!!!')
raise SyntaxError(
'Bad indentation in doctest on line {}: {!r}'.format(
line_idx, next_line
)
)
source_parts.append(suffix)
yield next_line, norm_line
except StopIteration:
if global_state.DEBUG_PARSER:
import ubelt as ub
print('<FAIL DID NOT COMPLETE SOURCE>')
import traceback
tb_text = traceback.format_exc()
tb_text = ub.highlight_code(tb_text)
tb_text = ub.indent(tb_text)
print(tb_text)
# print(' * line_iter = {!r}'.format(line_iter))
print(' * state_indent = {!r}'.format(state_indent))
print(' * line = {!r}'.format(line))
# print('source =\n{}'.format('\n'.join(source_parts)))
print('# Ensure that the following line should actually fail')
print('source_parts = {}'.format(ub.repr2(source_parts, nl=2)))
print(
ub.codeblock(
r"""
from xdoctest import static_analysis as static
import ast
static.is_balanced_statement(source_parts, only_tokens=False)
static.is_balanced_statement(source_parts, only_tokens=True)
text = '\n'.join(source_parts)
print(text)
ast.parse(text, filename='<source_block>')
"""
)
)
print('</FAIL DID NOT COMPLETE SOURCE>')
# sys.exit(1)
# TODO: use AST to reparse all doctest parts to discover where the
# syntax error in the doctest is and then raise it.
raise exceptions.IncompleteParseError(
'ill-formed doctest: all parts have been processed '
'but the doctest source is not balanced'
)
else:
if global_state.DEBUG_PARSER > 1:
import ubelt as ub
print('<SUCCESS COMPLETED SOURCE>')
# print(' * line_iter = {!r}'.format(line_iter))
print('source_parts = {}'.format(ub.repr2(source_parts, nl=2)))
print('</SUCCESS COMPLETED SOURCE>')
[docs]
def _iterthree(items, pad_value=None):
"""
Iterate over a sliding window of size 3 with None padding on
both sides.
Example:
>>> from xdoctest.parser import *
>>> print(list(_iterthree([])))
>>> print(list(_iterthree(range(1))))
>>> print(list(_iterthree([1, 2])))
>>> print(list(_iterthree([1, 2, 3])))
>>> print(list(_iterthree(range(4))))
>>> print(list(_iterthree(range(7))))
"""
# Initialize the return window to pad values
left = mid = right = pad_value
# Create an iterator
item_iter = iter(items)
# Check the first item, if we dont have it, then dont return anything
try:
mid = next(item_iter)
except StopIteration:
return
else:
# Check the second item, if we dont have it, we have to return
# the values we've seen so far.
try:
right = next(item_iter)
except StopIteration:
yield left, mid, right
return
else:
# If we have both mid and right, then yield both
yield left, mid, right
left, mid = mid, right
# If there is still data
for right in item_iter:
yield left, mid, right
left, mid = mid, right
right = pad_value
yield left, mid, right
[docs]
def _hasprefix(line, prefixes) -> bool:
"""helper prefix test"""
# if not isinstance(prefixes, tuple):
# prefixes = [prefixes]
return any(line == p or line.startswith(p + ' ') for p in prefixes)
if __name__ == '__main__':
"""
CommandLine:
python -m xdoctest.core
python -m xdoctest.parser all
"""
import xdoctest as xdoc
xdoc.doctest_module()