Source code for xdoctest.parser

"""
The XDoctest Parser
-------------------
This parses a docstring into one or more "doctest part" *after* the docstrings
have been extracted from the source code by either static or dynamic means.

Terms and definitions:

    logical block:
        a snippet of code that can be executed by itself if given the correct
        global / local variable context.

    PS1:
        The original meaning is "Prompt String 1". For details see:
        [SE32096]_ [BashPS1]_ [CustomPrompt]_ [GeekPrompt]_.  In the context of
        xdoctest, instead of referring to the prompt prefix, we use PS1 to
        refer to a line that starts a "logical block" of code. In the original
        doctest module these all had to be prefixed with ">>>". In xdoctest the
        prefix is used to simply denote the code is part of a doctest. It does
        not necessarily mean a new "logical block" is starting.

    PS2:
        The original meaning is "Prompt String 2". In the context of xdoctest,
        instead of referring to the prompt prefix, we use PS2 to refer to a
        line that continues a "logical block" of code. In the original doctest
        module these all had to be prefixed with "...". However, xdoctest uses
        parsing to automatically determine this.

    want statement:
        Lines directly after a logical block of code in a doctest indicating
        the desired result of executing the previous block.

While I do believe this AST-based code is a significant improvement over the
RE-based builtin doctest parser, I acknowledge that I'm not an AST expert and
there is room for improvement here.


References:
    .. [SE32096] https://unix.stackexchange.com/questions/32096/why-is-bashs-prompt-variable-called-ps1
    .. [BashPS1] https://www.gnu.org/savannah-checkouts/gnu/bash/manual/bash.html#index-PS1
    .. [CustomPrompt] https://wiki.archlinux.org/title/Bash/Prompt_customization
    .. [GeekPrompt] https://web.archive.org/web/20230824025647/https://www.thegeekstuff.com/2008/09/bash-shell-take-control-of-ps1-ps2-ps3-ps4-and-prompt_command/
"""

from __future__ import annotations

import ast
import re
import sys
import tokenize
import typing

from xdoctest import directive, doctest_part, exceptions, global_state, utils
from xdoctest import static_analysis as static

INDENT_RE = re.compile(r'^([ ]*)(?=\S)', re.MULTILINE)



[docs]
class DoctestParser:
    r"""
    Breaks docstrings into parts using the `parse` method.

    Example:
        >>> from xdoctest.parser import *  # NOQA
        >>> parser = DoctestParser()
        >>> doctest_parts = parser.parse(
        >>>     '''
        >>>     >>> j = 0
        >>>     >>> for i in range(10):
        >>>     >>>     j += 1
        >>>     >>> print(j)
        >>>     10
        >>>     '''.lstrip('\n'))
        >>> print('\n'.join(list(map(str, doctest_parts))))
        <DoctestPart(ln 0, src="j = 0...", want=None)>
        <DoctestPart(ln 3, src="print(j)...", want="10...")>

    Example:
        >>> # Having multiline strings in doctests can be nice
        >>> string = utils.codeblock(
                '''
                >>> name = 'name'
                'anything'
                ''')
        >>> self = DoctestParser()
        >>> doctest_parts = self.parse(string)
        >>> print('\n'.join(list(map(str, doctest_parts))))
    """

    def __init__(self, simulate_repl: bool = False) -> None:
        """
        Args:
            simulate_repl (bool): if True each line will be treated as its
                own doctest. This more closely mimics the original doctest
                module.  Defaults to False.
        """
        self.simulate_repl = simulate_repl


[docs]
    def parse(
        self, string: str, info: dict | None = None
    ) -> list[doctest_part.DoctestPart | str]:
        r"""
        Divide the given string into examples and interleaving text.

        Args:
            string (str): The docstring that may contain one or more doctests.
            info (dict | None): info about where the string came from in case of an
                error

        Returns:
            List[xdoctest.doctest_part.DoctestPart | str]:
                a list of `DoctestPart` objects and intervening text in the
                input docstring.

        CommandLine:
            python -m xdoctest.parser DoctestParser.parse

        Example:
            >>> docstr = '''
            >>>     A simple docstring contains text followed by an example.
            >>>     >>> numbers = [1, 2, 3, 4]
            >>>     >>> thirds = [x / 3 for x in numbers]
            >>>     >>> print(thirds)
            >>>     [0.33  0.66  1  1.33]
            >>> '''
            >>> from xdoctest import parser
            >>> self = parser.DoctestParser()
            >>> results = self.parse(docstr)
            >>> assert len(results) == 3
            >>> for index, result in enumerate(results):
            >>>     print(f'results[{index}] = {result!r}')
            results[0] = '\nA simple docstring contains text followed by an example.'
            results[1] = <DoctestPart(ln 2, src="numbers ...", want=None) at ...>
            results[2] = <DoctestPart(ln 4, src="print(th...", want="[0.33  0...") at ...>

        Example:
            >>> s = 'I am a dummy example with two parts'
            >>> x = 10
            >>> print(s)
            I am a dummy example with two parts
            >>> s = 'My purpose it so demonstrate how wants work here'
            >>> print('The new want applies ONLY to stdout')
            >>> print('given before the last want')
            >>> '''
                this wont hurt the test at all
                even though its multiline '''
            >>> y = 20
            The new want applies ONLY to stdout
            given before the last want
            >>> # Parts from previous examples are executed in the same context
            >>> print(x + y)
            30

            this is simply text, and doesnt apply to the previous doctest the
            <BLANKLINE> directive is still in effect.

        Example:
            >>> from xdoctest.parser import *  # NOQA
            >>> from xdoctest import parser
            >>> from xdoctest.docstr import docscrape_google
            >>> from xdoctest import core
            >>> self = parser.DoctestParser()
            >>> docstr = self.parse.__doc__
            >>> blocks = docscrape_google.split_google_docblocks(docstr)
            >>> doclineno = self.parse.__func__.__code__.co_firstlineno
            >>> key, (string, offset) = blocks[-2]
            >>> self._label_docsrc_lines(string)
            >>> doctest_parts = self.parse(string)
            >>> # each part with a want-string needs to be broken in two
            >>> assert len(doctest_parts) == 6
            >>> len(doctest_parts)
        """
        if global_state.DEBUG_PARSER > 1:
            print('\n===== PARSE ====')
        if sys.version_info.major == 2:  # nocover
            string = utils.ensure_unicode(string)

        if not isinstance(string, str):
            raise TypeError('Expected string but got {!r}'.format(string))

        # If all lines begin with the same indentation, then strip it.
        min_indent = _min_indentation(string)
        if min_indent > 0:
            string = '\n'.join([ln[min_indent:] for ln in string.splitlines()])

        labeled_lines = None
        grouped_lines = None
        all_parts = None
        try:
            labeled_lines = self._label_docsrc_lines(string)
            grouped_lines = self._group_labeled_lines(labeled_lines)
            all_parts = list(self._package_groups(grouped_lines))
        except Exception as orig_ex:
            if labeled_lines is None:
                failpoint = '_label_docsrc_lines'
            elif grouped_lines is None:
                failpoint = '_group_labeled_lines'
            elif all_parts is None:
                failpoint = '_package_groups'
            if global_state.DEBUG_PARSER:
                print('<FAILPOINT>')
                print('!!! FAILED !!!')
                print('failpoint = {!r}'.format(failpoint))

                import traceback

                import ubelt as ub

                tb_text = traceback.format_exc()
                tb_text = ub.highlight_code(tb_text)
                tb_text = ub.indent(tb_text)
                print(tb_text)

                print('Failed to parse string = <{[<{[<{[  # xdoc debug')
                print(string)
                print(']}>]}>]}>  # xdoc debug end string')

                print('info = {}'.format(ub.repr2(info)))
                print('-----')
                print('orig_ex = {}'.format(orig_ex))
                print('labeled_lines = {}'.format(ub.repr2(labeled_lines)))
                print(
                    'grouped_lines = {}'.format(ub.repr2(grouped_lines, nl=3))
                )
                print('all_parts = {}'.format(ub.repr2(all_parts)))
                print('</FAILPOINT>')
                # sys.exit(1)
            raise exceptions.DoctestParseError(
                'Failed to parse doctest in {}'.format(failpoint),
                string=string,
                info=info,
                orig_ex=orig_ex,
            )
        if global_state.DEBUG_PARSER > 1:
            print('\n===== FINISHED PARSE ====')
        return all_parts



[docs]
    def _package_groups(self, grouped_lines):
        if global_state.DEBUG_PARSER > 1:
            import ubelt as ub

            print('<PACKAGE LABEL GROUPS>')
            print('grouped_lines = {}'.format(ub.repr2(grouped_lines, nl=2)))
        lineno = 0
        for chunk in grouped_lines:
            if isinstance(chunk, tuple):
                slines, wlines = chunk
                for example in self._package_chunk(slines, wlines, lineno):
                    yield example
                lineno += len(slines) + len(wlines)
            else:
                text_part = '\n'.join(chunk)
                yield text_part
                lineno += len(chunk)
        if global_state.DEBUG_PARSER > 1:
            print('</PACKAGE LABEL GROUPS>')



[docs]
    def _package_chunk(self, raw_source_lines, raw_want_lines, lineno=0):
        """
        if `self.simulate_repl` is True, then each statement is broken into its
        own part.  Otherwise, statements are grouped by the closest `want`
        statement.

        TODO:
            - [ ] EXCEPT IN CASES OF EXPLICIT CONTINUATION

        Example:
            >>> from xdoctest.parser import *
            >>> raw_source_lines = ['>>> "string"']
            >>> raw_want_lines = ['string']
            >>> self = DoctestParser()
            >>> part, = self._package_chunk(raw_source_lines, raw_want_lines)
            >>> part.source
            '"string"'
            >>> part.want
            'string'

        """
        if global_state.DEBUG_PARSER > 1:
            print('<PACKAGE CHUNK>')
        match = INDENT_RE.search(raw_source_lines[0])
        line_indent = 0 if match is None else (match.end() - match.start())

        source_lines = [p[line_indent:] for p in raw_source_lines]
        want_lines = [p[line_indent:] for p in raw_want_lines]

        # TODO:
        # - [ ] Fix pytorch indentation issue here

        exec_source_lines = [p[4:] for p in source_lines]

        if global_state.DEBUG_PARSER > 1:
            print(' * locate ps1 lines')
        # Find the line number of each standalone statement
        ps1_linenos, mode_hint = self._locate_ps1_linenos(source_lines)
        if global_state.DEBUG_PARSER > 1:
            print('mode_hint = {!r}'.format(mode_hint))
            print(' * located ps1 lines')
            print(f'ps1_linenos={ps1_linenos}')

        # Find all directives here:
        # A directive necessarily will split a doctest into multiple parts
        # There are two types: block directives and inline-directives
        # * Block directives must exist on their own PS1 line
        # * Block directives insert a breakpoint before
        # * Inline directives may be on a PS1 or PS2 line
        # * Inline directives inserts a breakpoint before and after
        # First find block directives which must exist on there own PS1 line
        break_linenos = []
        ps1_to_directive = {}
        for s1, s2 in zip(ps1_linenos, ps1_linenos[1:] + [None]):
            lines = exec_source_lines[s1:s2]
            directives = list(directive.Directive.extract('\n'.join(lines)))
            if directives:
                ps1_to_directive[s1] = directives
                break_linenos.append(s1)
                if directives[0].inline:
                    if s2 is not None:
                        break_linenos.append(s2)

        if global_state.DEBUG_PARSER > 3:
            print(f'break_linenos={break_linenos}')

        def slice_example(s1, s2, want_lines=None) -> doctest_part.DoctestPart:
            exec_lines = exec_source_lines[s1:s2]
            orig_lines = source_lines[s1:s2]
            directives = ps1_to_directive.get(s1, None)
            example = doctest_part.DoctestPart(
                exec_lines,
                want_lines=want_lines,
                orig_lines=orig_lines,
                line_offset=lineno + s1,
                directives=directives,
            )
            return example

        s1 = 0
        s2 = 0
        if self.simulate_repl:
            # Break down first parts which dont have any want
            for s1, s2 in zip(ps1_linenos, ps1_linenos[1:]):
                example = slice_example(s1, s2)
                yield example
            s1 = s2
        else:
            if break_linenos:
                break_linenos = sorted(set([0] + break_linenos))
                # directives are forcing us to further breakup the parts
                for s1, s2 in zip(break_linenos, break_linenos[1:]):
                    example = slice_example(s1, s2)
                    yield example
                s1 = s2
            if want_lines and mode_hint in {'eval', 'single'}:
                # Whenever the evaluation of the final line needs to be tested
                # against want, that line must be separated into its own part.
                # We break the last line off so we can eval its value, but keep
                # previous groupings.
                s2 = ps1_linenos[-1]
                if s2 != s1:  # make sure the last line is not the only line
                    example = slice_example(s1, s2)
                    yield example
                    s1 = s2
        s2 = None

        example = slice_example(s1, s2, want_lines)

        # if mode_hint is False:
        #     mode_hint = 'exec'
        # if mode_hint is True:
        #     mode_hint = 'eval'

        if not bool(want_lines):
            example.compile_mode = 'exec'
        else:
            assert mode_hint in {'eval', 'exec', 'single'}
            example.compile_mode = mode_hint

        if global_state.DEBUG_PARSER > 1:
            print('example.compile_mode = {!r}'.format(example.compile_mode))
            print('<YIELD CHUNK>')
        yield example



[docs]
    def _group_labeled_lines(self, labeled_lines) -> list[list | tuple | str]:
        """
        Group labeled lines into logical parts to be executed together

        Returns:
            List[List[str] | Tuple[List[str], str]]:
                A list of parts. Text parts are just returned as a list of
                lines.  Executable parts are returned as a tuple of source
                lines and an optional "want" statement.
        """
        if global_state.DEBUG_PARSER > 1:
            print('<GROUP LABEL LINES>')
        # Now that lines have types, groups them. This could have done this
        # above, but functionality is split for readability.
        prev_source = None
        # TODO: make typing more sane here.
        grouped_lines: list[list | tuple | str] = []

        # WORKON_BACKWARDS_COMPAT_CONTINUE_EVAL
        # Break up explicit continuations for backwards compat
        groups: list[tuple] = []
        current: list[str] = []
        state = None
        if global_state.DEBUG_PARSER > 4:
            print('labeled_lines = {!r}'.format(labeled_lines))

        # Need to ensure that old-style continuations with want statements are
        # placed in their own group, so they can be executed as "single".
        for left, mid, right in _iterthree(
            labeled_lines, pad_value=(None, None)
        ):
            if left[0] != mid[0] or (mid[0] == 'dsrc' and right[0] == 'dcnt'):
                if not (left[0] == 'dsrc' and mid[0] == 'dcnt'):
                    # Start a new group
                    if state is not None:
                        groups.append((state, current))
                    state = mid[0]
                    current = []
            current.append(mid)
        if current:
            groups.append((state, current))

        if global_state.DEBUG_PARSER > 4:
            print('groups = {!r}'.format(groups))

        # need to merge consecutive dsrc groups without want statements
        merged_groups: list[tuple] = []
        merge_current: list[str] = []
        state = None
        for left, mid, right in _iterthree(groups, pad_value=(None, None)):
            # Merge consecutive groups unless it is followed by a want
            if left[0] == mid[0] and right[0] != 'want':
                # extend the previous group
                merge_current.extend(mid[1])
            else:
                # start a new group
                if state is not None:
                    merged_groups.append((left[0], merge_current))
                state = mid[0]
                merge_current = []
                merge_current.extend(mid[1])
        if merge_current:
            merged_groups.append((state, merge_current))

        # More iterating and grouping. This section needs a careful rewrite
        prev_source = None
        grouped_lines = []
        for state, group in merged_groups:
            block: list[str] = [t[1] for t in group]
            if state == 'text':
                if prev_source is not None:
                    # accept a source block without a want block
                    grouped_lines.append((prev_source, ''))
                    prev_source = None
                # accept the text
                grouped_lines.append(block)
            elif state == 'want':
                assert prev_source is not None, 'impossible'
                grouped_lines.append((prev_source, block))
                prev_source = None
            elif state in {'dsrc', 'dcnt'}:
                if prev_source is not None:
                    # accept a source block without a want block
                    grouped_lines.append((prev_source, ''))
                    prev_source = None
                # need to check if there is a want after us
                prev_source = block
        # Case where last block is source
        if prev_source:
            grouped_lines.append((prev_source, ''))

        if global_state.DEBUG_PARSER > 1:  # nocover
            print('</GROUP LABEL LINES>')
        return grouped_lines



[docs]
    def _locate_ps1_linenos(
        self, source_lines: list[str]
    ) -> tuple[list[int], str]:
        """
        Determines which lines in the source begin a "logical block" of code.

        Args:
            source_lines (List[str]): lines belonging only to the doctest src
                these will be unindented, prefixed, and without any want.

        Returns:
            Tuple[List[int], bool]:
                linenos is the first value a list of indices indicating which
                lines are considered "PS1" and
                mode_hint, the second value, is a flag indicating if the final
                line should be considered for a got/want assertion.

        Example:
            >>> self = DoctestParser()
            >>> source_lines = ['>>> def foo():', '>>>     return 0', '>>> 3']
            >>> linenos, mode_hint = self._locate_ps1_linenos(source_lines)
            >>> assert linenos == [0, 2]
            >>> assert mode_hint == 'eval'

        Example:
            >>> from xdoctest.parser import *  # NOQA
            >>> self = DoctestParser()
            >>> source_lines = ['>>> x = [1, 2, ', '>>> 3, 4]', '>>> print(len(x))']
            >>> linenos, mode_hint = self._locate_ps1_linenos(source_lines)
            >>> assert linenos == [0, 2]
            >>> assert mode_hint == 'eval'

        Example:
            >>> from xdoctest.parser import *  # NOQA
            >>> self = DoctestParser()
            >>> source_lines = [
            >>>    '>>> x = 1',
            >>>    '>>> try: raise Exception',
            >>>    '>>> except Exception: pass',
            >>>    '...',
            >>> ]
            >>> linenos, mode_hint = self._locate_ps1_linenos(source_lines)
            >>> assert linenos == [0, 1]
            >>> assert mode_hint == 'exec'

        Example:
            >>> from xdoctest.parser import *  # NOQA
            >>> self = DoctestParser()
            >>> source_lines = [
            >>>    '>>> import os; print(os)',
            >>>    '...',
            >>> ]
            >>> linenos, mode_hint = self._locate_ps1_linenos(source_lines)
            >>> assert linenos == [0]
            >>> assert mode_hint == 'single'

        Example:
            >>> # We should ensure that decorators are PS1 lines
            >>> from xdoctest.parser import *  # NOQA
            >>> self = DoctestParser()
            >>> source_lines = [
            >>>    '>>> # foo',
            >>>    '>>> @foo',
            >>>    '... def bar():',
            >>>    '...     ...',
            >>> ]
            >>> linenos, mode_hint = self._locate_ps1_linenos(source_lines)
            >>> print(f'linenos={linenos}')
            >>> assert linenos == [0, 1]
        """
        # Strip indentation (and PS1 / PS2 from source)
        exec_source_lines = [p[4:] for p in source_lines]

        def _hack_comment_statements(lines) -> typing.Iterable[str]:
            # Hack to make comments appear like executable statements
            # note, this hack never leaves this function because we only are
            # returning line numbers.
            # FIXME: there is probably a better way to do this.
            def balanced_intervals(lines: list[str]) -> list[tuple[int, int]]:
                """
                Finds intervals of balanced nesting syntax

                Args:
                    lines (List[str]): lines of source code
                """
                intervals = []
                a = len(lines) - 1
                b = len(lines)
                while b > 0:
                    # move the head pointer up until we become balanced
                    while (
                        not static.is_balanced_statement(
                            lines[a:b], only_tokens=True
                        )
                        and a >= 0
                    ):
                        a -= 1
                    if a < 0:
                        raise exceptions.IncompleteParseError(
                            'ill-formed doctest: cannot find balanced ps1 lines.'
                        )
                    # we found a balanced interval
                    intervals.append((a, b))
                    b = a
                    a = a - 1

                intervals = intervals[::-1]
                return intervals

            intervals = balanced_intervals(lines)
            interval_starts = {t[0] for t in intervals}

            def _indent(line: str) -> str:
                return line[: len(line) - len(line.lstrip())]

            def _infer_comment_indent(idx: int, line: str) -> str:
                """Infer the indentation a placeholder comment statement
                should have.

                In regular Python, comments may ignore indentation rules,
                but once we replace the comment with a statement we must honor
                the expected indentation to avoid `IndentationError`.  When a
                comment is missing indentation, try to infer it from the next
                relevant line in the doctest.
                """
                indent = _indent(line)
                stripped = line.lstrip()
                if indent or not stripped.startswith('#'):
                    return indent

                # Look ahead for the next non-comment line to determine the
                # indentation level this block expects.  Only adopt it if it is
                # more indented than the current comment, which prevents
                # top-level comments from being modified.
                base_len = len(indent)
                for look_ahead in range(idx + 1, len(lines)):
                    look_line = lines[look_ahead]
                    look_stripped = look_line.strip()
                    if not look_stripped:
                        continue
                    if look_stripped.startswith('#'):
                        continue
                    look_indent = _indent(look_line)
                    if len(look_indent) > base_len:
                        return look_indent
                    break
                return indent

            for i, line in enumerate(lines):
                stripped = line.lstrip()
                if i in interval_starts and stripped.startswith('#'):
                    indent = _infer_comment_indent(i, line)
                    # Replace any comment that is not within an interval with a
                    # statement, so ast.parse will record its line number
                    yield indent + '_._ = None'
                else:
                    yield line

        exec_source_lines = list(_hack_comment_statements(exec_source_lines))

        source_block = '\n'.join(exec_source_lines)
        try:
            pt = ast.parse(source_block, filename='<source_block>')
        except SyntaxError as syn_ex:
            # Assign missing information to the syntax error.
            if syn_ex.text is None:
                if syn_ex.lineno is not None:
                    # Grab the line where the error occurs
                    # (why is this not populated in SyntaxError by default?)
                    # (because filename does not point to a valid loc)
                    line = source_block.split('\n')[syn_ex.lineno - 1]
                    syn_ex.text = line + '\n'
            raise syn_ex

        # print(ast.dump(pt))
        # print('pt = {!r}'.format(pt))

        statement_nodes = pt.body
        ps1_linenos = [node.lineno - 1 for node in statement_nodes]

        if 1:
            # Get PS1 line numbers of statements accounting for decorators
            ps1_linenos = []
            for node in statement_nodes:
                if hasattr(node, 'decorator_list') and node.decorator_list:
                    lineno = node.decorator_list[0].lineno - 1  # type: ignore
                else:
                    lineno = node.lineno - 1
                ps1_linenos.append(lineno)

        # Respect any line explicitly defined as PS2 (via its prefix)
        ps2_linenos = {x for x, p in enumerate(source_lines) if p[:4] != '>>> '}
        ps1_linenos = sorted(set(ps1_linenos).difference(ps2_linenos))

        # There are 3 ways to compile python code
        # exec, eval, and single.

        # We almost always want to exec, but if we want to match the return
        # value of the function, we will need to run it in eval or single mode.
        mode_hint = 'exec'
        if len(statement_nodes) == 0:
            mode_hint = 'exec'
        else:
            # Is the last statement evaluate-able?
            if isinstance(statement_nodes[-1], ast.Expr):
                # This should just be an Expr in python3
                # (todo: ensure this is true)
                mode_hint = 'eval'

        # WORKON_BACKWARDS_COMPAT_CONTINUE_EVAL:
        # Force doctests parts to evaluate in backwards compatible "single"
        # mode when using old style doctest syntax.
        if len(source_lines) > 1:
            if source_lines[0].startswith('>>> '):
                if all(_hasprefix(s, ('...',)) for s in source_lines[1:]):
                    mode_hint = 'single'

        if mode_hint == 'eval':
            # Also check the tokens in the source lines to look for semicolons
            # to fix #108
            # Only iterate through non-empty lines otherwise tokenize will stop short
            # TODO: we probably could just save the tokens if we got them earlier?
            iterable = (line for line in exec_source_lines if line)

            def _readline() -> str:
                return next(iterable)

            # We cannot eval a statement with a semicolon in it
            # Single should work.
            if any(
                t.type == tokenize.OP and t.string == ';'
                for t in tokenize.generate_tokens(_readline)
            ):
                mode_hint = 'single'

        return ps1_linenos, mode_hint



[docs]
    def _label_docsrc_lines(self, string: str) -> list[tuple[str, str]]:
        """
        Give each line in the docstring a label so we can distinguish
        what parts are text, what parts are code, and what parts are "want"
        string.

        Args:
            string (str): doctest source

        Returns:
            List[Tuple[str, str]]: labeled_lines - the above source broken
                up by lines, each with a label indicating its type for later
                use in parsing.

        TODO:
            - [ ] Sphinx does not parse this doctest properly

        Example:
            >>> from xdoctest.parser import *
            >>> # Having multiline strings in doctests can be nice
            >>> string = utils.codeblock(
                    '''
                    text
                    >>> items = ['also', 'nice', 'to', 'not', 'worry',
                    >>>          'about', '...', 'vs', '>>>']
                    ... print('but its still allowed')
                    but its still allowed

                    more text
                    ''')
            >>> self = DoctestParser()
            >>> labeled = self._label_docsrc_lines(string)
            >>> expected = [
            >>>     ('text', 'text'),
            >>>     ('dsrc', ">>> items = ['also', 'nice', 'to', 'not', 'worry',"),
            >>>     ('dsrc', ">>>          'about', '...', 'vs', '>>>']"),
            >>>     ('dcnt', "... print('but its still allowed')"),
            >>>     ('want', 'but its still allowed'),
            >>>     ('text', ''),
            >>>     ('text', 'more text')
            >>> ]
            >>> assert labeled == expected
        """

        # parse and differentiate between doctest source and want statements.
        labeled_lines: list[tuple[str, str]] = []
        state_indent = 0

        # line states
        TEXT = 'text'
        DSRC = 'dsrc'
        DCNT = 'dcnt'  # explicit continuation  **new in 0.10.0**
        WANT = 'want'

        # Move through states, keeping track of points where states change
        #     text -> [text, dsrc]
        #     dsrc -> [dsrc, dcnt, want, text]
        #     dcnt -> [dsrc, dcnt, want, text]
        #     want -> [want, text, dsrc]
        prev_state = TEXT
        curr_state = None
        line_iter = enumerate(string.splitlines())

        for line_idx, line in line_iter:
            match = INDENT_RE.search(line)
            line_indent = 0 if match is None else (match.end() - match.start())
            if global_state.DEBUG_PARSER:  # nocover
                print('Next line {}: {}'.format(line_idx, line))
                print('state_indent = {!r}'.format(state_indent))
                print('match = {!r}'.format(match))
                print('line_indent = {!r}'.format(line_indent))

            norm_line = line[state_indent:]  # Normalize line indentation
            strip_line = line.strip()

            # Check prev_state transitions
            if prev_state == TEXT:
                # text transitions to source whenever a PS1 line is encountered
                # the PS1(>>>) can be at an arbitrary indentation
                if _hasprefix(strip_line, ('>>>',)):
                    curr_state = DSRC
                else:
                    curr_state = TEXT
            elif prev_state == WANT:
                # blank lines terminate wants
                if len(strip_line) == 0:
                    curr_state = TEXT
                # source-inconsistent indentation terminates want
                elif _hasprefix(line.strip(), ('>>>',)):
                    curr_state = DSRC
                elif line_indent < state_indent:
                    curr_state = TEXT
                else:
                    curr_state = WANT
            elif prev_state in {DSRC, DCNT}:  # pragma: nobranch
                if len(strip_line) == 0 or line_indent < state_indent:
                    curr_state = TEXT
                # allow source to continue with either PS1 or PS2
                elif _hasprefix(norm_line, ('>>>', '...')):
                    if strip_line == '...':
                        # TODO: add mechanism for checking next line.
                        # if the next line is also a continuation
                        # then dont treat this as an ellipses
                        if prev_state == DCNT:
                            # Hack to fix continuation issue
                            curr_state = DCNT
                        else:
                            curr_state = WANT
                    else:
                        if _hasprefix(norm_line, ('...',)):
                            curr_state = DCNT
                        else:
                            curr_state = DSRC
                else:
                    curr_state = WANT
            else:  # nocover
                # This should never happen
                raise AssertionError(
                    'Unknown state prev_state={}'.format(prev_state)
                )

            # Handle transitions
            if prev_state != curr_state:
                # Handle start of new states
                if curr_state == TEXT:
                    state_indent = 0
                if curr_state in {DSRC, DCNT}:
                    # Start a new source
                    state_indent = line_indent
                    # renormalize line when indentation changes
                    norm_line = line[state_indent:]

            # continue current state
            if curr_state in {DSRC, DCNT}:
                # source parts may consume more than one line
                try:
                    if global_state.DEBUG_PARSER:  # nocover
                        print('completing source')
                    for part, norm_line in _complete_source(
                        line, state_indent, line_iter
                    ):
                        if global_state.DEBUG_PARSER > 4:  # nocover
                            print('Append Completion Line:')
                            print('part = {!r}'.format(part))
                            print('norm_line = {!r}'.format(norm_line))
                            print('curr_state = {!r}'.format(curr_state))
                        if _hasprefix(norm_line, ('...',)):
                            curr_state = DCNT
                        labeled_lines.append((curr_state, part))

                except exceptions.IncompleteParseError:
                    raise
                except SyntaxError:
                    if global_state.DEBUG_PARSER:  # nocover
                        print('<LABEL FAIL>')
                        # print('next(line_iter) = {!r}'.format(line_iter))
                        print('state_indent = {!r}'.format(state_indent))
                        print('line = {!r}'.format(line))
                        print('Failed to label source lines')
                        print('Labeled lines so far: <[[[[[[[[[[')
                        for _line in labeled_lines:
                            print(_line)
                        print(']]]]]]]]]]>')
                        print('</LABEL FAIL>')
                    raise
            elif curr_state == WANT:
                labeled_lines.append((curr_state, line))
            elif curr_state == TEXT:
                labeled_lines.append((curr_state, line))
            prev_state = curr_state

        if global_state.DEBUG_PARSER > 1:  # nocover
            import ubelt as ub

            # if global_state.DEBUG_PARSER > 3:
            #     print('string = {!r}'.format(string))
            print('<FINISH LABELED LINES')
            print('labeled_lines = {}'.format(ub.repr2(labeled_lines, nl=1)))
            print('</FINISH LABELED LINES>')

        return labeled_lines





[docs]
def _min_indentation(s):
    "Return the minimum indentation of any non-blank line in `s`"
    indents = [len(indent) for indent in INDENT_RE.findall(s)]
    if len(indents) > 0:
        return min(indents)
    else:
        return 0




[docs]
def _complete_source(line, state_indent, line_iter):
    """
    helper
    remove lines from the iterator if they are needed to complete source

    This uses :func:`static.is_balanced_statement` to do the heavy lifting

    Example:
        >>> from xdoctest.parser import *  # NOQA
        >>> from xdoctest.parser import _complete_source
        >>> state_indent = 0
        >>> line = '>>> x = { # The line is not finished'
        >>> remain_lines = ['>>> 1:2,', '>>> 3:4,', '>>> 5:6}', '>>> y = 7']
        >>> line_iter = enumerate(remain_lines, start=1)
        >>> finished = list(_complete_source(line, state_indent, line_iter))
        >>> final = chr(10).join([t[1] for t in finished])
        >>> print(final)
    """
    norm_line = line[state_indent:]  # Normalize line indentation
    prefix = norm_line[:4]
    suffix = norm_line[4:]
    assert prefix.strip() in {'>>>', '...'}, 'unexpected prefix: {!r}'.format(
        prefix
    )
    yield line, norm_line

    source_parts = [suffix]

    # These hacks actually modify the input doctest slightly
    HACK_TRIPLE_QUOTE_FIX = True

    try:
        while not static.is_balanced_statement(source_parts, only_tokens=True):
            line_idx, next_line = next(line_iter)
            norm_line = next_line[state_indent:]
            prefix = norm_line[:4]
            suffix = norm_line[4:]

            if prefix.strip() not in {'>>>', '...', ''}:  # nocover
                error = True
                if HACK_TRIPLE_QUOTE_FIX:
                    # TODO: make a more robust patch
                    if any("'''" in s or '"""' in s for s in source_parts):
                        # print('HACK FIXING TRIPLE QUOTE')
                        next_line = (
                            next_line[:state_indent] + '... ' + norm_line
                        )
                        norm_line = '... ' + norm_line
                        prefix = ''
                        suffix = norm_line
                        error = False

                if error:
                    if global_state.DEBUG_PARSER:
                        print(' * !!!ERROR!!!')
                        print(' * source_parts = {!r}'.format(source_parts))
                        print(' * prefix = {!r}'.format(prefix))
                        print(' * norm_line = {!r}'.format(norm_line))
                        print(' * !!!!!!!!!!!!!')

                    raise SyntaxError(
                        'Bad indentation in doctest on line {}: {!r}'.format(
                            line_idx, next_line
                        )
                    )
            source_parts.append(suffix)
            yield next_line, norm_line
    except StopIteration:
        if global_state.DEBUG_PARSER:
            import ubelt as ub

            print('<FAIL DID NOT COMPLETE SOURCE>')
            import traceback

            tb_text = traceback.format_exc()
            tb_text = ub.highlight_code(tb_text)
            tb_text = ub.indent(tb_text)
            print(tb_text)
            # print(' * line_iter = {!r}'.format(line_iter))
            print(' * state_indent = {!r}'.format(state_indent))
            print(' * line = {!r}'.format(line))
            # print('source =\n{}'.format('\n'.join(source_parts)))
            print('# Ensure that the following line should actually fail')
            print('source_parts = {}'.format(ub.repr2(source_parts, nl=2)))
            print(
                ub.codeblock(
                    r"""
                from xdoctest import static_analysis as static
                import ast
                static.is_balanced_statement(source_parts, only_tokens=False)
                static.is_balanced_statement(source_parts, only_tokens=True)
                text = '\n'.join(source_parts)
                print(text)
                ast.parse(text, filename='<source_block>')
                """
                )
            )
            print('</FAIL DID NOT COMPLETE SOURCE>')
            # sys.exit(1)
        # TODO: use AST to reparse all doctest parts to discover where the
        # syntax error in the doctest is and then raise it.
        raise exceptions.IncompleteParseError(
            'ill-formed doctest: all parts have been processed '
            'but the doctest source is not balanced'
        )
    else:
        if global_state.DEBUG_PARSER > 1:
            import ubelt as ub

            print('<SUCCESS COMPLETED SOURCE>')
            # print(' * line_iter = {!r}'.format(line_iter))
            print('source_parts = {}'.format(ub.repr2(source_parts, nl=2)))
            print('</SUCCESS COMPLETED SOURCE>')




[docs]
def _iterthree(items, pad_value=None):
    """
    Iterate over a sliding window of size 3 with None padding on
    both sides.

    Example:
        >>> from xdoctest.parser import *
        >>> print(list(_iterthree([])))
        >>> print(list(_iterthree(range(1))))
        >>> print(list(_iterthree([1, 2])))
        >>> print(list(_iterthree([1, 2, 3])))
        >>> print(list(_iterthree(range(4))))
        >>> print(list(_iterthree(range(7))))
    """
    # Initialize the return window to pad values
    left = mid = right = pad_value
    # Create an iterator
    item_iter = iter(items)
    # Check the first item, if we dont have it, then dont return anything
    try:
        mid = next(item_iter)
    except StopIteration:
        return
    else:
        # Check the second item, if we dont have it, we have to return
        # the values we've seen so far.
        try:
            right = next(item_iter)
        except StopIteration:
            yield left, mid, right
            return
        else:
            # If we have both mid and right, then yield both
            yield left, mid, right
            left, mid = mid, right
            # If there is still data
            for right in item_iter:
                yield left, mid, right
                left, mid = mid, right
        right = pad_value
        yield left, mid, right




[docs]
def _hasprefix(line, prefixes) -> bool:
    """helper prefix test"""
    # if not isinstance(prefixes, tuple):
    #     prefixes = [prefixes]
    return any(line == p or line.startswith(p + ' ') for p in prefixes)



if __name__ == '__main__':
    """
    CommandLine:
        python -m xdoctest.core
        python -m xdoctest.parser all
    """
    import xdoctest as xdoc

    xdoc.doctest_module()