diff --git a/.github/workflows/diff_shades.yml b/.github/workflows/diff_shades.yml index 51a448a12a5..038408b94c9 100644 --- a/.github/workflows/diff_shades.yml +++ b/.github/workflows/diff_shades.yml @@ -110,19 +110,19 @@ jobs: ${{ matrix.baseline-analysis }} ${{ matrix.target-analysis }} - name: Upload diff report - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: ${{ matrix.mode }}-diff.html path: diff.html - name: Upload baseline analysis - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: ${{ matrix.baseline-analysis }} path: ${{ matrix.baseline-analysis }} - name: Upload target analysis - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: ${{ matrix.target-analysis }} path: ${{ matrix.target-analysis }} @@ -137,7 +137,7 @@ jobs: - name: Upload summary file (PR only) if: github.event_name == 'pull_request' && matrix.mode == 'preview-changes' - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: .pr-comment.json path: .pr-comment.json diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2f1bae939aa..ca1ce60ad72 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -50,6 +50,7 @@ repos: - click >= 8.1.0, != 8.1.4, != 8.1.5 - packaging >= 22.0 - platformdirs >= 2.1.0 + - pytokens >= 0.1.5 - pytest - hypothesis - aiohttp >= 3.7.4 diff --git a/CHANGES.md b/CHANGES.md index 51374fbe7f8..3837e70e432 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -40,6 +40,8 @@ +- Rewrite tokenizer to improve performance and compliance (#4536) + ### Performance diff --git a/pyproject.toml b/pyproject.toml index 30d2962248c..fbff7de3a4d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -69,6 +69,7 @@ dependencies = [ "packaging>=22.0", "pathspec>=0.9.0", "platformdirs>=2", + "pytokens>=0.1.9", "tomli>=1.1.0; python_version < '3.11'", "typing_extensions>=4.0.1; python_version < '3.11'", ] diff --git a/src/blib2to3/pgen2/driver.py b/src/blib2to3/pgen2/driver.py index d17fd1d7bfb..056fab2127b 100644 --- a/src/blib2to3/pgen2/driver.py +++ b/src/blib2to3/pgen2/driver.py @@ -28,7 +28,7 @@ from typing import IO, Any, Optional, Union, cast from blib2to3.pgen2.grammar import Grammar -from blib2to3.pgen2.tokenize import GoodTokenInfo +from blib2to3.pgen2.tokenize import TokenInfo from blib2to3.pytree import NL # Pgen imports @@ -112,7 +112,7 @@ def __init__(self, grammar: Grammar, logger: Optional[Logger] = None) -> None: logger = logging.getLogger(__name__) self.logger = logger - def parse_tokens(self, tokens: Iterable[GoodTokenInfo], debug: bool = False) -> NL: + def parse_tokens(self, tokens: Iterable[TokenInfo], debug: bool = False) -> NL: """Parse a series of tokens and return the syntax tree.""" # XXX Move the prefix computation into a wrapper around tokenize. proxy = TokenProxy(tokens) @@ -180,27 +180,17 @@ def parse_tokens(self, tokens: Iterable[GoodTokenInfo], debug: bool = False) -> assert p.rootnode is not None return p.rootnode - def parse_stream_raw(self, stream: IO[str], debug: bool = False) -> NL: - """Parse a stream and return the syntax tree.""" - tokens = tokenize.generate_tokens(stream.readline, grammar=self.grammar) - return self.parse_tokens(tokens, debug) - - def parse_stream(self, stream: IO[str], debug: bool = False) -> NL: - """Parse a stream and return the syntax tree.""" - return self.parse_stream_raw(stream, debug) - def parse_file( self, filename: Path, encoding: Optional[str] = None, debug: bool = False ) -> NL: """Parse a file and return the syntax tree.""" with open(filename, encoding=encoding) as stream: - return self.parse_stream(stream, debug) + text = stream.read() + return self.parse_string(text, debug) def parse_string(self, text: str, debug: bool = False) -> NL: """Parse a string and return the syntax tree.""" - tokens = tokenize.generate_tokens( - io.StringIO(text).readline, grammar=self.grammar - ) + tokens = tokenize.tokenize(text, grammar=self.grammar) return self.parse_tokens(tokens, debug) def _partially_consume_prefix(self, prefix: str, column: int) -> tuple[str, str]: diff --git a/src/blib2to3/pgen2/pgen.py b/src/blib2to3/pgen2/pgen.py index ea6d8cc19a5..6599c1f226c 100644 --- a/src/blib2to3/pgen2/pgen.py +++ b/src/blib2to3/pgen2/pgen.py @@ -6,7 +6,7 @@ from typing import IO, Any, NoReturn, Optional, Union from blib2to3.pgen2 import grammar, token, tokenize -from blib2to3.pgen2.tokenize import GoodTokenInfo +from blib2to3.pgen2.tokenize import TokenInfo Path = Union[str, "os.PathLike[str]"] @@ -18,7 +18,7 @@ class PgenGrammar(grammar.Grammar): class ParserGenerator: filename: Path stream: IO[str] - generator: Iterator[GoodTokenInfo] + generator: Iterator[TokenInfo] first: dict[str, Optional[dict[str, int]]] def __init__(self, filename: Path, stream: Optional[IO[str]] = None) -> None: @@ -27,8 +27,7 @@ def __init__(self, filename: Path, stream: Optional[IO[str]] = None) -> None: stream = open(filename, encoding="utf-8") close_stream = stream.close self.filename = filename - self.stream = stream - self.generator = tokenize.generate_tokens(stream.readline) + self.generator = tokenize.tokenize(stream.read()) self.gettoken() # Initialize lookahead self.dfas, self.startsymbol = self.parse() if close_stream is not None: diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index 407c184dd74..5cbfd5148d8 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -27,11 +27,9 @@ function to which the 5 fields described above are passed as 5 arguments, each time a new token is found.""" -import builtins import sys -from collections.abc import Callable, Iterable, Iterator -from re import Pattern -from typing import Final, Optional, Union +from collections.abc import Iterator +from typing import Optional from blib2to3.pgen2.grammar import Grammar from blib2to3.pgen2.token import ( @@ -45,13 +43,11 @@ FSTRING_MIDDLE, FSTRING_START, INDENT, - LBRACE, NAME, NEWLINE, NL, NUMBER, OP, - RBRACE, STRING, tok_name, ) @@ -59,1056 +55,206 @@ __author__ = "Ka-Ping Yee " __credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro" -import re -from codecs import BOM_UTF8, lookup +import pytokens +from pytokens import TokenType -from . import token +from . import token as _token -__all__ = [x for x in dir(token) if x[0] != "_"] + [ +__all__ = [x for x in dir(_token) if x[0] != "_"] + [ "tokenize", "generate_tokens", "untokenize", ] -del token - - -def group(*choices: str) -> str: - return "(" + "|".join(choices) + ")" - - -def any(*choices: str) -> str: - return group(*choices) + "*" - - -def maybe(*choices: str) -> str: - return group(*choices) + "?" - - -def _combinations(*l: str) -> set[str]: - return {x + y for x in l for y in l + ("",) if x.casefold() != y.casefold()} - - -Whitespace = r"[ \f\t]*" -Comment = r"#[^\r\n]*" -Ignore = Whitespace + any(r"\\\r?\n" + Whitespace) + maybe(Comment) -Name = ( # this is invalid but it's fine because Name comes after Number in all groups - r"[^\s#\(\)\[\]\{\}+\-*/!@$%^&=|;:'\",\.<>/?`~\\]+" -) - -Binnumber = r"0[bB]_?[01]+(?:_[01]+)*" -Hexnumber = r"0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?" -Octnumber = r"0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?" -Decnumber = group(r"[1-9]\d*(?:_\d+)*[lL]?", "0[lL]?") -Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber) -Exponent = r"[eE][-+]?\d+(?:_\d+)*" -Pointfloat = group(r"\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?", r"\.\d+(?:_\d+)*") + maybe( - Exponent -) -Expfloat = r"\d+(?:_\d+)*" + Exponent -Floatnumber = group(Pointfloat, Expfloat) -Imagnumber = group(r"\d+(?:_\d+)*[jJ]", Floatnumber + r"[jJ]") -Number = group(Imagnumber, Floatnumber, Intnumber) - -# Tail end of ' string. -Single = r"(?:\\.|[^'\\])*'" -# Tail end of " string. -Double = r'(?:\\.|[^"\\])*"' -# Tail end of ''' string. -Single3 = r"(?:\\.|'(?!'')|[^'\\])*'''" -# Tail end of """ string. -Double3 = r'(?:\\.|"(?!"")|[^"\\])*"""' -_litprefix = r"(?:[uUrRbB]|[rR][bB]|[bBuU][rR])?" -_fstringlitprefix = r"(?:rF|FR|Fr|fr|RF|F|rf|f|Rf|fR)" -Triple = group( - _litprefix + "'''", - _litprefix + '"""', - _fstringlitprefix + '"""', - _fstringlitprefix + "'''", -) - -# beginning of a single quoted f-string. must not end with `{{` or `\N{` -SingleLbrace = r"(?:\\N{|{{|\\'|[^\n'{])*(?>=?", - r"<<=?", - r"<>", - r"!=", - r"//=?", - r"->", - r"[+\-*/%&@|^=<>:]=?", - r"~", -) - -Bracket = "[][(){}]" -Special = group(r"\r?\n", r"[:;.,`@]") -Funny = group(Operator, Bracket, Special) - -_string_middle_single = r"(?:[^\n'\\]|\\.)*" -_string_middle_double = r'(?:[^\n"\\]|\\.)*' - -# FSTRING_MIDDLE and LBRACE, must not end with a `{{` or `\N{` -_fstring_middle_single = SingleLbrace -_fstring_middle_double = DoubleLbrace - -# First (or only) line of ' or " string. -ContStr = group( - _litprefix + "'" + _string_middle_single + group("'", r"\\\r?\n"), - _litprefix + '"' + _string_middle_double + group('"', r"\\\r?\n"), - group(_fstringlitprefix + "'") + _fstring_middle_single, - group(_fstringlitprefix + '"') + _fstring_middle_double, - group(_fstringlitprefix + "'") + _string_middle_single + group("'", r"\\\r?\n"), - group(_fstringlitprefix + '"') + _string_middle_double + group('"', r"\\\r?\n"), -) -PseudoExtras = group(r"\\\r?\n", Comment, Triple) -PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) - -pseudoprog: Final = re.compile(PseudoToken, re.UNICODE) - -singleprog = re.compile(Single) -singleprog_plus_lbrace = re.compile(group(SingleLbrace, Single)) -doubleprog = re.compile(Double) -doubleprog_plus_lbrace = re.compile(group(DoubleLbrace, Double)) - -single3prog = re.compile(Single3) -single3prog_plus_lbrace = re.compile(group(Single3Lbrace, Single3)) -double3prog = re.compile(Double3) -double3prog_plus_lbrace = re.compile(group(Double3Lbrace, Double3)) - -_strprefixes = _combinations("r", "R", "b", "B") | {"u", "U", "ur", "uR", "Ur", "UR"} -_fstring_prefixes = _combinations("r", "R", "f", "F") - {"r", "R"} - -endprogs: Final = { - "'": singleprog, - '"': doubleprog, - "'''": single3prog, - '"""': double3prog, - **{f"{prefix}'": singleprog for prefix in _strprefixes}, - **{f'{prefix}"': doubleprog for prefix in _strprefixes}, - **{f"{prefix}'": singleprog_plus_lbrace for prefix in _fstring_prefixes}, - **{f'{prefix}"': doubleprog_plus_lbrace for prefix in _fstring_prefixes}, - **{f"{prefix}'''": single3prog for prefix in _strprefixes}, - **{f'{prefix}"""': double3prog for prefix in _strprefixes}, - **{f"{prefix}'''": single3prog_plus_lbrace for prefix in _fstring_prefixes}, - **{f'{prefix}"""': double3prog_plus_lbrace for prefix in _fstring_prefixes}, -} - -triple_quoted: Final = ( - {"'''", '"""'} - | {f"{prefix}'''" for prefix in _strprefixes | _fstring_prefixes} - | {f'{prefix}"""' for prefix in _strprefixes | _fstring_prefixes} -) -single_quoted: Final = ( - {"'", '"'} - | {f"{prefix}'" for prefix in _strprefixes | _fstring_prefixes} - | {f'{prefix}"' for prefix in _strprefixes | _fstring_prefixes} -) -fstring_prefix: Final = tuple( - {f"{prefix}'" for prefix in _fstring_prefixes} - | {f'{prefix}"' for prefix in _fstring_prefixes} - | {f"{prefix}'''" for prefix in _fstring_prefixes} - | {f'{prefix}"""' for prefix in _fstring_prefixes} -) - -tabsize = 8 - - -class TokenError(Exception): - pass - - -class StopTokenizing(Exception): - pass - +del _token Coord = tuple[int, int] +TokenInfo = tuple[int, str, Coord, Coord, str] + +TOKEN_TYPE_MAP = { + TokenType.indent: INDENT, + TokenType.dedent: DEDENT, + TokenType.newline: NEWLINE, + TokenType.nl: NL, + TokenType.comment: COMMENT, + TokenType.semicolon: OP, + TokenType.lparen: OP, + TokenType.rparen: OP, + TokenType.lbracket: OP, + TokenType.rbracket: OP, + TokenType.lbrace: OP, + TokenType.rbrace: OP, + TokenType.colon: OP, + TokenType.op: OP, + TokenType.identifier: NAME, + TokenType.number: NUMBER, + TokenType.string: STRING, + TokenType.fstring_start: FSTRING_START, + TokenType.fstring_middle: FSTRING_MIDDLE, + TokenType.fstring_end: FSTRING_END, + TokenType.endmarker: ENDMARKER, +} -def printtoken( - type: int, token: str, srow_col: Coord, erow_col: Coord, line: str -) -> None: # for testing - (srow, scol) = srow_col - (erow, ecol) = erow_col - print( - "%d,%d-%d,%d:\t%s\t%s" % (srow, scol, erow, ecol, tok_name[type], repr(token)) - ) - - -TokenEater = Callable[[int, str, Coord, Coord, str], None] - - -def tokenize(readline: Callable[[], str], tokeneater: TokenEater = printtoken) -> None: - """ - The tokenize() function accepts two parameters: one representing the - input stream, and one providing an output mechanism for tokenize(). +class TokenError(Exception): ... - The first parameter, readline, must be a callable object which provides - the same interface as the readline() method of built-in file objects. - Each call to the function should return one line of input as a string. - The second parameter, tokeneater, must also be a callable object. It is - called once for each token, with five arguments, corresponding to the - tuples generated by generate_tokens(). +def transform_whitespace( + token: pytokens.Token, source: str, prev_token: Optional[pytokens.Token] +) -> pytokens.Token: + r""" + Black treats `\\\n` at the end of a line as a 'NL' token, while it + is ignored as whitespace in the regular Python parser. + But, only the first one. If there's a `\\\n` following it + (as in, a \ just by itself on a line), that is not made into NL. """ - try: - tokenize_loop(readline, tokeneater) - except StopTokenizing: - pass - - -# backwards compatible interface -def tokenize_loop(readline: Callable[[], str], tokeneater: TokenEater) -> None: - for token_info in generate_tokens(readline): - tokeneater(*token_info) - - -GoodTokenInfo = tuple[int, str, Coord, Coord, str] -TokenInfo = Union[tuple[int, str], GoodTokenInfo] - - -class Untokenizer: - tokens: list[str] - prev_row: int - prev_col: int - - def __init__(self) -> None: - self.tokens = [] - self.prev_row = 1 - self.prev_col = 0 - - def add_whitespace(self, start: Coord) -> None: - row, col = start - assert row <= self.prev_row - col_offset = col - self.prev_col - if col_offset: - self.tokens.append(" " * col_offset) - - def untokenize(self, iterable: Iterable[TokenInfo]) -> str: - for t in iterable: - if len(t) == 2: - self.compat(t, iterable) - break - tok_type, token, start, end, line = t - self.add_whitespace(start) - self.tokens.append(token) - self.prev_row, self.prev_col = end - if tok_type in (NEWLINE, NL): - self.prev_row += 1 - self.prev_col = 0 - return "".join(self.tokens) - - def compat(self, token: tuple[int, str], iterable: Iterable[TokenInfo]) -> None: - startline = False - indents = [] - toks_append = self.tokens.append - toknum, tokval = token - if toknum in (NAME, NUMBER): - tokval += " " - if toknum in (NEWLINE, NL): - startline = True - for tok in iterable: - toknum, tokval = tok[:2] - - if toknum in (NAME, NUMBER, ASYNC, AWAIT): - tokval += " " - - if toknum == INDENT: - indents.append(tokval) - continue - elif toknum == DEDENT: - indents.pop() - continue - elif toknum in (NEWLINE, NL): - startline = True - elif startline and indents: - toks_append(indents[-1]) - startline = False - toks_append(tokval) - - -cookie_re = re.compile(r"^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)", re.ASCII) -blank_re = re.compile(rb"^[ \t\f]*(?:[#\r\n]|$)", re.ASCII) - - -def _get_normal_name(orig_enc: str) -> str: - """Imitates get_normal_name in tokenizer.c.""" - # Only care about the first 12 characters. - enc = orig_enc[:12].lower().replace("_", "-") - if enc == "utf-8" or enc.startswith("utf-8-"): - return "utf-8" - if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or enc.startswith( - ("latin-1-", "iso-8859-1-", "iso-latin-1-") + if ( + token.type == TokenType.whitespace + and prev_token is not None + and prev_token.type not in (TokenType.nl, TokenType.newline) ): - return "iso-8859-1" - return orig_enc - - -def detect_encoding(readline: Callable[[], bytes]) -> tuple[str, list[bytes]]: - """ - The detect_encoding() function is used to detect the encoding that should - be used to decode a Python source file. It requires one argument, readline, - in the same way as the tokenize() generator. - - It will call readline a maximum of twice, and return the encoding used - (as a string) and a list of any lines (left as bytes) it has read - in. - - It detects the encoding from the presence of a utf-8 bom or an encoding - cookie as specified in pep-0263. If both a bom and a cookie are present, but - disagree, a SyntaxError will be raised. If the encoding cookie is an invalid - charset, raise a SyntaxError. Note that if a utf-8 bom is found, - 'utf-8-sig' is returned. - - If no encoding is specified, then the default of 'utf-8' will be returned. - """ - bom_found = False - encoding = None - default = "utf-8" - - def read_or_stop() -> bytes: - try: - return readline() - except StopIteration: - return b"" - - def find_cookie(line: bytes) -> Optional[str]: - try: - line_string = line.decode("ascii") - except UnicodeDecodeError: - return None - match = cookie_re.match(line_string) - if not match: - return None - encoding = _get_normal_name(match.group(1)) - try: - codec = lookup(encoding) - except LookupError: - # This behaviour mimics the Python interpreter - raise SyntaxError("unknown encoding: " + encoding) - - if bom_found: - if codec.name != "utf-8": - # This behaviour mimics the Python interpreter - raise SyntaxError("encoding problem: utf-8") - encoding += "-sig" - return encoding - - first = read_or_stop() - if first.startswith(BOM_UTF8): - bom_found = True - first = first[3:] - default = "utf-8-sig" - if not first: - return default, [] - - encoding = find_cookie(first) - if encoding: - return encoding, [first] - if not blank_re.match(first): - return default, [first] - - second = read_or_stop() - if not second: - return default, [first] - - encoding = find_cookie(second) - if encoding: - return encoding, [first, second] - - return default, [first, second] - - -def untokenize(iterable: Iterable[TokenInfo]) -> str: - """Transform tokens back into Python source code. - - Each element returned by the iterable must be a token sequence - with at least two elements, a token number and token value. If - only two tokens are passed, the resulting output is poor. - - Round-trip invariant for full input: - Untokenized source will match input source exactly - - Round-trip invariant for limited input: - # Output text will tokenize the back to the input - t1 = [tok[:2] for tok in generate_tokens(f.readline)] - newcode = untokenize(t1) - readline = iter(newcode.splitlines(1)).next - t2 = [tok[:2] for tokin generate_tokens(readline)] - assert t1 == t2 - """ - ut = Untokenizer() - return ut.untokenize(iterable) - - -def is_fstring_start(token: str) -> bool: - return token.startswith(fstring_prefix) - - -def _split_fstring_start_and_middle(token: str) -> tuple[str, str]: - for prefix in fstring_prefix: - _, prefix, rest = token.partition(prefix) - if prefix != "": - return prefix, rest - - raise ValueError(f"Token {token!r} is not a valid f-string start") - - -STATE_NOT_FSTRING: Final = 0 # not in an f-string -STATE_MIDDLE: Final = 1 # in the string portion of an f-string (outside braces) -STATE_IN_BRACES: Final = 2 # between braces in an f-string -# in the format specifier (between the colon and the closing brace) -STATE_IN_COLON: Final = 3 - - -class FStringState: - """Keeps track of state around f-strings. - - The tokenizer should call the appropriate method on this class when - it transitions to a different part of an f-string. This is needed - because the tokenization depends on knowing where exactly we are in - the f-string. - - For example, consider the following f-string: - - f"a{1:b{2}c}d" - - The following is the tokenization of this string and the states - tracked by this class: - - 1,0-1,2: FSTRING_START 'f"' # [STATE_NOT_FSTRING, STATE_MIDDLE] - 1,2-1,3: FSTRING_MIDDLE 'a' - 1,3-1,4: LBRACE '{' # [STATE_NOT_FSTRING, STATE_IN_BRACES] - 1,4-1,5: NUMBER '1' - 1,5-1,6: OP ':' # [STATE_NOT_FSTRING, STATE_IN_COLON] - 1,6-1,7: FSTRING_MIDDLE 'b' - 1,7-1,8: LBRACE '{' # [STATE_NOT_FSTRING, STATE_IN_COLON, STATE_IN_BRACES] - 1,8-1,9: NUMBER '2' - 1,9-1,10: RBRACE '}' # [STATE_NOT_FSTRING, STATE_IN_COLON] - 1,10-1,11: FSTRING_MIDDLE 'c' - 1,11-1,12: RBRACE '}' # [STATE_NOT_FSTRING, STATE_MIDDLE] - 1,12-1,13: FSTRING_MIDDLE 'd' - 1,13-1,14: FSTRING_END '"' # [STATE_NOT_FSTRING] - 1,14-1,15: NEWLINE '\n' - 2,0-2,0: ENDMARKER '' - - Notice that the nested braces in the format specifier are represented - by adding a STATE_IN_BRACES entry to the state stack. The stack is - also used if there are nested f-strings. - - """ - - def __init__(self) -> None: - self.stack: list[int] = [STATE_NOT_FSTRING] - - def is_in_fstring_expression(self) -> bool: - return self.stack[-1] not in (STATE_MIDDLE, STATE_NOT_FSTRING) - - def current(self) -> int: - return self.stack[-1] - - def enter_fstring(self) -> None: - self.stack.append(STATE_MIDDLE) - - def leave_fstring(self) -> None: - state = self.stack.pop() - assert state == STATE_MIDDLE - - def consume_lbrace(self) -> None: - current_state = self.stack[-1] - if current_state == STATE_MIDDLE: - self.stack[-1] = STATE_IN_BRACES - elif current_state == STATE_IN_COLON: - self.stack.append(STATE_IN_BRACES) - else: - assert False, current_state - - def consume_rbrace(self) -> None: - current_state = self.stack[-1] - assert current_state in (STATE_IN_BRACES, STATE_IN_COLON) - if len(self.stack) > 1 and self.stack[-2] == STATE_IN_COLON: - self.stack.pop() - else: - self.stack[-1] = STATE_MIDDLE - - def consume_colon(self) -> None: - assert self.stack[-1] == STATE_IN_BRACES, self.stack - self.stack[-1] = STATE_IN_COLON - - -def generate_tokens( - readline: Callable[[], str], grammar: Optional[Grammar] = None -) -> Iterator[GoodTokenInfo]: - """ - The generate_tokens() generator requires one argument, readline, which - must be a callable object which provides the same interface as the - readline() method of built-in file objects. Each call to the function - should return one line of input as a string. Alternately, readline - can be a callable function terminating with StopIteration: - readline = open(myfile).next # Example of alternate readline - - The generator produces 5-tuples with these members: the token type; the - token string; a 2-tuple (srow, scol) of ints specifying the row and - column where the token begins in the source; a 2-tuple (erow, ecol) of - ints specifying the row and column where the token ends in the source; - and the line on which the token was found. The line passed is the - logical line; continuation lines are included. - """ - lnum = parenlev = continued = 0 - parenlev_stack: list[int] = [] - fstring_state = FStringState() - formatspec = "" - numchars: Final[str] = "0123456789" - contstr, needcont = "", 0 - contline: Optional[str] = None - indents = [0] - - # If we know we're parsing 3.7+, we can unconditionally parse `async` and - # `await` as keywords. + token_str = source[token.start_index : token.end_index] + if token_str.startswith("\\\n"): + return pytokens.Token( + TokenType.nl, + token.start_index, + token.start_index + 2, + token.start_line, + token.start_col, + token.start_line, + token.start_col + 2, + ) + + return token + + +def tokenize(source: str, grammar: Optional[Grammar] = None) -> Iterator[TokenInfo]: async_keywords = False if grammar is None else grammar.async_keywords - # 'stashed' and 'async_*' are used for async/await parsing - stashed: Optional[GoodTokenInfo] = None - async_def = False - async_def_indent = 0 - async_def_nl = False - strstart: tuple[int, int] - endprog_stack: list[Pattern[str]] = [] - formatspec_start: tuple[int, int] + lines = source.split("\n") + lines += [""] # For newline tokens in files that don't end in a newline + line, column = 1, 0 - while 1: # loop over lines in stream - try: - line = readline() - except StopIteration: - line = "" - lnum += 1 + token_iterator = pytokens.tokenize(source) + is_async = False + current_indent = 0 + async_indent = 0 - # skip lines that are just indent characters ending with a slash - # to avoid storing that line's indent information. - if not contstr and line.rstrip("\n").strip(" \t\f") == "\\": - continue + prev_token: Optional[pytokens.Token] = None + try: + for token in token_iterator: + token = transform_whitespace(token, source, prev_token) - pos, max = 0, len(line) + line, column = token.start_line, token.start_col + if token.type == TokenType.whitespace: + continue - if contstr: # continued string - assert contline is not None - if not line: - raise TokenError("EOF in multi-line string", strstart) - endprog = endprog_stack[-1] - endmatch = endprog.match(line) - if endmatch: - end = endmatch.end(0) - token = contstr + line[:end] - spos = strstart - epos = (lnum, end) - tokenline = contline + line - if fstring_state.current() in ( - STATE_NOT_FSTRING, - STATE_IN_BRACES, - ) and not is_fstring_start(token): - yield (STRING, token, spos, epos, tokenline) - endprog_stack.pop() - parenlev = parenlev_stack.pop() - else: - if is_fstring_start(token): - fstring_start, token = _split_fstring_start_and_middle(token) - fstring_start_epos = (spos[0], spos[1] + len(fstring_start)) - yield ( - FSTRING_START, - fstring_start, - spos, - fstring_start_epos, - tokenline, - ) - fstring_state.enter_fstring() - # increase spos to the end of the fstring start - spos = fstring_start_epos + token_str = source[token.start_index : token.end_index] - if token.endswith("{"): - fstring_middle, lbrace = token[:-1], token[-1] - fstring_middle_epos = lbrace_spos = (lnum, end - 1) - yield ( - FSTRING_MIDDLE, - fstring_middle, - spos, - fstring_middle_epos, - line, - ) - yield (LBRACE, lbrace, lbrace_spos, epos, line) - fstring_state.consume_lbrace() - else: - if token.endswith(('"""', "'''")): - fstring_middle, fstring_end = token[:-3], token[-3:] - fstring_middle_epos = end_spos = (lnum, end - 3) - else: - fstring_middle, fstring_end = token[:-1], token[-1] - fstring_middle_epos = end_spos = (lnum, end - 1) - yield ( - FSTRING_MIDDLE, - fstring_middle, - spos, - fstring_middle_epos, - line, - ) - yield ( - FSTRING_END, - fstring_end, - end_spos, - epos, - line, - ) - fstring_state.leave_fstring() - endprog_stack.pop() - parenlev = parenlev_stack.pop() - pos = end - contstr, needcont = "", 0 - contline = None - elif needcont and line[-2:] != "\\\n" and line[-3:] != "\\\r\n": - yield ( - ERRORTOKEN, - contstr + line, - strstart, - (lnum, len(line)), - contline, - ) - contstr = "" - contline = None - continue - else: - contstr = contstr + line - contline = contline + line + if token.type == TokenType.newline and token_str == "": + # Black doesn't yield empty newline tokens at the end of a file + # if there's no newline at the end of a file. + prev_token = token continue - # new statement - elif ( - parenlev == 0 - and not continued - and not fstring_state.is_in_fstring_expression() - ): - if not line: - break - column = 0 - while pos < max: # measure leading whitespace - if line[pos] == " ": - column += 1 - elif line[pos] == "\t": - column = (column // tabsize + 1) * tabsize - elif line[pos] == "\f": - column = 0 - else: + if token.type == TokenType.indent: + current_indent += 1 + if token.type == TokenType.dedent: + current_indent -= 1 + if is_async and current_indent < async_indent: + is_async = False + + source_line = lines[token.start_line - 1] + + if token.type == TokenType.identifier and token_str in ("async", "await"): + # Black uses `async` and `await` token types just for those two keywords + while True: + next_token = next(token_iterator) + next_str = source[next_token.start_index : next_token.end_index] + next_token = transform_whitespace(next_token, next_str, token) + if next_token.type == TokenType.whitespace: + continue break - pos += 1 - if pos == max: - break - if stashed: - yield stashed - stashed = None + next_token_type = TOKEN_TYPE_MAP[next_token.type] + next_line = lines[next_token.start_line - 1] - if line[pos] in "\r\n": # skip blank lines - yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line) - continue + if token_str == "async" and ( + async_keywords + or (next_token_type == NAME and next_str in ("def", "for")) + ): + is_async = True + async_indent = current_indent + 1 + current_token_type = ASYNC + elif token_str == "await" and (async_keywords or is_async): + current_token_type = AWAIT + else: + current_token_type = TOKEN_TYPE_MAP[token.type] - if line[pos] == "#": # skip comments - comment_token = line[pos:].rstrip("\r\n") - nl_pos = pos + len(comment_token) yield ( - COMMENT, - comment_token, - (lnum, pos), - (lnum, nl_pos), - line, + current_token_type, + token_str, + (token.start_line, token.start_col), + (token.end_line, token.end_col), + source_line, ) - yield (NL, line[nl_pos:], (lnum, nl_pos), (lnum, len(line)), line) + yield ( + next_token_type, + next_str, + (next_token.start_line, next_token.start_col), + (next_token.end_line, next_token.end_col), + next_line, + ) + prev_token = token continue - if column > indents[-1]: # count indents - indents.append(column) - yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line) - - while column < indents[-1]: # count dedents - if column not in indents: - raise IndentationError( - "unindent does not match any outer indentation level", - ("", lnum, pos, line), - ) - indents = indents[:-1] - - if async_def and async_def_indent >= indents[-1]: - async_def = False - async_def_nl = False - async_def_indent = 0 - - yield (DEDENT, "", (lnum, pos), (lnum, pos), line) + if token.type == TokenType.op and token_str == "...": + # Black doesn't have an ellipsis token yet, yield 3 DOTs instead + assert token.start_line == token.end_line + assert token.end_col == token.start_col + 3 - if async_def and async_def_nl and async_def_indent >= indents[-1]: - async_def = False - async_def_nl = False - async_def_indent = 0 - - else: # continued statement - if not line: - raise TokenError("EOF in multi-line statement", (lnum, 0)) - continued = 0 - - while pos < max: - if fstring_state.current() == STATE_MIDDLE: - endprog = endprog_stack[-1] - endmatch = endprog.match(line, pos) - if endmatch: # all on one line - start, end = endmatch.span(0) - token = line[start:end] - if token.endswith(('"""', "'''")): - middle_token, end_token = token[:-3], token[-3:] - middle_epos = end_spos = (lnum, end - 3) - else: - middle_token, end_token = token[:-1], token[-1] - middle_epos = end_spos = (lnum, end - 1) - # TODO: unsure if this can be safely removed - if stashed: - yield stashed - stashed = None + token_str = "." + for start_col in range(token.start_col, token.start_col + 3): + end_col = start_col + 1 yield ( - FSTRING_MIDDLE, - middle_token, - (lnum, pos), - middle_epos, - line, + TOKEN_TYPE_MAP[token.type], + token_str, + (token.start_line, start_col), + (token.end_line, end_col), + source_line, ) - if not token.endswith("{"): - yield ( - FSTRING_END, - end_token, - end_spos, - (lnum, end), - line, - ) - fstring_state.leave_fstring() - endprog_stack.pop() - parenlev = parenlev_stack.pop() - else: - yield (LBRACE, "{", (lnum, end - 1), (lnum, end), line) - fstring_state.consume_lbrace() - pos = end - continue - else: # multiple lines - strstart = (lnum, end) - contstr = line[end:] - contline = line - break - - if fstring_state.current() == STATE_IN_COLON: - match = fstring_middle_after_colon.match(line, pos) - if match is None: - formatspec += line[pos:] - pos = max - continue - - start, end = match.span(1) - token = line[start:end] - formatspec += token - - brace_start, brace_end = match.span(2) - brace_or_nl = line[brace_start:brace_end] - if brace_or_nl == "\n": - pos = brace_end - - yield (FSTRING_MIDDLE, formatspec, formatspec_start, (lnum, end), line) - formatspec = "" - - if brace_or_nl == "{": - yield (LBRACE, "{", (lnum, brace_start), (lnum, brace_end), line) - fstring_state.consume_lbrace() - end = brace_end - elif brace_or_nl == "}": - yield (RBRACE, "}", (lnum, brace_start), (lnum, brace_end), line) - fstring_state.consume_rbrace() - end = brace_end - formatspec_start = (lnum, brace_end) - - pos = end + prev_token = token continue - if fstring_state.current() == STATE_IN_BRACES and parenlev == 0: - match = bang.match(line, pos) - if match: - start, end = match.span(1) - yield (OP, "!", (lnum, start), (lnum, end), line) - pos = end - continue + yield ( + TOKEN_TYPE_MAP[token.type], + token_str, + (token.start_line, token.start_col), + (token.end_line, token.end_col), + source_line, + ) + prev_token = token - match = colon.match(line, pos) - if match: - start, end = match.span(1) - yield (OP, ":", (lnum, start), (lnum, end), line) - fstring_state.consume_colon() - formatspec_start = (lnum, end) - pos = end - continue + except pytokens.UnexpectedEOF: + raise TokenError("Unexpected EOF in multi-line statement", (line, column)) + except pytokens.TokenizeError as exc: + raise TokenError(f"Failed to parse: {type(exc).__name__}", (line, column)) - pseudomatch = pseudoprog.match(line, pos) - if pseudomatch: # scan for tokens - start, end = pseudomatch.span(1) - spos, epos, pos = (lnum, start), (lnum, end), end - token, initial = line[start:end], line[start] - - if initial in numchars or ( - initial == "." and token != "." - ): # ordinary number - yield (NUMBER, token, spos, epos, line) - elif initial in "\r\n": - newline = NEWLINE - if parenlev > 0 or fstring_state.is_in_fstring_expression(): - newline = NL - elif async_def: - async_def_nl = True - if stashed: - yield stashed - stashed = None - yield (newline, token, spos, epos, line) - - elif initial == "#": - assert not token.endswith("\n") - if stashed: - yield stashed - stashed = None - yield (COMMENT, token, spos, epos, line) - elif token in triple_quoted: - endprog = endprogs[token] - endprog_stack.append(endprog) - parenlev_stack.append(parenlev) - parenlev = 0 - if is_fstring_start(token): - yield (FSTRING_START, token, spos, epos, line) - fstring_state.enter_fstring() - - endmatch = endprog.match(line, pos) - if endmatch: # all on one line - if stashed: - yield stashed - stashed = None - if not is_fstring_start(token): - pos = endmatch.end(0) - token = line[start:pos] - epos = (lnum, pos) - yield (STRING, token, spos, epos, line) - endprog_stack.pop() - parenlev = parenlev_stack.pop() - else: - end = endmatch.end(0) - token = line[pos:end] - spos, epos = (lnum, pos), (lnum, end) - if not token.endswith("{"): - fstring_middle, fstring_end = token[:-3], token[-3:] - fstring_middle_epos = fstring_end_spos = (lnum, end - 3) - yield ( - FSTRING_MIDDLE, - fstring_middle, - spos, - fstring_middle_epos, - line, - ) - yield ( - FSTRING_END, - fstring_end, - fstring_end_spos, - epos, - line, - ) - fstring_state.leave_fstring() - endprog_stack.pop() - parenlev = parenlev_stack.pop() - else: - fstring_middle, lbrace = token[:-1], token[-1] - fstring_middle_epos = lbrace_spos = (lnum, end - 1) - yield ( - FSTRING_MIDDLE, - fstring_middle, - spos, - fstring_middle_epos, - line, - ) - yield (LBRACE, lbrace, lbrace_spos, epos, line) - fstring_state.consume_lbrace() - pos = end - else: - # multiple lines - if is_fstring_start(token): - strstart = (lnum, pos) - contstr = line[pos:] - else: - strstart = (lnum, start) - contstr = line[start:] - contline = line - break - elif ( - initial in single_quoted - or token[:2] in single_quoted - or token[:3] in single_quoted - ): - maybe_endprog = ( - endprogs.get(initial) - or endprogs.get(token[:2]) - or endprogs.get(token[:3]) - ) - assert maybe_endprog is not None, f"endprog not found for {token}" - endprog = maybe_endprog - if token[-1] == "\n": # continued string - endprog_stack.append(endprog) - parenlev_stack.append(parenlev) - parenlev = 0 - strstart = (lnum, start) - contstr, needcont = line[start:], 1 - contline = line - break - else: # ordinary string - if stashed: - yield stashed - stashed = None - if not is_fstring_start(token): - yield (STRING, token, spos, epos, line) - else: - if pseudomatch[20] is not None: - fstring_start = pseudomatch[20] - offset = pseudomatch.end(20) - pseudomatch.start(1) - elif pseudomatch[22] is not None: - fstring_start = pseudomatch[22] - offset = pseudomatch.end(22) - pseudomatch.start(1) - elif pseudomatch[24] is not None: - fstring_start = pseudomatch[24] - offset = pseudomatch.end(24) - pseudomatch.start(1) - else: - fstring_start = pseudomatch[26] - offset = pseudomatch.end(26) - pseudomatch.start(1) - - start_epos = (lnum, start + offset) - yield (FSTRING_START, fstring_start, spos, start_epos, line) - fstring_state.enter_fstring() - endprog = endprogs[fstring_start] - endprog_stack.append(endprog) - parenlev_stack.append(parenlev) - parenlev = 0 - - end_offset = pseudomatch.end(1) - 1 - fstring_middle = line[start + offset : end_offset] - middle_spos = (lnum, start + offset) - middle_epos = (lnum, end_offset) - yield ( - FSTRING_MIDDLE, - fstring_middle, - middle_spos, - middle_epos, - line, - ) - if not token.endswith("{"): - end_spos = (lnum, end_offset) - end_epos = (lnum, end_offset + 1) - yield (FSTRING_END, token[-1], end_spos, end_epos, line) - fstring_state.leave_fstring() - endprog_stack.pop() - parenlev = parenlev_stack.pop() - else: - end_spos = (lnum, end_offset) - end_epos = (lnum, end_offset + 1) - yield (LBRACE, "{", end_spos, end_epos, line) - fstring_state.consume_lbrace() - - elif initial.isidentifier(): # ordinary name - if token in ("async", "await"): - if async_keywords or async_def: - yield ( - ASYNC if token == "async" else AWAIT, - token, - spos, - epos, - line, - ) - continue - - tok = (NAME, token, spos, epos, line) - if token == "async" and not stashed: - stashed = tok - continue - - if token in ("def", "for"): - if stashed and stashed[0] == NAME and stashed[1] == "async": - if token == "def": - async_def = True - async_def_indent = indents[-1] - - yield ( - ASYNC, - stashed[1], - stashed[2], - stashed[3], - stashed[4], - ) - stashed = None - - if stashed: - yield stashed - stashed = None - - yield tok - elif initial == "\\": # continued stmt - # This yield is new; needed for better idempotency: - if stashed: - yield stashed - stashed = None - yield (NL, token, spos, (lnum, pos), line) - continued = 1 - elif ( - initial == "}" - and parenlev == 0 - and fstring_state.is_in_fstring_expression() - ): - yield (RBRACE, token, spos, epos, line) - fstring_state.consume_rbrace() - formatspec_start = epos - else: - if initial in "([{": - parenlev += 1 - elif initial in ")]}": - parenlev -= 1 - if stashed: - yield stashed - stashed = None - yield (OP, token, spos, epos, line) - else: - yield (ERRORTOKEN, line[pos], (lnum, pos), (lnum, pos + 1), line) - pos += 1 - - if stashed: - yield stashed - stashed = None - - for _indent in indents[1:]: # pop remaining indent levels - yield (DEDENT, "", (lnum, 0), (lnum, 0), "") - yield (ENDMARKER, "", (lnum, 0), (lnum, 0), "") - assert len(endprog_stack) == 0 - assert len(parenlev_stack) == 0 +def printtoken( + type: int, token: str, srow_col: Coord, erow_col: Coord, line: str +) -> None: # for testing + (srow, scol) = srow_col + (erow, ecol) = erow_col + print( + "%d,%d-%d,%d:\t%s\t%s" % (srow, scol, erow, ecol, tok_name[type], repr(token)) + ) if __name__ == "__main__": # testing if len(sys.argv) > 1: - tokenize(open(sys.argv[1]).readline) + token_iterator = tokenize(open(sys.argv[1]).read()) else: - tokenize(sys.stdin.readline) + token_iterator = tokenize(sys.stdin.read()) + + for tok in token_iterator: + printtoken(*tok) diff --git a/tests/data/miscellaneous/debug_visitor.out b/tests/data/miscellaneous/debug_visitor.out index 24d7ed82472..a243ab72734 100644 --- a/tests/data/miscellaneous/debug_visitor.out +++ b/tests/data/miscellaneous/debug_visitor.out @@ -232,8 +232,6 @@ file_input fstring FSTRING_START "f'" - FSTRING_MIDDLE - '' fstring_replacement_field LBRACE '{' @@ -242,8 +240,6 @@ file_input RBRACE '}' /fstring_replacement_field - FSTRING_MIDDLE - '' fstring_replacement_field LBRACE '{' @@ -252,8 +248,6 @@ file_input RBRACE '}' /fstring_replacement_field - FSTRING_MIDDLE - '' FSTRING_END "'" /fstring @@ -399,8 +393,6 @@ file_input fstring FSTRING_START "f'" - FSTRING_MIDDLE - '' fstring_replacement_field LBRACE '{' @@ -419,8 +411,6 @@ file_input RBRACE '}' /fstring_replacement_field - FSTRING_MIDDLE - '' FSTRING_END "'" /fstring @@ -549,8 +539,6 @@ file_input fstring FSTRING_START "f'" - FSTRING_MIDDLE - '' fstring_replacement_field LBRACE '{' @@ -559,8 +547,6 @@ file_input RBRACE '}' /fstring_replacement_field - FSTRING_MIDDLE - '' fstring_replacement_field LBRACE '{' @@ -569,8 +555,6 @@ file_input RBRACE '}' /fstring_replacement_field - FSTRING_MIDDLE - '' FSTRING_END "'" /fstring @@ -660,8 +644,6 @@ file_input RBRACE '}' /fstring_replacement_field - FSTRING_MIDDLE - '' FSTRING_END "'" /fstring @@ -744,8 +726,6 @@ file_input RBRACE '}' /fstring_replacement_field - FSTRING_MIDDLE - '' FSTRING_END "'" /fstring diff --git a/tests/test_black.py b/tests/test_black.py index 98d8ff886d7..0292de91eb6 100644 --- a/tests/test_black.py +++ b/tests/test_black.py @@ -458,17 +458,6 @@ def test_tab_comment_indentation(self) -> None: self.assertFormatEqual(contents_spc, fs(contents_spc)) self.assertFormatEqual(contents_spc, fs(contents_tab)) - # mixed tabs and spaces (valid Python 2 code) - contents_tab = "if 1:\n if 2:\n\t\tpass\n\t# comment\n pass\n" - contents_spc = "if 1:\n if 2:\n pass\n # comment\n pass\n" - self.assertFormatEqual(contents_spc, fs(contents_spc)) - self.assertFormatEqual(contents_spc, fs(contents_tab)) - - contents_tab = "if 1:\n if 2:\n\t\tpass\n\t\t# comment\n pass\n" - contents_spc = "if 1:\n if 2:\n pass\n # comment\n pass\n" - self.assertFormatEqual(contents_spc, fs(contents_spc)) - self.assertFormatEqual(contents_spc, fs(contents_tab)) - def test_false_positive_symlink_output_issue_3384(self) -> None: # Emulate the behavior when using the CLI (`black ./child --verbose`), which # involves patching some `pathlib.Path` methods. In particular, `is_dir` is @@ -1975,7 +1964,7 @@ def test_for_handled_unexpected_eof_error(self) -> None: with pytest.raises(black.parsing.InvalidInput) as exc_info: black.lib2to3_parse("print(", {}) - exc_info.match("Cannot parse: 2:0: EOF in multi-line statement") + exc_info.match("Cannot parse: 1:6: Unexpected EOF in multi-line statement") def test_line_ranges_with_code_option(self) -> None: code = textwrap.dedent("""\ diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index 71773069546..efa7ad5e80d 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -1,6 +1,5 @@ """Tests for the blib2to3 tokenizer.""" -import io import sys import textwrap from dataclasses import dataclass @@ -19,16 +18,10 @@ class Token: def get_tokens(text: str) -> list[Token]: """Return the tokens produced by the tokenizer.""" - readline = io.StringIO(text).readline - tokens: list[Token] = [] - - def tokeneater( - type: int, string: str, start: tokenize.Coord, end: tokenize.Coord, line: str - ) -> None: - tokens.append(Token(token.tok_name[type], string, start, end)) - - tokenize.tokenize(readline, tokeneater) - return tokens + return [ + Token(token.tok_name[tok_type], string, start, end) + for tok_type, string, start, end, _ in tokenize.tokenize(text) + ] def assert_tokenizes(text: str, tokens: list[Token]) -> None: @@ -69,11 +62,9 @@ def test_fstring() -> None: 'f"{x}"', [ Token("FSTRING_START", 'f"', (1, 0), (1, 2)), - Token("FSTRING_MIDDLE", "", (1, 2), (1, 2)), - Token("LBRACE", "{", (1, 2), (1, 3)), + Token("OP", "{", (1, 2), (1, 3)), Token("NAME", "x", (1, 3), (1, 4)), - Token("RBRACE", "}", (1, 4), (1, 5)), - Token("FSTRING_MIDDLE", "", (1, 5), (1, 5)), + Token("OP", "}", (1, 4), (1, 5)), Token("FSTRING_END", '"', (1, 5), (1, 6)), Token("ENDMARKER", "", (2, 0), (2, 0)), ], @@ -82,13 +73,11 @@ def test_fstring() -> None: 'f"{x:y}"\n', [ Token(type="FSTRING_START", string='f"', start=(1, 0), end=(1, 2)), - Token(type="FSTRING_MIDDLE", string="", start=(1, 2), end=(1, 2)), - Token(type="LBRACE", string="{", start=(1, 2), end=(1, 3)), + Token(type="OP", string="{", start=(1, 2), end=(1, 3)), Token(type="NAME", string="x", start=(1, 3), end=(1, 4)), Token(type="OP", string=":", start=(1, 4), end=(1, 5)), Token(type="FSTRING_MIDDLE", string="y", start=(1, 5), end=(1, 6)), - Token(type="RBRACE", string="}", start=(1, 6), end=(1, 7)), - Token(type="FSTRING_MIDDLE", string="", start=(1, 7), end=(1, 7)), + Token(type="OP", string="}", start=(1, 6), end=(1, 7)), Token(type="FSTRING_END", string='"', start=(1, 7), end=(1, 8)), Token(type="NEWLINE", string="\n", start=(1, 8), end=(1, 9)), Token(type="ENDMARKER", string="", start=(2, 0), end=(2, 0)), @@ -99,10 +88,9 @@ def test_fstring() -> None: [ Token(type="FSTRING_START", string='f"', start=(1, 0), end=(1, 2)), Token(type="FSTRING_MIDDLE", string="x\\\n", start=(1, 2), end=(2, 0)), - Token(type="LBRACE", string="{", start=(2, 0), end=(2, 1)), + Token(type="OP", string="{", start=(2, 0), end=(2, 1)), Token(type="NAME", string="a", start=(2, 1), end=(2, 2)), - Token(type="RBRACE", string="}", start=(2, 2), end=(2, 3)), - Token(type="FSTRING_MIDDLE", string="", start=(2, 3), end=(2, 3)), + Token(type="OP", string="}", start=(2, 2), end=(2, 3)), Token(type="FSTRING_END", string='"', start=(2, 3), end=(2, 4)), Token(type="NEWLINE", string="\n", start=(2, 4), end=(2, 5)), Token(type="ENDMARKER", string="", start=(3, 0), end=(3, 0)),