From 10e73a26650d19c3974fed95381328c1798b70ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tam=C3=A1s=20T=C3=B3th?= Date: Fri, 31 May 2024 21:02:25 +0200 Subject: [PATCH] Improve parsing of variables in pyk KAST parser (#4403) * Add lexer support for: `?X`, `_X`, `?_X`, `?_` * Add parser support for: `X:S` --------- Co-authored-by: Bruce Collie Co-authored-by: rv-jenkins --- pyk/src/pyk/kast/lexer.py | 91 +++++++++++++++++++++----- pyk/src/pyk/kast/parser.py | 13 +++- pyk/src/tests/unit/kast/test_lexer.py | 11 +++- pyk/src/tests/unit/kast/test_parser.py | 2 + 4 files changed, 96 insertions(+), 21 deletions(-) diff --git a/pyk/src/pyk/kast/lexer.py b/pyk/src/pyk/kast/lexer.py index 03b81dac641..91690672bf2 100644 --- a/pyk/src/pyk/kast/lexer.py +++ b/pyk/src/pyk/kast/lexer.py @@ -14,12 +14,14 @@ class TokenType(Enum): LPAREN = auto() RPAREN = auto() COMMA = auto() + COLON = auto() KSEQ = auto() DOTK = auto() DOTKLIST = auto() TOKEN = auto() ID = auto() VARIABLE = auto() + SORT = auto() KLABEL = auto() STRING = auto() @@ -29,7 +31,13 @@ class Token(NamedTuple): type: TokenType +class State(Enum): + DEFAULT = auto() + SORT = auto() + + def lexer(text: Iterable[str]) -> Iterator[Token]: + state = State.DEFAULT it = iter(text) la = next(it, '') while True: @@ -41,11 +49,12 @@ def lexer(text: Iterable[str]) -> Iterator[Token]: return try: - sublexer = _SUBLEXER[la] + sublexer = _SUBLEXER[state][la] except KeyError: raise _unexpected_char(la) from None token, la = sublexer(la, it) + state = _STATE.get(token.type, State.DEFAULT) yield token @@ -56,6 +65,7 @@ def lexer(text: Iterable[str]) -> Iterator[Token]: (TokenType.LPAREN, '('), (TokenType.RPAREN, ')'), (TokenType.COMMA, ','), + (TokenType.COLON, ':'), (TokenType.KSEQ, '~>'), (TokenType.DOTK, '.K'), (TokenType.DOTKLIST, '.KList'), @@ -67,6 +77,7 @@ def lexer(text: Iterable[str]) -> Iterator[Token]: _DIGIT: Final = set('0123456789') _LOWER: Final = set('abcdefghijklmnopqrstuvwxyz') _UPPER: Final = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ') +_ALNUM: Final = set.union(_DIGIT, _LOWER, _UPPER) _UNEXPECTED_EOF: Final = ValueError('Unexpected end of file') @@ -151,13 +162,40 @@ def _id_or_token(la: str, it: Iterator[str]) -> tuple[Token, str]: def _variable(la: str, it: Iterator[str]) -> tuple[Token, str]: - """_ | [A-Z][a-zA-Z0-9'_]* + r"""_ | \?_ | \??_?[A-Z][a-zA-Z0-9'_]*""" + assert la == '?' or la == '_' or la in _UPPER + + # States: + # 0: expect '_' or _UPPER + # 1: continue if _UPPER + # 2: read while _VARIABLE_CHARS + state = {'?': 0, '_': 1}.get(la, 2) - '_' is handled in a separate function. - """ - assert la in _UPPER buf = [la] la = next(it, '') + + if state == 0: + if la == '_': + state = 1 + elif la in _UPPER: + state = 2 + else: + raise _unexpected_char(la) + + buf += la + la = next(it, '') + + if state == 1: + if la in _UPPER: + buf += la + la = next(it, '') + state = 2 + else: + la = next(it, '') + text = ''.join(buf) + return Token(text, TokenType.VARIABLE), la + + assert state == 2 while la in _VARIABLE_CHARS: buf += la la = next(it, '') @@ -168,7 +206,7 @@ def _variable(la: str, it: Iterator[str]) -> tuple[Token, str]: # For ease of implementation, KDOT and KDOTLIST tokens are read until _SEP # This allows LA(1) # But e.g. .KA won't be lexed, even though it can be read as [KDOT, VARIABLE] -_SEP: Final = set(',()`"#.~ \t\r\n').union({''}) +_SEP: Final = set(',:()`"#.~ \t\r\n').union({''}) def _dotk_or_dotklist(la: str, it: Iterator[str]) -> tuple[Token, str]: @@ -188,15 +226,34 @@ def _dotk_or_dotklist(la: str, it: Iterator[str]) -> tuple[Token, str]: raise _unexpected_char(la) -_SUBLEXER: Final[dict[str, SubLexer]] = { - '(': _simple(_TOKENS[TokenType.LPAREN]), - ')': _simple(_TOKENS[TokenType.RPAREN]), - ',': _simple(_TOKENS[TokenType.COMMA]), - '_': _simple(Token('_', TokenType.VARIABLE)), - '"': _delimited('"', TokenType.STRING), - '`': _delimited('`', TokenType.KLABEL), - '~': _kseq, - '.': _dotk_or_dotklist, - **{c: _id_or_token for c in {'#'}.union(_LOWER)}, - **{c: _variable for c in _UPPER}, +def _sort(la: str, it: Iterator[str]) -> tuple[Token, str]: + assert la in _UPPER + buf = [la] + la = next(it, '') + while la in _ALNUM: + buf.append(la) + la = next(it, '') + text = ''.join(buf) + return Token(text, TokenType.SORT), la + + +_SUBLEXER: Final[dict[State, dict[str, SubLexer]]] = { + State.DEFAULT: { + '(': _simple(_TOKENS[TokenType.LPAREN]), + ')': _simple(_TOKENS[TokenType.RPAREN]), + ',': _simple(_TOKENS[TokenType.COMMA]), + ':': _simple(_TOKENS[TokenType.COLON]), + '"': _delimited('"', TokenType.STRING), + '`': _delimited('`', TokenType.KLABEL), + '~': _kseq, + '.': _dotk_or_dotklist, + **{c: _id_or_token for c in {'#'}.union(_LOWER)}, + **{c: _variable for c in {'?', '_'}.union(_UPPER)}, + }, + State.SORT: {c: _sort for c in _UPPER}, +} + + +_STATE: Final[dict[TokenType, State]] = { + TokenType.COLON: State.SORT, } diff --git a/pyk/src/pyk/kast/parser.py b/pyk/src/pyk/kast/parser.py index c7c7c81d8e3..96c45d5dc4c 100644 --- a/pyk/src/pyk/kast/parser.py +++ b/pyk/src/pyk/kast/parser.py @@ -51,8 +51,12 @@ def _unexpected_token(token: Token, expected: Iterable[TokenType] = ()) -> Value type_str = ', '.join(typ.name for typ in types) return ValueError(f'Unexpected token: {token.text!r}. Expected one of: {type_str}') + def eof(self) -> bool: + return self._la.type is TT.EOF + def k(self) -> KInner: if self._la.type is TT.DOTK: + self._consume() return KSequence() items = [self.kitem()] @@ -68,9 +72,12 @@ def k(self) -> KInner: def kitem(self) -> KInner: match self._la.type: case TT.VARIABLE: - res = KVariable(self._la.text) - self._consume() - return res + name = self._consume() + sort: str | None = None + if self._la.type is TT.COLON: + self._consume() + sort = self._match(TT.SORT) + return KVariable(name, sort) case TT.TOKEN: self._consume() diff --git a/pyk/src/tests/unit/kast/test_lexer.py b/pyk/src/tests/unit/kast/test_lexer.py index 8c9efc640a6..95a56f47a96 100644 --- a/pyk/src/tests/unit/kast/test_lexer.py +++ b/pyk/src/tests/unit/kast/test_lexer.py @@ -32,7 +32,16 @@ ('#token', [Token('#token', TT.TOKEN)]), ('fO0', [Token('fO0', TT.ID)]), ('_', [Token('_', TT.VARIABLE)]), - ('A', [Token('A', TT.VARIABLE)]), + ('?_', [Token('?_', TT.VARIABLE)]), + ('X', [Token('X', TT.VARIABLE)]), + ('?X', [Token('?X', TT.VARIABLE)]), + ('_X', [Token('_X', TT.VARIABLE)]), + ('?_X', [Token('?_X', TT.VARIABLE)]), + ('Foo', [Token('Foo', TT.VARIABLE)]), + ('?Foo', [Token('?Foo', TT.VARIABLE)]), + ('_Foo', [Token('_Foo', TT.VARIABLE)]), + ('?_Foo', [Token('?_Foo', TT.VARIABLE)]), + ('X:Int', [Token('X', TT.VARIABLE), Token(':', TT.COLON), Token('Int', TT.SORT)]), ( '`_+_`(#token("1", "Int"), X)', [ diff --git a/pyk/src/tests/unit/kast/test_parser.py b/pyk/src/tests/unit/kast/test_parser.py index b92fcb210bc..6537337fa6e 100644 --- a/pyk/src/tests/unit/kast/test_parser.py +++ b/pyk/src/tests/unit/kast/test_parser.py @@ -16,6 +16,7 @@ TEST_DATA: Final = ( ('_', KVariable('_')), ('X', KVariable('X')), + ('X:Int', KVariable('X', 'Int')), ('#token("1", "Int")', KToken('1', 'Int')), (r'#token("\"foo\"", "String")', KToken('"foo"', 'String')), ('.K', KSequence()), @@ -36,4 +37,5 @@ def test_parser(text: str, expected: KInner) -> None: actual = parser.k() # Then + assert parser.eof() assert actual == expected