Add textual KAST parser to pyk (#4336)

Known differences to the JavaCC implementation: * Tokens `.::K` and `.::KList` are not supported. * Token `#klabel` is not supported. * When lexing `.K` and `.KList`, the lexer reads until special character. This allows a simple LA(1) implementation. On the other hand, e.g. `.KA` will raise an error. * This double comma won't parse: https://github.com/runtimeverification/k/blob/7be107f58bb97c8f50989b60247c0180f80d88c8/k-frontend/src/main/javacc/KastParser.jj#L182 * When unquoting strings (`"Int"`) and klabels (`` `<k>` ``), a more straightforward decoding is used that simply replaces `\X` by `X`. This is in line with how these tokens are lexed. Potential improvements to the format: * Support `?X`. * Support `X:Int`. * Support klabels with sort arguments. * Write `foo` or `foo()` instead of `foo(.KList)`. * Write something like `Int("1")` (like an application) or `"1":Int` (like a variable) instead of `#token("1", "Int")` (though there might be a reason `KToken` is represented that way.)
runtimeverification · May 15, 2024 · 320540f · 320540f
1 parent ebbe00e
commit 320540f
Show file tree

Hide file tree

Showing 4 changed files with 424 additions and 0 deletions.
diff --git a/pyk/src/pyk/kast/lexer.py b/pyk/src/pyk/kast/lexer.py
@@ -0,0 +1,202 @@
+from __future__ import annotations
+
+from collections.abc import Callable, Iterator
+from enum import Enum, auto
+from typing import TYPE_CHECKING, NamedTuple
+
+if TYPE_CHECKING:
+    from collections.abc import Iterable
+    from typing import Final
+
+
+class TokenType(Enum):
+    EOF = auto()
+    LPAREN = auto()
+    RPAREN = auto()
+    COMMA = auto()
+    KSEQ = auto()
+    DOTK = auto()
+    DOTKLIST = auto()
+    TOKEN = auto()
+    ID = auto()
+    VARIABLE = auto()
+    KLABEL = auto()
+    STRING = auto()
+
+
+class Token(NamedTuple):
+    text: str
+    type: TokenType
+
+
+def lexer(text: Iterable[str]) -> Iterator[Token]:
+    it = iter(text)
+    la = next(it, '')
+    while True:
+        while la.isspace():
+            la = next(it, '')
+
+        if not la:
+            yield _TOKENS[TokenType.EOF]
+            return
+
+        try:
+            sublexer = _SUBLEXER[la]
+        except KeyError:
+            raise _unexpected_char(la) from None
+
+        token, la = sublexer(la, it)
+        yield token
+
+
+_TOKENS: Final = {
+    typ: Token(txt, typ)
+    for typ, txt in (
+        (TokenType.EOF, ''),
+        (TokenType.LPAREN, '('),
+        (TokenType.RPAREN, ')'),
+        (TokenType.COMMA, ','),
+        (TokenType.KSEQ, '~>'),
+        (TokenType.DOTK, '.K'),
+        (TokenType.DOTKLIST, '.KList'),
+        (TokenType.TOKEN, '#token'),
+    )
+}
+
+
+_DIGIT: Final = set('0123456789')
+_LOWER: Final = set('abcdefghijklmnopqrstuvwxyz')
+_UPPER: Final = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
+
+
+_UNEXPECTED_EOF: Final = ValueError('Unexpected end of file')
+
+
+def _unexpected_char(actual: str, expected: str | None = None) -> ValueError:
+    if expected is None:
+        return ValueError(f'Unexpected character: {actual!r}')
+    actual_str = repr(actual) if actual else '<EOF>'
+    return ValueError(f'Expected {expected!r}, got: {actual_str}')
+
+
+SubLexer = Callable[[str, Iterator[str]], tuple[Token, str]]
+
+
+def _simple(token: Token) -> SubLexer:
+    def sublexer(la: str, it: Iterator[str]) -> tuple[Token, str]:
+        la = next(it, '')
+        return token, la
+
+    return sublexer
+
+
+def _delimited(delimiter: str, type: TokenType) -> SubLexer:
+    assert len(delimiter) == 1
+
+    def sublexer(la: str, it: Iterator[str]) -> tuple[Token, str]:
+        assert la == delimiter
+        buf = [la]
+        la = next(it, '')
+        while True:
+            if not la:
+                raise _UNEXPECTED_EOF
+
+            elif la == delimiter:
+                buf.append(la)
+                la = next(it, '')
+                return Token(''.join(buf), type), la
+
+            elif la == '\\':
+                buf.append(la)
+                la = next(it, '')
+                if not la:
+                    raise _UNEXPECTED_EOF
+                buf.append(la)
+                la = next(it, '')
+
+            else:
+                buf.append(la)
+                la = next(it, '')
+
+    return sublexer
+
+
+def _kseq(la: str, it: Iterator[str]) -> tuple[Token, str]:
+    assert la == '~'
+    la = next(it, '')
+    if la != '>':
+        raise _unexpected_char(la, '>')
+    la = next(it, '')
+    return _TOKENS[TokenType.KSEQ], la
+
+
+_ID_CHARS: Final = set.union(_LOWER, _UPPER, _DIGIT)
+
+
+def _id_or_token(la: str, it: Iterator[str]) -> tuple[Token, str]:
+    """[#a-z](a-zA-Z0-9)*"""
+    assert la == '#' or la in _LOWER
+    buf = [la]
+    la = next(it, '')
+    while la in _ID_CHARS:
+        buf += la
+        la = next(it, '')
+    text = ''.join(buf)
+    if text == '#token':
+        return _TOKENS[TokenType.TOKEN], la
+    return Token(text, TokenType.ID), la
+
+
+_VARIABLE_CHARS: Final = set.union(_LOWER, _UPPER, _DIGIT, set("'_"))
+
+
+def _variable(la: str, it: Iterator[str]) -> tuple[Token, str]:
+    """_ | [A-Z][a-zA-Z0-9'_]*
+
+    '_' is handled in a separate function.
+    """
+    assert la in _UPPER
+    buf = [la]
+    la = next(it, '')
+    while la in _VARIABLE_CHARS:
+        buf += la
+        la = next(it, '')
+    text = ''.join(buf)
+    return Token(text, TokenType.VARIABLE), la
+
+
+# For ease of implementation, KDOT and KDOTLIST tokens are read until _SEP
+# This allows LA(1)
+# But e.g. .KA won't be lexed, even though it can be read as [KDOT, VARIABLE]
+_SEP: Final = set(',()`"#.~ \t\r\n').union({''})
+
+
+def _dotk_or_dotklist(la: str, it: Iterator[str]) -> tuple[Token, str]:
+    assert la == '.'
+    la = next(it, '')
+    if la != 'K':
+        raise _unexpected_char(la, 'K')
+    la = next(it, '')
+    if la in _SEP:
+        return _TOKENS[TokenType.DOTK], la
+    for c in 'List':
+        if la != c:
+            raise _unexpected_char(la, c)
+        la = next(it, '')
+    if la in _SEP:
+        return _TOKENS[TokenType.DOTKLIST], la
+    raise _unexpected_char(la)
+
+
+_SUBLEXER: Final[dict[str, SubLexer]] = {
+    '(': _simple(_TOKENS[TokenType.LPAREN]),
+    ')': _simple(_TOKENS[TokenType.RPAREN]),
+    ',': _simple(_TOKENS[TokenType.COMMA]),
+    '_': _simple(Token('_', TokenType.VARIABLE)),
+    '"': _delimited('"', TokenType.STRING),
+    '`': _delimited('`', TokenType.KLABEL),
+    '~': _kseq,
+    '.': _dotk_or_dotklist,
+    **{c: _id_or_token for c in {'#'}.union(_LOWER)},
+    **{c: _variable for c in _UPPER},
+}
diff --git a/pyk/src/pyk/kast/parser.py b/pyk/src/pyk/kast/parser.py
@@ -0,0 +1,119 @@
+from __future__ import annotations
+
+import re
+from typing import TYPE_CHECKING
+
+from .inner import KApply, KLabel, KSequence, KToken, KVariable
+from .lexer import TokenType, lexer
+
+if TYPE_CHECKING:
+    from collections.abc import Iterable, Iterator
+    from typing import Final
+
+    from . import KInner
+    from .lexer import Token
+
+
+TT = TokenType
+
+
+class KAstParser:
+    _it: Iterator[Token]
+    _la: Token
+
+    def __init__(self, it: Iterable[str]):
+        self._it = lexer(it)
+        self._la = next(self._it)
+
+    def _consume(self) -> str:
+        text = self._la.text
+        self._la = next(self._it)
+        return text
+
+    def _match(self, expected: TokenType) -> str:
+        if self._la.type is not expected:
+            raise self._unexpected_token(self._la, [expected])
+        text = self._la.text
+        self._la = next(self._it)
+        return text
+
+    @staticmethod
+    def _unexpected_token(token: Token, expected: Iterable[TokenType] = ()) -> ValueError:
+        types = sorted(expected, key=lambda typ: typ.name)
+
+        if not types:
+            return ValueError(f'Unexpected token: {token.text!r}')
+
+        if len(types) == 1:
+            typ = types[0]
+            return ValueError(f'Unexpected token: {token.text!r}. Expected: {typ.name}')
+
+        type_str = ', '.join(typ.name for typ in types)
+        return ValueError(f'Unexpected token: {token.text!r}. Expected one of: {type_str}')
+
+    def k(self) -> KInner:
+        if self._la.type is TT.DOTK:
+            return KSequence()
+
+        items = [self.kitem()]
+        while self._la.type is TT.KSEQ:
+            self._consume()
+            items.append(self.kitem())
+
+        if len(items) > 1:
+            return KSequence(items)
+
+        return items[0]
+
+    def kitem(self) -> KInner:
+        match self._la.type:
+            case TT.VARIABLE:
+                res = KVariable(self._la.text)
+                self._consume()
+                return res
+
+            case TT.TOKEN:
+                self._consume()
+                self._match(TT.LPAREN)
+                token = _unquote(self._match(TT.STRING))
+                self._match(TT.COMMA)
+                sort = _unquote(self._match(TT.STRING))
+                self._match(TT.RPAREN)
+                return KToken(token, sort)
+
+            case TT.ID | TT.KLABEL:
+                label = self.klabel()
+                self._match(TT.LPAREN)
+                args = self.klist()
+                self._match(TT.RPAREN)
+                return KApply(label, args)
+
+            case _:
+                raise self._unexpected_token(self._la, [TT.VARIABLE, TT.TOKEN, TT.ID, TT.KLABEL])
+
+    def klabel(self) -> KLabel:
+        match self._la.type:
+            case TT.ID:
+                return KLabel(self._consume())
+            case TT.KLABEL:
+                return KLabel(_unquote(self._consume()))
+            case _:
+                raise self._unexpected_token(self._la, [TT.ID, TT.KLABEL])
+
+    def klist(self) -> list[KInner]:
+        if self._la.type is TT.DOTKLIST:
+            self._consume()
+            return []
+
+        res = [self.k()]
+        while self._la.type is TT.COMMA:
+            self._consume()
+            res.append(self.k())
+        return res
+
+
+_UNQUOTE_PATTERN: Final = re.compile(r'\\.')
+
+
+def _unquote(s: str) -> str:
+    return _UNQUOTE_PATTERN.sub(lambda m: m.group(0)[1], s[1:-1])
diff --git a/pyk/src/tests/unit/kast/test_lexer.py b/pyk/src/tests/unit/kast/test_lexer.py
@@ -0,0 +1,64 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pytest
+
+from pyk.kast.lexer import Token, TokenType, lexer
+
+if TYPE_CHECKING:
+    from typing import Final
+
+TT: Final = TokenType
+TEST_DATA: Final[tuple[tuple[str, list[Token]], ...]] = (
+    ('', []),
+    ('\n', []),
+    ('(', [Token('(', TT.LPAREN)]),
+    (')', [Token(')', TT.RPAREN)]),
+    (',', [Token(',', TT.COMMA)]),
+    ('~>', [Token('~>', TT.KSEQ)]),
+    ('.K', [Token('.K', TT.DOTK)]),
+    ('.KList', [Token('.KList', TT.DOTKLIST)]),
+    ('``', [Token('``', TT.KLABEL)]),
+    (r'`\x`', [Token(r'`\x`', TT.KLABEL)]),
+    (r'`\``', [Token(r'`\``', TT.KLABEL)]),
+    ('`foo`', [Token('`foo`', TT.KLABEL)]),
+    ('""', [Token('""', TT.STRING)]),
+    (r'"\x"', [Token(r'"\x"', TT.STRING)]),
+    (r'"\""', [Token(r'"\""', TT.STRING)]),
+    (r'"foo"', [Token('"foo"', TT.STRING)]),
+    ('foo', [Token('foo', TT.ID)]),
+    ('#foo', [Token('#foo', TT.ID)]),
+    ('#token', [Token('#token', TT.TOKEN)]),
+    ('fO0', [Token('fO0', TT.ID)]),
+    ('_', [Token('_', TT.VARIABLE)]),
+    ('A', [Token('A', TT.VARIABLE)]),
+    (
+        '`_+_`(#token("1", "Int"), X)',
+        [
+            Token('`_+_`', TT.KLABEL),
+            Token('(', TT.LPAREN),
+            Token('#token', TT.TOKEN),
+            Token('(', TT.LPAREN),
+            Token('"1"', TT.STRING),
+            Token(',', TT.COMMA),
+            Token('"Int"', TT.STRING),
+            Token(')', TT.RPAREN),
+            Token(',', TT.COMMA),
+            Token('X', TT.VARIABLE),
+            Token(')', TT.RPAREN),
+        ],
+    ),
+)
+
+
+@pytest.mark.parametrize('text,output', TEST_DATA, ids=[text for text, _ in TEST_DATA])
+def test_lexer(text: str, output: list[str]) -> None:
+    # Given
+    expected = output + [Token('', TT.EOF)]
+
+    # When
+    actual = list(lexer(text))
+
+    # Then
+    assert actual == expected