Skip to content

Commit

Permalink
Add textual KAST parser to pyk (#4336)
Browse files Browse the repository at this point in the history
Known differences to the JavaCC implementation:
* Tokens `.::K` and `.::KList` are not supported.
* Token `#klabel` is not supported.
* When lexing `.K` and `.KList`, the lexer reads until special
character. This allows a simple LA(1) implementation. On the other hand,
e.g. `.KA` will raise an error.
* This double comma won't parse:

https://github.com/runtimeverification/k/blob/7be107f58bb97c8f50989b60247c0180f80d88c8/k-frontend/src/main/javacc/KastParser.jj#L182
* When unquoting strings (`"Int"`) and klabels (`` `<k>` ``), a more
straightforward decoding is used that simply replaces `\X` by `X`. This
is in line with how these tokens are lexed.

Potential improvements to the format:
* Support `?X`.
* Support `X:Int`.
* Support klabels with sort arguments.
* Write `foo` or `foo()` instead of `foo(.KList)`.
* Write something like `Int("1")` (like an application) or `"1":Int`
(like a variable) instead of `#token("1", "Int")` (though there might be
a reason `KToken` is represented that way.)
  • Loading branch information
tothtamas28 authored May 15, 2024
1 parent ebbe00e commit 320540f
Show file tree
Hide file tree
Showing 4 changed files with 424 additions and 0 deletions.
202 changes: 202 additions & 0 deletions pyk/src/pyk/kast/lexer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
from __future__ import annotations

from collections.abc import Callable, Iterator
from enum import Enum, auto
from typing import TYPE_CHECKING, NamedTuple

if TYPE_CHECKING:
from collections.abc import Iterable
from typing import Final


class TokenType(Enum):
EOF = auto()
LPAREN = auto()
RPAREN = auto()
COMMA = auto()
KSEQ = auto()
DOTK = auto()
DOTKLIST = auto()
TOKEN = auto()
ID = auto()
VARIABLE = auto()
KLABEL = auto()
STRING = auto()


class Token(NamedTuple):
text: str
type: TokenType


def lexer(text: Iterable[str]) -> Iterator[Token]:
it = iter(text)
la = next(it, '')
while True:
while la.isspace():
la = next(it, '')

if not la:
yield _TOKENS[TokenType.EOF]
return

try:
sublexer = _SUBLEXER[la]
except KeyError:
raise _unexpected_char(la) from None

token, la = sublexer(la, it)
yield token


_TOKENS: Final = {
typ: Token(txt, typ)
for typ, txt in (
(TokenType.EOF, ''),
(TokenType.LPAREN, '('),
(TokenType.RPAREN, ')'),
(TokenType.COMMA, ','),
(TokenType.KSEQ, '~>'),
(TokenType.DOTK, '.K'),
(TokenType.DOTKLIST, '.KList'),
(TokenType.TOKEN, '#token'),
)
}


_DIGIT: Final = set('0123456789')
_LOWER: Final = set('abcdefghijklmnopqrstuvwxyz')
_UPPER: Final = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ')


_UNEXPECTED_EOF: Final = ValueError('Unexpected end of file')


def _unexpected_char(actual: str, expected: str | None = None) -> ValueError:
if expected is None:
return ValueError(f'Unexpected character: {actual!r}')
actual_str = repr(actual) if actual else '<EOF>'
return ValueError(f'Expected {expected!r}, got: {actual_str}')


SubLexer = Callable[[str, Iterator[str]], tuple[Token, str]]


def _simple(token: Token) -> SubLexer:
def sublexer(la: str, it: Iterator[str]) -> tuple[Token, str]:
la = next(it, '')
return token, la

return sublexer


def _delimited(delimiter: str, type: TokenType) -> SubLexer:
assert len(delimiter) == 1

def sublexer(la: str, it: Iterator[str]) -> tuple[Token, str]:
assert la == delimiter
buf = [la]
la = next(it, '')
while True:
if not la:
raise _UNEXPECTED_EOF

elif la == delimiter:
buf.append(la)
la = next(it, '')
return Token(''.join(buf), type), la

elif la == '\\':
buf.append(la)
la = next(it, '')
if not la:
raise _UNEXPECTED_EOF
buf.append(la)
la = next(it, '')

else:
buf.append(la)
la = next(it, '')

return sublexer


def _kseq(la: str, it: Iterator[str]) -> tuple[Token, str]:
assert la == '~'
la = next(it, '')
if la != '>':
raise _unexpected_char(la, '>')
la = next(it, '')
return _TOKENS[TokenType.KSEQ], la


_ID_CHARS: Final = set.union(_LOWER, _UPPER, _DIGIT)


def _id_or_token(la: str, it: Iterator[str]) -> tuple[Token, str]:
"""[#a-z](a-zA-Z0-9)*"""
assert la == '#' or la in _LOWER
buf = [la]
la = next(it, '')
while la in _ID_CHARS:
buf += la
la = next(it, '')
text = ''.join(buf)
if text == '#token':
return _TOKENS[TokenType.TOKEN], la
return Token(text, TokenType.ID), la


_VARIABLE_CHARS: Final = set.union(_LOWER, _UPPER, _DIGIT, set("'_"))


def _variable(la: str, it: Iterator[str]) -> tuple[Token, str]:
"""_ | [A-Z][a-zA-Z0-9'_]*
'_' is handled in a separate function.
"""
assert la in _UPPER
buf = [la]
la = next(it, '')
while la in _VARIABLE_CHARS:
buf += la
la = next(it, '')
text = ''.join(buf)
return Token(text, TokenType.VARIABLE), la


# For ease of implementation, KDOT and KDOTLIST tokens are read until _SEP
# This allows LA(1)
# But e.g. .KA won't be lexed, even though it can be read as [KDOT, VARIABLE]
_SEP: Final = set(',()`"#.~ \t\r\n').union({''})


def _dotk_or_dotklist(la: str, it: Iterator[str]) -> tuple[Token, str]:
assert la == '.'
la = next(it, '')
if la != 'K':
raise _unexpected_char(la, 'K')
la = next(it, '')
if la in _SEP:
return _TOKENS[TokenType.DOTK], la
for c in 'List':
if la != c:
raise _unexpected_char(la, c)
la = next(it, '')
if la in _SEP:
return _TOKENS[TokenType.DOTKLIST], la
raise _unexpected_char(la)


_SUBLEXER: Final[dict[str, SubLexer]] = {
'(': _simple(_TOKENS[TokenType.LPAREN]),
')': _simple(_TOKENS[TokenType.RPAREN]),
',': _simple(_TOKENS[TokenType.COMMA]),
'_': _simple(Token('_', TokenType.VARIABLE)),
'"': _delimited('"', TokenType.STRING),
'`': _delimited('`', TokenType.KLABEL),
'~': _kseq,
'.': _dotk_or_dotklist,
**{c: _id_or_token for c in {'#'}.union(_LOWER)},
**{c: _variable for c in _UPPER},
}
119 changes: 119 additions & 0 deletions pyk/src/pyk/kast/parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
from __future__ import annotations

import re
from typing import TYPE_CHECKING

from .inner import KApply, KLabel, KSequence, KToken, KVariable
from .lexer import TokenType, lexer

if TYPE_CHECKING:
from collections.abc import Iterable, Iterator
from typing import Final

from . import KInner
from .lexer import Token


TT = TokenType


class KAstParser:
_it: Iterator[Token]
_la: Token

def __init__(self, it: Iterable[str]):
self._it = lexer(it)
self._la = next(self._it)

def _consume(self) -> str:
text = self._la.text
self._la = next(self._it)
return text

def _match(self, expected: TokenType) -> str:
if self._la.type is not expected:
raise self._unexpected_token(self._la, [expected])
text = self._la.text
self._la = next(self._it)
return text

@staticmethod
def _unexpected_token(token: Token, expected: Iterable[TokenType] = ()) -> ValueError:
types = sorted(expected, key=lambda typ: typ.name)

if not types:
return ValueError(f'Unexpected token: {token.text!r}')

if len(types) == 1:
typ = types[0]
return ValueError(f'Unexpected token: {token.text!r}. Expected: {typ.name}')

type_str = ', '.join(typ.name for typ in types)
return ValueError(f'Unexpected token: {token.text!r}. Expected one of: {type_str}')

def k(self) -> KInner:
if self._la.type is TT.DOTK:
return KSequence()

items = [self.kitem()]
while self._la.type is TT.KSEQ:
self._consume()
items.append(self.kitem())

if len(items) > 1:
return KSequence(items)

return items[0]

def kitem(self) -> KInner:
match self._la.type:
case TT.VARIABLE:
res = KVariable(self._la.text)
self._consume()
return res

case TT.TOKEN:
self._consume()
self._match(TT.LPAREN)
token = _unquote(self._match(TT.STRING))
self._match(TT.COMMA)
sort = _unquote(self._match(TT.STRING))
self._match(TT.RPAREN)
return KToken(token, sort)

case TT.ID | TT.KLABEL:
label = self.klabel()
self._match(TT.LPAREN)
args = self.klist()
self._match(TT.RPAREN)
return KApply(label, args)

case _:
raise self._unexpected_token(self._la, [TT.VARIABLE, TT.TOKEN, TT.ID, TT.KLABEL])

def klabel(self) -> KLabel:
match self._la.type:
case TT.ID:
return KLabel(self._consume())
case TT.KLABEL:
return KLabel(_unquote(self._consume()))
case _:
raise self._unexpected_token(self._la, [TT.ID, TT.KLABEL])

def klist(self) -> list[KInner]:
if self._la.type is TT.DOTKLIST:
self._consume()
return []

res = [self.k()]
while self._la.type is TT.COMMA:
self._consume()
res.append(self.k())
return res


_UNQUOTE_PATTERN: Final = re.compile(r'\\.')


def _unquote(s: str) -> str:
return _UNQUOTE_PATTERN.sub(lambda m: m.group(0)[1], s[1:-1])
64 changes: 64 additions & 0 deletions pyk/src/tests/unit/kast/test_lexer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from __future__ import annotations

from typing import TYPE_CHECKING

import pytest

from pyk.kast.lexer import Token, TokenType, lexer

if TYPE_CHECKING:
from typing import Final

TT: Final = TokenType
TEST_DATA: Final[tuple[tuple[str, list[Token]], ...]] = (
('', []),
('\n', []),
('(', [Token('(', TT.LPAREN)]),
(')', [Token(')', TT.RPAREN)]),
(',', [Token(',', TT.COMMA)]),
('~>', [Token('~>', TT.KSEQ)]),
('.K', [Token('.K', TT.DOTK)]),
('.KList', [Token('.KList', TT.DOTKLIST)]),
('``', [Token('``', TT.KLABEL)]),
(r'`\x`', [Token(r'`\x`', TT.KLABEL)]),
(r'`\``', [Token(r'`\``', TT.KLABEL)]),
('`foo`', [Token('`foo`', TT.KLABEL)]),
('""', [Token('""', TT.STRING)]),
(r'"\x"', [Token(r'"\x"', TT.STRING)]),
(r'"\""', [Token(r'"\""', TT.STRING)]),
(r'"foo"', [Token('"foo"', TT.STRING)]),
('foo', [Token('foo', TT.ID)]),
('#foo', [Token('#foo', TT.ID)]),
('#token', [Token('#token', TT.TOKEN)]),
('fO0', [Token('fO0', TT.ID)]),
('_', [Token('_', TT.VARIABLE)]),
('A', [Token('A', TT.VARIABLE)]),
(
'`_+_`(#token("1", "Int"), X)',
[
Token('`_+_`', TT.KLABEL),
Token('(', TT.LPAREN),
Token('#token', TT.TOKEN),
Token('(', TT.LPAREN),
Token('"1"', TT.STRING),
Token(',', TT.COMMA),
Token('"Int"', TT.STRING),
Token(')', TT.RPAREN),
Token(',', TT.COMMA),
Token('X', TT.VARIABLE),
Token(')', TT.RPAREN),
],
),
)


@pytest.mark.parametrize('text,output', TEST_DATA, ids=[text for text, _ in TEST_DATA])
def test_lexer(text: str, output: list[str]) -> None:
# Given
expected = output + [Token('', TT.EOF)]

# When
actual = list(lexer(text))

# Then
assert actual == expected
Loading

0 comments on commit 320540f

Please sign in to comment.