-
Notifications
You must be signed in to change notification settings - Fork 152
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add textual KAST parser to
pyk
(#4336)
Known differences to the JavaCC implementation: * Tokens `.::K` and `.::KList` are not supported. * Token `#klabel` is not supported. * When lexing `.K` and `.KList`, the lexer reads until special character. This allows a simple LA(1) implementation. On the other hand, e.g. `.KA` will raise an error. * This double comma won't parse: https://github.com/runtimeverification/k/blob/7be107f58bb97c8f50989b60247c0180f80d88c8/k-frontend/src/main/javacc/KastParser.jj#L182 * When unquoting strings (`"Int"`) and klabels (`` `<k>` ``), a more straightforward decoding is used that simply replaces `\X` by `X`. This is in line with how these tokens are lexed. Potential improvements to the format: * Support `?X`. * Support `X:Int`. * Support klabels with sort arguments. * Write `foo` or `foo()` instead of `foo(.KList)`. * Write something like `Int("1")` (like an application) or `"1":Int` (like a variable) instead of `#token("1", "Int")` (though there might be a reason `KToken` is represented that way.)
- Loading branch information
1 parent
ebbe00e
commit 320540f
Showing
4 changed files
with
424 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,202 @@ | ||
from __future__ import annotations | ||
|
||
from collections.abc import Callable, Iterator | ||
from enum import Enum, auto | ||
from typing import TYPE_CHECKING, NamedTuple | ||
|
||
if TYPE_CHECKING: | ||
from collections.abc import Iterable | ||
from typing import Final | ||
|
||
|
||
class TokenType(Enum): | ||
EOF = auto() | ||
LPAREN = auto() | ||
RPAREN = auto() | ||
COMMA = auto() | ||
KSEQ = auto() | ||
DOTK = auto() | ||
DOTKLIST = auto() | ||
TOKEN = auto() | ||
ID = auto() | ||
VARIABLE = auto() | ||
KLABEL = auto() | ||
STRING = auto() | ||
|
||
|
||
class Token(NamedTuple): | ||
text: str | ||
type: TokenType | ||
|
||
|
||
def lexer(text: Iterable[str]) -> Iterator[Token]: | ||
it = iter(text) | ||
la = next(it, '') | ||
while True: | ||
while la.isspace(): | ||
la = next(it, '') | ||
|
||
if not la: | ||
yield _TOKENS[TokenType.EOF] | ||
return | ||
|
||
try: | ||
sublexer = _SUBLEXER[la] | ||
except KeyError: | ||
raise _unexpected_char(la) from None | ||
|
||
token, la = sublexer(la, it) | ||
yield token | ||
|
||
|
||
_TOKENS: Final = { | ||
typ: Token(txt, typ) | ||
for typ, txt in ( | ||
(TokenType.EOF, ''), | ||
(TokenType.LPAREN, '('), | ||
(TokenType.RPAREN, ')'), | ||
(TokenType.COMMA, ','), | ||
(TokenType.KSEQ, '~>'), | ||
(TokenType.DOTK, '.K'), | ||
(TokenType.DOTKLIST, '.KList'), | ||
(TokenType.TOKEN, '#token'), | ||
) | ||
} | ||
|
||
|
||
_DIGIT: Final = set('0123456789') | ||
_LOWER: Final = set('abcdefghijklmnopqrstuvwxyz') | ||
_UPPER: Final = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ') | ||
|
||
|
||
_UNEXPECTED_EOF: Final = ValueError('Unexpected end of file') | ||
|
||
|
||
def _unexpected_char(actual: str, expected: str | None = None) -> ValueError: | ||
if expected is None: | ||
return ValueError(f'Unexpected character: {actual!r}') | ||
actual_str = repr(actual) if actual else '<EOF>' | ||
return ValueError(f'Expected {expected!r}, got: {actual_str}') | ||
|
||
|
||
SubLexer = Callable[[str, Iterator[str]], tuple[Token, str]] | ||
|
||
|
||
def _simple(token: Token) -> SubLexer: | ||
def sublexer(la: str, it: Iterator[str]) -> tuple[Token, str]: | ||
la = next(it, '') | ||
return token, la | ||
|
||
return sublexer | ||
|
||
|
||
def _delimited(delimiter: str, type: TokenType) -> SubLexer: | ||
assert len(delimiter) == 1 | ||
|
||
def sublexer(la: str, it: Iterator[str]) -> tuple[Token, str]: | ||
assert la == delimiter | ||
buf = [la] | ||
la = next(it, '') | ||
while True: | ||
if not la: | ||
raise _UNEXPECTED_EOF | ||
|
||
elif la == delimiter: | ||
buf.append(la) | ||
la = next(it, '') | ||
return Token(''.join(buf), type), la | ||
|
||
elif la == '\\': | ||
buf.append(la) | ||
la = next(it, '') | ||
if not la: | ||
raise _UNEXPECTED_EOF | ||
buf.append(la) | ||
la = next(it, '') | ||
|
||
else: | ||
buf.append(la) | ||
la = next(it, '') | ||
|
||
return sublexer | ||
|
||
|
||
def _kseq(la: str, it: Iterator[str]) -> tuple[Token, str]: | ||
assert la == '~' | ||
la = next(it, '') | ||
if la != '>': | ||
raise _unexpected_char(la, '>') | ||
la = next(it, '') | ||
return _TOKENS[TokenType.KSEQ], la | ||
|
||
|
||
_ID_CHARS: Final = set.union(_LOWER, _UPPER, _DIGIT) | ||
|
||
|
||
def _id_or_token(la: str, it: Iterator[str]) -> tuple[Token, str]: | ||
"""[#a-z](a-zA-Z0-9)*""" | ||
assert la == '#' or la in _LOWER | ||
buf = [la] | ||
la = next(it, '') | ||
while la in _ID_CHARS: | ||
buf += la | ||
la = next(it, '') | ||
text = ''.join(buf) | ||
if text == '#token': | ||
return _TOKENS[TokenType.TOKEN], la | ||
return Token(text, TokenType.ID), la | ||
|
||
|
||
_VARIABLE_CHARS: Final = set.union(_LOWER, _UPPER, _DIGIT, set("'_")) | ||
|
||
|
||
def _variable(la: str, it: Iterator[str]) -> tuple[Token, str]: | ||
"""_ | [A-Z][a-zA-Z0-9'_]* | ||
'_' is handled in a separate function. | ||
""" | ||
assert la in _UPPER | ||
buf = [la] | ||
la = next(it, '') | ||
while la in _VARIABLE_CHARS: | ||
buf += la | ||
la = next(it, '') | ||
text = ''.join(buf) | ||
return Token(text, TokenType.VARIABLE), la | ||
|
||
|
||
# For ease of implementation, KDOT and KDOTLIST tokens are read until _SEP | ||
# This allows LA(1) | ||
# But e.g. .KA won't be lexed, even though it can be read as [KDOT, VARIABLE] | ||
_SEP: Final = set(',()`"#.~ \t\r\n').union({''}) | ||
|
||
|
||
def _dotk_or_dotklist(la: str, it: Iterator[str]) -> tuple[Token, str]: | ||
assert la == '.' | ||
la = next(it, '') | ||
if la != 'K': | ||
raise _unexpected_char(la, 'K') | ||
la = next(it, '') | ||
if la in _SEP: | ||
return _TOKENS[TokenType.DOTK], la | ||
for c in 'List': | ||
if la != c: | ||
raise _unexpected_char(la, c) | ||
la = next(it, '') | ||
if la in _SEP: | ||
return _TOKENS[TokenType.DOTKLIST], la | ||
raise _unexpected_char(la) | ||
|
||
|
||
_SUBLEXER: Final[dict[str, SubLexer]] = { | ||
'(': _simple(_TOKENS[TokenType.LPAREN]), | ||
')': _simple(_TOKENS[TokenType.RPAREN]), | ||
',': _simple(_TOKENS[TokenType.COMMA]), | ||
'_': _simple(Token('_', TokenType.VARIABLE)), | ||
'"': _delimited('"', TokenType.STRING), | ||
'`': _delimited('`', TokenType.KLABEL), | ||
'~': _kseq, | ||
'.': _dotk_or_dotklist, | ||
**{c: _id_or_token for c in {'#'}.union(_LOWER)}, | ||
**{c: _variable for c in _UPPER}, | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
from __future__ import annotations | ||
|
||
import re | ||
from typing import TYPE_CHECKING | ||
|
||
from .inner import KApply, KLabel, KSequence, KToken, KVariable | ||
from .lexer import TokenType, lexer | ||
|
||
if TYPE_CHECKING: | ||
from collections.abc import Iterable, Iterator | ||
from typing import Final | ||
|
||
from . import KInner | ||
from .lexer import Token | ||
|
||
|
||
TT = TokenType | ||
|
||
|
||
class KAstParser: | ||
_it: Iterator[Token] | ||
_la: Token | ||
|
||
def __init__(self, it: Iterable[str]): | ||
self._it = lexer(it) | ||
self._la = next(self._it) | ||
|
||
def _consume(self) -> str: | ||
text = self._la.text | ||
self._la = next(self._it) | ||
return text | ||
|
||
def _match(self, expected: TokenType) -> str: | ||
if self._la.type is not expected: | ||
raise self._unexpected_token(self._la, [expected]) | ||
text = self._la.text | ||
self._la = next(self._it) | ||
return text | ||
|
||
@staticmethod | ||
def _unexpected_token(token: Token, expected: Iterable[TokenType] = ()) -> ValueError: | ||
types = sorted(expected, key=lambda typ: typ.name) | ||
|
||
if not types: | ||
return ValueError(f'Unexpected token: {token.text!r}') | ||
|
||
if len(types) == 1: | ||
typ = types[0] | ||
return ValueError(f'Unexpected token: {token.text!r}. Expected: {typ.name}') | ||
|
||
type_str = ', '.join(typ.name for typ in types) | ||
return ValueError(f'Unexpected token: {token.text!r}. Expected one of: {type_str}') | ||
|
||
def k(self) -> KInner: | ||
if self._la.type is TT.DOTK: | ||
return KSequence() | ||
|
||
items = [self.kitem()] | ||
while self._la.type is TT.KSEQ: | ||
self._consume() | ||
items.append(self.kitem()) | ||
|
||
if len(items) > 1: | ||
return KSequence(items) | ||
|
||
return items[0] | ||
|
||
def kitem(self) -> KInner: | ||
match self._la.type: | ||
case TT.VARIABLE: | ||
res = KVariable(self._la.text) | ||
self._consume() | ||
return res | ||
|
||
case TT.TOKEN: | ||
self._consume() | ||
self._match(TT.LPAREN) | ||
token = _unquote(self._match(TT.STRING)) | ||
self._match(TT.COMMA) | ||
sort = _unquote(self._match(TT.STRING)) | ||
self._match(TT.RPAREN) | ||
return KToken(token, sort) | ||
|
||
case TT.ID | TT.KLABEL: | ||
label = self.klabel() | ||
self._match(TT.LPAREN) | ||
args = self.klist() | ||
self._match(TT.RPAREN) | ||
return KApply(label, args) | ||
|
||
case _: | ||
raise self._unexpected_token(self._la, [TT.VARIABLE, TT.TOKEN, TT.ID, TT.KLABEL]) | ||
|
||
def klabel(self) -> KLabel: | ||
match self._la.type: | ||
case TT.ID: | ||
return KLabel(self._consume()) | ||
case TT.KLABEL: | ||
return KLabel(_unquote(self._consume())) | ||
case _: | ||
raise self._unexpected_token(self._la, [TT.ID, TT.KLABEL]) | ||
|
||
def klist(self) -> list[KInner]: | ||
if self._la.type is TT.DOTKLIST: | ||
self._consume() | ||
return [] | ||
|
||
res = [self.k()] | ||
while self._la.type is TT.COMMA: | ||
self._consume() | ||
res.append(self.k()) | ||
return res | ||
|
||
|
||
_UNQUOTE_PATTERN: Final = re.compile(r'\\.') | ||
|
||
|
||
def _unquote(s: str) -> str: | ||
return _UNQUOTE_PATTERN.sub(lambda m: m.group(0)[1], s[1:-1]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
from __future__ import annotations | ||
|
||
from typing import TYPE_CHECKING | ||
|
||
import pytest | ||
|
||
from pyk.kast.lexer import Token, TokenType, lexer | ||
|
||
if TYPE_CHECKING: | ||
from typing import Final | ||
|
||
TT: Final = TokenType | ||
TEST_DATA: Final[tuple[tuple[str, list[Token]], ...]] = ( | ||
('', []), | ||
('\n', []), | ||
('(', [Token('(', TT.LPAREN)]), | ||
(')', [Token(')', TT.RPAREN)]), | ||
(',', [Token(',', TT.COMMA)]), | ||
('~>', [Token('~>', TT.KSEQ)]), | ||
('.K', [Token('.K', TT.DOTK)]), | ||
('.KList', [Token('.KList', TT.DOTKLIST)]), | ||
('``', [Token('``', TT.KLABEL)]), | ||
(r'`\x`', [Token(r'`\x`', TT.KLABEL)]), | ||
(r'`\``', [Token(r'`\``', TT.KLABEL)]), | ||
('`foo`', [Token('`foo`', TT.KLABEL)]), | ||
('""', [Token('""', TT.STRING)]), | ||
(r'"\x"', [Token(r'"\x"', TT.STRING)]), | ||
(r'"\""', [Token(r'"\""', TT.STRING)]), | ||
(r'"foo"', [Token('"foo"', TT.STRING)]), | ||
('foo', [Token('foo', TT.ID)]), | ||
('#foo', [Token('#foo', TT.ID)]), | ||
('#token', [Token('#token', TT.TOKEN)]), | ||
('fO0', [Token('fO0', TT.ID)]), | ||
('_', [Token('_', TT.VARIABLE)]), | ||
('A', [Token('A', TT.VARIABLE)]), | ||
( | ||
'`_+_`(#token("1", "Int"), X)', | ||
[ | ||
Token('`_+_`', TT.KLABEL), | ||
Token('(', TT.LPAREN), | ||
Token('#token', TT.TOKEN), | ||
Token('(', TT.LPAREN), | ||
Token('"1"', TT.STRING), | ||
Token(',', TT.COMMA), | ||
Token('"Int"', TT.STRING), | ||
Token(')', TT.RPAREN), | ||
Token(',', TT.COMMA), | ||
Token('X', TT.VARIABLE), | ||
Token(')', TT.RPAREN), | ||
], | ||
), | ||
) | ||
|
||
|
||
@pytest.mark.parametrize('text,output', TEST_DATA, ids=[text for text, _ in TEST_DATA]) | ||
def test_lexer(text: str, output: list[str]) -> None: | ||
# Given | ||
expected = output + [Token('', TT.EOF)] | ||
|
||
# When | ||
actual = list(lexer(text)) | ||
|
||
# Then | ||
assert actual == expected |
Oops, something went wrong.