Skip to content

Commit

Permalink
Improve parsing of variables in pyk KAST parser (#4403)
Browse files Browse the repository at this point in the history
* Add lexer support for: `?X`, `_X`, `?_X`, `?_`
* Add parser support for: `X:S`

---------

Co-authored-by: Bruce Collie <[email protected]>
Co-authored-by: rv-jenkins <[email protected]>
  • Loading branch information
3 people authored May 31, 2024
1 parent aac2d03 commit 10e73a2
Show file tree
Hide file tree
Showing 4 changed files with 96 additions and 21 deletions.
91 changes: 74 additions & 17 deletions pyk/src/pyk/kast/lexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,14 @@ class TokenType(Enum):
LPAREN = auto()
RPAREN = auto()
COMMA = auto()
COLON = auto()
KSEQ = auto()
DOTK = auto()
DOTKLIST = auto()
TOKEN = auto()
ID = auto()
VARIABLE = auto()
SORT = auto()
KLABEL = auto()
STRING = auto()

Expand All @@ -29,7 +31,13 @@ class Token(NamedTuple):
type: TokenType


class State(Enum):
DEFAULT = auto()
SORT = auto()


def lexer(text: Iterable[str]) -> Iterator[Token]:
state = State.DEFAULT
it = iter(text)
la = next(it, '')
while True:
Expand All @@ -41,11 +49,12 @@ def lexer(text: Iterable[str]) -> Iterator[Token]:
return

try:
sublexer = _SUBLEXER[la]
sublexer = _SUBLEXER[state][la]
except KeyError:
raise _unexpected_char(la) from None

token, la = sublexer(la, it)
state = _STATE.get(token.type, State.DEFAULT)
yield token


Expand All @@ -56,6 +65,7 @@ def lexer(text: Iterable[str]) -> Iterator[Token]:
(TokenType.LPAREN, '('),
(TokenType.RPAREN, ')'),
(TokenType.COMMA, ','),
(TokenType.COLON, ':'),
(TokenType.KSEQ, '~>'),
(TokenType.DOTK, '.K'),
(TokenType.DOTKLIST, '.KList'),
Expand All @@ -67,6 +77,7 @@ def lexer(text: Iterable[str]) -> Iterator[Token]:
_DIGIT: Final = set('0123456789')
_LOWER: Final = set('abcdefghijklmnopqrstuvwxyz')
_UPPER: Final = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
_ALNUM: Final = set.union(_DIGIT, _LOWER, _UPPER)


_UNEXPECTED_EOF: Final = ValueError('Unexpected end of file')
Expand Down Expand Up @@ -151,13 +162,40 @@ def _id_or_token(la: str, it: Iterator[str]) -> tuple[Token, str]:


def _variable(la: str, it: Iterator[str]) -> tuple[Token, str]:
"""_ | [A-Z][a-zA-Z0-9'_]*
r"""_ | \?_ | \??_?[A-Z][a-zA-Z0-9'_]*"""
assert la == '?' or la == '_' or la in _UPPER

# States:
# 0: expect '_' or _UPPER
# 1: continue if _UPPER
# 2: read while _VARIABLE_CHARS
state = {'?': 0, '_': 1}.get(la, 2)

'_' is handled in a separate function.
"""
assert la in _UPPER
buf = [la]
la = next(it, '')

if state == 0:
if la == '_':
state = 1
elif la in _UPPER:
state = 2
else:
raise _unexpected_char(la)

buf += la
la = next(it, '')

if state == 1:
if la in _UPPER:
buf += la
la = next(it, '')
state = 2
else:
la = next(it, '')
text = ''.join(buf)
return Token(text, TokenType.VARIABLE), la

assert state == 2
while la in _VARIABLE_CHARS:
buf += la
la = next(it, '')
Expand All @@ -168,7 +206,7 @@ def _variable(la: str, it: Iterator[str]) -> tuple[Token, str]:
# For ease of implementation, KDOT and KDOTLIST tokens are read until _SEP
# This allows LA(1)
# But e.g. .KA won't be lexed, even though it can be read as [KDOT, VARIABLE]
_SEP: Final = set(',()`"#.~ \t\r\n').union({''})
_SEP: Final = set(',:()`"#.~ \t\r\n').union({''})


def _dotk_or_dotklist(la: str, it: Iterator[str]) -> tuple[Token, str]:
Expand All @@ -188,15 +226,34 @@ def _dotk_or_dotklist(la: str, it: Iterator[str]) -> tuple[Token, str]:
raise _unexpected_char(la)


_SUBLEXER: Final[dict[str, SubLexer]] = {
'(': _simple(_TOKENS[TokenType.LPAREN]),
')': _simple(_TOKENS[TokenType.RPAREN]),
',': _simple(_TOKENS[TokenType.COMMA]),
'_': _simple(Token('_', TokenType.VARIABLE)),
'"': _delimited('"', TokenType.STRING),
'`': _delimited('`', TokenType.KLABEL),
'~': _kseq,
'.': _dotk_or_dotklist,
**{c: _id_or_token for c in {'#'}.union(_LOWER)},
**{c: _variable for c in _UPPER},
def _sort(la: str, it: Iterator[str]) -> tuple[Token, str]:
assert la in _UPPER
buf = [la]
la = next(it, '')
while la in _ALNUM:
buf.append(la)
la = next(it, '')
text = ''.join(buf)
return Token(text, TokenType.SORT), la


_SUBLEXER: Final[dict[State, dict[str, SubLexer]]] = {
State.DEFAULT: {
'(': _simple(_TOKENS[TokenType.LPAREN]),
')': _simple(_TOKENS[TokenType.RPAREN]),
',': _simple(_TOKENS[TokenType.COMMA]),
':': _simple(_TOKENS[TokenType.COLON]),
'"': _delimited('"', TokenType.STRING),
'`': _delimited('`', TokenType.KLABEL),
'~': _kseq,
'.': _dotk_or_dotklist,
**{c: _id_or_token for c in {'#'}.union(_LOWER)},
**{c: _variable for c in {'?', '_'}.union(_UPPER)},
},
State.SORT: {c: _sort for c in _UPPER},
}


_STATE: Final[dict[TokenType, State]] = {
TokenType.COLON: State.SORT,
}
13 changes: 10 additions & 3 deletions pyk/src/pyk/kast/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,12 @@ def _unexpected_token(token: Token, expected: Iterable[TokenType] = ()) -> Value
type_str = ', '.join(typ.name for typ in types)
return ValueError(f'Unexpected token: {token.text!r}. Expected one of: {type_str}')

def eof(self) -> bool:
return self._la.type is TT.EOF

def k(self) -> KInner:
if self._la.type is TT.DOTK:
self._consume()
return KSequence()

items = [self.kitem()]
Expand All @@ -68,9 +72,12 @@ def k(self) -> KInner:
def kitem(self) -> KInner:
match self._la.type:
case TT.VARIABLE:
res = KVariable(self._la.text)
self._consume()
return res
name = self._consume()
sort: str | None = None
if self._la.type is TT.COLON:
self._consume()
sort = self._match(TT.SORT)
return KVariable(name, sort)

case TT.TOKEN:
self._consume()
Expand Down
11 changes: 10 additions & 1 deletion pyk/src/tests/unit/kast/test_lexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,16 @@
('#token', [Token('#token', TT.TOKEN)]),
('fO0', [Token('fO0', TT.ID)]),
('_', [Token('_', TT.VARIABLE)]),
('A', [Token('A', TT.VARIABLE)]),
('?_', [Token('?_', TT.VARIABLE)]),
('X', [Token('X', TT.VARIABLE)]),
('?X', [Token('?X', TT.VARIABLE)]),
('_X', [Token('_X', TT.VARIABLE)]),
('?_X', [Token('?_X', TT.VARIABLE)]),
('Foo', [Token('Foo', TT.VARIABLE)]),
('?Foo', [Token('?Foo', TT.VARIABLE)]),
('_Foo', [Token('_Foo', TT.VARIABLE)]),
('?_Foo', [Token('?_Foo', TT.VARIABLE)]),
('X:Int', [Token('X', TT.VARIABLE), Token(':', TT.COLON), Token('Int', TT.SORT)]),
(
'`_+_`(#token("1", "Int"), X)',
[
Expand Down
2 changes: 2 additions & 0 deletions pyk/src/tests/unit/kast/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
TEST_DATA: Final = (
('_', KVariable('_')),
('X', KVariable('X')),
('X:Int', KVariable('X', 'Int')),
('#token("1", "Int")', KToken('1', 'Int')),
(r'#token("\"foo\"", "String")', KToken('"foo"', 'String')),
('.K', KSequence()),
Expand All @@ -36,4 +37,5 @@ def test_parser(text: str, expected: KInner) -> None:
actual = parser.k()

# Then
assert parser.eof()
assert actual == expected

0 comments on commit 10e73a2

Please sign in to comment.