From 10e73a26650d19c3974fed95381328c1798b70ae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tam=C3=A1s=20T=C3=B3th?=
 <tothtamas28@users.noreply.github.com>
Date: Fri, 31 May 2024 21:02:25 +0200
Subject: [PATCH] Improve parsing of variables in pyk KAST parser (#4403)

* Add lexer support for: `?X`, `_X`, `?_X`, `?_`
* Add parser support for: `X:S`

---------

Co-authored-by: Bruce Collie <brucecollie82@gmail.com>
Co-authored-by: rv-jenkins <admin@runtimeverification.com>
---
 pyk/src/pyk/kast/lexer.py              | 91 +++++++++++++++++++++-----
 pyk/src/pyk/kast/parser.py             | 13 +++-
 pyk/src/tests/unit/kast/test_lexer.py  | 11 +++-
 pyk/src/tests/unit/kast/test_parser.py |  2 +
 4 files changed, 96 insertions(+), 21 deletions(-)

diff --git a/pyk/src/pyk/kast/lexer.py b/pyk/src/pyk/kast/lexer.py
index 03b81dac641..91690672bf2 100644
--- a/pyk/src/pyk/kast/lexer.py
+++ b/pyk/src/pyk/kast/lexer.py
@@ -14,12 +14,14 @@ class TokenType(Enum):
     LPAREN = auto()
     RPAREN = auto()
     COMMA = auto()
+    COLON = auto()
     KSEQ = auto()
     DOTK = auto()
     DOTKLIST = auto()
     TOKEN = auto()
     ID = auto()
     VARIABLE = auto()
+    SORT = auto()
     KLABEL = auto()
     STRING = auto()
 
@@ -29,7 +31,13 @@ class Token(NamedTuple):
     type: TokenType
 
 
+class State(Enum):
+    DEFAULT = auto()
+    SORT = auto()
+
+
 def lexer(text: Iterable[str]) -> Iterator[Token]:
+    state = State.DEFAULT
     it = iter(text)
     la = next(it, '')
     while True:
@@ -41,11 +49,12 @@ def lexer(text: Iterable[str]) -> Iterator[Token]:
             return
 
         try:
-            sublexer = _SUBLEXER[la]
+            sublexer = _SUBLEXER[state][la]
         except KeyError:
             raise _unexpected_char(la) from None
 
         token, la = sublexer(la, it)
+        state = _STATE.get(token.type, State.DEFAULT)
         yield token
 
 
@@ -56,6 +65,7 @@ def lexer(text: Iterable[str]) -> Iterator[Token]:
         (TokenType.LPAREN, '('),
         (TokenType.RPAREN, ')'),
         (TokenType.COMMA, ','),
+        (TokenType.COLON, ':'),
         (TokenType.KSEQ, '~>'),
         (TokenType.DOTK, '.K'),
         (TokenType.DOTKLIST, '.KList'),
@@ -67,6 +77,7 @@ def lexer(text: Iterable[str]) -> Iterator[Token]:
 _DIGIT: Final = set('0123456789')
 _LOWER: Final = set('abcdefghijklmnopqrstuvwxyz')
 _UPPER: Final = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
+_ALNUM: Final = set.union(_DIGIT, _LOWER, _UPPER)
 
 
 _UNEXPECTED_EOF: Final = ValueError('Unexpected end of file')
@@ -151,13 +162,40 @@ def _id_or_token(la: str, it: Iterator[str]) -> tuple[Token, str]:
 
 
 def _variable(la: str, it: Iterator[str]) -> tuple[Token, str]:
-    """_ | [A-Z][a-zA-Z0-9'_]*
+    r"""_ | \?_ | \??_?[A-Z][a-zA-Z0-9'_]*"""
+    assert la == '?' or la == '_' or la in _UPPER
+
+    # States:
+    # 0: expect '_' or _UPPER
+    # 1: continue if _UPPER
+    # 2: read while _VARIABLE_CHARS
+    state = {'?': 0, '_': 1}.get(la, 2)
 
-    '_' is handled in a separate function.
-    """
-    assert la in _UPPER
     buf = [la]
     la = next(it, '')
+
+    if state == 0:
+        if la == '_':
+            state = 1
+        elif la in _UPPER:
+            state = 2
+        else:
+            raise _unexpected_char(la)
+
+        buf += la
+        la = next(it, '')
+
+    if state == 1:
+        if la in _UPPER:
+            buf += la
+            la = next(it, '')
+            state = 2
+        else:
+            la = next(it, '')
+            text = ''.join(buf)
+            return Token(text, TokenType.VARIABLE), la
+
+    assert state == 2
     while la in _VARIABLE_CHARS:
         buf += la
         la = next(it, '')
@@ -168,7 +206,7 @@ def _variable(la: str, it: Iterator[str]) -> tuple[Token, str]:
 # For ease of implementation, KDOT and KDOTLIST tokens are read until _SEP
 # This allows LA(1)
 # But e.g. .KA won't be lexed, even though it can be read as [KDOT, VARIABLE]
-_SEP: Final = set(',()`"#.~ \t\r\n').union({''})
+_SEP: Final = set(',:()`"#.~ \t\r\n').union({''})
 
 
 def _dotk_or_dotklist(la: str, it: Iterator[str]) -> tuple[Token, str]:
@@ -188,15 +226,34 @@ def _dotk_or_dotklist(la: str, it: Iterator[str]) -> tuple[Token, str]:
     raise _unexpected_char(la)
 
 
-_SUBLEXER: Final[dict[str, SubLexer]] = {
-    '(': _simple(_TOKENS[TokenType.LPAREN]),
-    ')': _simple(_TOKENS[TokenType.RPAREN]),
-    ',': _simple(_TOKENS[TokenType.COMMA]),
-    '_': _simple(Token('_', TokenType.VARIABLE)),
-    '"': _delimited('"', TokenType.STRING),
-    '`': _delimited('`', TokenType.KLABEL),
-    '~': _kseq,
-    '.': _dotk_or_dotklist,
-    **{c: _id_or_token for c in {'#'}.union(_LOWER)},
-    **{c: _variable for c in _UPPER},
+def _sort(la: str, it: Iterator[str]) -> tuple[Token, str]:
+    assert la in _UPPER
+    buf = [la]
+    la = next(it, '')
+    while la in _ALNUM:
+        buf.append(la)
+        la = next(it, '')
+    text = ''.join(buf)
+    return Token(text, TokenType.SORT), la
+
+
+_SUBLEXER: Final[dict[State, dict[str, SubLexer]]] = {
+    State.DEFAULT: {
+        '(': _simple(_TOKENS[TokenType.LPAREN]),
+        ')': _simple(_TOKENS[TokenType.RPAREN]),
+        ',': _simple(_TOKENS[TokenType.COMMA]),
+        ':': _simple(_TOKENS[TokenType.COLON]),
+        '"': _delimited('"', TokenType.STRING),
+        '`': _delimited('`', TokenType.KLABEL),
+        '~': _kseq,
+        '.': _dotk_or_dotklist,
+        **{c: _id_or_token for c in {'#'}.union(_LOWER)},
+        **{c: _variable for c in {'?', '_'}.union(_UPPER)},
+    },
+    State.SORT: {c: _sort for c in _UPPER},
+}
+
+
+_STATE: Final[dict[TokenType, State]] = {
+    TokenType.COLON: State.SORT,
 }
diff --git a/pyk/src/pyk/kast/parser.py b/pyk/src/pyk/kast/parser.py
index c7c7c81d8e3..96c45d5dc4c 100644
--- a/pyk/src/pyk/kast/parser.py
+++ b/pyk/src/pyk/kast/parser.py
@@ -51,8 +51,12 @@ def _unexpected_token(token: Token, expected: Iterable[TokenType] = ()) -> Value
         type_str = ', '.join(typ.name for typ in types)
         return ValueError(f'Unexpected token: {token.text!r}. Expected one of: {type_str}')
 
+    def eof(self) -> bool:
+        return self._la.type is TT.EOF
+
     def k(self) -> KInner:
         if self._la.type is TT.DOTK:
+            self._consume()
             return KSequence()
 
         items = [self.kitem()]
@@ -68,9 +72,12 @@ def k(self) -> KInner:
     def kitem(self) -> KInner:
         match self._la.type:
             case TT.VARIABLE:
-                res = KVariable(self._la.text)
-                self._consume()
-                return res
+                name = self._consume()
+                sort: str | None = None
+                if self._la.type is TT.COLON:
+                    self._consume()
+                    sort = self._match(TT.SORT)
+                return KVariable(name, sort)
 
             case TT.TOKEN:
                 self._consume()
diff --git a/pyk/src/tests/unit/kast/test_lexer.py b/pyk/src/tests/unit/kast/test_lexer.py
index 8c9efc640a6..95a56f47a96 100644
--- a/pyk/src/tests/unit/kast/test_lexer.py
+++ b/pyk/src/tests/unit/kast/test_lexer.py
@@ -32,7 +32,16 @@
     ('#token', [Token('#token', TT.TOKEN)]),
     ('fO0', [Token('fO0', TT.ID)]),
     ('_', [Token('_', TT.VARIABLE)]),
-    ('A', [Token('A', TT.VARIABLE)]),
+    ('?_', [Token('?_', TT.VARIABLE)]),
+    ('X', [Token('X', TT.VARIABLE)]),
+    ('?X', [Token('?X', TT.VARIABLE)]),
+    ('_X', [Token('_X', TT.VARIABLE)]),
+    ('?_X', [Token('?_X', TT.VARIABLE)]),
+    ('Foo', [Token('Foo', TT.VARIABLE)]),
+    ('?Foo', [Token('?Foo', TT.VARIABLE)]),
+    ('_Foo', [Token('_Foo', TT.VARIABLE)]),
+    ('?_Foo', [Token('?_Foo', TT.VARIABLE)]),
+    ('X:Int', [Token('X', TT.VARIABLE), Token(':', TT.COLON), Token('Int', TT.SORT)]),
     (
         '`_+_`(#token("1", "Int"), X)',
         [
diff --git a/pyk/src/tests/unit/kast/test_parser.py b/pyk/src/tests/unit/kast/test_parser.py
index b92fcb210bc..6537337fa6e 100644
--- a/pyk/src/tests/unit/kast/test_parser.py
+++ b/pyk/src/tests/unit/kast/test_parser.py
@@ -16,6 +16,7 @@
 TEST_DATA: Final = (
     ('_', KVariable('_')),
     ('X', KVariable('X')),
+    ('X:Int', KVariable('X', 'Int')),
     ('#token("1", "Int")', KToken('1', 'Int')),
     (r'#token("\"foo\"", "String")', KToken('"foo"', 'String')),
     ('.K', KSequence()),
@@ -36,4 +37,5 @@ def test_parser(text: str, expected: KInner) -> None:
     actual = parser.k()
 
     # Then
+    assert parser.eof()
     assert actual == expected