diff --git a/src/pyk/kast/outer_lexer.py b/src/pyk/kast/outer_lexer.py index ce33539e4..015d9de04 100644 --- a/src/pyk/kast/outer_lexer.py +++ b/src/pyk/kast/outer_lexer.py @@ -113,8 +113,9 @@ class Token(NamedTuple): _DIGIT: Final = set('0123456789') _LOWER: Final = set('abcdefghijklmnopqrstuvwxyz') _UPPER: Final = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ') -_ALPHA: Final = set().union(_LOWER).union(_UPPER) -_ALNUM: Final = set(_ALPHA).union(_DIGIT) +_ALPHA: Final = _LOWER.union(_UPPER) +_ALNUM: Final = _ALPHA.union(_DIGIT) +_WORD: Final = {'_'}.union(_ALNUM) class State(Enum): @@ -456,29 +457,42 @@ def _hash_upper_id(la: str, it: Iterator[str]) -> tuple[Token, str]: _MODNAME_KEYWORDS: Final = {'private', 'public'} -_MODNAME_CHARS: Final = {'-', '_'}.union(_ALNUM) def _modname(la: str, it: Iterator) -> tuple[Token, str]: + r"""[a-zA-Z]\w*(-\w+)*""" + la = _skip_ws_and_comments(la, it) consumed = [] - if la == '#': + if la not in _ALPHA: + raise _unexpected_character(la) + + consumed.append(la) + la = next(it, '') + + while la in _WORD: consumed.append(la) la = next(it, '') - if not la: - raise _unexpected_character(la) + while True: + if la != '-': + break + + consumed.append(la) + la = next(it, '') - allow_dash = False - while la in _MODNAME_CHARS: - if la == '-' and not allow_dash: + if la not in _WORD: raise _unexpected_character(la) - allow_dash = la != '-' + consumed.append(la) la = next(it, '') + while la in _WORD: + consumed.append(la) + la = next(it, '') + text = ''.join(consumed) if text in _MODNAME_KEYWORDS: return _KEYWORDS[text], la diff --git a/src/tests/unit/kast/test_outer_lexer.py b/src/tests/unit/kast/test_outer_lexer.py index 9550bec55..923fd8c54 100644 --- a/src/tests/unit/kast/test_outer_lexer.py +++ b/src/tests/unit/kast/test_outer_lexer.py @@ -304,10 +304,8 @@ def test_default(text: str, expected_token: Token, expected_remaining: str) -> N ('private MODULE', Token('private', TokenType.KW_PRIVATE), ' MODULE'), ('public', Token('public', TokenType.KW_PUBLIC), ''), ('module', Token('module', TokenType.MODNAME), ''), - ('module ', Token('module', TokenType.MODNAME), ' '), ('MODULE', Token('MODULE', TokenType.MODNAME), ''), - ('#module', Token('#module', TokenType.MODNAME), ''), - ('#module#module', Token('#module', TokenType.MODNAME), '#module'), + ('module#module', Token('module', TokenType.MODNAME), '#module'), ('mo-du-le', Token('mo-du-le', TokenType.MODNAME), ''), ('m0-DU_l3', Token('m0-DU_l3', TokenType.MODNAME), ''), ('TEST-MODULE', Token('TEST-MODULE', TokenType.MODNAME), ''),