diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e3ced6f..6e7ace2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -18,7 +18,7 @@ jobs: - name: Set up monorepo overrides run: | - for pkg in rumil_parsers rumil_expressions; do + for pkg in rumil_parsers rumil_expressions rumil_tokens; do cat > $pkg/pubspec_overrides.yaml <<'EOF' dependency_overrides: rumil: @@ -37,7 +37,7 @@ jobs: - name: Install dependencies run: | - for pkg in rumil rumil_codec rumil_parsers rumil_expressions rumil_codec_builder; do + for pkg in rumil rumil_codec rumil_parsers rumil_expressions rumil_codec_builder rumil_tokens; do (cd $pkg && dart pub get) done @@ -47,7 +47,7 @@ jobs: - name: Analyze run: | - for pkg in rumil rumil_codec rumil_parsers rumil_expressions rumil_codec_builder; do + for pkg in rumil rumil_codec rumil_parsers rumil_expressions rumil_codec_builder rumil_tokens; do echo "=== $pkg ===" (cd $pkg && dart analyze --fatal-infos) done @@ -63,7 +63,7 @@ jobs: - name: Set up monorepo overrides run: | - for pkg in rumil_parsers rumil_expressions; do + for pkg in rumil_parsers rumil_expressions rumil_tokens; do cat > $pkg/pubspec_overrides.yaml <<'EOF' dependency_overrides: rumil: @@ -82,12 +82,12 @@ jobs: - name: Install dependencies run: | - for pkg in rumil rumil_codec rumil_parsers rumil_expressions rumil_codec_builder; do + for pkg in rumil rumil_codec rumil_parsers rumil_expressions rumil_codec_builder rumil_tokens; do (cd $pkg && dart pub get) done - name: Check formatting - run: dart format --output=none --set-exit-if-changed rumil rumil_codec rumil_parsers rumil_expressions rumil_codec_builder + run: dart format --output=none --set-exit-if-changed rumil rumil_codec rumil_parsers rumil_expressions rumil_codec_builder rumil_tokens test: name: Test @@ -100,7 +100,7 @@ jobs: - name: Set up monorepo overrides run: | - for pkg in rumil_parsers rumil_expressions; do + for pkg in rumil_parsers rumil_expressions rumil_tokens; do cat > $pkg/pubspec_overrides.yaml <<'EOF' dependency_overrides: rumil: @@ -110,7 +110,7 @@ jobs: - name: Install dependencies run: | - for pkg in rumil rumil_codec rumil_parsers rumil_expressions; do + for pkg in rumil rumil_codec rumil_parsers rumil_expressions rumil_tokens; do (cd $pkg && dart pub get) done @@ -130,6 +130,10 @@ jobs: working-directory: rumil_expressions run: dart test + - name: Test rumil_tokens + working-directory: rumil_tokens + run: dart test + doc: name: Documentation runs-on: ubuntu-latest @@ -141,7 +145,7 @@ jobs: - name: Set up monorepo overrides run: | - for pkg in rumil_parsers rumil_expressions; do + for pkg in rumil_parsers rumil_expressions rumil_tokens; do cat > $pkg/pubspec_overrides.yaml <<'EOF' dependency_overrides: rumil: @@ -151,13 +155,13 @@ jobs: - name: Install dependencies run: | - for pkg in rumil rumil_codec rumil_parsers rumil_expressions; do + for pkg in rumil rumil_codec rumil_parsers rumil_expressions rumil_tokens; do (cd $pkg && dart pub get) done - name: Generate docs run: | - for pkg in rumil rumil_codec; do + for pkg in rumil rumil_codec rumil_tokens; do echo "=== $pkg ===" (cd $pkg && dart doc --validate-links) done diff --git a/rumil/CHANGELOG.md b/rumil/CHANGELOG.md index dccd0d1..a53c7ee 100644 --- a/rumil/CHANGELOG.md +++ b/rumil/CHANGELOG.md @@ -1,3 +1,13 @@ +## 0.6.0 + +Synchronized release across all rumil-dart packages. Additive for +`rumil`. + +- `position()` primitive: a zero-width parser that yields the current + byte offset. Combines with `Zip` for span capture: + `position().zip(p).zip(position())` produces `((start, value), end)` + in one pass. + ## 0.5.0 **Interpreter optimizations and API refinements.** diff --git a/rumil/lib/src/interpreter.dart b/rumil/lib/src/interpreter.dart index da7b38a..b401333 100644 --- a/rumil/lib/src/interpreter.dart +++ b/rumil/lib/src/interpreter.dart @@ -317,6 +317,9 @@ Result interpretI(Parser parser, ParserState state) { loc, ); + case GetPosition(): + return Success(state.offset as A, 0); + case Mapped(): return _runTrampoline(p, state); diff --git a/rumil/lib/src/location.dart b/rumil/lib/src/location.dart index 3b4ea70..6719e03 100644 --- a/rumil/lib/src/location.dart +++ b/rumil/lib/src/location.dart @@ -11,7 +11,7 @@ final class Location { /// 0-indexed byte offset from start of input. final int offset; - /// Creates a location at [offset] within [input]. + /// Creates a location at [offset] within the given input string. const Location(this._input, this.offset); /// The start of input: line 1, column 1, offset 0. diff --git a/rumil/lib/src/parser.dart b/rumil/lib/src/parser.dart index b8a3292..80edd38 100644 --- a/rumil/lib/src/parser.dart +++ b/rumil/lib/src/parser.dart @@ -99,6 +99,19 @@ final class Eof extends Parser { bool get isSimple => true; } +/// Succeeds without consuming input, yielding the current byte offset. +/// +/// Use via [position] in `primitives.dart`. Typically wrapped into span +/// tracking: `position().zip(p).zip(position())` gives the start offset, +/// the parsed value, and the end offset in one pass. +final class GetPosition extends Parser { + /// Creates a position-reading parser. + const GetPosition(); + + @override + bool get isSimple => true; +} + // --------------------------------------------------------------------------- // Composition // --------------------------------------------------------------------------- diff --git a/rumil/lib/src/primitives.dart b/rumil/lib/src/primitives.dart index 2f5a0aa..8041835 100644 --- a/rumil/lib/src/primitives.dart +++ b/rumil/lib/src/primitives.dart @@ -112,6 +112,19 @@ Parser symbol(String s) => lexeme(string(s)); /// Matches end of input. Parser eof() => const Eof(); +/// Succeeds without consuming input, yielding the current byte offset. +/// +/// Combines with [Zip] to capture spans around a parser: +/// +/// ```dart +/// final spanned = position().zip(myParser).zip(position()); +/// // produces (((int startOffset, A value), int endOffset)) +/// ``` +/// +/// The offset is 0-indexed. Use [Location] (via `Location(input, offset)`) +/// when converting to line/column. +Parser position() => GetPosition(); + /// Defers parser construction for recursive grammars. Parser defer(Parser Function() thunk) => Defer(thunk); diff --git a/rumil/pubspec.yaml b/rumil/pubspec.yaml index d423ea3..59d7f3b 100644 --- a/rumil/pubspec.yaml +++ b/rumil/pubspec.yaml @@ -2,7 +2,7 @@ name: rumil description: >- Parser combinator library for Dart with left recursion, stack-safe trampolining, typed errors, lazy error construction, and sealed ADT design. -version: 0.5.0 +version: 0.6.0 repository: https://github.com/hakimjonas/rumil-dart topics: - parser diff --git a/rumil/test/smoke_test.dart b/rumil/test/smoke_test.dart index cb37181..f19b589 100644 --- a/rumil/test/smoke_test.dart +++ b/rumil/test/smoke_test.dart @@ -42,6 +42,30 @@ void main() { test('eof fails with remaining input', () { expectFailure(eof().run('x')); }); + + test('position at start yields 0', () { + final r = position().run('abc'); + expect(successValue(r), 0); + expect((r as Success).consumed, 0); + }); + + test('position after consumption yields offset', () { + final r = string('abc').skipThen(position()).run('abcdef'); + expect(successValue(r), 3); + }); + + test('position captures span around a parser', () { + final spanned = spaces() + .skipThen(position()) + .zip(string('hello')) + .zip(position()); + final r = spanned.run(' hello!'); + expect(r, isA>()); + final s = r as Success; + expect(s.value.$1.$1, 2); + expect(s.value.$1.$2, 'hello'); + expect(s.value.$2, 7); + }); }); group('Composition', () { diff --git a/rumil_codec/CHANGELOG.md b/rumil_codec/CHANGELOG.md index 332cbcf..1d16fb4 100644 --- a/rumil_codec/CHANGELOG.md +++ b/rumil_codec/CHANGELOG.md @@ -1,3 +1,8 @@ +## 0.6.0 + +Version aligned with the rumil-dart monorepo 0.6.0 release. No +functional changes in this package. + ## 0.5.0 - **New:** `dateTimeCodec` — microsecond precision, preserves UTC/local flag. diff --git a/rumil_codec/pubspec.yaml b/rumil_codec/pubspec.yaml index bef4dd5..fe838d7 100644 --- a/rumil_codec/pubspec.yaml +++ b/rumil_codec/pubspec.yaml @@ -2,7 +2,7 @@ name: rumil_codec description: >- Binary codec library for Dart with ZigZag/Varint encoding, composable BinaryCodec instances, and product type composition via records. -version: 0.5.0 +version: 0.6.0 repository: https://github.com/hakimjonas/rumil-dart topics: - codec diff --git a/rumil_codec_builder/CHANGELOG.md b/rumil_codec_builder/CHANGELOG.md index 819e34b..572949b 100644 --- a/rumil_codec_builder/CHANGELOG.md +++ b/rumil_codec_builder/CHANGELOG.md @@ -1,3 +1,8 @@ +## 0.6.0 + +- Depends on `rumil_codec: ^0.6.0`, `rumil_parsers: ^0.6.0`. Version + aligned with the rumil-dart monorepo 0.6.0 release. + ## 0.5.0 - Depends on `rumil_codec: ^0.5.0`, `rumil_parsers: ^0.5.0`. Version aligned. diff --git a/rumil_codec_builder/pubspec.yaml b/rumil_codec_builder/pubspec.yaml index 71e7996..71c229e 100644 --- a/rumil_codec_builder/pubspec.yaml +++ b/rumil_codec_builder/pubspec.yaml @@ -2,7 +2,7 @@ name: rumil_codec_builder description: >- Code generator for rumil_codec: derives BinaryCodec implementations for annotated classes and sealed class hierarchies. -version: 0.5.0 +version: 0.6.0 repository: https://github.com/hakimjonas/rumil-dart topics: - codec @@ -16,11 +16,11 @@ dependencies: build: ">=3.0.0 <5.0.0" source_gen: ^4.0.0 analyzer: ">=12.0.0 <13.0.0" - rumil_codec: ^0.5.0 + rumil_codec: ^0.6.0 dev_dependencies: build_runner: ">=2.4.0 <3.0.0" build_test: ^3.5.0 - rumil_parsers: ^0.5.0 + rumil_parsers: ^0.6.0 test: ^1.25.0 lints: ^6.0.0 diff --git a/rumil_expressions/CHANGELOG.md b/rumil_expressions/CHANGELOG.md index 0260af6..41c87fa 100644 --- a/rumil_expressions/CHANGELOG.md +++ b/rumil_expressions/CHANGELOG.md @@ -1,3 +1,8 @@ +## 0.6.0 + +- Depends on `rumil: ^0.6.0`. Version aligned with the rumil-dart + monorepo 0.6.0 release. No functional changes in this package. + ## 0.5.0 - Depends on rumil ^0.5.0. Benefits from interpreter optimizations (5-9% AOT, 30-52% WasmGC). diff --git a/rumil_expressions/pubspec.yaml b/rumil_expressions/pubspec.yaml index 0e78f21..a94d1ed 100644 --- a/rumil_expressions/pubspec.yaml +++ b/rumil_expressions/pubspec.yaml @@ -2,7 +2,7 @@ name: rumil_expressions description: >- Formula evaluator built on Rumil: arithmetic, boolean logic, string ops, variables, custom functions, and precise error locations. -version: 0.5.0 +version: 0.6.0 repository: https://github.com/hakimjonas/rumil-dart topics: - parser @@ -13,7 +13,7 @@ environment: sdk: ^3.7.0 dependencies: - rumil: ^0.5.0 + rumil: ^0.6.0 dev_dependencies: test: ^1.31.0 diff --git a/rumil_parsers/CHANGELOG.md b/rumil_parsers/CHANGELOG.md index c8e49cf..8cc21ef 100644 --- a/rumil_parsers/CHANGELOG.md +++ b/rumil_parsers/CHANGELOG.md @@ -1,3 +1,8 @@ +## 0.6.0 + +- Depends on `rumil: ^0.6.0`. Version aligned with the rumil-dart + monorepo 0.6.0 release. No functional changes in this package. + ## 0.5.0 **CommonMark Markdown parser. Architecture audit. 7376 tests.** diff --git a/rumil_parsers/pubspec.yaml b/rumil_parsers/pubspec.yaml index d0a6df0..2ac57c1 100644 --- a/rumil_parsers/pubspec.yaml +++ b/rumil_parsers/pubspec.yaml @@ -2,7 +2,7 @@ name: rumil_parsers description: >- Format parsers built on Rumil: JSON, CSV, XML, TOML, YAML, Proto3, HCL, and CommonMark Markdown, plus typed AST decoders with ObjectAccessor pattern. -version: 0.5.0 +version: 0.6.0 repository: https://github.com/hakimjonas/rumil-dart topics: - parser @@ -13,7 +13,7 @@ environment: sdk: ^3.7.0 dependencies: - rumil: ^0.5.0 + rumil: ^0.6.0 dev_dependencies: test: ^1.31.0 diff --git a/rumil_tokens/CHANGELOG.md b/rumil_tokens/CHANGELOG.md new file mode 100644 index 0000000..465d90c --- /dev/null +++ b/rumil_tokens/CHANGELOG.md @@ -0,0 +1,52 @@ +## 0.1.0 + +Initial in-tree cut. Source code tokenizer built on Rumil. Not +published to pub.dev; consumed via path dependency from elsewhere +in the monorepo. + +### Tokens + +- Sealed `Token` ADT: `Keyword`, `TypeName`, `StringLit`, `NumberLit`, + `Comment`, `Punctuation`, `Operator`, `Variable`, `Identifier`, + `Annotation`, `Whitespace`, `Plain`. + +### API + +- `tokenize(source, grammar)` returns a lossless `List`; + concatenating `token.text` reconstructs the source exactly. +- `tokenizeSpans(source, grammar)` returns `List>` + carrying byte offsets. Spans are half-open `[start, end)`, + contiguous, and anchored to `[0, source.length)`. +- `Spanned` is an extension type over + `(T, int, int)`. Narrow types upcast to wider ones. + +### Built-in grammars + +- `dart`, `scala`, `yaml`, `json`, `shell`. +- `grammarFor(name)` returns the matching grammar or `null`. + +### `LangGrammar` fields + +- Lexical: `keywords`, `types`, `lineComment`, `blockComment`, + `stringDelimiters`, `multiLineStringDelimiters`, `annotationPrefix`, + `punctuationChars`, `operatorChars`, `multiCharOperators`. +- Flags: `identifiersAllowDollar`, `rawStringPrefix`, + `identifierStringPrefix`, `backtickIdentifiers`, `shellVariables`, + `backtickCommandSubstitution`, `heredocs`. + +### Known limitations + +- YAML block scalars (`|`, `>`) tokenize the indented body as regular + YAML content rather than one string literal. +- Dart string interpolation (`"$x"`, `"${expr}"`) remains one + `StringLit`; no structured tokens for the interpolated parts. +- Shell braced expansions do not balance nested braces: `${x:-${y}}` + closes the outer expansion prematurely. +- Heredoc body is one `StringLit`; per-component coloring is not + available. +- Nested generic close (`List>`) highlights the outer + `>>` as the right-shift operator. + +### Dependencies + +- `rumil: ^0.6.0` for the `position()` primitive. diff --git a/rumil_tokens/LICENSE b/rumil_tokens/LICENSE new file mode 100644 index 0000000..cd50bba --- /dev/null +++ b/rumil_tokens/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 Hakim Jonas Ghoula + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/rumil_tokens/README.md b/rumil_tokens/README.md new file mode 100644 index 0000000..033f74b --- /dev/null +++ b/rumil_tokens/README.md @@ -0,0 +1,83 @@ +# rumil_tokens + +**Status: in-tree only. Not published to pub.dev.** Consumed via path +dependency from other packages in the monorepo (e.g. `rem`). + +Source code tokenizer built on [Rumil](https://pub.dev/packages/rumil) +parser combinators. Classifies source text into typed token spans: +keywords, strings, comments, numbers, types, annotations, operators, +variables, and punctuation. Token streams are lossless; concatenating +`token.text` across a stream reconstructs the input exactly. + +## Usage + +```dart +import 'package:rumil_tokens/rumil_tokens.dart'; + +final tokens = tokenize('final x = 42; // answer', dart); +for (final token in tokens) { + print('${token.runtimeType}: ${token.text}'); +} +``` + +Use a built-in grammar (`dart`, `scala`, `yaml`, `json`, `shell`) or define +your own: + +```dart +const rust = LangGrammar( + name: 'rust', + keywords: ['fn', 'let', 'mut', 'if', 'else', 'match', 'impl', 'struct'], + types: ['i32', 'u64', 'String', 'Vec', 'Option', 'Result', 'bool'], + lineComment: '//', + blockComment: ('/*', '*/'), + stringDelimiters: ['"'], + annotationPrefix: '#', +); + +final tokens = tokenize(source, rust); +``` + +Look up a grammar by name: + +```dart +final grammar = grammarFor('dart'); // returns null for unknown languages +``` + +## Lossless property + +Concatenating `token.text` for every token reconstructs the original source: + +```dart +assert(tokens.map((t) => t.text).join() == source); +``` + +## Positions + +For tooling that needs byte offsets, use `tokenizeSpans`: + +```dart +final spans = tokenizeSpans(source, dart); +for (final s in spans) { + print('[${s.start}, ${s.end}) ${s.token}'); + assert(source.substring(s.start, s.end) == s.token.text); +} +``` + +`Spanned` is an extension type over `(Token, int, int)`. +The `[start, end)` interval is half-open; spans are contiguous +(`spans[i].end == spans[i+1].start`) and anchored (`spans.first.start == 0`, +`spans.last.end == source.length`). + +## Grammar coverage + +Known limitations (see `CHANGELOG.md`): + +- YAML block scalars (`|`, `>`) tokenize the indented body as regular + YAML content rather than one string literal. +- Dart string interpolation (`"$x"`, `"${expr}"`) remains one + `StringLit`. +- Shell braced expansions do not balance nested braces. +- Heredoc body is one `StringLit`. +- Nested generic close renders the outer `>>` as right-shift. + +Part of the [rumil-dart](https://github.com/hakimjonas/rumil-dart) monorepo. diff --git a/rumil_tokens/analysis_options.yaml b/rumil_tokens/analysis_options.yaml new file mode 100644 index 0000000..b414920 --- /dev/null +++ b/rumil_tokens/analysis_options.yaml @@ -0,0 +1,21 @@ +include: package:lints/recommended.yaml + +analyzer: + language: + strict-casts: true + strict-inference: true + strict-raw-types: true + +linter: + rules: + - prefer_final_locals + - prefer_final_in_for_each + - prefer_const_constructors + - prefer_const_declarations + - always_declare_return_types + - annotate_overrides + - avoid_dynamic_calls + - prefer_expression_function_bodies + - unnecessary_lambdas + - prefer_single_quotes + - public_member_api_docs diff --git a/rumil_tokens/example/example.dart b/rumil_tokens/example/example.dart new file mode 100644 index 0000000..cfbe5b7 --- /dev/null +++ b/rumil_tokens/example/example.dart @@ -0,0 +1,33 @@ +import 'package:rumil_tokens/rumil_tokens.dart'; + +void main() { + const source = ''' +void main() { + final x = 42; + // greeting + print("hello \$x"); +} +'''; + + final tokens = tokenize(source, dart); + + for (final token in tokens) { + final kind = switch (token) { + Keyword() => 'keyword', + TypeName() => 'type', + StringLit() => 'string', + NumberLit() => 'number', + Comment() => 'comment', + Annotation() => 'annotation', + Punctuation() => 'punct', + Operator() => 'op', + Variable() => 'var', + Identifier() => 'ident', + Whitespace() => 'ws', + Plain() => 'plain', + }; + if (token is! Whitespace) { + print('$kind: ${token.text}'); + } + } +} diff --git a/rumil_tokens/lib/rumil_tokens.dart b/rumil_tokens/lib/rumil_tokens.dart new file mode 100644 index 0000000..c930362 --- /dev/null +++ b/rumil_tokens/lib/rumil_tokens.dart @@ -0,0 +1,17 @@ +/// Lossless source code tokenizer built on Rumil. +library; + +// Token types +export 'src/token.dart'; + +// Spanned tokens (byte offsets into source). +export 'src/spanned.dart'; + +// Grammar definition +export 'src/grammar.dart'; + +// Tokenizer +export 'src/tokenizer.dart' show tokenize, tokenizeSpans; + +// Built-in language grammars +export 'src/languages.dart'; diff --git a/rumil_tokens/lib/src/grammar.dart b/rumil_tokens/lib/src/grammar.dart new file mode 100644 index 0000000..d087431 --- /dev/null +++ b/rumil_tokens/lib/src/grammar.dart @@ -0,0 +1,140 @@ +/// Language grammar definitions for the tokenizer. +library; + +/// Describes a language's lexical structure. +/// +/// Grammars are plain data. The tokenizer reads a grammar and builds +/// the combinator pipeline. +class LangGrammar { + /// Language identifier (e.g. `'dart'`, `'scala'`). + final String name; + + /// Reserved keywords (e.g. `['if', 'else', 'class']`). + final List keywords; + + /// Built-in or well-known type names (e.g. `['int', 'String']`). + final List types; + + /// Line comment prefix (e.g. `'//'`), or `null` if unsupported. + final String? lineComment; + + /// Block comment delimiters `(open, close)`, or `null`. + final (String, String)? blockComment; + + /// String delimiters to recognize (e.g. `['"', "'"]`). + final List stringDelimiters; + + /// Multi-line string delimiters (e.g. `['"""', "'''"]`). + final List multiLineStringDelimiters; + + /// Annotation prefix (e.g. `'@'` for Dart/Java), or `null`. + final String? annotationPrefix; + + /// Structural punctuation: delimiters, separators, grouping characters. + /// + /// Typical contents: `()`, `{}`, `[]`, `,`, `;`, `:`, `.`. Distinct from + /// [operatorChars], which is reserved for value-computing operators + /// (`+`, `*`, `==`). + final String punctuationChars; + + /// Multi-character operator vocabulary, matched before single-char + /// operators or punctuation. + /// + /// Order within the list is irrelevant; the tokenizer matches in + /// longest-first order. Each entry is matched as a literal string. + /// + /// Dart example: `['=>', '<=', '>=', '==', '!=', '&&', '||', '??', + /// '?.', '<<', '>>', '~/']`. Scala adds `'<-'`, `'->'`, `'::'`. + /// + /// Matched operators emit one [Operator] token including the full + /// multi-char text. + final List multiCharOperators; + + /// Single-character operator alphabet. + /// + /// Typical contents: `+`, `-`, `*`, `/`, `%`, `=`, `&`, `|`, `^`, `~`, + /// `!`. Characters here emit one [Operator] token each; runs do not + /// coalesce. Multi-character operators must be listed explicitly in + /// [multiCharOperators]. + /// + /// When empty (e.g. JSON), no operator classification happens. + /// Overlaps with [punctuationChars] are resolved in favor of operators. + final String operatorChars; + + /// Whether identifiers may contain `$`. + /// + /// Dart allows `$` in identifiers; most other languages do not. When + /// `false`, `$` is free to carry language-specific meaning such as a + /// shell variable prefix. + final bool identifiersAllowDollar; + + /// Raw-string prefix (Dart's `'r'` for `r'no\escape'`), or `null`. + /// + /// When set, the tokenizer recognizes the single-character prefix + /// immediately followed by any [stringDelimiters] or + /// [multiLineStringDelimiters] as one [StringLit] whose text includes + /// the prefix. Escape sequences inside raw strings are not processed; + /// the body is captured verbatim up to the matching delimiter. + final String? rawStringPrefix; + + /// Whether an identifier immediately followed by a string delimiter + /// forms a string with that identifier as a prefix. + /// + /// Scala's string interpolators (`s"hi $x"`, `f"$x%.2f"`, any + /// user-defined `foo"..."`) follow this pattern. When `true`, the + /// tokenizer treats `"..."` as one [StringLit] whose text + /// includes the identifier prefix. + final bool identifierStringPrefix; + + /// Whether backtick-delimited identifiers are allowed (`` `type` ``). + /// + /// Scala uses this to escape keywords. When `true`, the tokenizer + /// recognizes `` `...` `` as one [Identifier] even when the bracketed + /// content would otherwise be a keyword. + final bool backtickIdentifiers; + + /// Whether `$` introduces a variable reference (shell-style). + /// + /// When `true`, the tokenizer recognizes: + /// - `$NAME`: one [Variable] token including the `$`. + /// - `${NAME}` and `${...}` expansions: one [Variable] token, including + /// braces and body up to the matching close brace. + /// - Bare `$` not followed by a name or `{` falls through. + final bool shellVariables; + + /// Whether backtick-delimited command substitution is recognized + /// (`` `cmd` ``). + /// + /// When `true`, the tokenizer emits [Punctuation] for each backtick; + /// the body between them is tokenized as ordinary source. + final bool backtickCommandSubstitution; + + /// Whether `<<` followed by a marker introduces a heredoc. + /// + /// Shell heredocs (`<;:,.?', + operatorChars: '+-*/%=!&|^~', + multiCharOperators: [ + '<<=', + '>>=', + '>>>', + '~/=', + '??=', + '<<', + '>>', + '<=', + '>=', + '==', + '!=', + '&&', + '||', + '??', + '?.', + '=>', + '~/', + '+=', + '-=', + '*=', + '/=', + '%=', + '&=', + '|=', + '^=', + ], + identifiersAllowDollar: true, + rawStringPrefix: 'r', +); + +/// Scala language grammar. +const LangGrammar scala = LangGrammar( + name: 'scala', + keywords: [ + 'abstract', + 'as', + 'case', + 'catch', + 'class', + 'def', + 'derives', + 'do', + 'else', + 'end', + 'enum', + 'export', + 'extends', + 'extension', + 'false', + 'final', + 'finally', + 'for', + 'forSome', + 'given', + 'if', + 'implicit', + 'import', + 'inline', + 'infix', + 'lazy', + 'match', + 'new', + 'null', + 'object', + 'opaque', + 'open', + 'override', + 'package', + 'private', + 'protected', + 'return', + 'sealed', + 'super', + 'then', + 'this', + 'throw', + 'trait', + 'transparent', + 'true', + 'try', + 'type', + 'using', + 'val', + 'var', + 'while', + 'with', + 'yield', + ], + types: [ + 'Int', + 'Long', + 'Short', + 'Byte', + 'Float', + 'Double', + 'Char', + 'Boolean', + 'String', + 'Unit', + 'Nothing', + 'Any', + 'AnyRef', + 'AnyVal', + 'Option', + 'Some', + 'None', + 'Either', + 'Left', + 'Right', + 'List', + 'Map', + 'Set', + 'Seq', + 'Vector', + 'Array', + 'Future', + 'Try', + 'Success', + 'Failure', + ], + lineComment: '//', + blockComment: ('/*', '*/'), + stringDelimiters: ['"', "'"], + multiLineStringDelimiters: ['"""'], + annotationPrefix: '@', + punctuationChars: '(){}[]<>;:,.?', + operatorChars: '+-*/%=!&|^~', + multiCharOperators: [ + '<<=', + '>>=', + '>>>', + '<=', + '>=', + '==', + '!=', + '&&', + '||', + '<<', + '>>', + '<-', + '->', + '=>', + '::', + '+=', + '-=', + '*=', + '/=', + '%=', + '&=', + '|=', + '^=', + ], + identifierStringPrefix: true, + backtickIdentifiers: true, +); + +/// YAML 1.2 grammar (flow-collection keys, quoted strings, anchors, +/// aliases, comments). +/// +/// Block scalars (`|`, `>`) tokenize the indented body as regular YAML +/// content rather than as a single string literal. +const LangGrammar yaml = LangGrammar( + name: 'yaml', + keywords: ['true', 'false', 'null'], + lineComment: '#', + stringDelimiters: ['"', "'"], + punctuationChars: '[]{},:-', + operatorChars: '&*!%|>', +); + +/// JSON grammar, lenient for highlighting purposes. +/// +/// Tokenizes technically-invalid JSON inputs (hex literals, leading +/// zeros) as numbers rather than rejecting them. For strict validation +/// use a JSON parser. +const LangGrammar json = LangGrammar( + name: 'json', + keywords: ['true', 'false', 'null'], + stringDelimiters: ['"'], + punctuationChars: '{}[]:,', + operatorChars: '', +); + +/// Shell / Bash grammar. +const LangGrammar shell = LangGrammar( + name: 'shell', + keywords: [ + 'if', + 'then', + 'else', + 'elif', + 'fi', + 'for', + 'while', + 'until', + 'do', + 'done', + 'case', + 'esac', + 'in', + 'function', + 'return', + 'exit', + 'local', + 'export', + 'readonly', + 'declare', + 'typeset', + 'source', + 'eval', + 'exec', + 'set', + 'unset', + 'true', + 'false', + ], + lineComment: '#', + stringDelimiters: ['"', "'"], + punctuationChars: '(){}[];,', + operatorChars: r'=!<>&|+-*/%', + multiCharOperators: [ + '<<=', + '>>=', + '==', + '!=', + '<=', + '>=', + '&&', + '||', + '<<', + '>>', + '+=', + '-=', + '*=', + '/=', + '%=', + ], + annotationPrefix: null, + shellVariables: true, + backtickCommandSubstitution: true, + heredocs: true, +); + +/// Look up a built-in grammar by name. +/// +/// Returns `null` if no built-in grammar matches. +LangGrammar? grammarFor(String language) => switch (language) { + 'dart' => dart, + 'scala' => scala, + 'yaml' || 'yml' => yaml, + 'json' => json, + 'sh' || 'bash' || 'shell' || 'zsh' => shell, + _ => null, +}; diff --git a/rumil_tokens/lib/src/spanned.dart b/rumil_tokens/lib/src/spanned.dart new file mode 100644 index 0000000..b198ade --- /dev/null +++ b/rumil_tokens/lib/src/spanned.dart @@ -0,0 +1,35 @@ +/// Spanned token: a [Token] paired with byte offsets into the source. +library; + +import 'token.dart'; + +/// A [Token] with byte offsets into the original source string. +/// +/// The interval `[start, end)` is half-open. `source.substring(start, end)` +/// reconstructs the token's text: +/// +/// ```dart +/// final spans = tokenizeSpans(source, grammar); +/// for (final s in spans) { +/// assert(source.substring(s.start, s.end) == s.token.text); +/// } +/// ``` +/// +/// Callers needing line/column can construct a `Location(source, offset)` +/// from `rumil`. +extension type const Spanned._((T, int, int) _) { + /// Creates a spanned token covering the half-open interval `[start, end)`. + const Spanned.of(T token, int start, int end) : this._((token, start, end)); + + /// The classified token. + T get token => _.$1; + + /// Byte offset of the first character of [token] in the original source. + int get start => _.$2; + + /// Byte offset one past the last character of [token] in the original source. + int get end => _.$3; + + /// Length of the span in code units: `end - start`. + int get length => _.$3 - _.$2; +} diff --git a/rumil_tokens/lib/src/token.dart b/rumil_tokens/lib/src/token.dart new file mode 100644 index 0000000..67a3117 --- /dev/null +++ b/rumil_tokens/lib/src/token.dart @@ -0,0 +1,129 @@ +/// Token types produced by the tokenizer. +library; + +/// A classified span of source text. +/// +/// Tokens are lossless: concatenating [text] from a token stream +/// reconstructs the original source exactly. +sealed class Token { + /// The source text this token covers. + final String text; + + /// Creates a token covering [text]. + const Token(this.text); +} + +/// A language keyword (`if`, `class`, `val`, etc.). +final class Keyword extends Token { + /// Creates a keyword token. + const Keyword(super.text); + + @override + String toString() => 'Keyword($text)'; +} + +/// A built-in or well-known type name (`int`, `String`, `List`, etc.). +final class TypeName extends Token { + /// Creates a type-name token. + const TypeName(super.text); + + @override + String toString() => 'TypeName($text)'; +} + +/// A string literal, including delimiters. +final class StringLit extends Token { + /// Creates a string-literal token. + const StringLit(super.text); + + @override + String toString() => 'StringLit($text)'; +} + +/// A numeric literal (integer, float, hex, etc.). +final class NumberLit extends Token { + /// Creates a number-literal token. + const NumberLit(super.text); + + @override + String toString() => 'NumberLit($text)'; +} + +/// A comment (line or block), including delimiters. +final class Comment extends Token { + /// Creates a comment token. + const Comment(super.text); + + @override + String toString() => 'Comment($text)'; +} + +/// Structural punctuation: `(`, `)`, `{`, `}`, `[`, `]`, `,`, `;`, `:`, `.`. +/// +/// Delimits, separates, or groups. Distinct from [Operator], which is +/// reserved for value-computing operators. +final class Punctuation extends Token { + /// Creates a punctuation token. + const Punctuation(super.text); + + @override + String toString() => 'Punctuation($text)'; +} + +/// A value-computing operator: `+`, `*`, `==`, `&&`, `=>`, `->`. +/// +/// Distinct from [Punctuation]. +final class Operator extends Token { + /// Creates an operator token. + const Operator(super.text); + + @override + String toString() => 'Operator($text)'; +} + +/// A variable reference: shell `$HOME`, `${PATH}`. +/// +/// The token text includes the leading `$` and braces if present. +final class Variable extends Token { + /// Creates a variable token. + const Variable(super.text); + + @override + String toString() => 'Variable($text)'; +} + +/// An identifier that is not a keyword or type name. +final class Identifier extends Token { + /// Creates an identifier token. + const Identifier(super.text); + + @override + String toString() => 'Identifier($text)'; +} + +/// An annotation or decorator (`@override`, `#[derive]`, etc.). +final class Annotation extends Token { + /// Creates an annotation token. + const Annotation(super.text); + + @override + String toString() => 'Annotation($text)'; +} + +/// Whitespace (spaces, tabs, newlines). +final class Whitespace extends Token { + /// Creates a whitespace token. + const Whitespace(super.text); + + @override + String toString() => 'Whitespace(${text.length})'; +} + +/// Any text not matched by a language-specific rule. +final class Plain extends Token { + /// Creates a plain-text token. + const Plain(super.text); + + @override + String toString() => 'Plain($text)'; +} diff --git a/rumil_tokens/lib/src/tokenizer.dart b/rumil_tokens/lib/src/tokenizer.dart new file mode 100644 index 0000000..15501eb --- /dev/null +++ b/rumil_tokens/lib/src/tokenizer.dart @@ -0,0 +1,522 @@ +/// Lossless tokenizer built on Rumil combinators. +library; + +import 'package:rumil/rumil.dart'; + +import 'grammar.dart'; +import 'spanned.dart'; +import 'token.dart'; + +/// Tokenize [source] according to [grammar]. +/// +/// Returns a lossless token stream: concatenating every token's [Token.text] +/// reproduces [source] exactly. +/// +/// Equivalent to `tokenizeSpans(source, grammar).map((s) => s.token).toList()`. +/// Callers that need byte offsets should use [tokenizeSpans] directly. +List tokenize(String source, LangGrammar grammar) => + tokenizeSpans(source, grammar).map((s) => s.token).toList(); + +/// Tokenize [source] into [Spanned] tokens carrying byte offsets. +/// +/// The returned list satisfies: +/// +/// - Lossless: `spans.map((s) => s.token.text).join() == source`. +/// - Anchored: `spans.first.start == 0` and `spans.last.end == source.length` +/// unless [source] is empty, in which case the list is empty. +/// - Contiguous: `spans[i].end == spans[i+1].start` for every adjacent pair. +/// - Text matches span: `source.substring(s.start, s.end) == s.token.text`. +/// +/// On parser failure the whole source is returned as a single `Spanned` +/// covering `[0, source.length)`. +List> tokenizeSpans(String source, LangGrammar grammar) { + if (source.isEmpty) return const []; + final parser = _buildSpannedTokenizer(grammar); + final result = parser.run(source); + final spans = switch (result) { + Success>>(:final value) => value, + Partial>>(:final value) => value, + Failure>>() => >[ + Spanned.of(Plain(source), 0, source.length), + ], + }; + return _mergePlainSpans(spans); +} + +Parser>> _buildSpannedTokenizer( + LangGrammar grammar, +) { + final choice = Choice(_alternatives(grammar)); + final spanned = position() + .zip(choice) + .zip(position()) + .map>((nested) { + final ((start, token), end) = nested; + return Spanned.of(token, start, end); + }); + return spanned.many.thenSkip(eof()); +} + +List> _alternatives(LangGrammar grammar) { + final alternatives = >[]; + + if (grammar.blockComment case (final open, final close)) { + alternatives.add(_blockComment(open, close)); + } + if (grammar.lineComment case final prefix?) { + alternatives.add(_lineComment(prefix)); + } + + if (grammar.rawStringPrefix case final prefix?) { + for (final delim in grammar.multiLineStringDelimiters) { + alternatives.add(_rawMultiLineString(prefix, delim)); + } + for (final delim in grammar.stringDelimiters) { + alternatives.add(_rawStringLiteral(prefix, delim)); + } + } + + if (grammar.identifierStringPrefix) { + for (final delim in grammar.multiLineStringDelimiters) { + alternatives.add(_prefixedMultiLineString(delim)); + } + for (final delim in grammar.stringDelimiters) { + alternatives.add(_prefixedStringLiteral(delim)); + } + } + + for (final delim in grammar.multiLineStringDelimiters) { + alternatives.add(_multiLineString(delim)); + } + for (final delim in grammar.stringDelimiters) { + alternatives.add(_stringLiteral(delim)); + } + + alternatives.add(_number(grammar.operatorChars)); + + if (grammar.annotationPrefix case final prefix?) { + alternatives.add(_annotation(prefix, grammar.identifiersAllowDollar)); + } + + if (grammar.backtickIdentifiers) { + alternatives.add(_backtickIdentifier()); + } + + if (grammar.heredocs) { + alternatives.add(_heredoc()); + } + + if (grammar.shellVariables) { + alternatives.add(_shellVariableBraced()); + alternatives.add(_shellVariableBare()); + } + + if (grammar.backtickCommandSubstitution) { + alternatives.add(char('`').map((c) => Punctuation(c) as Token)); + } + + alternatives.add( + _identifierOrKeyword( + grammar.keywords, + grammar.types, + grammar.identifiersAllowDollar, + ), + ); + + if (grammar.multiCharOperators.isNotEmpty) { + alternatives.add(_multiCharOperator(grammar.multiCharOperators)); + } + + if (grammar.operatorChars.isNotEmpty) { + alternatives.add(_operator(grammar.operatorChars)); + } + + if (grammar.punctuationChars.isNotEmpty) { + alternatives.add(_punctuation(grammar.punctuationChars)); + } + + alternatives.add(_whitespace()); + alternatives.add(anyChar().map(Plain.new)); + + return alternatives; +} + +Parser _lineComment(String prefix) => string(prefix) + .skipThen(satisfy((c) => c != '\n', 'comment char').many.capture) + .map((body) => Comment('$prefix$body') as Token); + +Parser _blockComment(String open, String close) { + final closeFirst = close[0]; + final body = (string(close).notFollowedBy.skipThen(anyChar())).many.capture; + return string(open) + .skipThen(body) + .thenSkip(string(close)) + .map((body) => Comment('$open$body$close') as Token) + .or( + string(open) + .skipThen(satisfy((c) => c != closeFirst, 'any char').many.capture) + .map((body) => Comment('$open$body') as Token), + ); +} + +Parser _multiLineString(String delim) { + final body = (string(delim).notFollowedBy.skipThen(anyChar())).many.capture; + return string(delim) + .skipThen(body) + .thenSkip(string(delim)) + .map((body) => StringLit('$delim$body$delim') as Token) + .or( + string(delim) + .skipThen(anyChar().many.capture) + .map((body) => StringLit('$delim$body') as Token), + ); +} + +Parser _stringLiteral(String delim) { + final escaped = string('\\').skipThen(anyChar()).capture; + final normal = satisfy((c) => c != delim && c != '\\' && c != '\n', 'char'); + final body = (escaped | normal.capture).many.map((parts) => parts.join()); + return char(delim).skipThen(body).zip(char(delim).capture.optional).map(( + pair, + ) { + final (body, close) = pair; + return StringLit('$delim$body${close ?? ''}') as Token; + }); +} + +/// Raw string literal (`r'no\escape'`). Escapes are captured verbatim; +/// the body runs until the matching delimiter or end-of-line. +Parser _rawStringLiteral(String prefix, String delim) { + final normal = satisfy((c) => c != delim && c != '\n', 'raw-string char'); + final body = normal.many.capture; + return string(prefix) + .skipThen(char(delim)) + .skipThen(body) + .zip(char(delim).capture.optional) + .map((pair) { + final (body, close) = pair; + return StringLit('$prefix$delim$body${close ?? ''}') as Token; + }); +} + +/// Raw multi-line string literal (`r'''no\escape'''`). Body runs until +/// the matching triple delimiter; a missing close is tolerated. +Parser _rawMultiLineString(String prefix, String delim) { + final body = (string(delim).notFollowedBy.skipThen(anyChar())).many.capture; + return string(prefix) + .skipThen(string(delim)) + .skipThen(body) + .thenSkip(string(delim)) + .map((body) => StringLit('$prefix$delim$body$delim') as Token) + .or( + string(prefix) + .skipThen(string(delim)) + .skipThen(anyChar().many.capture) + .map((body) => StringLit('$prefix$delim$body') as Token), + ); +} + +/// Identifier-prefixed string literal (`s"hi $x"`). Escapes are respected +/// like a regular string literal. +Parser _prefixedStringLiteral(String delim) { + final prefix = satisfy((c) => _isAlpha(c) || c == '_', 'interpolator prefix') + .zip( + satisfy( + (c) => _isAlpha(c) || _isDigit(c) || c == '_', + 'ident char', + ).many, + ) + .map((pair) => pair.$1 + pair.$2.join()); + final escaped = string('\\').skipThen(anyChar()).capture; + final normal = satisfy((c) => c != delim && c != '\\' && c != '\n', 'char'); + final body = (escaped | normal.capture).many.map((parts) => parts.join()); + return prefix + .zip(char(delim)) + .zip(body) + .zip(char(delim).capture.optional) + .map((nested) { + final (((p, d), b), close) = nested; + return StringLit('$p$d$b${close ?? ''}') as Token; + }); +} + +/// Identifier-prefixed multi-line string literal (`s"""hi $x"""`). +Parser _prefixedMultiLineString(String delim) { + final prefix = satisfy((c) => _isAlpha(c) || c == '_', 'interpolator prefix') + .zip( + satisfy( + (c) => _isAlpha(c) || _isDigit(c) || c == '_', + 'ident char', + ).many, + ) + .map((pair) => pair.$1 + pair.$2.join()); + final body = (string(delim).notFollowedBy.skipThen(anyChar())).many.capture; + return prefix + .zip(string(delim)) + .zip(body) + .thenSkip(string(delim)) + .map((nested) { + final ((p, d), b) = nested; + return StringLit('$p$d$b$d') as Token; + }) + .or( + prefix.zip(string(delim)).zip(anyChar().many.capture).map((nested) { + final ((p, d), b) = nested; + return StringLit('$p$d$b') as Token; + }), + ); +} + +/// Backtick-delimited identifier (`` `type` ``). Keywords inside +/// backticks are identifiers. Body runs until the matching backtick. +Parser _backtickIdentifier() { + final normal = satisfy((c) => c != '`' && c != '\n', 'backtick-ident char'); + final body = normal.many.capture; + return char('`').skipThen(body).zip(char('`').capture.optional).map((pair) { + final (body, close) = pair; + return Identifier('`$body${close ?? ''}') as Token; + }); +} + +/// Bare shell variable (`$NAME`, `$1`, `$@`, `$#`, `$?`, `$$`, `$!`). +/// +/// Matches `$` followed by an identifier name, a single digit, or a +/// special positional parameter character. A lone `$` emits `$` as +/// [Variable] so the follower tokenizes normally. +Parser _shellVariableBare() { + final name = satisfy((c) => _isAlpha(c) || c == '_', 'variable name start') + .zip( + satisfy( + (c) => _isAlpha(c) || _isDigit(c) || c == '_', + 'ident char', + ).many, + ) + .map((pair) => pair.$1 + pair.$2.join()); + final special = satisfy( + (c) => + _isDigit(c) || + c == '@' || + c == '#' || + c == '?' || + c == r'$' || + c == '!' || + c == '*' || + c == '-', + 'special parameter', + ); + final body = name | special.capture | succeed(''); + return char( + r'$', + ).skipThen(body).map((name) => Variable(r'$' + name) as Token); +} + +/// Shell heredoc (`< _heredoc() { + final introducer = string('<<-') | string('<<'); + final quotedMarker = char("'") + .skipThen( + satisfy((c) => c != "'" && c != '\n', 'marker char').many.capture, + ) + .thenSkip(char("'")) + .map((m) => ("'$m'", m)); + final dquotedMarker = char('"') + .skipThen( + satisfy((c) => c != '"' && c != '\n', 'marker char').many.capture, + ) + .thenSkip(char('"')) + .map((m) => ('"$m"', m)); + final bareMarker = satisfy((c) => _isAlpha(c) || c == '_', 'marker start') + .zip( + satisfy( + (c) => _isAlpha(c) || _isDigit(c) || c == '_', + 'marker char', + ).many, + ) + .map((p) => (p.$1 + p.$2.join(), p.$1 + p.$2.join())); + final marker = quotedMarker | dquotedMarker | bareMarker; + + return introducer.zip(marker).flatMap((pair) { + final (intro, markerPair) = pair; + final (markerText, markerName) = markerPair; + final restOfLine = satisfy((c) => c != '\n', 'heredoc rest').many.capture; + final newline = char('\n'); + final tabs = satisfy((c) => c == '\t', 'tab').many.capture; + final eolOrEof = newline.capture | succeed(''); + final stripLeadingTabs = intro == '<<-'; + // Word-boundary lookahead: `EOF` terminates, `EOFISH` does not. + final markerEnd = + satisfy( + (c) => c != '\n' && (_isAlpha(c) || _isDigit(c) || c == '_'), + 'ident continuation', + ).notFollowedBy; + final terminatorLine = (stripLeadingTabs + ? tabs + : succeed('')) + .zip(string(markerName)) + .thenSkip(markerEnd) + .zip(eolOrEof) + .map((nested) { + final ((leading, mark), trailing) = nested; + return '$leading$mark$trailing'; + }); + final bodyChar = terminatorLine.notFollowedBy.skipThen(anyChar()); + final body = bodyChar.many.capture; + final full = restOfLine.thenSkip(newline).zip(body).zip(terminatorLine).map( + (nested) { + final ((rest, bodyText), term) = nested; + return StringLit('$intro$markerText$rest\n$bodyText$term') as Token; + }, + ); + final untilEof = restOfLine + .thenSkip(newline.optional) + .zip(anyChar().many.capture) + .map((pair) { + final (rest, bodyText) = pair; + return StringLit('$intro$markerText$rest\n$bodyText') as Token; + }); + return full | untilEof; + }); +} + +/// Braced shell variable (`${NAME}`, `${#NAME}`, `${NAME:-default}`, +/// `${NAME//pat/repl}`). Body runs until the matching close brace. +/// +/// Nested braces are not balanced: the first `}` at the top level +/// closes the expansion. +Parser _shellVariableBraced() { + final body = + satisfy((c) => c != '}' && c != '\n', 'expansion body').many.capture; + return string(r'${').skipThen(body).zip(char('}').capture.optional).map(( + pair, + ) { + final (body, close) = pair; + return Variable('\${$body${close ?? ''}') as Token; + }); +} + +Parser _number(String operatorChars) { + final hexLit = + string('0x').skipThen(satisfy(_isHexDigit, 'hex digit').many1).capture; + final binLit = string('0b').skipThen(oneOf('01').many1).capture; + + final digits = satisfy(_isDigit, 'digit').many1.capture; + // Digit-lookahead gate so `x.length` doesn't read `.l` as a decimal. + final decimalPart = + char( + '.', + ).thenSkip(satisfy(_isDigit, 'digit').lookAhead).skipThen(digits).capture; + final exponent = + oneOf('eE').skipThen(oneOf('+-').optional).zip(digits).capture; + final suffix = oneOf('lLfFdD').capture.optional; + + final decLit = + digits + .zip(decimalPart.optional) + .zip(exponent.optional) + .zip(suffix) + .capture; + + // When `-` is an operator character the operator parser handles it; + // otherwise (JSON) we accept an optional leading `-` as part of the number. + final signed = + operatorChars.contains('-') + ? (hexLit | binLit | decLit) + : char('-').capture.optional + .zip(hexLit | binLit | decLit) + .map((pair) => (pair.$1 ?? '') + pair.$2); + + return signed.map(NumberLit.new as Token Function(String)); +} + +Parser _annotation(String prefix, bool allowDollar) => + string(prefix) + .skipThen(_identRaw(allowDollar)) + .map((id) => Annotation('$prefix$id') as Token); + +Parser _identifierOrKeyword( + List keywords, + List types, + bool allowDollar, +) { + final keywordSet = {...keywords}; + final typeSet = {...types}; + return _identRaw(allowDollar).map((id) { + if (keywordSet.contains(id)) return Keyword(id) as Token; + if (typeSet.contains(id)) return TypeName(id) as Token; + return Identifier(id) as Token; + }); +} + +Parser _identRaw(bool allowDollar) { + bool isStart(String c) => + _isAlpha(c) || c == '_' || (allowDollar && c == r'$'); + bool isCont(String c) => isStart(c) || _isDigit(c); + return satisfy(isStart, 'identifier start') + .zip(satisfy(isCont, 'identifier char').many) + .map((pair) => pair.$1 + pair.$2.join()); +} + +Parser _punctuation(String chars) => satisfy( + (c) => chars.contains(c), + 'punctuation', +).map((c) => Punctuation(c) as Token); + +/// Single-character operator parser. Emits one character per token. +/// Multi-character operators must be declared in +/// [LangGrammar.multiCharOperators]. +Parser _operator(String chars) => satisfy( + (c) => chars.contains(c), + 'operator', +).map((c) => Operator(c) as Token); + +/// Multi-character operator parser. Matches candidates longest-first. +Parser _multiCharOperator(List ops) { + final sorted = [...ops]..sort((a, b) => b.length.compareTo(a.length)); + var parser = string(sorted.first); + for (final op in sorted.skip(1)) { + parser = parser.or(string(op)); + } + return parser.map((s) => Operator(s) as Token); +} + +Parser _whitespace() => + satisfy(_isWhitespace, 'whitespace').many1.capture.map(Whitespace.new); + +/// Merges consecutive [Plain]-spanned entries. The merged span inherits +/// the first entry's `start` and the last entry's `end`. +List> _mergePlainSpans(List> spans) { + if (spans.length < 2) return spans; + final out = >[]; + for (final cur in spans) { + if (cur.token is Plain && out.isNotEmpty && out.last.token is Plain) { + final prev = out.last; + final merged = Plain(prev.token.text + cur.token.text); + out[out.length - 1] = Spanned.of(merged, prev.start, cur.end); + } else { + out.add(cur); + } + } + return out; +} + +bool _isDigit(String c) => c.compareTo('0') >= 0 && c.compareTo('9') <= 0; + +bool _isHexDigit(String c) => + _isDigit(c) || + (c.compareTo('a') >= 0 && c.compareTo('f') <= 0) || + (c.compareTo('A') >= 0 && c.compareTo('F') <= 0); + +bool _isAlpha(String c) => + (c.compareTo('a') >= 0 && c.compareTo('z') <= 0) || + (c.compareTo('A') >= 0 && c.compareTo('Z') <= 0); + +bool _isWhitespace(String c) => c == ' ' || c == '\t' || c == '\n' || c == '\r'; diff --git a/rumil_tokens/pubspec.yaml b/rumil_tokens/pubspec.yaml new file mode 100644 index 0000000..bb1c114 --- /dev/null +++ b/rumil_tokens/pubspec.yaml @@ -0,0 +1,16 @@ +name: rumil_tokens +description: >- + Source code tokenizer built on Rumil. Classified token spans for syntax + highlighting. In-tree within the rumil-dart monorepo; not published. +version: 0.1.0 +publish_to: none + +environment: + sdk: ^3.7.0 + +dependencies: + rumil: ^0.6.0 + +dev_dependencies: + test: ^1.31.0 + lints: ^6.0.0 diff --git a/rumil_tokens/test/tokenizer_test.dart b/rumil_tokens/test/tokenizer_test.dart new file mode 100644 index 0000000..8a27c54 --- /dev/null +++ b/rumil_tokens/test/tokenizer_test.dart @@ -0,0 +1,1446 @@ +import 'package:rumil_tokens/rumil_tokens.dart'; +import 'package:test/test.dart'; + +void _expectLossless(String source, LangGrammar grammar) { + final tokens = tokenize(source, grammar); + final reconstructed = tokens.map((t) => t.text).join(); + expect(reconstructed, source, reason: 'lossless round-trip'); +} + +List _ofType(List tokens) => + tokens.whereType().toList(); + +List _textsOf(List tokens) => + _ofType(tokens).map((t) => t.text).toList(); + +void main() { + // --------------------------------------------------------------------------- + // Lossless round-trip. + // --------------------------------------------------------------------------- + + group('lossless round-trip', () { + test('empty input', () { + expect(tokenize('', dart), isEmpty); + }); + + test('single character', () { + _expectLossless('x', dart); + }); + + test('only whitespace', () { + _expectLossless(' \t\n \r\n', dart); + }); + + test('Dart function', () { + const source = ''' +void main() { + final x = 42; + // comment + print('hello \$x'); +} +'''; + _expectLossless(source, dart); + }); + + test('Dart class with annotations', () { + const source = ''' +@immutable +class Point { + final int x; + final int y; + const Point(this.x, this.y); + + @override + String toString() => 'Point(\$x, \$y)'; +} +'''; + _expectLossless(source, dart); + }); + + test('Dart multi-line strings', () { + _expectLossless("var a = '''multi\nline''';\n", dart); + _expectLossless('var b = """another\nmulti""";\n', dart); + }); + + test('Dart string interpolation with escapes', () { + const source = r'''var s = "line1\nline2\t\"quoted\"\\end";'''; + _expectLossless(source, dart); + }); + + test('Dart hex and binary literals', () { + _expectLossless('var a = 0xFF; var b = 0b1010;', dart); + }); + + test('Dart floating point', () { + _expectLossless('var x = 3.14; var y = 1e10; var z = 2.5e-3;', dart); + }); + + test('Dart block comment', () { + _expectLossless('x /* block\ncomment */ y', dart); + }); + + test('unterminated string', () { + _expectLossless('var s = "unterminated', dart); + }); + + test('unterminated multi-line string', () { + _expectLossless("var s = '''unterminated", dart); + }); + + test('unterminated block comment', () { + _expectLossless('/* unterminated', dart); + }); + + test('Scala snippet', () { + const source = ''' +object Main: + def run(args: List[String]): Unit = + val x: Int = 42 + /* block comment */ + println(s"hello \$x") +'''; + _expectLossless(source, scala); + }); + + test('Scala triple-quoted string', () { + _expectLossless('val s = """raw\nstring"""', scala); + }); + + test('YAML document', () { + const source = ''' +name: rumil +version: 0.5.0 +# comment +dependencies: + rumil: ^0.5.0 + flag: true +list: + - one + - two +'''; + _expectLossless(source, yaml); + }); + + test('JSON document', () { + const source = ''' +{ + "name": "rumil", + "version": 42, + "active": true, + "data": null, + "items": [1, 2, 3] +} +'''; + _expectLossless(source, json); + }); + + test('shell script', () { + const source = ''' +#!/bin/bash +# deploy script +export PORT=8080 +for f in *.dart; do + echo "building \$f" + if [ -f "\$f" ]; then + dart compile exe "\$f" + fi +done +'''; + _expectLossless(source, shell); + }); + + test('consecutive comments', () { + _expectLossless('// one\n// two\n// three\n', dart); + }); + + test('adjacent strings', () { + _expectLossless('"a""b""c"', dart); + }); + + test('mixed content', () { + const source = 'if (x > 0) { return "yes"; } // done'; + _expectLossless(source, dart); + }); + + test('only punctuation', () { + _expectLossless('(){}[]<>;:,.', dart); + }); + + test('only numbers', () { + _expectLossless('42', dart); + }); + + test('only a string', () { + _expectLossless('"hello world"', dart); + }); + + test('only a comment', () { + _expectLossless('// just a comment', dart); + }); + + test('unicode identifiers', () { + _expectLossless(r'var $dollar = _under;', dart); + }); + }); + + // --------------------------------------------------------------------------- + // Token classification. + // --------------------------------------------------------------------------- + + group('keywords', () { + test('Dart keywords', () { + final tokens = tokenize('if else class final var void return', dart); + expect(_textsOf(tokens), [ + 'if', + 'else', + 'class', + 'final', + 'var', + 'void', + 'return', + ]); + }); + + test('Scala keywords', () { + final tokens = tokenize('def val object trait sealed match', scala); + expect(_textsOf(tokens), [ + 'def', + 'val', + 'object', + 'trait', + 'sealed', + 'match', + ]); + }); + + test('YAML keywords', () { + final tokens = tokenize('true false null', yaml); + expect(_textsOf(tokens), ['true', 'false', 'null']); + }); + + test('JSON keywords', () { + final tokens = tokenize('true false null', json); + expect(_textsOf(tokens), ['true', 'false', 'null']); + }); + + test('shell keywords', () { + final tokens = tokenize('if then else fi for do done', shell); + expect(_textsOf(tokens), [ + 'if', + 'then', + 'else', + 'fi', + 'for', + 'do', + 'done', + ]); + }); + + test('keyword not matched inside identifier', () { + final tokens = tokenize('classify iffy', dart); + expect(_textsOf(tokens), ['classify', 'iffy']); + expect(_ofType(tokens), isEmpty); + }); + + test('keyword not matched as prefix of identifier', () { + final tokens = tokenize('ifTrue forEachItem', dart); + expect(_textsOf(tokens), ['ifTrue', 'forEachItem']); + expect(_ofType(tokens), isEmpty); + }); + + test('keyword followed by punctuation', () { + final tokens = tokenize('if(x)', dart); + expect(_textsOf(tokens), ['if']); + expect(_textsOf(tokens), ['x']); + }); + }); + + group('type names', () { + test('Dart types', () { + final tokens = tokenize('int String List Map Future', dart); + expect(_textsOf(tokens), [ + 'int', + 'String', + 'List', + 'Map', + 'Future', + ]); + }); + + test('Scala types', () { + final tokens = tokenize('Int Boolean Option Either Unit', scala); + expect(_textsOf(tokens), [ + 'Int', + 'Boolean', + 'Option', + 'Either', + 'Unit', + ]); + }); + + test('type not matched inside identifier', () { + final tokens = tokenize('integer Stringify', dart); + expect(_textsOf(tokens), ['integer', 'Stringify']); + expect(_ofType(tokens), isEmpty); + }); + }); + + group('identifiers', () { + test('simple identifiers', () { + final tokens = tokenize('foo bar baz', dart); + expect(_textsOf(tokens), ['foo', 'bar', 'baz']); + }); + + test('underscore identifiers', () { + final tokens = tokenize('_private __dunder _a1', dart); + expect(_textsOf(tokens), ['_private', '__dunder', '_a1']); + }); + + test('dollar identifiers', () { + final tokens = tokenize(r'$ref $$double', dart); + expect(_textsOf(tokens), [r'$ref', r'$$double']); + }); + + test('alphanumeric identifiers', () { + final tokens = tokenize('item1 item2 a123b', dart); + expect(_textsOf(tokens), ['item1', 'item2', 'a123b']); + }); + }); + + group('string literals', () { + test('double-quoted string', () { + final tokens = tokenize('"hello"', dart); + expect(_textsOf(tokens), ['"hello"']); + }); + + test('single-quoted string', () { + final tokens = tokenize("'world'", dart); + expect(_textsOf(tokens), ["'world'"]); + }); + + test('empty string', () { + final tokens = tokenize('""', dart); + expect(_textsOf(tokens), ['""']); + }); + + test('string with escapes', () { + final tokens = tokenize(r'"hello\nworld"', dart); + expect(_textsOf(tokens), [r'"hello\nworld"']); + }); + + test('string with escaped quote', () { + final tokens = tokenize(r'"say \"hi\""', dart); + expect(_textsOf(tokens), [r'"say \"hi\""']); + }); + + test('string with escaped backslash', () { + final tokens = tokenize(r'"path\\file"', dart); + expect(_textsOf(tokens), [r'"path\\file"']); + }); + + test('multi-line string (triple single)', () { + final tokens = tokenize("'''multi\nline'''", dart); + expect(_textsOf(tokens), ["'''multi\nline'''"]); + }); + + test('multi-line string (triple double)', () { + final tokens = tokenize('"""multi\nline"""', dart); + expect(_textsOf(tokens), ['"""multi\nline"""']); + }); + + test('unterminated string stops at newline', () { + final tokens = tokenize('"unterminated\nnext', dart); + final strings = _textsOf(tokens); + expect(strings.length, 1); + expect(strings.first, '"unterminated'); + }); + + test('adjacent strings', () { + final tokens = tokenize('"a""b"', dart); + expect(_textsOf(tokens), ['"a"', '"b"']); + }); + + test('JSON only has double-quoted strings', () { + final tokens = tokenize('"value"', json); + expect(_textsOf(tokens), ['"value"']); + }); + }); + + group('number literals', () { + test('integer', () { + final tokens = tokenize('42', dart); + expect(_textsOf(tokens), ['42']); + }); + + test('float', () { + final tokens = tokenize('3.14', dart); + expect(_textsOf(tokens), ['3.14']); + }); + + test('hex literal', () { + final tokens = tokenize('0xFF', dart); + expect(_textsOf(tokens), ['0xFF']); + }); + + test('hex lowercase', () { + final tokens = tokenize('0xdeadbeef', dart); + expect(_textsOf(tokens), ['0xdeadbeef']); + }); + + test('binary literal', () { + final tokens = tokenize('0b1010', dart); + expect(_textsOf(tokens), ['0b1010']); + }); + + test('scientific notation', () { + final tokens = tokenize('1e10', dart); + expect(_textsOf(tokens), ['1e10']); + }); + + test('scientific with decimal', () { + final tokens = tokenize('2.5e-3', dart); + expect(_textsOf(tokens), ['2.5e-3']); + }); + + test('multiple numbers', () { + final tokens = tokenize('1 2 3', dart); + expect(_textsOf(tokens), ['1', '2', '3']); + }); + + test('number followed by punctuation', () { + final tokens = tokenize('42;', dart); + expect(_textsOf(tokens), ['42']); + expect(_textsOf(tokens), [';']); + }); + + test('number inside expression', () { + final tokens = tokenize('x+42', dart); + expect(_textsOf(tokens), ['42']); + }); + }); + + group('comments', () { + test('line comment', () { + final tokens = tokenize('x // comment\ny', dart); + expect(_textsOf(tokens), ['// comment']); + }); + + test('line comment at start', () { + final tokens = tokenize('// first line', dart); + expect(_textsOf(tokens), ['// first line']); + }); + + test('line comment with no space', () { + final tokens = tokenize('//compact', dart); + expect(_textsOf(tokens), ['//compact']); + }); + + test('empty line comment', () { + final tokens = tokenize('//\nx', dart); + expect(_textsOf(tokens), ['//']); + }); + + test('block comment single line', () { + final tokens = tokenize('/* block */', dart); + expect(_textsOf(tokens), ['/* block */']); + }); + + test('block comment multi-line', () { + final tokens = tokenize('/* line1\nline2 */', dart); + expect(_textsOf(tokens), ['/* line1\nline2 */']); + }); + + test('block comment with stars', () { + final tokens = tokenize('/** doc comment */', dart); + expect(_textsOf(tokens), ['/** doc comment */']); + }); + + test('unterminated block comment', () { + final tokens = tokenize('/* unterminated', dart); + expect(_textsOf(tokens), ['/* unterminated']); + }); + + test('consecutive line comments', () { + final tokens = tokenize('// one\n// two\n// three', dart); + expect(_textsOf(tokens), ['// one', '// two', '// three']); + }); + + test('hash comment (YAML)', () { + final tokens = tokenize('key: value # comment', yaml); + expect(_textsOf(tokens), ['# comment']); + }); + + test('hash comment (shell)', () { + final tokens = tokenize('echo hi # comment', shell); + expect(_textsOf(tokens), ['# comment']); + }); + + test('JSON has no comments', () { + final tokens = tokenize('// not a comment', json); + expect(_ofType(tokens), isEmpty); + }); + }); + + group('annotations', () { + test('Dart annotation', () { + final tokens = tokenize('@override', dart); + expect(_textsOf(tokens), ['@override']); + }); + + test('Dart annotation before declaration', () { + final tokens = tokenize('@deprecated void f() {}', dart); + expect(_textsOf(tokens), ['@deprecated']); + }); + + test('multiple annotations', () { + final tokens = tokenize('@immutable @sealed class X {}', dart); + expect(_textsOf(tokens), ['@immutable', '@sealed']); + }); + + test('Scala annotation', () { + final tokens = tokenize('@tailrec def f(): Unit = ???', scala); + expect(_textsOf(tokens), ['@tailrec']); + }); + + test('no annotations in YAML', () { + final tokens = tokenize('@value', yaml); + expect(_ofType(tokens), isEmpty); + }); + + test('no annotations in JSON', () { + final tokens = tokenize('@value', json); + expect(_ofType(tokens), isEmpty); + }); + }); + + group('punctuation', () { + test('parentheses', () { + final tokens = tokenize('()', dart); + expect(_textsOf(tokens), ['(', ')']); + }); + + test('braces', () { + final tokens = tokenize('{}', dart); + expect(_textsOf(tokens), ['{', '}']); + }); + + test('brackets', () { + final tokens = tokenize('[]', dart); + expect(_textsOf(tokens), ['[', ']']); + }); + + test('mixed punctuation', () { + final tokens = tokenize('f(x, y);', dart); + expect(_textsOf(tokens), ['(', ',', ')', ';']); + }); + + test('operators are Operator, not Punctuation', () { + final tokens = tokenize('a + b * c', dart); + expect(_textsOf(tokens), isEmpty); + expect(_textsOf(tokens), ['+', '*']); + }); + + test('JSON punctuation', () { + final tokens = tokenize('{[]:,}', json); + expect(_textsOf(tokens), ['{', '[', ']', ':', ',', '}']); + }); + + test('YAML colon', () { + final tokens = tokenize('key: value', yaml); + expect(_textsOf(tokens), [':']); + }); + }); + + group('whitespace', () { + test('spaces', () { + final tokens = tokenize('a b', dart); + expect(tokens[1], isA()); + expect(tokens[1].text, ' '); + }); + + test('tabs', () { + final tokens = tokenize('a\tb', dart); + expect(tokens[1], isA()); + expect(tokens[1].text, '\t'); + }); + + test('newlines', () { + final tokens = tokenize('a\nb', dart); + expect(tokens[1], isA()); + expect(tokens[1].text, '\n'); + }); + + test('mixed whitespace collapsed', () { + final tokens = tokenize('a \t\n b', dart); + expect(tokens[1], isA()); + expect(tokens[1].text, ' \t\n '); + }); + + test('only whitespace', () { + final tokens = tokenize(' ', dart); + expect(tokens.length, 1); + expect(tokens.first, isA()); + }); + }); + + // --------------------------------------------------------------------------- + // Language-specific integration tests + // --------------------------------------------------------------------------- + + group('Dart integration', () { + test('full function', () { + const source = ''' +Future compute(List args) async { + final result = await fetch("url"); + if (result == null) return -1; + // process + return result.length; +} +'''; + _expectLossless(source, dart); + final tokens = tokenize(source, dart); + expect(_textsOf(tokens), contains('async')); + expect(_textsOf(tokens), contains('await')); + expect(_textsOf(tokens), contains('if')); + expect(_textsOf(tokens), contains('return')); + expect(_textsOf(tokens), contains('Future')); + expect(_textsOf(tokens), contains('int')); + expect(_textsOf(tokens), contains('List')); + expect(_textsOf(tokens), contains('String')); + expect(_textsOf(tokens), ['"url"']); + expect(_textsOf(tokens), ['1']); + expect(_textsOf(tokens), ['// process']); + }); + + test('sealed class with pattern matching', () { + const source = ''' +sealed class Shape {} +final class Circle extends Shape { + final double radius; + const Circle(this.radius); +} +'''; + _expectLossless(source, dart); + final tokens = tokenize(source, dart); + expect(_textsOf(tokens), contains('sealed')); + expect(_textsOf(tokens), contains('extends')); + expect(_textsOf(tokens), contains('double')); + }); + }); + + group('Scala integration', () { + test('case class and match', () { + const source = ''' +case class Point(x: Int, y: Int) + +val p = Point(1, 2) +val desc = p match + case Point(0, 0) => "origin" + case Point(x, y) => s"(\$x, \$y)" +'''; + _expectLossless(source, scala); + final tokens = tokenize(source, scala); + expect(_textsOf(tokens), contains('case')); + expect(_textsOf(tokens), contains('class')); + expect(_textsOf(tokens), contains('val')); + expect(_textsOf(tokens), contains('match')); + expect(_textsOf(tokens), contains('Int')); + }); + }); + + group('YAML integration', () { + test('nested structure', () { + const source = ''' +server: + host: "localhost" + port: 8080 + debug: true + tags: + - web + - api +'''; + _expectLossless(source, yaml); + final tokens = tokenize(source, yaml); + expect(_textsOf(tokens), ['"localhost"']); + expect(_textsOf(tokens), ['8080']); + expect(_textsOf(tokens), contains('true')); + }); + }); + + group('JSON integration', () { + test('nested object', () { + const source = '{"a": 1, "b": [true, false, null], "c": "text"}'; + _expectLossless(source, json); + final tokens = tokenize(source, json); + expect(_textsOf(tokens), ['true', 'false', 'null']); + expect(_textsOf(tokens), ['1']); + expect(_textsOf(tokens), ['"a"', '"b"', '"c"', '"text"']); + }); + }); + + group('shell integration', () { + test('script with conditionals and loops', () { + const source = ''' +if [ -d "build" ]; then + for f in build/*; do + echo "removing \$f" + done +fi +'''; + _expectLossless(source, shell); + final tokens = tokenize(source, shell); + expect(_textsOf(tokens), contains('if')); + expect(_textsOf(tokens), contains('then')); + expect(_textsOf(tokens), contains('for')); + expect(_textsOf(tokens), contains('do')); + expect(_textsOf(tokens), contains('done')); + expect(_textsOf(tokens), contains('fi')); + }); + }); + + // --------------------------------------------------------------------------- + // Custom grammars + // --------------------------------------------------------------------------- + + group('custom grammar', () { + test('minimal grammar', () { + const minimal = LangGrammar(name: 'minimal'); + const source = 'hello 42 "world"'; + _expectLossless(source, minimal); + }); + + test('custom keywords', () { + const custom = LangGrammar( + name: 'custom', + keywords: ['fn', 'let'], + types: ['u32'], + ); + final tokens = tokenize('fn main() { let x: u32 = 1; }', custom); + expect(_textsOf(tokens), ['fn', 'let']); + expect(_textsOf(tokens), ['u32']); + }); + + test('custom comment syntax', () { + const custom = LangGrammar( + name: 'custom', + lineComment: '--', + blockComment: ('{-', '-}'), + ); + final tokens = tokenize('x -- line comment\ny {- block -} z', custom); + expect(_textsOf(tokens), ['-- line comment', '{- block -}']); + }); + }); + + // --------------------------------------------------------------------------- + // grammarFor lookup + // --------------------------------------------------------------------------- + + group('grammarFor', () { + test('dart', () { + expect(grammarFor('dart')?.name, 'dart'); + }); + + test('scala', () { + expect(grammarFor('scala')?.name, 'scala'); + }); + + test('yaml aliases', () { + expect(grammarFor('yaml')?.name, 'yaml'); + expect(grammarFor('yml')?.name, 'yaml'); + }); + + test('json', () { + expect(grammarFor('json')?.name, 'json'); + }); + + test('shell aliases', () { + expect(grammarFor('sh')?.name, 'shell'); + expect(grammarFor('bash')?.name, 'shell'); + expect(grammarFor('shell')?.name, 'shell'); + expect(grammarFor('zsh')?.name, 'shell'); + }); + + test('unknown returns null', () { + expect(grammarFor('brainfuck'), isNull); + expect(grammarFor(''), isNull); + }); + }); + + // --------------------------------------------------------------------------- + // Edge cases + // --------------------------------------------------------------------------- + + group('edge cases', () { + test('empty input returns empty list', () { + expect(tokenize('', dart), isEmpty); + }); + + test('single keyword', () { + final tokens = tokenize('if', dart); + expect(tokens.length, 1); + expect(tokens.first, isA()); + }); + + test('single number', () { + final tokens = tokenize('42', dart); + expect(tokens.length, 1); + expect(tokens.first, isA()); + }); + + test('single string', () { + final tokens = tokenize('"x"', dart); + expect(tokens.length, 1); + expect(tokens.first, isA()); + }); + + test('single comment', () { + final tokens = tokenize('// comment', dart); + expect(tokens.length, 1); + expect(tokens.first, isA()); + }); + + test('number then dot then identifier is not float', () { + final tokens = tokenize('x.length', dart); + expect(_textsOf(tokens), ['x', 'length']); + expect(_textsOf(tokens), ['.']); + }); + + test('annotation without following identifier is Plain', () { + // `@` is only valid as an annotation prefix in Dart. On its own + // It falls through to Plain. + final tokens = tokenize('@ ', dart); + expect(_ofType(tokens), isEmpty); + expect(_textsOf(tokens), ['@']); + }); + + test('special characters merged into single Plain token', () { + final tokens = tokenize('\u00e9\u00e8\u00ea', dart); + expect(tokens.length, 1); + expect(tokens.first, isA()); + expect(tokens.first.text, '\u00e9\u00e8\u00ea'); + }); + + test('very long input', () { + final source = 'var x = ${List.filled(1000, '42').join(' + ')};'; + _expectLossless(source, dart); + }); + }); + + // --------------------------------------------------------------------------- + // tokenizeSpans: byte offsets into source. + // --------------------------------------------------------------------------- + + void expectSpanInvariants(String source, LangGrammar grammar) { + final spans = tokenizeSpans(source, grammar); + if (source.isEmpty) { + expect(spans, isEmpty); + return; + } + // Anchored. + expect(spans.first.start, 0, reason: 'first span starts at 0'); + expect( + spans.last.end, + source.length, + reason: 'last span ends at source.length', + ); + // Contiguous. + for (var i = 0; i + 1 < spans.length; i++) { + expect( + spans[i].end, + spans[i + 1].start, + reason: 'spans[$i].end == spans[${i + 1}].start', + ); + } + // Text matches substring. + for (final s in spans) { + expect( + source.substring(s.start, s.end), + s.token.text, + reason: 'substring matches token text at [${s.start}, ${s.end})', + ); + expect(s.length, s.end - s.start, reason: 'length == end - start'); + } + // Lossless (already covered by substring-matches + contiguous, but + // explicit for readability). + expect( + spans.map((s) => s.token.text).join(), + source, + reason: 'lossless join', + ); + } + + group('tokenizeSpans invariants', () { + test('empty source returns empty list', () { + expect(tokenizeSpans('', dart), isEmpty); + }); + + test('single-character source', () { + expectSpanInvariants('x', dart); + }); + + test('Dart snippet', () { + const source = ''' +void main() { + final x = 42; + // greeting + print("hello \$x"); +} +'''; + expectSpanInvariants(source, dart); + }); + + test('Scala snippet', () { + const source = ''' +object Main: + def run(args: List[String]): Unit = + val x: Int = 42 +'''; + expectSpanInvariants(source, scala); + }); + + test('YAML snippet', () { + const source = ''' +name: rumil_tokens +version: 0.6.0 +# comment +'''; + expectSpanInvariants(source, yaml); + }); + + test('JSON snippet', () { + expectSpanInvariants('{"a": 1, "b": true, "c": null}', json); + }); + + test('shell snippet', () { + const source = ''' +# deploy +for f in *.dart; do + echo "\$f" +done +'''; + expectSpanInvariants(source, shell); + }); + + test('unterminated string preserves end-of-source anchor', () { + expectSpanInvariants('var s = "unterminated', dart); + }); + + test('unterminated block comment preserves end-of-source anchor', () { + expectSpanInvariants('/* unterminated', dart); + }); + + test('only whitespace', () { + expectSpanInvariants(' \t\n ', dart); + }); + + test('only punctuation', () { + expectSpanInvariants('(){}[]<>;:,.', dart); + }); + }); + + group('tokenizeSpans parity with tokenize', () { + // Token has no == / hashCode so we compare by (runtimeType, text). + (Type, String) key(Token t) => (t.runtimeType, t.text); + void expectParity(String source, LangGrammar grammar) { + final tokens = tokenize(source, grammar); + final spans = tokenizeSpans(source, grammar); + expect( + spans.map((s) => key(s.token)).toList(), + tokens.map(key).toList(), + reason: 'tokenizeSpans token sequence matches tokenize', + ); + } + + test('Dart', () { + expectParity('void f() { return 42; }', dart); + }); + + test('Scala', () { + expectParity('val x: Int = 42', scala); + }); + + test('YAML', () { + expectParity('key: "value" # c\n', yaml); + }); + + test('JSON', () { + expectParity('[1, 2, 3]', json); + }); + + test('shell', () { + expectParity('if [ -f "x" ]; then echo hi; fi', shell); + }); + }); + + group('tokenizeSpans span boundaries', () { + test('keyword span covers exact characters', () { + final spans = tokenizeSpans('if x', dart); + expect(spans[0].token, isA()); + expect(spans[0].start, 0); + expect(spans[0].end, 2); + }); + + test('whitespace span covers exact characters', () { + final spans = tokenizeSpans('a b', dart); + expect(spans[1].token, isA()); + expect(spans[1].start, 1); + expect(spans[1].end, 3); + }); + + test('number span covers exact characters', () { + final spans = tokenizeSpans('x = 42', dart); + final number = spans.firstWhere((s) => s.token is NumberLit); + expect(number.start, 4); + expect(number.end, 6); + }); + + test('string span includes delimiters', () { + final spans = tokenizeSpans('"hi"', dart); + expect(spans.single.token, isA()); + expect(spans.single.start, 0); + expect(spans.single.end, 4); + }); + + test('line comment span runs to end of line (exclusive of newline)', () { + final spans = tokenizeSpans('// note\nx', dart); + final comment = spans.firstWhere((s) => s.token is Comment); + expect(comment.start, 0); + expect(comment.end, 7); + // Newline is a separate Whitespace token. + expect(spans[1].token, isA()); + expect(spans[1].start, 7); + }); + + test('merged Plain spans cover the full run', () { + // Unicode chars the tokenizer doesn't classify → Plain tokens that + // get merged. The merged span must cover the whole run. + const source = 'éèê'; + final spans = tokenizeSpans(source, dart); + expect(spans, hasLength(1)); + expect(spans.single.token, isA()); + expect(spans.single.start, 0); + expect(spans.single.end, source.length); + expect(spans.single.token.text, source); + }); + + test('annotation span includes prefix', () { + final spans = tokenizeSpans('@override', dart); + expect(spans.single.token, isA()); + expect(spans.single.start, 0); + expect(spans.single.end, 9); + }); + }); + + group('Spanned generics', () { + test('Spanned exposes record fields through getters', () { + const s = Spanned.of(Keyword('if'), 3, 5); + expect(s.token.text, 'if'); + expect(s.start, 3); + expect(s.end, 5); + expect(s.length, 2); + }); + + test('narrow type parameter upcasts to Spanned', () { + const kw = Spanned.of(Keyword('if'), 0, 2); + // Covariance through the record type parameter. + const Spanned wide = kw; + expect(wide.token, isA()); + expect(wide.start, 0); + }); + }); + + // --------------------------------------------------------------------------- + // Grammar correctness (0.6.0 Path C fixes) + // --------------------------------------------------------------------------- + + group('Dart raw strings', () { + test("r'...' is one StringLit including the r prefix", () { + final tokens = tokenize("r'no\\escape'", dart); + expect(_textsOf(tokens), ["r'no\\escape'"]); + expect(_ofType(tokens), isEmpty); + }); + + test('r"..." is one StringLit including the r prefix', () { + final tokens = tokenize('r"no\\escape"', dart); + expect(_textsOf(tokens), ['r"no\\escape"']); + }); + + test("r'''...''' (triple-single) is one StringLit", () { + final tokens = tokenize("r'''no\\escape'''", dart); + expect(_textsOf(tokens), ["r'''no\\escape'''"]); + }); + + test('r"""..." (triple-double) is one StringLit', () { + final tokens = tokenize('r"""no\\escape"""', dart); + expect(_textsOf(tokens), ['r"""no\\escape"""']); + }); + + test('non-raw identifier r followed by space is Identifier', () { + final tokens = tokenize("r 'x'", dart); + expect(_textsOf(tokens), ['r']); + expect(_textsOf(tokens), ["'x'"]); + }); + }); + + group('Scala backtick identifiers', () { + test('`type` is one Identifier even though `type` is a keyword', () { + final tokens = tokenize('val `type` = 1', scala); + expect(_textsOf(tokens), ['`type`']); + // `type` inside backticks must NOT appear as a separate keyword. + expect(_textsOf(tokens), ['val']); + }); + + test('unterminated backtick identifier is tolerated', () { + final tokens = tokenize('val `noclose', scala); + expect(_textsOf(tokens), ['`noclose']); + }); + + test('backtick identifier with spaces', () { + final tokens = tokenize('val `hello world` = 1', scala); + expect(_textsOf(tokens), ['`hello world`']); + }); + }); + + group('Scala string interpolator prefix', () { + test('s"..." is one StringLit including the s prefix', () { + final tokens = tokenize(r'val s = s"hi $name"', scala); + expect(_textsOf(tokens), ['s"hi \$name"']); + // Only `s` as in `val s = ...` should be an Identifier, not the + // interpolator prefix. + expect(_textsOf(tokens), ['s']); + }); + + test('f"..." is one StringLit including the f prefix', () { + final tokens = tokenize(r'val x = f"$v%.2f"', scala); + expect(_textsOf(tokens).last, 'f"\$v%.2f"'); + }); + + test('arbitrary identifier prefix (my_interp"..."):', () { + final tokens = tokenize('val x = my_interp"body"', scala); + expect(_textsOf(tokens), ['my_interp"body"']); + }); + + test('triple-quoted with prefix (raw"""...""")', () { + final tokens = tokenize('val x = raw"""body"""', scala); + expect(_textsOf(tokens), ['raw"""body"""']); + }); + + test('no prefix still works (plain "...")', () { + final tokens = tokenize('val x = "body"', scala); + expect(_textsOf(tokens), ['"body"']); + }); + }); + + group('JSON negative numbers', () { + test('-1 is one NumberLit', () { + final tokens = tokenize('{"n": -1}', json); + expect(_textsOf(tokens), ['-1']); + // No separate `-` token anywhere. + expect(_ofType(tokens), isEmpty); + expect(_textsOf(tokens), isEmpty); + }); + + test('-3.14 is one NumberLit', () { + final tokens = tokenize('{"n": -3.14}', json); + expect(_textsOf(tokens), ['-3.14']); + }); + + test('-1e10 is one NumberLit', () { + final tokens = tokenize('{"n": -1e10}', json); + expect(_textsOf(tokens), ['-1e10']); + }); + + test('positive numbers still work', () { + final tokens = tokenize('{"n": 42}', json); + expect(_textsOf(tokens), ['42']); + }); + }); + + group('YAML flow collections', () { + test('flow sequence [1, 2, 3] classifies as punctuation', () { + final tokens = tokenize('[1, 2, 3]', yaml); + expect(_textsOf(tokens), ['[', ',', ',', ']']); + expect(_textsOf(tokens), ['1', '2', '3']); + }); + + test('flow map {a: 1} classifies as punctuation', () { + final tokens = tokenize('{a: 1}', yaml); + expect(_textsOf(tokens), ['{', ':', '}']); + }); + + test('YAML 1.1 keywords removed: yes/no/on/off are identifiers', () { + final tokens = tokenize('a: yes\nb: no\nc: on\nd: off', yaml); + // In YAML 1.2 these are strings (we treat as identifiers for highlighting). + expect(_textsOf(tokens), isEmpty); + expect(_textsOf(tokens), [ + 'a', + 'yes', + 'b', + 'no', + 'c', + 'on', + 'd', + 'off', + ]); + }); + + test('YAML 1.2 booleans still classified as Keyword', () { + final tokens = tokenize('a: true\nb: false\nc: null', yaml); + expect(_textsOf(tokens), ['true', 'false', 'null']); + }); + }); + + group('Operator vs Punctuation classification', () { + test('Dart: + and * are Operator, not Punctuation', () { + final tokens = tokenize('a + b * c', dart); + expect(_textsOf(tokens), ['+', '*']); + expect(_ofType(tokens), isEmpty); + }); + + test('Dart: parens and comma are Punctuation', () { + final tokens = tokenize('f(1, 2)', dart); + expect(_textsOf(tokens), ['(', ',', ')']); + expect(_ofType(tokens), isEmpty); + }); + + test('Dart: multi-char operators coalesce into one token', () { + final tokens = tokenize('a == b && c', dart); + expect(_textsOf(tokens), ['==', '&&']); + }); + + test('Dart: arrow => is one Operator', () { + final tokens = tokenize('(x) => x', dart); + expect(_textsOf(tokens), ['=>']); + }); + + test('Scala: <- is one Operator (for-comprehensions)', () { + final tokens = tokenize('for { x <- xs }', scala); + expect(_textsOf(tokens), ['<-']); + }); + + test('JSON: no operators (no operator classification happens)', () { + final tokens = tokenize('{"n": 1}', json); + expect(_ofType(tokens), isEmpty); + }); + }); + + group('Shell variables', () { + test('bare \$NAME is one Variable', () { + final tokens = tokenize(r'echo $HOME', shell); + expect(_textsOf(tokens), [r'$HOME']); + // $ must not leak out as a separate Plain token. + expect(_textsOf(tokens), isEmpty); + }); + + test(r'${NAME} (braced) is one Variable', () { + final tokens = tokenize(r'echo ${HOME}', shell); + expect(_textsOf(tokens), [r'${HOME}']); + }); + + test(r'${NAME:-default} captures full expansion', () { + final tokens = tokenize(r'echo ${X:-hi}', shell); + expect(_textsOf(tokens), [r'${X:-hi}']); + }); + + test(r'${#NAME} (string length) captures full expansion', () { + final tokens = tokenize(r'echo ${#PATH}', shell); + expect(_textsOf(tokens), [r'${#PATH}']); + }); + + test('special parameters: \$1, \$@, \$?, \$\$', () { + final tokens = tokenize(r'echo $1 $@ $? $$', shell); + expect(_textsOf(tokens), [r'$1', r'$@', r'$?', r'$$']); + }); + + test(r'lone $ before ( (for $(...)) emits $ as Variable', () { + final tokens = tokenize(r'echo $(ls)', shell); + final vars = _textsOf(tokens); + expect(vars, [r'$']); + expect(_textsOf(tokens), ['(', ')']); + expect(_textsOf(tokens), ['echo', 'ls']); + }); + + test('unterminated \${ is tolerated', () { + final tokens = tokenize(r'echo ${foo', shell); + expect(_textsOf(tokens), [r'${foo']); + }); + + test('Dart: \$ in identifier does NOT produce Variable', () { + final tokens = tokenize(r'var $x = 1;', dart); + // Dart allows $ in idents; no Variable classification. + expect(_ofType(tokens), isEmpty); + expect(_textsOf(tokens), [r'$x']); + }); + }); + + group('Shell backtick command substitution', () { + test('backticks classified as Punctuation', () { + final tokens = tokenize('echo `ls`', shell); + expect(_textsOf(tokens), ['`', '`']); + expect(_textsOf(tokens), ['echo', 'ls']); + }); + + test('non-shell grammars do not recognize backticks as punctuation', () { + final tokens = tokenize('x `y`', json); + // backtick falls through to Plain in JSON. + expect(_textsOf(tokens), contains('`')); + }); + }); + + group('Shell heredocs', () { + test('<().single; + expect(heredoc.text, startsWith('<().single; + expect(heredoc.text, startsWith('<<-EOF')); + expect(heredoc.text, contains('\tbody')); + }); + + test("<<'EOF' single-quoted marker", () { + const source = "cat <<'EOF'\nbody\nEOF\n"; + final tokens = tokenize(source, shell); + final heredoc = tokens.whereType().single; + expect(heredoc.text, startsWith("<<'EOF'")); + expect(heredoc.text, contains('body')); + }); + + test('unterminated heredoc consumes to end-of-source', () { + const source = 'cat <().single; + expect(heredoc.text, startsWith('<().single; + expect(heredoc.text, contains('EOFISH')); + expect(heredoc.text, endsWith('EOF\n')); + }); + }); + + group('Multi-char operators and generics (0.6.0 polish)', () { + test('Dart: Map: < and > are Punctuation (generics)', () { + final tokens = tokenize('Map', dart); + expect(_textsOf(tokens), ['<', ',', '>']); + expect(_ofType(tokens), isEmpty); + }); + + test('Dart: a <= b is one Operator', () { + final tokens = tokenize('a <= b', dart); + expect(_textsOf(tokens), ['<=']); + expect(_ofType(tokens), isEmpty); + }); + + test('Dart: a >= b is one Operator', () { + final tokens = tokenize('a >= b', dart); + expect(_textsOf(tokens), ['>=']); + }); + + test('Dart: a ?? b is one Operator', () { + final tokens = tokenize('a ?? b', dart); + expect(_textsOf(tokens), ['??']); + }); + + test('Dart: a?.b is one Operator', () { + final tokens = tokenize('a?.b', dart); + expect(_textsOf(tokens), ['?.']); + }); + + test('Dart: nullable type String?: ? is Punctuation', () { + final tokens = tokenize('String? x', dart); + expect(_textsOf(tokens), ['?']); + }); + + test('Dart: arrow => is one Operator, not < plus =', () { + final tokens = tokenize('(x) => x', dart); + expect(_textsOf(tokens), ['=>']); + }); + + test('Dart: compound assign += -= *= etc', () { + final tokens = tokenize('x += 1; y -= 2; z *= 3;', dart); + expect(_textsOf(tokens), ['+=', '-=', '*=']); + }); + + test('Dart: ??= compound assign', () { + final tokens = tokenize('x ??= 1', dart); + expect(_textsOf(tokens), ['??=']); + }); + + test('Dart: x=-1 tokenizes as three tokens', () { + final tokens = tokenize('x=-1', dart); + expect(_textsOf(tokens), ['=', '-']); + expect(_textsOf(tokens), ['1']); + }); + + test('Scala: <- is one Operator', () { + final tokens = tokenize('for { x <- xs }', scala); + expect(_textsOf(tokens), ['<-']); + }); + + test('Scala: -> is one Operator', () { + final tokens = tokenize('val m = Map(1 -> "a")', scala); + expect(_textsOf(tokens), contains('->')); + }); + + test('Scala: :: is one Operator', () { + final tokens = tokenize('1 :: Nil', scala); + expect(_textsOf(tokens), ['::']); + }); + + test('Shell: && is one Operator', () { + final tokens = tokenize('a && b', shell); + expect(_textsOf(tokens), ['&&']); + }); + + test('Shell: || is one Operator', () { + final tokens = tokenize('a || b', shell); + expect(_textsOf(tokens), ['||']); + }); + }); + + group('lossless roundtrip under new grammar rules', () { + // Every grammar-fix input must still round-trip losslessly. + test('Dart raw strings round-trip', () { + _expectLossless("r'no\\escape' + r\"also\" + r'''triple'''", dart); + }); + + test('Scala interpolators round-trip', () { + _expectLossless(r'val s = s"hi $name"; val f = f"$x%.2f"', scala); + }); + + test('Scala backtick idents round-trip', () { + _expectLossless('val `type` = 1; val `hello world` = 2', scala); + }); + + test('JSON negatives round-trip', () { + _expectLossless('{"a": -1, "b": -3.14, "c": -1e10}', json); + }); + + test('YAML flow collections round-trip', () { + _expectLossless('a: [1, 2, 3]\nb: {x: 1, y: 2}\n', yaml); + }); + + test('shell variables round-trip', () { + _expectLossless(r'echo $HOME ${PATH:-/bin} $(ls) `pwd`', shell); + }); + + test('shell heredocs round-trip', () { + _expectLossless('cat <', 'text: >\n folded\n lines\n', yaml); + dump('yaml anchor/ref', 'foo: &a 1\nbar: *a\n', yaml); + dump('yaml doc sep', '---\nkey: value\n', yaml); + dump('yaml plain scalar', 'key: value\n', yaml); + dump('yaml flow seq', '[1, 2, 3]\n', yaml); + dump('yaml flow map', '{a: 1}\n', yaml); + + dump('json hex', '{"n": 0xFF}', json); + dump('json negative num', '{"n": -1}', json); + dump('json line-comment', '{"a": 1} // c', json); + dump('json exponent', '{"n": 1e10}', json); + + dump('sh variable', r'echo $HOME', shell); + dump('sh var braces', r'echo ${HOME}', shell); + dump('sh command sub', r'echo $(ls)', shell); + dump('sh backtick sub', 'echo `ls`', shell); + dump('sh heredoc', 'cat <