From 6679a8971b004fe9dcae2b3a52a6e77640842e53 Mon Sep 17 00:00:00 2001 From: David Neto Date: Mon, 13 Feb 2023 14:54:21 -0500 Subject: [PATCH] wgsl: discover template lists early (Lookahead disambiguation of less-than vs template argument list (v2)) (#3803) * Implement a tree-sitter scanner for template disambiguation Use a custom scanner to disambiguate between template-argument-lists and less-than / greater-than. * Build the treesitter shared library on our own The py-tree-sitter compilation doesn't work on macOS because it doesn't know to use -std=c++17 when compiling C++ code. * Grammar analyzer understands many extra external tokens * Only do slow actions when data is newer. * Grammar.py: Generate syntax_sym references for extra external tokens * Allow syntactic tokens to be named without backticks * Regenerate recursive grammar * type_specifier is fully_qualified_ident This is much simpler than using "expression". But note that in future we may want to have type expressions like unions, as in TypeScript. That door is still open, as the grammar was unambiguous (and recursive-descentable) even with type being "expression" * Add TODOs * Remove extraneous grammar grouping * analyze/Grammar.py: Make _disambiguate_template an empty token In the template-matching scheme, it doesn't appear in source text. Make it empty so that first and follow sets are computed correctly. * Explain the custom tokens Delete redundant syntactic token definitions * Add more TODOs in scan, and more comments * Explain why "var" is special-cased Ben answered * disambiguate token does not have to be optional * Refactor extract-grammar.py Add argument parsing Add a --flow argument to specify which step to run. * Add WGSL parsing unit tests * Fix tree-sitter parsing of bitcast Need to add _disambiguate_template before the template list so they are parsed as templates. * Add more unit tests * analyze/Grammar.py: disambiguate token treated as nonempty * scanner.cc: better comments, and dump "valids" array when dumping * "var" is followed by template list scanner.cc: Remove the "var" exception * Better explanation of synthetic token * Fix comment from merge conflict * Add explicit dependencies on the fixed wgsl include files * Support elements that are hidden spans. * Change _disambiguate_template to a hidden span token It's a Treesitter implementation detail, so it doesn't belong in the spec. * The relational and shift tokens are displayed plainly in the HTML They are remapped to custom tokens by the extract-grammar process. Pretty printing in analyze/Grammar.py has to remove leading underscores so they link correctly to definitions. * Fix formatting of bitcast rule * Start writing template disambiguation * Add TODOs in the custom scanner * Describe the template list discovery algorithm. * Add missing disambiguation for fully-qualified-ident * Custom scanner: correctly mark an ordinary greater-than code point * Better wording of CurrentPosition * scanner.cc: Show more details about expected valid tokens * Add another disambiguation token ident use inin core_lhs_expression makes simple assignments work. * scanner.cc: Add more logging * Lots more logging * Many more unit tests * Make types and type-generating names keywords again Insert disambiguating spans and template_args_start and _end where needed. Fixes: #3770 * scanner.cc: Better comment about the sentinel value * extract-grammar.py: GCC wants -shared and -fPIC link flags * extract-grammar:.py: GCC requires link flags after, not before --------- Co-authored-by: Ben Clayton --- .clang-format | 2 + wgsl/Makefile | 32 +- wgsl/analyze/Grammar.py | 56 +- wgsl/extract-grammar.py | 962 ++++++++++++++++++++------------ wgsl/index.bs | 207 ++++++- wgsl/scanner.cc | 992 +++++++++++++++++++++++++++++++++ wgsl/wgsl.recursive.bs.include | 46 +- wgsl/wgsl_unit_tests.py | 150 +++++ 8 files changed, 2037 insertions(+), 410 deletions(-) create mode 100644 .clang-format create mode 100644 wgsl/scanner.cc create mode 100644 wgsl/wgsl_unit_tests.py diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000000..2fb833a5df --- /dev/null +++ b/.clang-format @@ -0,0 +1,2 @@ +# http://clang.llvm.org/docs/ClangFormatStyleOptions.html +BasedOnStyle: Chromium diff --git a/wgsl/Makefile b/wgsl/Makefile index 5374b9d6cf..6bd8439878 100644 --- a/wgsl/Makefile +++ b/wgsl/Makefile @@ -1,14 +1,14 @@ .PHONY: all clean nfkc validate lalr validate-examples all: index.html nfkc validate test diagrams -validate: lalr validate-examples -validate-examples: grammar/grammar.js +validate: lalr unit_tests validate-examples clean: - rm -f index.html index.pre.html grammar/grammar.js wgsl.lalr.txt + rm -f index.html index.pre.html index.bs.pre grammar/grammar.js grammar/build wgsl.lalr.txt + # Generate spec HTML from Bikeshed source. -WGSL_SOURCES:=index.bs $(wildcard wgsl.*.bs.include) +WGSL_SOURCES:=index.bs scanner.cc wgsl.recursive.bs.include wgsl.reserved.bs.include index.pre.html: $(WGSL_SOURCES) DIE_ON=everything bash ../tools/invoke-bikeshed.sh $@ $(WGSL_SOURCES) @@ -23,14 +23,28 @@ diagrams: $(MERMAID_OUTPUTS) img/%.mmd.svg: diagrams/%.mmd ../tools/invoke-mermaid.sh ../tools/mermaid.json sh ../tools/invoke-mermaid.sh -i $< -o $@ -# Extract WGSL grammar from the spec, validate it with Treesitter, -# and use Treesitter to parse many code examples in the spec. -grammar/grammar.js: index.bs extract-grammar.py - python3 ./extract-grammar.py index.bs grammar/grammar.js +TREESITTER_GRAMMAR_INPUT := grammar/grammar.js +TREESITTER_PARSER := grammar/build/wgsl.so + +# Extract WGSL grammar from the spec, validate it by building a Treesitter parser from it. +$(TREESITTER_GRAMMAR_INPUT) $(TREESITTER_PARSER): index.bs scanner.cc extract-grammar.py + python3 ./extract-grammar.py --spec index.bs --scanner scanner.cc --tree-sitter-dir grammar --flow xb + +.PHONY: validate-examples +# Use Treesitter to parse many code examples in the spec. +validate-examples: $(TREESITTER_PARSER) + python3 ./extract-grammar.py --flow e + +.PHONY: unit_tests +# Use Treesitter to parse code samples +unit_tests: $(TREESITTER_PARSER) wgsl_unit_tests.py + python3 wgsl_unit_tests.py --parser $(TREESITTER_PARSER) # The grammar in JSON form, emitted by Treesitter. WGSL_GRAMMAR=grammar/src/grammar.json -$(WGSL_GRAMMAR) : grammar/grammar.js +$(WGSL_GRAMMAR) : $(TREESITTER_GRAMMAR_INPUT) + +wgsl_unit_tests: .PHONY: nfkc nfkc: diff --git a/wgsl/analyze/Grammar.py b/wgsl/analyze/Grammar.py index d051856205..f51b7a99ee 100755 --- a/wgsl/analyze/Grammar.py +++ b/wgsl/analyze/Grammar.py @@ -44,6 +44,7 @@ import json import functools +import sys from ObjectRegistry import RegisterableObject, ObjectRegistry from collections import defaultdict @@ -323,8 +324,25 @@ def with_meta(phrase,metachar,print_option): # Print ourselves if print_option.bikeshed: context = 'recursive descent syntax' - if print_option.grammar.rules[name].is_token(): + g = print_option.grammar + if g.rules[name].is_token(): context = 'syntax' + if name in g.extra_externals: + context = 'syntax_sym' + if name == '_disambiguate_template': + # This is an implementation detail, so make it invisible. + return '' + else: + without_underscore = ['_less_than', + '_less_than_equal', + '_greater_than', + '_greater_than_equal', + '_shift_left', + '_shift_left_assign', + '_shift_right', + '_shift_right_assign'] + if name in without_underscore: + name = name[1:] return "[={}/{}=]".format(context,name) return name if isinstance(rule,Choice): @@ -350,7 +368,7 @@ def with_meta(phrase,metachar,print_option): # If it's not canonical, then it can have nesting. return "(" + inside + nl + ")" if isinstance(rule,Seq): - return " ".join([i.pretty_str(print_option) for i in rule]) + return " ".join(filter(lambda i: len(i)>0, [i.pretty_str(print_option) for i in rule])) if isinstance(rule,Repeat1): return "( " + "".join([i.pretty_str(print_option) for i in rule]) + " )+" raise RuntimeError("unexpected node: {}".format(str(rule))) @@ -859,6 +877,21 @@ def is_accepting(self): def at_end(self): return self.position == len(self.items()) +def json_externals(json): + """ + Returns the set of names of symbols in the "externals" section of the + Treesitter JSON grammar. + + Data looks like this, for section "externals". + { + "externals": [ + { "type": "SYMBOL", name: "_block_comment" }, + { "type": "SYMBOL", name: "_error_sentinel" } + } + } + """ + return set([ x["name"] for x in json.get("externals",[]) ]) + def json_hook(grammar,memo,tokens_only,dct): """ @@ -1801,6 +1834,22 @@ def __init__(self, json_text, start_symbol, ignore='_reserved'): # First decode it without any interpretation. pass0 = json.loads(json_text) + + # Get the external tokens, these are not necessarily represented in the rules. + external_tokens = json_externals(pass0) + #print(external_tokens,file=sys.stderr) + defined_rules = set(pass0["rules"].keys()) + # The set of external tokens that don't have an ordinary definition in the grammar. + self.extra_externals = external_tokens - defined_rules + for e in self.extra_externals: + content = "\\u200B{}".format(e) + if e == '_disambiguate_template': + # This is a zero-width token used for Treesitter's benefit + #content = '' + pass + # Create a placholder definition + pass0["rules"][e] = {"type":"TOKEN","content":{"type":"PATTERN","value":content}} + # Remove any rules that should be ignored # The WGSL grammar has _reserved, which includes 'attribute' but # that is also the name of a different grammar rule. @@ -1922,6 +1971,7 @@ def pretty_str(self,print_option=PrintOption()): token_rules = set() + # Look for defined rules that look better as absorbed into their uses. for name, rule in self.rules.items(): # Star-able is also optional-able, so starrable must come first. starred_phrase = rule.as_starred(name) @@ -1938,6 +1988,8 @@ def pretty_str(self,print_option=PrintOption()): if len(phrase)==1 and phrase[0].is_token(): token_rules.add(name) + # A rule that was generated to satisfy canonicalization is better + # presented as absorbed in its original parent. for name, rule in self.rules.items(): # We only care about rules generated during canonicalization if name.find('.') > 0 or name.find('/') > 0: diff --git a/wgsl/extract-grammar.py b/wgsl/extract-grammar.py index 938c9fdae1..9329fe08eb 100755 --- a/wgsl/extract-grammar.py +++ b/wgsl/extract-grammar.py @@ -3,28 +3,50 @@ from datetime import date from string import Template +import argparse import os import re import subprocess import sys +import shutil +import wgsl_unit_tests +from distutils.ccompiler import new_compiler +from distutils.unixccompiler import UnixCCompiler from tree_sitter import Language, Parser -HEADER = """ -// Copyright (C) [$YEAR] World Wide Web Consortium, -// (Massachusetts Institute of Technology, European Research Consortium for -// Informatics and Mathematics, Keio University, Beihang). -// All Rights Reserved. -// -// This work is distributed under the W3C (R) Software License [1] in the hope -// that it will be useful, but WITHOUT ANY WARRANTY; without even the implied -// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. -// -// [1] http://www.w3.org/Consortium/Legal/copyright-software - -// **** This file is auto-generated. Do not edit. **** - -""".lstrip() +class Options(): + def __init__(self,bs_filename, tree_sitter_dir, scanner_cc_filename): + self.script = 'extract-grammar.py' + self.bs_filename = bs_filename + self.grammar_dir = tree_sitter_dir + self.scanner_cc_filename = scanner_cc_filename + self.wgsl_shared_lib = os.path.join(self.grammar_dir,"build","wgsl.so") + self.grammar_filename = os.path.join(self.grammar_dir,"grammar.js") + self.verbose = False + + def __str__(self): + parts = [] + parts.append("script = {}".format(self.script)) + parts.append("bs_filename = {}".format(self.bs_filename)) + parts.append("grammar_dir = {}".format(self.grammar_dir)) + parts.append("grammar_filename = {}".format(self.grammar_filename)) + parts.append("scanner_cc_filename = {}".format(self.scanner_cc_filename)) + parts.append("wgsl_shared_lib = {}".format(self.wgsl_shared_lib)) + return "Options({})".format(",".join(parts)) + +def newer_than(first,second): + """ + Returns true if file 'first' is newer than 'second', + or if 'second' does not exist + """ + if not os.path.exists(first): + raise Exception("Missing file {}".format(first)) + if not os.path.exists(second): + return True + first_time = os.path.getmtime(first) + second_time = os.path.getmtime(second) + return first_time >= second_time def read_lines_from_file(filename, exclusions): @@ -34,7 +56,8 @@ def read_lines_from_file(filename, exclusions): """ file = open(filename, "r") # Break up the input into lines, and skip empty lines. - parts = [j for i in [i.split("\n") for i in file.readlines()] for j in i if len(j) > 0] + parts = [j for i in [i.split("\n") for i in file.readlines()] + for j in i if len(j) > 0] result = [] include_re = re.compile('path:\s+(\S+)') for line in parts: @@ -42,35 +65,17 @@ def read_lines_from_file(filename, exclusions): if m: included_file = m.group(1) if included_file not in exclusions: - print("including {}".format(included_file)) - result.extend(read_lines_from_file(included_file,exclusions)) + result.extend(read_lines_from_file(included_file, exclusions)) continue result.append(line) return result -scanner_lines = read_lines_from_file(sys.argv[1], {'wgsl.recursive.bs.include'}) - -# Skip lines like: -#
-#  
-scanner_lines = filter(lambda s: not s.startswith("") and not s.startswith("
', '', line) for line in scanner_lines]
-
-grammar_filename = sys.argv[2]
-grammar_path = os.path.dirname(grammar_filename)
-os.makedirs(grammar_path, exist_ok=True)
-grammar_file = open(grammar_filename, "w")
-
-# Global variable holding the current line text.
-line = ""
 
 """
 Scanner classes are used to parse contiguous sets of lines in the WGSL bikeshed
 source text.
 """
+
 class Scanner:
 
     @staticmethod
@@ -130,6 +135,9 @@ def parse(lines, i):
 
 
 class scanner_rule(Scanner):
+    """
+    A scanner that reads grammar rules from bikeshed source text.
+    """
     @staticmethod
     def name():
         return "rule"
@@ -188,7 +196,10 @@ def parse(lines, i):
         return (None, None, None)
 
 
-class scanner_example(Scanner):  # Not an example of a scanner, scanner of examples from specification
+class scanner_example(Scanner):
+    """
+    A scanner that reads WGSL program examples from bikeshed source text.
+    """
     @staticmethod
     def name():
         return "example"
@@ -231,134 +242,24 @@ def parse(lines, i):
         return (None, line, 0)
 
 
-scanner_spans = [scanner_rule,
-                 scanner_example]
-
-
-scanner_components = {i.name(): {} for i in scanner_spans}
-
-scanner_i = 0 # line number of the current line
-scanner_span = None
-scanner_record = False
-last_key = None   # The rule name, if the most recently parsed thing was a rule.
-last_value = None # The most recently parsed thing
-while scanner_i < len(scanner_lines):
-    # Try both the rule and the example scanners.
-    for j in scanner_spans:
-        scanner_begin = j.begin(scanner_lines, scanner_i)
-        if scanner_begin[0]:
-            # Use this scanner
-            scanner_span = None
-            scanner_record = False
-            last_key = None
-            last_value = None
-            scanner_span = j
-            if scanner_begin[1] != None:
-                last_key = scanner_begin[1]
-            scanner_i += scanner_begin[-1]
-        if scanner_span == j:
-            # Check if we should stop using this scanner.
-            scanner_end = j.end(scanner_lines, scanner_i)
-            if scanner_end[0]:
-                # Yes, stop using this scanner.
-                scanner_span = None
-                scanner_record = False
-                last_key = None
-                last_value = None
-                scanner_i += scanner_end[-1]
-    if scanner_span != None:
-        # We're are in the middle of scanning a span of lines.
-        if scanner_record:
-            scanner_skip = scanner_span.skip(scanner_lines, scanner_i)
-            if scanner_skip[0]:
-                # Stop recording
-                scanner_record = False
-                scanner_i += scanner_skip[-1]  # Advance past this line
-        else:
-            # Should we start recording?
-            scanner_record_value = scanner_span.record(
-                scanner_lines, scanner_i)
-            if scanner_record_value[0]:
-                # Start recording
-                scanner_record = True
-                if last_key != None and scanner_span.name() == "example":  # TODO Remove special case
-                    if last_key in scanner_components[scanner_span.name()]:
-                        raise RuntimeError("line " + str(scanner_i) + ": example with duplicate name: " + last_key)
-                    else:
-                        scanner_components[scanner_span.name()][last_key] = []
-                scanner_i += scanner_record_value[-1]
-        if scanner_record and scanner_span.valid(scanner_lines, scanner_i):
-            # Try parsing this line
-            scanner_parse = scanner_span.parse(scanner_lines, scanner_i)
-            if scanner_parse[2] < 0:
-                # This line continues the rule parsed on the immediately preceding lines.
-                if (scanner_parse[1] != None and
-                        last_key != None and
-                        last_value != None and
-                        last_key in scanner_components[scanner_span.name()] and
-                        len(scanner_components[scanner_span.name()][last_key]) > 0):
-                    scanner_components[scanner_span.name(
-                    )][last_key][-1] += scanner_parse[1]
-            else:
-                if scanner_parse[0] != None:
-                    # It's a rule, with name in the 0'th position.
-                    last_key = scanner_parse[0]
-                    if scanner_parse[1] != None:
-                        last_value = scanner_parse[1]
-                        if last_key not in scanner_components[scanner_span.name()]:
-                            # Create a new entry for this rule
-                            scanner_components[scanner_span.name()][last_key] = [
-                                last_value]
-                        else:
-                            # Append to the existing entry.
-                            scanner_components[scanner_span.name()][last_key].append(
-                                last_value)
-                    else:
-                        # Reset
-                        last_value = None
-                        scanner_components[scanner_span.name()][last_key] = []
-                else:
-                    # It's example text
-                    if scanner_parse[1] != None:
-                        last_value = scanner_parse[1]
-                        scanner_components[scanner_span.name()][last_key].append(
-                            last_value)
-                scanner_i += scanner_parse[-1] # Advance line index
-    scanner_i += 1
-
-
-grammar_source = ""
-
-grammar_source += r"""
-module.exports = grammar({
-    name: 'wgsl',
-
-    externals: $ => [
-        $._block_comment,
-    ],
-
-    extras: $ => [
-        $._comment,
-        $._block_comment,
-        $._blankspace,
-    ],
-
-    inline: $ => [
-        $.global_decl,
-        $._reserved,
-    ],
-
-    // WGSL has no parsing conflicts.
-    conflicts: $ => [],
-
-    word: $ => $.ident_pattern_token,
-
-    rules: {
-"""[1:-1]
-grammar_source += "\n"
-
+# These fixed tokens must be parsed by the custom scanner.
+# This is needed to support template disambiguation.
+custom_simple_tokens = {
+    '>': '_greater_than',
+    '>=': '_greater_than_equal',
+    '<': '_less_than',
+    '<=': '_less_than_equal',
+    '<<': '_shift_left',
+    '>>': '_shift_right',
+    '<<=': '_shift_left_assign',
+    '>>=': '_shift_right_assign'
+}
 
 def grammar_from_rule_item(rule_item):
+    """
+    Returns a string for the JavaScript expression for this rule.
+    """
+    global custom_simple_tokens
     result = ""
     item_choice = False
     items = []
@@ -369,16 +270,50 @@ def grammar_from_rule_item(rule_item):
         i_skip = 0
         i_item = ""
         if rule_item[i].startswith("[=syntax/"):
+            # From '[=syntax/foobar=]' pick out 'foobar'
             i_item = rule_item[i].split("[=syntax/")[1].split("=]")[0]
             i_item = f"$.{i_item}"
         elif rule_item[i].startswith("`/"):
+            # From "`/pattern/`" pick out '/pattern/'
             i_item = f"token({rule_item[i][1:-1]})"
         elif rule_item[i].startswith("`'"):
-            i_item = f"token({rule_item[i][1:-1]})"
+            # From "`'&&'`" pick out '&&'
+            content = rule_item[i][2:-2]
+            # If the name maps to a custom token, then use that, otherwise,
+            # use the content name itself.
+            if content in custom_simple_tokens:
+                i_item = custom_simple_tokens[content]
+            else:
+                i_item = f"token('{content}')"
+        elif rule_item[i].startswith("_disambiguate_template']
+            # pick out '_disambiguate_template'
+            match = re.fullmatch("[^>]*>(.*)",rule_item[i+1])
+            token = match.group(1)
+            i_item = f"$.{token}"
+            i += 1
         elif rule_item[i].startswith("`'")[1].split("'``'true'`"]
+            # pick out "true"
+            match = re.fullmatch("[^>]*>`'(.*)'`",rule_item[i+2])
+            if match:
+                token = match.group(1)
+            else:
+                # Now try it without `' '` surrounding the element content text.
+                # From  ['_disam"]
+                # pick out "_disam"
+                match = re.fullmatch("[^>]*>(.*)",rule_item[i+2])
+                token = match.group(1)
+            if token in custom_simple_tokens:
+                token = custom_simple_tokens[token]
+                i_item = f"$.{token}"
+            elif token.startswith("_") and token != "_":
+                i_item = f"$.{token}"
+            else:
+                i_item = f"""token('{token}')"""
             i += 2
         elif rule_item[i] == "(":
+            # Extract a parenthesized rule
             j = i + 1
             j_span = 0
             rule_subitem = []
@@ -433,236 +368,557 @@ def grammar_from_rule(key, value):
     return result
 
 
-scanner_components[scanner_rule.name()]["_comment"] = [["`/\\/\\/.*/`"]]
+class ScanResult(dict):
+    """
+    A dictionary containing the results of scanning the WGSL spec.
+
+    self['raw']
+         A list of the Bikeshed source text lines, after include expansion and before
+         without further filtering
+    self['rule']
+         A dictionary mapping a parsed grammar rule to its definition.
+    self['example']
+         A dictionary mapping the name of an example to the
+         WGSL source text for the example.
+         The name is taken from the "heading" attriute of the 
element. + """ + def __init__(self): + self['rule'] = dict() + self['example'] = dict() + self['raw'] = [] -# Following sections are to allow out-of-order per syntactic grammar appearance of rules +def read_spec(options): + """ + Returns a ScanResult from parsing the Bikeshed source of the WGSL spec. + """ + result = ScanResult() -rule_skip = set() + # Get the input bikeshed text. + scanner_lines = read_lines_from_file( + options.bs_filename, {'wgsl.recursive.bs.include'}) + # Make a *copy* of the text input because we'll filter it later. + result['raw'] = [x for x in scanner_lines] -for rule in ["translation_unit", "global_directive", "global_decl"]: - grammar_source += grammar_from_rule( - rule, scanner_components[scanner_rule.name()][rule]) + ",\n" - rule_skip.add(rule) + # Skip lines like: + #
+    #  
+ scanner_lines = filter(lambda s: not s.startswith( + "
") and not s.startswith("
', '', line) for line in scanner_lines]
 
+    os.makedirs(options.grammar_dir, exist_ok=True)
 
-for key, value in scanner_components[scanner_rule.name()].items():
-    if key.endswith("_literal") and key not in rule_skip:
-        grammar_source += grammar_from_rule(key, value) + ",\n"
-        rule_skip.add(key)
+    # Global variable holding the current line text.
+    line = ""
 
 
-# Extract constituents
+    scanner_spans = [scanner_rule,
+                     scanner_example]
 
+    scanner_i = 0  # line number of the current line
+    scanner_span = None
+    scanner_record = False
+    # The rule name, if the most recently parsed thing was a rule.
+    last_key = None
+    last_value = None  # The most recently parsed thing
+    while scanner_i < len(scanner_lines):
+        # Try both the rule and the example scanners.
+        for j in scanner_spans:
+            scanner_begin = j.begin(scanner_lines, scanner_i)
+            if scanner_begin[0]:
+                # Use this scanner
+                scanner_span = None
+                scanner_record = False
+                last_key = None
+                last_value = None
+                scanner_span = j
+                if scanner_begin[1] != None:
+                    last_key = scanner_begin[1]
+                scanner_i += scanner_begin[-1]
+            if scanner_span == j:
+                # Check if we should stop using this scanner.
+                scanner_end = j.end(scanner_lines, scanner_i)
+                if scanner_end[0]:
+                    # Yes, stop using this scanner.
+                    scanner_span = None
+                    scanner_record = False
+                    last_key = None
+                    last_value = None
+                    scanner_i += scanner_end[-1]
+        if scanner_span != None:
+            # We're are in the middle of scanning a span of lines.
+            if scanner_record:
+                scanner_skip = scanner_span.skip(scanner_lines, scanner_i)
+                if scanner_skip[0]:
+                    # Stop recording
+                    scanner_record = False
+                    scanner_i += scanner_skip[-1]  # Advance past this line
+            else:
+                # Should we start recording?
+                scanner_record_value = scanner_span.record(
+                    scanner_lines, scanner_i)
+                if scanner_record_value[0]:
+                    # Start recording
+                    scanner_record = True
+                    if last_key != None and scanner_span.name() == "example":  # TODO Remove special case
+                        if last_key in result[scanner_span.name()]:
+                            raise RuntimeError(
+                                "line " + str(scanner_i) + ": example with duplicate name: " + last_key)
+                        else:
+                            result[scanner_span.name()][last_key] = []
+                    scanner_i += scanner_record_value[-1]
+            if scanner_record and scanner_span.valid(scanner_lines, scanner_i):
+                # Try parsing this line
+                scanner_parse = scanner_span.parse(scanner_lines, scanner_i)
+                if scanner_parse[2] < 0:
+                    # This line continues the rule parsed on the immediately preceding lines.
+                    if (scanner_parse[1] != None and
+                            last_key != None and
+                            last_value != None and
+                            last_key in result[scanner_span.name()] and
+                            len(result[scanner_span.name()][last_key]) > 0):
+                        result[scanner_span.name(
+                        )][last_key][-1] += scanner_parse[1]
+                else:
+                    if scanner_parse[0] != None:
+                        # It's a rule, with name in the 0'th position.
+                        last_key = scanner_parse[0]
+                        if scanner_parse[1] != None:
+                            last_value = scanner_parse[1]
+                            if last_key not in result[scanner_span.name()]:
+                                # Create a new entry for this rule
+                                result[scanner_span.name()][last_key] = [
+                                    last_value]
+                            else:
+                                # Append to the existing entry.
+                                result[scanner_span.name()][last_key].append(
+                                    last_value)
+                        else:
+                            # Reset
+                            last_value = None
+                            result[scanner_span.name()][last_key] = []
+                    else:
+                        # It's example text
+                        if scanner_parse[1] != None:
+                            last_value = scanner_parse[1]
+                            result[scanner_span.name()][last_key].append(
+                                last_value)
+                    scanner_i += scanner_parse[-1]  # Advance line index
+        scanner_i += 1
 
-def not_token_only(value):
-    result = False
-    for i in value:
-        result = result or len(
-            [j for j in i if not j.startswith("`/") and not j.startswith("`'")]) > 0
+    result[scanner_rule.name()]["_comment"] = [["`/\\/\\/.*/`"]]
     return result
 
 
-for key, value in scanner_components[scanner_rule.name()].items():
-    if not key.startswith("_") and key != "ident" and not_token_only(value) and key not in rule_skip:
-        grammar_source += grammar_from_rule(key, value) + ",\n"
-        rule_skip.add(key)
+def flow_extract(options, scan_result):
+    """
+    Write the tree-sitter grammar definition for WGSL
+
+    Args:
+        options: Options
+        scan_result: the ScanResult holding rules and examples extracted from the WGSL spec
+    """
+    print("{}: Extract...".format(options.script))
 
+    input_bs_is_fresh = True
+    previously_scanned_bs_file = options.bs_filename + ".pre"
+    if not os.path.exists(options.grammar_filename):
+        # Must regenerate the tree-sitter grammar file
+        pass
+    else:
+        # Check against previously scanned text
+        if os.path.exists(previously_scanned_bs_file):
+            with open(previously_scanned_bs_file,"r") as previous_file:
+                previous_lines = previous_file.readlines()
+                if previous_lines == scan_result['raw']:
+                    input_bs_is_fresh = False
 
-# Extract tokens
+    if input_bs_is_fresh:
+        rules = scan_result['rule']
 
+        grammar_source = ""
 
-for key, value in scanner_components[scanner_rule.name()].items():
-    if not key.startswith("_") and key != "ident" and key not in rule_skip:
-        grammar_source += grammar_from_rule(key, value) + ",\n"
-        rule_skip.add(key)
+        grammar_source += r"""
+        module.exports = grammar({
+            name: 'wgsl',
 
+            externals: $ => [
+                $._block_comment,
+                $._disambiguate_template,
+                $._template_args_start,
+                $._template_args_end,
+                $._less_than,
+                $._less_than_equal,
+                $._shift_left,
+                $._shift_left_assign,
+                $._greater_than,
+                $._greater_than_equal,
+                $._shift_right,
+                $._shift_right_assign,
+                $._error_sentinel,
+            ],
 
-# Extract underscore
+            extras: $ => [
+                $._comment,
+                $._block_comment,
+                $._blankspace,
+            ],
 
+            inline: $ => [
+                $.global_decl,
+                $._reserved,
+            ],
 
-for key, value in scanner_components[scanner_rule.name()].items():
-    if key.startswith("_") and key != "_comment" and key != "_blankspace" and key not in rule_skip:
-        grammar_source += grammar_from_rule(key, value) + ",\n"
-        rule_skip.add(key)
+            // WGSL has no parsing conflicts.
+            conflicts: $ => [],
 
+            word: $ => $.ident_pattern_token,
 
-# Extract ident
+            rules: {
+        """[1:-1]
+        grammar_source += "\n"
 
+        # Following sections are to allow out-of-order per syntactic grammar appearance of rules
 
-grammar_source += grammar_from_rule(
-    "ident", scanner_components[scanner_rule.name()]["ident"]) + ",\n"
-rule_skip.add("ident")
+        rule_skip = set()
 
+        for rule in ["translation_unit", "global_directive", "global_decl"]:
+            grammar_source += grammar_from_rule(
+                rule, rules[rule]) + ",\n"
+            rule_skip.add(rule)
 
-# Extract comment
 
+        # Extract literals
 
-grammar_source += grammar_from_rule(
-    "_comment", scanner_components[scanner_rule.name()]["_comment"]) + ",\n"
-rule_skip.add("_comment")
 
+        for key, value in rules.items():
+            if key.endswith("_literal") and key not in rule_skip:
+                grammar_source += grammar_from_rule(key, value) + ",\n"
+                rule_skip.add(key)
 
-# Extract space
 
+        # Extract constituents
 
-grammar_source += grammar_from_rule(
-    "_blankspace", scanner_components[scanner_rule.name()]["_blankspace"])
-rule_skip.add("_blankspace")
 
+        def not_token_only(value):
+            result = False
+            for i in value:
+                result = result or len(
+                    [j for j in i if not j.startswith("`/") and not j.startswith("`'")]) > 0
+            return result
 
-grammar_source += "\n"
-grammar_source += r"""
-    },
-});
-"""[1:-1]
 
-headerTemplate = Template(HEADER)
-grammar_file.write(headerTemplate.substitute(
-    YEAR=date.today().year) + grammar_source + "\n")
-grammar_file.close()
+        for key, value in rules.items():
+            if not key.startswith("_") and not_token_only(value) and key not in rule_skip:
+                grammar_source += grammar_from_rule(key, value) + ",\n"
+                rule_skip.add(key)
 
-with open(grammar_path + "/package.json", "w") as grammar_package:
-    grammar_package.write('{\n')
-    grammar_package.write('    "name": "tree-sitter-wgsl",\n')
-    grammar_package.write('    "dependencies": {\n')
-    grammar_package.write('        "nan": "^2.15.0"\n')
-    grammar_package.write('    },\n')
-    grammar_package.write('    "devDependencies": {\n')
-    grammar_package.write('        "tree-sitter-cli": "^0.20.0"\n')
-    grammar_package.write('    },\n')
-    grammar_package.write('    "main": "bindings/node"\n')
-    grammar_package.write('}\n')
 
-# External scanner for nested block comments
-# For the API, see https://tree-sitter.github.io/tree-sitter/creating-parsers#external-scanners
-# See: https://github.com/tree-sitter/tree-sitter-rust/blob/master/src/scanner.c
+        # Extract tokens
 
-os.makedirs(os.path.join(grammar_path, "src"), exist_ok=True)
-with open(os.path.join(grammar_path, "src", "scanner.c"), "w") as external_scanner:
-    external_scanner.write(r"""
-#include 
-#include 
 
-enum TokenType {
-  BLOCK_COMMENT,
-};
+        for key, value in rules.items():
+            if not key.startswith("_") and key not in rule_skip:
+                grammar_source += grammar_from_rule(key, value) + ",\n"
+                rule_skip.add(key)
 
-void *tree_sitter_wgsl_external_scanner_create() { return NULL; }
-void tree_sitter_wgsl_external_scanner_destroy(void *p) {}
-unsigned tree_sitter_wgsl_external_scanner_serialize(void *p, char *buffer) { return 0; }
-void tree_sitter_wgsl_external_scanner_deserialize(void *p, const char *b, unsigned n) {}
 
-static void advance(TSLexer *lexer) {
-  lexer->advance(lexer, false);
-}
+        # Extract underscore
 
-bool tree_sitter_wgsl_external_scanner_scan(void *payload, TSLexer *lexer,
-                                            const bool *valid_symbols) {
-  while (iswspace(lexer->lookahead)) lexer->advance(lexer, true);
-
-  if (lexer->lookahead == '/') {
-    advance(lexer);
-    if (lexer->lookahead != '*') return false;
-    advance(lexer);
-
-    bool after_star = false;
-    unsigned nesting_depth = 1;
-    for (;;) {
-      switch (lexer->lookahead) {
-        case '\0':
-          /* This signals the end of input. Since nesting depth is
-           * greater than zero, the scanner is in the middle of
-           * a block comment. Block comments must be affirmatively
-           * terminated.
-           */
-          return false;
-        case '*':
-          advance(lexer);
-          after_star = true;
-          break;
-        case '/':
-          if (after_star) {
-            advance(lexer);
-            after_star = false;
-            nesting_depth--;
-            if (nesting_depth == 0) {
-              lexer->result_symbol = BLOCK_COMMENT;
-              return true;
-            }
-          } else {
-            advance(lexer);
-            after_star = false;
-            if (lexer->lookahead == '*') {
-              nesting_depth++;
-              advance(lexer);
-            }
-          }
-          break;
-        default:
-          advance(lexer);
-          after_star = false;
-          break;
-      }
-    }
-  }
-
-  return false;
-}
-"""[1:-1])
-
-
-# Use "npm install" to create the tree-sitter CLI that has WGSL
-# support.  But "npm install" fetches data over the network.
-# That can be flaky, so only invoke it when needed.
-if os.path.exists("grammar/node_modules/tree-sitter-cli") and os.path.exists("grammar/node_modules/nan"):
-    # "npm install" has been run already.
-    pass
-else:
-    subprocess.run(["npm", "install"], cwd=grammar_path, check=True)
-subprocess.run(["npx", "tree-sitter", "generate"],
-               cwd=grammar_path, check=True)
-# Following are commented for future reference to expose playground
-# Remove "--docker" if local environment matches with the container
-# subprocess.run(["npx", "tree-sitter", "build-wasm", "--docker"],
-#                cwd=grammar_path, check=True)
-
-Language.build_library(
-    grammar_path + "/build/wgsl.so",
-    [
-        grammar_path,
-    ]
-)
-
-WGSL_LANGUAGE = Language(grammar_path + "/build/wgsl.so", "wgsl")
-
-parser = Parser()
-parser.set_language(WGSL_LANGUAGE)
-
-error_list = []
-
-for key, value in scanner_components[scanner_example.name()].items():
-    if "expect-error" in key:
-        continue
-    value = value[:]
-    if "function-scope" in key:
-        value = ["fn function__scope____() {"] + value + ["}"]
-    if "type-scope" in key:
-        # Initiailize with zero-value expression.
-        value = ["const type_scope____: "] + value + ["="] + value + ["()"] + [";"]
-    program = "\n".join(value)
-    tree = parser.parse(bytes(program, "utf8"))
-    if tree.root_node.has_error:
-        error_list.append((program, tree))
-    # TODO Semantic CI
-
-if len(error_list) > 0:
-    for error in error_list:
-        print("Example:")
-        print(error[0])
-        print("Tree:")
-        print(error[1].root_node.sexp())
-    raise Exception("Grammar is not compatible with examples!")
+
+        for key, value in rules.items():
+            if key.startswith("_") and key != "_comment" and key != "_blankspace" and key not in rule_skip:
+                grammar_source += grammar_from_rule(key, value) + ",\n"
+                rule_skip.add(key)
+
+
+        # Extract ident
+
+
+        grammar_source += grammar_from_rule( "ident", rules["ident"]) + ",\n"
+        rule_skip.add("ident")
+
+
+        # Extract comment
+
+
+        grammar_source += grammar_from_rule(
+            "_comment", rules["_comment"]) + ",\n"
+        rule_skip.add("_comment")
+
+
+        # Extract space
+
+
+        grammar_source += grammar_from_rule(
+            "_blankspace", rules["_blankspace"])
+        rule_skip.add("_blankspace")
+
+
+        grammar_source += "\n"
+        grammar_source += r"""
+            },
+        });
+        """[1:-1]
+
+        HEADER = """
+        // Copyright (C) [$YEAR] World Wide Web Consortium,
+        // (Massachusetts Institute of Technology, European Research Consortium for
+        // Informatics and Mathematics, Keio University, Beihang).
+        // All Rights Reserved.
+        //
+        // This work is distributed under the W3C (R) Software License [1] in the hope
+        // that it will be useful, but WITHOUT ANY WARRANTY; without even the implied
+        // warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+        //
+        // [1] http://www.w3.org/Consortium/Legal/copyright-software
+
+        // **** This file is auto-generated. Do not edit. ****
+
+        """.lstrip()
+
+    if input_bs_is_fresh:
+        print("{}: ...Creating tree-sitter parser".format(options.script,options.grammar_filename))
+        with open(options.grammar_filename, "w") as grammar_file:
+            headerTemplate = Template(HEADER)
+            grammar_file.write(headerTemplate.substitute(
+                YEAR=date.today().year) + grammar_source + "\n")
+            grammar_file.close()
+
+    if input_bs_is_fresh:
+        # Save scanned lines for next time.
+        with open(previously_scanned_bs_file,"w") as previous_file:
+            for line in scan_result['raw']:
+                previous_file.write(line)
+
+    with open(os.path.join(options.grammar_dir,"package.json"), "w") as grammar_package:
+        grammar_package.write('{\n')
+        grammar_package.write('    "name": "tree-sitter-wgsl",\n')
+        grammar_package.write('    "dependencies": {\n')
+        grammar_package.write('        "nan": "^2.15.0"\n')
+        grammar_package.write('    },\n')
+        grammar_package.write('    "devDependencies": {\n')
+        grammar_package.write('        "tree-sitter-cli": "^0.20.7"\n')
+        grammar_package.write('    },\n')
+        grammar_package.write('    "main": "bindings/node"\n')
+        grammar_package.write('}\n')
+
+    return True
+
+def flow_build(options):
+    """
+    Build the shared library for the custom tree-sitter scanner.
+    """
+
+    print("{}: Build...".format(options.script))
+    if not os.path.exists(options.grammar_filename):
+        print("missing grammar file: {}")
+        return False
+
+    # External scanner for nested block comments
+    # For the API, see https://tree-sitter.github.io/tree-sitter/creating-parsers#external-scanners
+    # See: https://github.com/tree-sitter/tree-sitter-rust/blob/master/src/scanner.c
+
+    os.makedirs(os.path.join(options.grammar_dir, "src"), exist_ok=True)
+
+    # Remove the old custom scanner, if it exists.
+    scanner_c_staging = os.path.join(options.grammar_dir, "src", "scanner.c")
+    if os.path.exists(scanner_c_staging):
+        os.remove(scanner_c_staging)
+    # Copy the new scanner into place, if newer
+    scanner_cc_staging = os.path.join(options.grammar_dir, "src", "scanner.cc")
+    if newer_than(options.scanner_cc_filename, scanner_cc_staging):
+        shutil.copyfile(options.scanner_cc_filename, scanner_cc_staging)
+
+
+    # Use "npm install" to create the tree-sitter CLI that has WGSL
+    # support.  But "npm install" fetches data over the network.
+    # That can be flaky, so only invoke it when needed.
+    if os.path.exists("grammar/node_modules/tree-sitter-cli") and os.path.exists("grammar/node_modules/nan"):
+        # "npm install" has been run already.
+        pass
+    else:
+        subprocess.run(["npm", "install"], cwd=options.grammar_dir, check=True)
+    subprocess.run(["npx", "tree-sitter", "generate"],
+                   cwd=options.grammar_dir, check=True)
+    # Following are commented for future reference to expose playground
+    # Remove "--docker" if local environment matches with the container
+    # subprocess.run(["npx", "tree-sitter", "build-wasm", "--docker"],
+    #                cwd=options.grammar_dir, check=True)
+
+
+    def build_library(output_file, input_files):
+        # The py-tree-sitter build_library method wasn't compiling with C++17 flags,
+        # so invoke the compile ourselves.
+        compiler = new_compiler()
+        clang_like = isinstance(compiler, UnixCCompiler)
+
+        # Compile .c and .cc files down to object files.
+        object_files = []
+        includes = [os.path.dirname(input_files[0])]
+        for src in input_files:
+            flags = []
+            if src.endswith(".cc"):
+                if clang_like:
+                    flags.extend(["-fPIC", "-std=c++17"])
+                else:
+                    flags.extend(["/std:c++17"])
+            objects = compiler.compile([src],
+                                       extra_preargs=flags,
+                                       include_dirs=includes)
+            object_files.extend(objects)
+
+        # Link object files to a single shared library.
+        if clang_like:
+            link_flags = ["-lstdc++"]
+        compiler.link_shared_object(
+                object_files,
+                output_file,
+                target_lang="c++",
+                extra_postargs=link_flags)
+
+    if newer_than(scanner_cc_staging, options.wgsl_shared_lib) or newer_than(options.grammar_filename,options.wgsl_shared_lib):
+        print("{}: ...Building custom scanner: {}".format(options.script,options.wgsl_shared_lib))
+        build_library(options.wgsl_shared_lib,
+                      [scanner_cc_staging,
+                       os.path.join(options.grammar_dir,"src","parser.c")])
+    return True
+
+def flow_examples(options,scan_result):
+    """
+    Check the tree-sitter parser can parse the examples from the WGSL spec.
+
+    Args:
+        options: Options
+        scan_result: the ScanResult holding rules and examples extracted from the WGSL spec
+    """
+    print("{}: Examples...".format(options.script))
+
+    examples = scan_result['example']
+    WGSL_LANGUAGE = Language(options.wgsl_shared_lib, "wgsl")
+
+    parser = Parser()
+    parser.set_language(WGSL_LANGUAGE)
+
+    errors = 0
+    for key, value in examples.items():
+        print(".",flush=True,end='')
+        if "expect-error" in key:
+            continue
+        value = value[:]
+        if "function-scope" in key:
+            value = ["fn function__scope____() {"] + value + ["}"]
+        if "type-scope" in key:
+            # Initialize with zero-value expression.
+            value = ["const type_scope____: "] + \
+                value + ["="] + value + ["()"] + [";"]
+        program = "\n".join(value)
+        # print("**************** BEGIN ****************")
+        # print(program)
+        # print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
+        tree = parser.parse(bytes(program, "utf8"))
+        if tree.root_node.has_error:
+            print("Example:")
+            print(program)
+            print("Tree:")
+            print(tree.root_node.sexp())
+            errors = errors + 1
+        # print("***************** END *****************")
+        # print("")
+        # print("")
+
+        # TODO Semantic CI
+
+    if errors > 0:
+        raise Exception("Grammar is not compatible with examples!")
+    print("Ok",flush=True)
+    return True
+
+
+FLOW_HELP = """
+A complete flow has the following steps, in order
+    'x' (think 'extract'): Generate a tree-sitter grammar definition from the
+          bikeshed source for the WGSL specification.
+    'b' (think 'build'): Build the tree-sitter parser
+    'e' (think 'example'): Check the examples from the WGSL spec parse correctly.
+    't' (think 'test'): Run parser unit tests.
+
+You can be more selective by specifying the --flow option followed by a word
+containing the letters for the steps to run.
+
+For example, the following will extract the grammar, build the tree-sitter parse,
+and check that the examples from the spec parse correctly:
+
+    extract-grammar --flow xbe
+
+The order of the letters is not significant. The steps will always run in the
+same relative order as the default flow.
+"""
+DEFAULT_FLOW="xbet"
+
+def main():
+    argparser = argparse.ArgumentParser(
+            prog="extract-grammar.py",
+            description="Extract the grammar from the WGSL spec and run checks",
+            add_help=False # We want to print our own additional formatted help
+            )
+    argparser.add_argument("--help","-h",
+                           action='store_true',
+                           help="Show this help message, then exit")
+    argparser.add_argument("--verbose","-v",
+                           action='store_true',
+                           help="Be verbose")
+    argparser.add_argument("--flow",
+                           action='store',
+                           help="The flow steps to run. Default is the whole flow.",
+                           default=DEFAULT_FLOW)
+    argparser.add_argument("--tree-sitter-dir",
+                           help="Target directory for the tree-sitter parser",
+                           default="grammar")
+    argparser.add_argument("--spec",
+                           action='store',
+                           help="Bikeshed source file for the WGSL spec",
+                           default="index.bs")
+    argparser.add_argument("--scanner",
+                           action='store',
+                           help="source file for the tree-sitter custom scanner",
+                           default="scanner.cc")
+
+    args = argparser.parse_args()
+    if args.help:
+        print(argparser.format_help())
+        print(FLOW_HELP)
+        return 0
+
+    options = Options(args.spec,args.tree_sitter_dir,args.scanner)
+    options.verbose = args.verbose
+    if args.verbose:
+        print(options)
+
+    scan_result = None
+
+    if 'x' in args.flow:
+        scan_result = read_spec(options)
+        if not flow_extract(options,scan_result):
+            return 1
+    if 'b' in args.flow:
+        if not flow_build(options):
+            return 1
+    if 'e' in args.flow:
+        if scan_result is None:
+            scan_result = read_spec(options)
+        if not flow_examples(options,scan_result):
+            return 1
+    if 't' in args.flow:
+        test_options = wgsl_unit_tests.Options(options.wgsl_shared_lib)
+        if not wgsl_unit_tests.run_tests(test_options):
+            return 1
+    return 0
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/wgsl/index.bs b/wgsl/index.bs
index 4ce62da0ec..942e84fc22 100644
--- a/wgsl/index.bs
+++ b/wgsl/index.bs
@@ -110,6 +110,9 @@ div.syntax > p > a > code {
   font-style: normal;
   font-weight: bold;
 }
+.hidden {
+  display: none
+}
 table.data.builtin tbody{
   border-bottom: 0;
 }
@@ -532,16 +535,23 @@ The program text [=shader-creation error|must not=] include a null code point (`
 ## Parsing ## {#parsing}
 
 To parse a WGSL program:
+
1. Remove [=comments=]: * Replace the first comment with a space code point (`U+0020`). * Repeat until no comments remain. -2. Parse the whole text, attempting to match the [=syntax/translation_unit=] grammar rule. +2. Find [=template lists=], using the [=template list discovery|algorithm=] in [[#template-lists-sec]]. +3. Parse the whole text, attempting to match the [=syntax/translation_unit=] grammar rule. Parsing uses a LALR(1) parser (one token of lookahead) [[!DeRemer1969]], with the following customization: * Tokenization is interleaved with parsing, and is context-aware. When the parser requests the next token: * Consume and ignore an initial sequence of [=blankspace=] code points. - * A token candidate is any WGSL [=token=] formed from the non-empty prefix of the remaining unconsumed code points. - * The token returned is the longest [=token candidate=] that is also a valid lookahead token for the current parser state. [[!VanWyk2007]] + * If the next code point is the start of a [=template list=], consume it and return [=syntax_sym/_template_args_start=]. + * If the next code point is the end of a [=template list=], consume it and return [=syntax_sym/_template_args_end=]. + * Otherwise: + * A token candidate is any WGSL [=token=] formed from the non-empty prefix of the remaining unconsumed code points. + * The token returned is the longest [=token candidate=] that is also a valid lookahead token for the current parser state. [[!VanWyk2007]] + +
A [=shader-creation error=] results if: * the entire source text cannot be converted into a finite sequence of valid tokens, or @@ -966,6 +976,149 @@ The spelling of the token may be the same as an [=identifier=], but the token do Section [[#context-dependent-name-tokens]] lists all such tokens. +## Template Lists ## {#template-lists-sec} + +Template parameterization is a way to specify parameters that modify a general concept. +To write a template parameterization, write the general concept, followed by a [=template list=]. + +Ignoring [=comments=] and [=blankspace=], a template list is: +* An initial `'<'` (U+003C) code point, then +* A [=syntax_sym/comma=]-separated list of one or more template parameters, then +* An optional trailing [=syntax_sym/comma=], then +* A terminating `'>'` (U+003E) code point. + +The form of a [=template parameter=] is implicitly defined by the [=template list discovery=] algorithm below. +Generally, they are names, expressions, or types. + +Note: For example, the phrase `vec3` is a template parameterization where `vec3` is the general concept being modified, +and `` is a template list containing one parameter, the [=f32=] type. +Together, `vec3` denotes a specific [=vector=] type. + +Note: For example, the phrase `var` modifies the general `var` concept with template parameters `storage` and `read_write`. + +
+Note: For example, the phrase `array>` has two template parameterizations: +* `vec4` modifies the general `vec4` concept with template parameter `f32`. +* `array>` modifies the general `array` concept with template parameter `vec4`. + +
+ +The `'<'` (U+003C) and `'>'` (U+003E) code points that delimit a template list are also used when spelling: +* A comparison operator in a [=syntax/relational_expression=]. +* A shift operator in a [=syntax/shift_expression=]. +* A [=syntax/compound_assignment_operator=] for performing a shift operation followed by an assignment. + +The syntactic ambiguity is resolved in favour of template lists: +* Template lists are discovered in an early phase of parsing, before [=declarations=], [=expressions=], [=statements=] are parsed. +* During tokenization in a later phase, + the intial `'<'` (U+003C) of a template list is mapped to a [=syntax_sym/_template_args_start=] token, and + the terminating `'>'` (U+003E) of a template list is mapped to a [=syntax_sym/_template_args_end=] token. + + +The template list discovery algorithm is as follows. +
+**Input:** The program source text. + +**Record types:** + +Let |UnclosedCandidate| be a record type containing: + * |position|, a location in the source text + * |depth|, an integer, the expression nesting depth at |position| + +Let |TemplateList| be record type containing: + * |start_position|, the source location of the `'<'` (U+003C) code point that starts this template list. + * |end_position|, the source location of the `'>'` (U+003E) code point that ends this template list. + +**Output:** |DiscoveredTemplateLists|, a list of |TemplateList| records. + +**Algorithm:** +* Initialize |DiscoveredTemplateLists| to an empty list. +* Initialize a |Pending| variable to be an empty stack of |UnclosedCandidate| records. +* Initialize a |CurrentPosition| integer variable to 0. + It encodes the position of the code point currently being examined, as a count of the number of code points after the start of the source text. + * This variable will advance forward in the text while executing the algorithm. + When the end of text is reached, terminate the algorithm immediately and have it return |DiscoveredTemplateLists|. +* Initialize a |NestingDepth| integer variable to 0. +* Repeat the following steps: + * Advance |CurrentPosition| past [=blankspace=], [=comments=], and [=literals=]. + * If [=syntax/ident_pattern_token=] matches the text at |CurrentPosition|, then: + * Advance |CurrentPosition| past the [=syntax/ident_pattern_token=]. + * Advance |CurrentPosition| past blankspace and comments, if present. + * If `'<'` (U+003C) appears at |CurrentPosition|, then: + * Note: This code point is a candidate for being the start of a template list. + Save enough state so it can be matched against a terminating `'>'` (U+003E) appearing later in the input. + * Push |UnclosedCandidate|(|position|=|CurrentPosition|,|depth|=|NestingDepth|) onto the |Pending| stack. + * Advance |CurrentPosition| to the next code point, and start the next iteration of the loop. + * If `'>'` (U+003E) appears at |CurrentPosition| then: + * Note: This code point is a candidate for being the end of a template list. + * If |Pending| is not empty, then let |T| be its top entry, and: + * If |T|.|depth| equals |NestingDepth| then: + * Note: This code point ends the current template list whose start is recorded in |T|. + * Add |TemplateList|(|start_position|=|T|.|position|, |end_position|=|CurrentPosition|) to |DiscoveredTemplateLists|. + * Pop |T| off the |Pending| stack. + * Advance |CurrentPosition| past this code point, and start the next iteration of the loop. + * If `'('` (U+0028) or `'['` (U+005B) appears at |CurrentPosition| then: + * Note: Enter a nested expression. + * Add 1 to |NestingDepth|. + * Advance |CurrentPosition| past this code point, and start the next iteration of the loop. + * If `')'` (U+0029) or `']'` (U+005D) appears at |CurrentPosition| then: + * Note: Exit a nested expression. + * Pop entries from the |Pending| stack until it is empty, or until the its top entry has |depth| < |NestingDepth|. + * Set |NestingDepth| to 0 or |NestingDepth| − 1, whichever is larger. + * Advance |CurrentPosition| past this code point, and start the next iteration of the loop. + * If `';'` (U+003B) or `'{'` (U+007B) or `'='` (U+003D) or `':'` (U+003A) appears at |CurrentPosition| then: + * Note: These cannot contain an expression, and therefore cannot appear in a template list. + Clear pending unclosed candidates. + * Set |NestingDepth| to 0. + * Remove all entries from the |Pending| stack. + * Advance |CurrentPosition| past this code point, and start the next iteration of the loop. + * If `'&&'` or `'||'` matches the text at |CurrentPosition| then: + * Note: These are operators that have lower precedence than comparisons. Reject any pending unclosed candidates at the current expression level. + * Note: With this rule, no template list will be found in the program fragment `ad`. + Instead it will be recognized as the short-circuiting disjunction of two comparisons. + * Pop entries from the |Pending| stack until it is empty, or until the its top entry has |depth| < |NestingDepth|. + * Advance |CurrentPosition| past the two code points, and start the next iteration of the loop. + * Advance |CurrentPosition| past the current code point. + +
+ +
+Note: The algorithm can be modified to find the source ranges for [=template parameters=], as follows: + +* Modify |UnclosedCandidate| to add the following fields: + * |parameters|, a list of source ranges of template parameters. + * |parameter_start_position|, a source location. +* Modify |TemplateList| to add a field: + * |parameters|, a list of source ranges of template parameters. +* When pushing a new |UnclosedCandidate| onto the |Pending| stack: + * Set its |parameters| field to an empty list. + * Set |parameter_start_position| to one code point past |CurrentPosition|. +* When adding a |TemplateList|, |TL|, to DiscoveredTemplateLists: + * Let |T| be the top of the |Pending| stack, as in the original algorithm. + * Push the source range starting at |T|.|parameter_start_position| and ending at |CurrentPosition|−1 onto |T|.|parameters|. + * Prepare |TL| as in the original algorithm. + * Set |TL|.|parameters| to |T|.|parameters|. +* Insert a check at the end the loop, just before advancing past the current code point: + * If '`,`' (U+002C) appears at |CurrentPosition| and |Pending| is not empty, then: + * Let |T| be the top of the |Pending| stack. + * Push the source range starting at |T|.|parameter_start_position| and ending at |CurrentPosition|−1 onto |T|.|parameters|. + * Set |T|.|parameter_start_position| to |CurrentPosition|+1 + +
+ +Note: The algorithm explicitly skips past literals because some numeric literals end in a letter, for example `1.0f`. +The terminating `f` should not be mistaken as the start of an [=syntax/ident_pattern_token=]. + +Note: In the phrase `A ( B < C, D > ( E ) )`, the segment `< C, D >` is a [=template list=]. + +Note: The algorithm respects expression nesting: The start and end of a particular template list cannot appear at different expression nesting levels. +For example, in `arrayb)>`, the template list has three parameters, where the last one is `select(2,3,a>b)`. +The `'>'` in `a>b` does not terminate the template list because it is enclosed in a parenthesized part of the expression calling the `select` function. + +Note: Both ends of a template list must appear within the same array indexing phrase. For example `a[b()` does not contain a valid template list. + + + ## Attributes ## {#attributes} An attribute modifies an object. @@ -2311,7 +2464,7 @@ This includes the [=store type=] of a workgroup variable.
array_type_specifier : - | `'array'` `'<'` [=syntax/type_specifier=] ( `','` [=syntax/element_count_expression=] ) ? `'>'` + | `'array'` _template_args_start [=syntax/type_specifier=] ( `','` [=syntax/element_count_expression=] ) ? _template_args_end
element_count_expression : @@ -3993,11 +4146,11 @@ sampler_comparison | [=syntax/depth_texture_type=] - | [=syntax/sampled_texture_type=] `'<'` [=syntax/type_specifier=] `'>'` + | [=syntax/sampled_texture_type=] _template_args_start [=syntax/type_specifier=] _template_args_end - | [=syntax/multisampled_texture_type=] `'<'` [=syntax/type_specifier=] `'>'` + | [=syntax/multisampled_texture_type=] _template_args_start [=syntax/type_specifier=] _template_args_end - | [=syntax/storage_texture_type=] `'<'` [=syntax/texel_format=] `','` [=syntax/access_mode=] `'>'` + | [=syntax/storage_texture_type=] _template_args_start [=syntax/texel_format=] `','` [=syntax/access_mode=] _template_args_end
sampler_type : @@ -4102,15 +4255,15 @@ all properties of the members of *S*, including attributes, carry over to the me | `'u32'` - | [=syntax/vec_prefix=] `'<'` [=syntax/type_specifier=] `'>'` + | [=syntax/vec_prefix=] _template_args_start [=syntax/type_specifier=] _template_args_end - | [=syntax/mat_prefix=] `'<'` [=syntax/type_specifier=] `'>'` + | [=syntax/mat_prefix=] _template_args_start [=syntax/type_specifier=] _template_args_end - | `'ptr'` `'<'` [=syntax/address_space=] `','` [=syntax/type_specifier=] ( `','` [=syntax/access_mode=] ) ? `'>'` + | `'ptr'` _template_args_start [=syntax/address_space=] `','` [=syntax/type_specifier=] ( `','` [=syntax/access_mode=] ) ? _template_args_end | [=syntax/array_type_specifier=] - | `'atomic'` `'<'` [=syntax/type_specifier=] `'>'` + | `'atomic'` _template_args_start [=syntax/type_specifier=] _template_args_end | [=syntax/texture_and_sampler_types=]
@@ -4714,7 +4867,7 @@ such that the redundant loads are eliminated.
variable_decl : - | `'var'` [=syntax/variable_qualifier=] ? [=syntax/optionally_typed_ident=] + | `'var'` [=syntax/variable_qualifier=] ? [=syntax/optionally_typed_ident=]
optionally_typed_ident : @@ -4724,7 +4877,7 @@ such that the redundant loads are eliminated.
variable_qualifier : - | `'<'` [=syntax/address_space=] ( `','` [=syntax/access_mode=] ) ? `'>'` + | _template_args_start [=syntax/address_space=] ( `','` [=syntax/access_mode=] ) ? _template_args_end
@@ -6727,7 +6880,7 @@ When an identifier is used as a [=syntax/callable=] item, it is one of: | [=syntax/paren_expression=] - | `'bitcast'` `'<'` [=syntax/type_specifier=] `'>'` [=syntax/paren_expression=] + | `'bitcast'` _template_args_start [=syntax/type_specifier=] _template_args_end [=syntax/paren_expression=]
call_expression : @@ -6747,11 +6900,11 @@ Note: The [=syntax/call_expression=] rule exists to ensure [=type checking=] app | [=syntax/type_specifier_without_ident=] - | [=syntax/vec_prefix=] + | [=syntax/vec_prefix=] - | [=syntax/mat_prefix=] + | [=syntax/mat_prefix=] - | `'array'` + | `'array'`
paren_expression : @@ -7051,7 +7204,7 @@ to bind with this operator. This column is necessary for linearly listing operat # Statements # {#statements} -Statements are program fragments that control its execution. +A statement is a program fragment that controls execution. Statements are generally executed in sequential order; however, [[#control-flow|control flow statements]] may cause a program to execute in non-sequential order. @@ -7239,9 +7392,9 @@ An [=statement/assignment=] is a compound assignment when th | `'^='` - | `'>>='` + | `'>>='` - | `'<<='` + | `'<<='`
The type requirements, semantics, and behavior of each statement is defined as if @@ -11096,7 +11249,7 @@ A syntactic token is a sequence of special code points, used: * to spell an expression operator, or * as punctuation: to group, sequence, or separate other grammar elements. -List of [=syntactic tokens=]: +The [=syntactic tokens=] are: * `'&'` (Code point: `U+0026`) * `'&&'` (Code points: `U+0026` `U+0026`) @@ -11142,8 +11295,16 @@ List of [=syntactic tokens=]: * `'&='` (Code points: `U+0026` `U+003D`) * `'|='` (Code points: `U+007C` `U+003D`) * `'^='` (Code points: `U+005E` `U+003D`) -* `'>>='` (Code points: `U+003E` `U+003E` `U+003D`) -* `'<<='` (Code points: `U+003C` `U+003C` `U+003D`) +* `'>>='` (Code point: `U+003E` `U+003E` `U+003D`) +* `'<<='` (Code points: `U+003C` `U+003C` `U+003D`) +* `_template_args_end` + * Text: `'>'` (Code point: `U+003E`) + * This token is textually the same as the [=syntax_sym/greater_than=] syntactic token. + * It is generated by [=template list discovery=], and is used as the last token in a [=template list=]. +* `_template_args_start` + * Text: `'<'` (Code point: `U+003C`) + * This token is textually the same as the [=syntax_sym/less_than=] syntactic token. + * It is generated by [=template list discovery=], and is used as the first token in a [=template list=]. ## Context-Dependent Name Tokens ## {#context-dependent-name-tokens} diff --git a/wgsl/scanner.cc b/wgsl/scanner.cc new file mode 100644 index 0000000000..59879988d4 --- /dev/null +++ b/wgsl/scanner.cc @@ -0,0 +1,992 @@ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define ENABLE_LOGGING 0 + +#if ENABLE_LOGGING +#define LOG(msg, ...) printf(msg "\n", ##__VA_ARGS__) +#else +#define LOG(...) +#endif + +namespace { + +/// The possible external tokens matched by this custom scanner. +/// The order of the entries in this enumerator must match the 'externals' in +/// the grammar.js. +enum Token { + BLOCK_COMMENT, + DISAMBIGUATE_TEMPLATE, // A zero-length token used to scan ahead + TEMPLATE_ARGS_START, + TEMPLATE_ARGS_END, + LESS_THAN, // '<' + LESS_THAN_EQUAL, // '<=' + SHIFT_LEFT, // '<<' + SHIFT_LEFT_ASSIGN, // '<<=' + GREATER_THAN, // '>' + GREATER_THAN_EQUAL, // '>=' + SHIFT_RIGHT, // '>>' + SHIFT_RIGHT_ASSIGN, // '>>=' + + // A sentinel value used to signal an error has occurred already. + // https://tree-sitter.github.io/tree-sitter/creating-parsers#other-external-scanner-details + ERROR, +}; + +const char* str(Token tok,bool brief=false) { + switch (tok) { + case Token::BLOCK_COMMENT: + return "BLOCK_COMMENT"; + case Token::DISAMBIGUATE_TEMPLATE: + return "DISAMBIGUATE_TEMPLATE"; + case Token::TEMPLATE_ARGS_START: + return "TEMPLATE_ARGS_START"; + case Token::TEMPLATE_ARGS_END: + return "TEMPLATE_ARGS_END"; + case Token::LESS_THAN: + return brief ? "<" : "LESS_THAN"; + case Token::LESS_THAN_EQUAL: + return brief ? "<=" : "LESS_THAN_EQUAL"; + case Token::SHIFT_LEFT: + return brief ? "<<" : "SHIFT_LEFT"; + case Token::SHIFT_LEFT_ASSIGN: + return brief ? "<<=" : "SHIFT_LEFT_ASSIGN"; + case Token::GREATER_THAN: + return brief ? ">" : "GREATER_THAN"; + case Token::GREATER_THAN_EQUAL: + return brief ? ">=" : "GREATER_THAN_EQUAL"; + case Token::SHIFT_RIGHT: + return brief ? ">>" : "SHIFT_RIGHT"; + case Token::SHIFT_RIGHT_ASSIGN: + return brief ? ">>=" : "SHIFT_RIGHT_ASSIGN"; + case Token::ERROR: + return "ERROR"; + default: + return ""; + } +} + +using CodePoint = uint32_t; + +static constexpr CodePoint kEOF = 0; + +struct CodePointRange { + CodePoint first; // First code point in the interval + CodePoint last; // Last code point in the interval (inclusive) +}; + +inline bool operator<(CodePoint code_point, CodePointRange range) { + return code_point < range.first; +} +inline bool operator<(CodePointRange range, CodePoint code_point) { + return range.last < code_point; +} + +// Interval ranges of all code points in the Unicode 14 XID_Start set +// This array needs to be in ascending order. +constexpr CodePointRange kXIDStartRanges[] = { + {0x00041, 0x0005a}, {0x00061, 0x0007a}, {0x000aa, 0x000aa}, + {0x000b5, 0x000b5}, {0x000ba, 0x000ba}, {0x000c0, 0x000d6}, + {0x000d8, 0x000f6}, {0x000f8, 0x002c1}, {0x002c6, 0x002d1}, + {0x002e0, 0x002e4}, {0x002ec, 0x002ec}, {0x002ee, 0x002ee}, + {0x00370, 0x00374}, {0x00376, 0x00377}, {0x0037b, 0x0037d}, + {0x0037f, 0x0037f}, {0x00386, 0x00386}, {0x00388, 0x0038a}, + {0x0038c, 0x0038c}, {0x0038e, 0x003a1}, {0x003a3, 0x003f5}, + {0x003f7, 0x00481}, {0x0048a, 0x0052f}, {0x00531, 0x00556}, + {0x00559, 0x00559}, {0x00560, 0x00588}, {0x005d0, 0x005ea}, + {0x005ef, 0x005f2}, {0x00620, 0x0064a}, {0x0066e, 0x0066f}, + {0x00671, 0x006d3}, {0x006d5, 0x006d5}, {0x006e5, 0x006e6}, + {0x006ee, 0x006ef}, {0x006fa, 0x006fc}, {0x006ff, 0x006ff}, + {0x00710, 0x00710}, {0x00712, 0x0072f}, {0x0074d, 0x007a5}, + {0x007b1, 0x007b1}, {0x007ca, 0x007ea}, {0x007f4, 0x007f5}, + {0x007fa, 0x007fa}, {0x00800, 0x00815}, {0x0081a, 0x0081a}, + {0x00824, 0x00824}, {0x00828, 0x00828}, {0x00840, 0x00858}, + {0x00860, 0x0086a}, {0x00870, 0x00887}, {0x00889, 0x0088e}, + {0x008a0, 0x008c9}, {0x00904, 0x00939}, {0x0093d, 0x0093d}, + {0x00950, 0x00950}, {0x00958, 0x00961}, {0x00971, 0x00980}, + {0x00985, 0x0098c}, {0x0098f, 0x00990}, {0x00993, 0x009a8}, + {0x009aa, 0x009b0}, {0x009b2, 0x009b2}, {0x009b6, 0x009b9}, + {0x009bd, 0x009bd}, {0x009ce, 0x009ce}, {0x009dc, 0x009dd}, + {0x009df, 0x009e1}, {0x009f0, 0x009f1}, {0x009fc, 0x009fc}, + {0x00a05, 0x00a0a}, {0x00a0f, 0x00a10}, {0x00a13, 0x00a28}, + {0x00a2a, 0x00a30}, {0x00a32, 0x00a33}, {0x00a35, 0x00a36}, + {0x00a38, 0x00a39}, {0x00a59, 0x00a5c}, {0x00a5e, 0x00a5e}, + {0x00a72, 0x00a74}, {0x00a85, 0x00a8d}, {0x00a8f, 0x00a91}, + {0x00a93, 0x00aa8}, {0x00aaa, 0x00ab0}, {0x00ab2, 0x00ab3}, + {0x00ab5, 0x00ab9}, {0x00abd, 0x00abd}, {0x00ad0, 0x00ad0}, + {0x00ae0, 0x00ae1}, {0x00af9, 0x00af9}, {0x00b05, 0x00b0c}, + {0x00b0f, 0x00b10}, {0x00b13, 0x00b28}, {0x00b2a, 0x00b30}, + {0x00b32, 0x00b33}, {0x00b35, 0x00b39}, {0x00b3d, 0x00b3d}, + {0x00b5c, 0x00b5d}, {0x00b5f, 0x00b61}, {0x00b71, 0x00b71}, + {0x00b83, 0x00b83}, {0x00b85, 0x00b8a}, {0x00b8e, 0x00b90}, + {0x00b92, 0x00b95}, {0x00b99, 0x00b9a}, {0x00b9c, 0x00b9c}, + {0x00b9e, 0x00b9f}, {0x00ba3, 0x00ba4}, {0x00ba8, 0x00baa}, + {0x00bae, 0x00bb9}, {0x00bd0, 0x00bd0}, {0x00c05, 0x00c0c}, + {0x00c0e, 0x00c10}, {0x00c12, 0x00c28}, {0x00c2a, 0x00c39}, + {0x00c3d, 0x00c3d}, {0x00c58, 0x00c5a}, {0x00c5d, 0x00c5d}, + {0x00c60, 0x00c61}, {0x00c80, 0x00c80}, {0x00c85, 0x00c8c}, + {0x00c8e, 0x00c90}, {0x00c92, 0x00ca8}, {0x00caa, 0x00cb3}, + {0x00cb5, 0x00cb9}, {0x00cbd, 0x00cbd}, {0x00cdd, 0x00cde}, + {0x00ce0, 0x00ce1}, {0x00cf1, 0x00cf2}, {0x00d04, 0x00d0c}, + {0x00d0e, 0x00d10}, {0x00d12, 0x00d3a}, {0x00d3d, 0x00d3d}, + {0x00d4e, 0x00d4e}, {0x00d54, 0x00d56}, {0x00d5f, 0x00d61}, + {0x00d7a, 0x00d7f}, {0x00d85, 0x00d96}, {0x00d9a, 0x00db1}, + {0x00db3, 0x00dbb}, {0x00dbd, 0x00dbd}, {0x00dc0, 0x00dc6}, + {0x00e01, 0x00e30}, {0x00e32, 0x00e32}, {0x00e40, 0x00e46}, + {0x00e81, 0x00e82}, {0x00e84, 0x00e84}, {0x00e86, 0x00e8a}, + {0x00e8c, 0x00ea3}, {0x00ea5, 0x00ea5}, {0x00ea7, 0x00eb0}, + {0x00eb2, 0x00eb2}, {0x00ebd, 0x00ebd}, {0x00ec0, 0x00ec4}, + {0x00ec6, 0x00ec6}, {0x00edc, 0x00edf}, {0x00f00, 0x00f00}, + {0x00f40, 0x00f47}, {0x00f49, 0x00f6c}, {0x00f88, 0x00f8c}, + {0x01000, 0x0102a}, {0x0103f, 0x0103f}, {0x01050, 0x01055}, + {0x0105a, 0x0105d}, {0x01061, 0x01061}, {0x01065, 0x01066}, + {0x0106e, 0x01070}, {0x01075, 0x01081}, {0x0108e, 0x0108e}, + {0x010a0, 0x010c5}, {0x010c7, 0x010c7}, {0x010cd, 0x010cd}, + {0x010d0, 0x010fa}, {0x010fc, 0x01248}, {0x0124a, 0x0124d}, + {0x01250, 0x01256}, {0x01258, 0x01258}, {0x0125a, 0x0125d}, + {0x01260, 0x01288}, {0x0128a, 0x0128d}, {0x01290, 0x012b0}, + {0x012b2, 0x012b5}, {0x012b8, 0x012be}, {0x012c0, 0x012c0}, + {0x012c2, 0x012c5}, {0x012c8, 0x012d6}, {0x012d8, 0x01310}, + {0x01312, 0x01315}, {0x01318, 0x0135a}, {0x01380, 0x0138f}, + {0x013a0, 0x013f5}, {0x013f8, 0x013fd}, {0x01401, 0x0166c}, + {0x0166f, 0x0167f}, {0x01681, 0x0169a}, {0x016a0, 0x016ea}, + {0x016ee, 0x016f8}, {0x01700, 0x01711}, {0x0171f, 0x01731}, + {0x01740, 0x01751}, {0x01760, 0x0176c}, {0x0176e, 0x01770}, + {0x01780, 0x017b3}, {0x017d7, 0x017d7}, {0x017dc, 0x017dc}, + {0x01820, 0x01878}, {0x01880, 0x018a8}, {0x018aa, 0x018aa}, + {0x018b0, 0x018f5}, {0x01900, 0x0191e}, {0x01950, 0x0196d}, + {0x01970, 0x01974}, {0x01980, 0x019ab}, {0x019b0, 0x019c9}, + {0x01a00, 0x01a16}, {0x01a20, 0x01a54}, {0x01aa7, 0x01aa7}, + {0x01b05, 0x01b33}, {0x01b45, 0x01b4c}, {0x01b83, 0x01ba0}, + {0x01bae, 0x01baf}, {0x01bba, 0x01be5}, {0x01c00, 0x01c23}, + {0x01c4d, 0x01c4f}, {0x01c5a, 0x01c7d}, {0x01c80, 0x01c88}, + {0x01c90, 0x01cba}, {0x01cbd, 0x01cbf}, {0x01ce9, 0x01cec}, + {0x01cee, 0x01cf3}, {0x01cf5, 0x01cf6}, {0x01cfa, 0x01cfa}, + {0x01d00, 0x01dbf}, {0x01e00, 0x01f15}, {0x01f18, 0x01f1d}, + {0x01f20, 0x01f45}, {0x01f48, 0x01f4d}, {0x01f50, 0x01f57}, + {0x01f59, 0x01f59}, {0x01f5b, 0x01f5b}, {0x01f5d, 0x01f5d}, + {0x01f5f, 0x01f7d}, {0x01f80, 0x01fb4}, {0x01fb6, 0x01fbc}, + {0x01fbe, 0x01fbe}, {0x01fc2, 0x01fc4}, {0x01fc6, 0x01fcc}, + {0x01fd0, 0x01fd3}, {0x01fd6, 0x01fdb}, {0x01fe0, 0x01fec}, + {0x01ff2, 0x01ff4}, {0x01ff6, 0x01ffc}, {0x02071, 0x02071}, + {0x0207f, 0x0207f}, {0x02090, 0x0209c}, {0x02102, 0x02102}, + {0x02107, 0x02107}, {0x0210a, 0x02113}, {0x02115, 0x02115}, + {0x02118, 0x0211d}, {0x02124, 0x02124}, {0x02126, 0x02126}, + {0x02128, 0x02128}, {0x0212a, 0x02139}, {0x0213c, 0x0213f}, + {0x02145, 0x02149}, {0x0214e, 0x0214e}, {0x02160, 0x02188}, + {0x02c00, 0x02ce4}, {0x02ceb, 0x02cee}, {0x02cf2, 0x02cf3}, + {0x02d00, 0x02d25}, {0x02d27, 0x02d27}, {0x02d2d, 0x02d2d}, + {0x02d30, 0x02d67}, {0x02d6f, 0x02d6f}, {0x02d80, 0x02d96}, + {0x02da0, 0x02da6}, {0x02da8, 0x02dae}, {0x02db0, 0x02db6}, + {0x02db8, 0x02dbe}, {0x02dc0, 0x02dc6}, {0x02dc8, 0x02dce}, + {0x02dd0, 0x02dd6}, {0x02dd8, 0x02dde}, {0x03005, 0x03007}, + {0x03021, 0x03029}, {0x03031, 0x03035}, {0x03038, 0x0303c}, + {0x03041, 0x03096}, {0x0309d, 0x0309f}, {0x030a1, 0x030fa}, + {0x030fc, 0x030ff}, {0x03105, 0x0312f}, {0x03131, 0x0318e}, + {0x031a0, 0x031bf}, {0x031f0, 0x031ff}, {0x03400, 0x04dbf}, + {0x04e00, 0x0a48c}, {0x0a4d0, 0x0a4fd}, {0x0a500, 0x0a60c}, + {0x0a610, 0x0a61f}, {0x0a62a, 0x0a62b}, {0x0a640, 0x0a66e}, + {0x0a67f, 0x0a69d}, {0x0a6a0, 0x0a6ef}, {0x0a717, 0x0a71f}, + {0x0a722, 0x0a788}, {0x0a78b, 0x0a7ca}, {0x0a7d0, 0x0a7d1}, + {0x0a7d3, 0x0a7d3}, {0x0a7d5, 0x0a7d9}, {0x0a7f2, 0x0a801}, + {0x0a803, 0x0a805}, {0x0a807, 0x0a80a}, {0x0a80c, 0x0a822}, + {0x0a840, 0x0a873}, {0x0a882, 0x0a8b3}, {0x0a8f2, 0x0a8f7}, + {0x0a8fb, 0x0a8fb}, {0x0a8fd, 0x0a8fe}, {0x0a90a, 0x0a925}, + {0x0a930, 0x0a946}, {0x0a960, 0x0a97c}, {0x0a984, 0x0a9b2}, + {0x0a9cf, 0x0a9cf}, {0x0a9e0, 0x0a9e4}, {0x0a9e6, 0x0a9ef}, + {0x0a9fa, 0x0a9fe}, {0x0aa00, 0x0aa28}, {0x0aa40, 0x0aa42}, + {0x0aa44, 0x0aa4b}, {0x0aa60, 0x0aa76}, {0x0aa7a, 0x0aa7a}, + {0x0aa7e, 0x0aaaf}, {0x0aab1, 0x0aab1}, {0x0aab5, 0x0aab6}, + {0x0aab9, 0x0aabd}, {0x0aac0, 0x0aac0}, {0x0aac2, 0x0aac2}, + {0x0aadb, 0x0aadd}, {0x0aae0, 0x0aaea}, {0x0aaf2, 0x0aaf4}, + {0x0ab01, 0x0ab06}, {0x0ab09, 0x0ab0e}, {0x0ab11, 0x0ab16}, + {0x0ab20, 0x0ab26}, {0x0ab28, 0x0ab2e}, {0x0ab30, 0x0ab5a}, + {0x0ab5c, 0x0ab69}, {0x0ab70, 0x0abe2}, {0x0ac00, 0x0d7a3}, + {0x0d7b0, 0x0d7c6}, {0x0d7cb, 0x0d7fb}, {0x0f900, 0x0fa6d}, + {0x0fa70, 0x0fad9}, {0x0fb00, 0x0fb06}, {0x0fb13, 0x0fb17}, + {0x0fb1d, 0x0fb1d}, {0x0fb1f, 0x0fb28}, {0x0fb2a, 0x0fb36}, + {0x0fb38, 0x0fb3c}, {0x0fb3e, 0x0fb3e}, {0x0fb40, 0x0fb41}, + {0x0fb43, 0x0fb44}, {0x0fb46, 0x0fbb1}, {0x0fbd3, 0x0fc5d}, + {0x0fc64, 0x0fd3d}, {0x0fd50, 0x0fd8f}, {0x0fd92, 0x0fdc7}, + {0x0fdf0, 0x0fdf9}, {0x0fe71, 0x0fe71}, {0x0fe73, 0x0fe73}, + {0x0fe77, 0x0fe77}, {0x0fe79, 0x0fe79}, {0x0fe7b, 0x0fe7b}, + {0x0fe7d, 0x0fe7d}, {0x0fe7f, 0x0fefc}, {0x0ff21, 0x0ff3a}, + {0x0ff41, 0x0ff5a}, {0x0ff66, 0x0ff9d}, {0x0ffa0, 0x0ffbe}, + {0x0ffc2, 0x0ffc7}, {0x0ffca, 0x0ffcf}, {0x0ffd2, 0x0ffd7}, + {0x0ffda, 0x0ffdc}, {0x10000, 0x1000b}, {0x1000d, 0x10026}, + {0x10028, 0x1003a}, {0x1003c, 0x1003d}, {0x1003f, 0x1004d}, + {0x10050, 0x1005d}, {0x10080, 0x100fa}, {0x10140, 0x10174}, + {0x10280, 0x1029c}, {0x102a0, 0x102d0}, {0x10300, 0x1031f}, + {0x1032d, 0x1034a}, {0x10350, 0x10375}, {0x10380, 0x1039d}, + {0x103a0, 0x103c3}, {0x103c8, 0x103cf}, {0x103d1, 0x103d5}, + {0x10400, 0x1049d}, {0x104b0, 0x104d3}, {0x104d8, 0x104fb}, + {0x10500, 0x10527}, {0x10530, 0x10563}, {0x10570, 0x1057a}, + {0x1057c, 0x1058a}, {0x1058c, 0x10592}, {0x10594, 0x10595}, + {0x10597, 0x105a1}, {0x105a3, 0x105b1}, {0x105b3, 0x105b9}, + {0x105bb, 0x105bc}, {0x10600, 0x10736}, {0x10740, 0x10755}, + {0x10760, 0x10767}, {0x10780, 0x10785}, {0x10787, 0x107b0}, + {0x107b2, 0x107ba}, {0x10800, 0x10805}, {0x10808, 0x10808}, + {0x1080a, 0x10835}, {0x10837, 0x10838}, {0x1083c, 0x1083c}, + {0x1083f, 0x10855}, {0x10860, 0x10876}, {0x10880, 0x1089e}, + {0x108e0, 0x108f2}, {0x108f4, 0x108f5}, {0x10900, 0x10915}, + {0x10920, 0x10939}, {0x10980, 0x109b7}, {0x109be, 0x109bf}, + {0x10a00, 0x10a00}, {0x10a10, 0x10a13}, {0x10a15, 0x10a17}, + {0x10a19, 0x10a35}, {0x10a60, 0x10a7c}, {0x10a80, 0x10a9c}, + {0x10ac0, 0x10ac7}, {0x10ac9, 0x10ae4}, {0x10b00, 0x10b35}, + {0x10b40, 0x10b55}, {0x10b60, 0x10b72}, {0x10b80, 0x10b91}, + {0x10c00, 0x10c48}, {0x10c80, 0x10cb2}, {0x10cc0, 0x10cf2}, + {0x10d00, 0x10d23}, {0x10e80, 0x10ea9}, {0x10eb0, 0x10eb1}, + {0x10f00, 0x10f1c}, {0x10f27, 0x10f27}, {0x10f30, 0x10f45}, + {0x10f70, 0x10f81}, {0x10fb0, 0x10fc4}, {0x10fe0, 0x10ff6}, + {0x11003, 0x11037}, {0x11071, 0x11072}, {0x11075, 0x11075}, + {0x11083, 0x110af}, {0x110d0, 0x110e8}, {0x11103, 0x11126}, + {0x11144, 0x11144}, {0x11147, 0x11147}, {0x11150, 0x11172}, + {0x11176, 0x11176}, {0x11183, 0x111b2}, {0x111c1, 0x111c4}, + {0x111da, 0x111da}, {0x111dc, 0x111dc}, {0x11200, 0x11211}, + {0x11213, 0x1122b}, {0x11280, 0x11286}, {0x11288, 0x11288}, + {0x1128a, 0x1128d}, {0x1128f, 0x1129d}, {0x1129f, 0x112a8}, + {0x112b0, 0x112de}, {0x11305, 0x1130c}, {0x1130f, 0x11310}, + {0x11313, 0x11328}, {0x1132a, 0x11330}, {0x11332, 0x11333}, + {0x11335, 0x11339}, {0x1133d, 0x1133d}, {0x11350, 0x11350}, + {0x1135d, 0x11361}, {0x11400, 0x11434}, {0x11447, 0x1144a}, + {0x1145f, 0x11461}, {0x11480, 0x114af}, {0x114c4, 0x114c5}, + {0x114c7, 0x114c7}, {0x11580, 0x115ae}, {0x115d8, 0x115db}, + {0x11600, 0x1162f}, {0x11644, 0x11644}, {0x11680, 0x116aa}, + {0x116b8, 0x116b8}, {0x11700, 0x1171a}, {0x11740, 0x11746}, + {0x11800, 0x1182b}, {0x118a0, 0x118df}, {0x118ff, 0x11906}, + {0x11909, 0x11909}, {0x1190c, 0x11913}, {0x11915, 0x11916}, + {0x11918, 0x1192f}, {0x1193f, 0x1193f}, {0x11941, 0x11941}, + {0x119a0, 0x119a7}, {0x119aa, 0x119d0}, {0x119e1, 0x119e1}, + {0x119e3, 0x119e3}, {0x11a00, 0x11a00}, {0x11a0b, 0x11a32}, + {0x11a3a, 0x11a3a}, {0x11a50, 0x11a50}, {0x11a5c, 0x11a89}, + {0x11a9d, 0x11a9d}, {0x11ab0, 0x11af8}, {0x11c00, 0x11c08}, + {0x11c0a, 0x11c2e}, {0x11c40, 0x11c40}, {0x11c72, 0x11c8f}, + {0x11d00, 0x11d06}, {0x11d08, 0x11d09}, {0x11d0b, 0x11d30}, + {0x11d46, 0x11d46}, {0x11d60, 0x11d65}, {0x11d67, 0x11d68}, + {0x11d6a, 0x11d89}, {0x11d98, 0x11d98}, {0x11ee0, 0x11ef2}, + {0x11fb0, 0x11fb0}, {0x12000, 0x12399}, {0x12400, 0x1246e}, + {0x12480, 0x12543}, {0x12f90, 0x12ff0}, {0x13000, 0x1342e}, + {0x14400, 0x14646}, {0x16800, 0x16a38}, {0x16a40, 0x16a5e}, + {0x16a70, 0x16abe}, {0x16ad0, 0x16aed}, {0x16b00, 0x16b2f}, + {0x16b40, 0x16b43}, {0x16b63, 0x16b77}, {0x16b7d, 0x16b8f}, + {0x16e40, 0x16e7f}, {0x16f00, 0x16f4a}, {0x16f50, 0x16f50}, + {0x16f93, 0x16f9f}, {0x16fe0, 0x16fe1}, {0x16fe3, 0x16fe3}, + {0x17000, 0x187f7}, {0x18800, 0x18cd5}, {0x18d00, 0x18d08}, + {0x1aff0, 0x1aff3}, {0x1aff5, 0x1affb}, {0x1affd, 0x1affe}, + {0x1b000, 0x1b122}, {0x1b150, 0x1b152}, {0x1b164, 0x1b167}, + {0x1b170, 0x1b2fb}, {0x1bc00, 0x1bc6a}, {0x1bc70, 0x1bc7c}, + {0x1bc80, 0x1bc88}, {0x1bc90, 0x1bc99}, {0x1d400, 0x1d454}, + {0x1d456, 0x1d49c}, {0x1d49e, 0x1d49f}, {0x1d4a2, 0x1d4a2}, + {0x1d4a5, 0x1d4a6}, {0x1d4a9, 0x1d4ac}, {0x1d4ae, 0x1d4b9}, + {0x1d4bb, 0x1d4bb}, {0x1d4bd, 0x1d4c3}, {0x1d4c5, 0x1d505}, + {0x1d507, 0x1d50a}, {0x1d50d, 0x1d514}, {0x1d516, 0x1d51c}, + {0x1d51e, 0x1d539}, {0x1d53b, 0x1d53e}, {0x1d540, 0x1d544}, + {0x1d546, 0x1d546}, {0x1d54a, 0x1d550}, {0x1d552, 0x1d6a5}, + {0x1d6a8, 0x1d6c0}, {0x1d6c2, 0x1d6da}, {0x1d6dc, 0x1d6fa}, + {0x1d6fc, 0x1d714}, {0x1d716, 0x1d734}, {0x1d736, 0x1d74e}, + {0x1d750, 0x1d76e}, {0x1d770, 0x1d788}, {0x1d78a, 0x1d7a8}, + {0x1d7aa, 0x1d7c2}, {0x1d7c4, 0x1d7cb}, {0x1df00, 0x1df1e}, + {0x1e100, 0x1e12c}, {0x1e137, 0x1e13d}, {0x1e14e, 0x1e14e}, + {0x1e290, 0x1e2ad}, {0x1e2c0, 0x1e2eb}, {0x1e7e0, 0x1e7e6}, + {0x1e7e8, 0x1e7eb}, {0x1e7ed, 0x1e7ee}, {0x1e7f0, 0x1e7fe}, + {0x1e800, 0x1e8c4}, {0x1e900, 0x1e943}, {0x1e94b, 0x1e94b}, + {0x1ee00, 0x1ee03}, {0x1ee05, 0x1ee1f}, {0x1ee21, 0x1ee22}, + {0x1ee24, 0x1ee24}, {0x1ee27, 0x1ee27}, {0x1ee29, 0x1ee32}, + {0x1ee34, 0x1ee37}, {0x1ee39, 0x1ee39}, {0x1ee3b, 0x1ee3b}, + {0x1ee42, 0x1ee42}, {0x1ee47, 0x1ee47}, {0x1ee49, 0x1ee49}, + {0x1ee4b, 0x1ee4b}, {0x1ee4d, 0x1ee4f}, {0x1ee51, 0x1ee52}, + {0x1ee54, 0x1ee54}, {0x1ee57, 0x1ee57}, {0x1ee59, 0x1ee59}, + {0x1ee5b, 0x1ee5b}, {0x1ee5d, 0x1ee5d}, {0x1ee5f, 0x1ee5f}, + {0x1ee61, 0x1ee62}, {0x1ee64, 0x1ee64}, {0x1ee67, 0x1ee6a}, + {0x1ee6c, 0x1ee72}, {0x1ee74, 0x1ee77}, {0x1ee79, 0x1ee7c}, + {0x1ee7e, 0x1ee7e}, {0x1ee80, 0x1ee89}, {0x1ee8b, 0x1ee9b}, + {0x1eea1, 0x1eea3}, {0x1eea5, 0x1eea9}, {0x1eeab, 0x1eebb}, + {0x20000, 0x2a6df}, {0x2a700, 0x2b738}, {0x2b740, 0x2b81d}, + {0x2b820, 0x2cea1}, {0x2ceb0, 0x2ebe0}, {0x2f800, 0x2fa1d}, + {0x30000, 0x3134a}, +}; + +// Number of ranges in kXIDStartRanges +constexpr size_t kNumXIDStartRanges = + sizeof(kXIDStartRanges) / sizeof(kXIDStartRanges[0]); + +// The additional code point interval ranges for the Unicode 14 XID_Continue +// set. This extends the values in kXIDStartRanges. +// This array needs to be in ascending order. +constexpr CodePointRange kXIDContinueRanges[] = { + {0x00030, 0x00039}, {0x0005f, 0x0005f}, {0x000b7, 0x000b7}, + {0x00300, 0x0036f}, {0x00387, 0x00387}, {0x00483, 0x00487}, + {0x00591, 0x005bd}, {0x005bf, 0x005bf}, {0x005c1, 0x005c2}, + {0x005c4, 0x005c5}, {0x005c7, 0x005c7}, {0x00610, 0x0061a}, + {0x0064b, 0x00669}, {0x00670, 0x00670}, {0x006d6, 0x006dc}, + {0x006df, 0x006e4}, {0x006e7, 0x006e8}, {0x006ea, 0x006ed}, + {0x006f0, 0x006f9}, {0x00711, 0x00711}, {0x00730, 0x0074a}, + {0x007a6, 0x007b0}, {0x007c0, 0x007c9}, {0x007eb, 0x007f3}, + {0x007fd, 0x007fd}, {0x00816, 0x00819}, {0x0081b, 0x00823}, + {0x00825, 0x00827}, {0x00829, 0x0082d}, {0x00859, 0x0085b}, + {0x00898, 0x0089f}, {0x008ca, 0x008e1}, {0x008e3, 0x00903}, + {0x0093a, 0x0093c}, {0x0093e, 0x0094f}, {0x00951, 0x00957}, + {0x00962, 0x00963}, {0x00966, 0x0096f}, {0x00981, 0x00983}, + {0x009bc, 0x009bc}, {0x009be, 0x009c4}, {0x009c7, 0x009c8}, + {0x009cb, 0x009cd}, {0x009d7, 0x009d7}, {0x009e2, 0x009e3}, + {0x009e6, 0x009ef}, {0x009fe, 0x009fe}, {0x00a01, 0x00a03}, + {0x00a3c, 0x00a3c}, {0x00a3e, 0x00a42}, {0x00a47, 0x00a48}, + {0x00a4b, 0x00a4d}, {0x00a51, 0x00a51}, {0x00a66, 0x00a71}, + {0x00a75, 0x00a75}, {0x00a81, 0x00a83}, {0x00abc, 0x00abc}, + {0x00abe, 0x00ac5}, {0x00ac7, 0x00ac9}, {0x00acb, 0x00acd}, + {0x00ae2, 0x00ae3}, {0x00ae6, 0x00aef}, {0x00afa, 0x00aff}, + {0x00b01, 0x00b03}, {0x00b3c, 0x00b3c}, {0x00b3e, 0x00b44}, + {0x00b47, 0x00b48}, {0x00b4b, 0x00b4d}, {0x00b55, 0x00b57}, + {0x00b62, 0x00b63}, {0x00b66, 0x00b6f}, {0x00b82, 0x00b82}, + {0x00bbe, 0x00bc2}, {0x00bc6, 0x00bc8}, {0x00bca, 0x00bcd}, + {0x00bd7, 0x00bd7}, {0x00be6, 0x00bef}, {0x00c00, 0x00c04}, + {0x00c3c, 0x00c3c}, {0x00c3e, 0x00c44}, {0x00c46, 0x00c48}, + {0x00c4a, 0x00c4d}, {0x00c55, 0x00c56}, {0x00c62, 0x00c63}, + {0x00c66, 0x00c6f}, {0x00c81, 0x00c83}, {0x00cbc, 0x00cbc}, + {0x00cbe, 0x00cc4}, {0x00cc6, 0x00cc8}, {0x00cca, 0x00ccd}, + {0x00cd5, 0x00cd6}, {0x00ce2, 0x00ce3}, {0x00ce6, 0x00cef}, + {0x00d00, 0x00d03}, {0x00d3b, 0x00d3c}, {0x00d3e, 0x00d44}, + {0x00d46, 0x00d48}, {0x00d4a, 0x00d4d}, {0x00d57, 0x00d57}, + {0x00d62, 0x00d63}, {0x00d66, 0x00d6f}, {0x00d81, 0x00d83}, + {0x00dca, 0x00dca}, {0x00dcf, 0x00dd4}, {0x00dd6, 0x00dd6}, + {0x00dd8, 0x00ddf}, {0x00de6, 0x00def}, {0x00df2, 0x00df3}, + {0x00e31, 0x00e31}, {0x00e33, 0x00e3a}, {0x00e47, 0x00e4e}, + {0x00e50, 0x00e59}, {0x00eb1, 0x00eb1}, {0x00eb3, 0x00ebc}, + {0x00ec8, 0x00ecd}, {0x00ed0, 0x00ed9}, {0x00f18, 0x00f19}, + {0x00f20, 0x00f29}, {0x00f35, 0x00f35}, {0x00f37, 0x00f37}, + {0x00f39, 0x00f39}, {0x00f3e, 0x00f3f}, {0x00f71, 0x00f84}, + {0x00f86, 0x00f87}, {0x00f8d, 0x00f97}, {0x00f99, 0x00fbc}, + {0x00fc6, 0x00fc6}, {0x0102b, 0x0103e}, {0x01040, 0x01049}, + {0x01056, 0x01059}, {0x0105e, 0x01060}, {0x01062, 0x01064}, + {0x01067, 0x0106d}, {0x01071, 0x01074}, {0x01082, 0x0108d}, + {0x0108f, 0x0109d}, {0x0135d, 0x0135f}, {0x01369, 0x01371}, + {0x01712, 0x01715}, {0x01732, 0x01734}, {0x01752, 0x01753}, + {0x01772, 0x01773}, {0x017b4, 0x017d3}, {0x017dd, 0x017dd}, + {0x017e0, 0x017e9}, {0x0180b, 0x0180d}, {0x0180f, 0x01819}, + {0x018a9, 0x018a9}, {0x01920, 0x0192b}, {0x01930, 0x0193b}, + {0x01946, 0x0194f}, {0x019d0, 0x019da}, {0x01a17, 0x01a1b}, + {0x01a55, 0x01a5e}, {0x01a60, 0x01a7c}, {0x01a7f, 0x01a89}, + {0x01a90, 0x01a99}, {0x01ab0, 0x01abd}, {0x01abf, 0x01ace}, + {0x01b00, 0x01b04}, {0x01b34, 0x01b44}, {0x01b50, 0x01b59}, + {0x01b6b, 0x01b73}, {0x01b80, 0x01b82}, {0x01ba1, 0x01bad}, + {0x01bb0, 0x01bb9}, {0x01be6, 0x01bf3}, {0x01c24, 0x01c37}, + {0x01c40, 0x01c49}, {0x01c50, 0x01c59}, {0x01cd0, 0x01cd2}, + {0x01cd4, 0x01ce8}, {0x01ced, 0x01ced}, {0x01cf4, 0x01cf4}, + {0x01cf7, 0x01cf9}, {0x01dc0, 0x01dff}, {0x0203f, 0x02040}, + {0x02054, 0x02054}, {0x020d0, 0x020dc}, {0x020e1, 0x020e1}, + {0x020e5, 0x020f0}, {0x02cef, 0x02cf1}, {0x02d7f, 0x02d7f}, + {0x02de0, 0x02dff}, {0x0302a, 0x0302f}, {0x03099, 0x0309a}, + {0x0a620, 0x0a629}, {0x0a66f, 0x0a66f}, {0x0a674, 0x0a67d}, + {0x0a69e, 0x0a69f}, {0x0a6f0, 0x0a6f1}, {0x0a802, 0x0a802}, + {0x0a806, 0x0a806}, {0x0a80b, 0x0a80b}, {0x0a823, 0x0a827}, + {0x0a82c, 0x0a82c}, {0x0a880, 0x0a881}, {0x0a8b4, 0x0a8c5}, + {0x0a8d0, 0x0a8d9}, {0x0a8e0, 0x0a8f1}, {0x0a8ff, 0x0a909}, + {0x0a926, 0x0a92d}, {0x0a947, 0x0a953}, {0x0a980, 0x0a983}, + {0x0a9b3, 0x0a9c0}, {0x0a9d0, 0x0a9d9}, {0x0a9e5, 0x0a9e5}, + {0x0a9f0, 0x0a9f9}, {0x0aa29, 0x0aa36}, {0x0aa43, 0x0aa43}, + {0x0aa4c, 0x0aa4d}, {0x0aa50, 0x0aa59}, {0x0aa7b, 0x0aa7d}, + {0x0aab0, 0x0aab0}, {0x0aab2, 0x0aab4}, {0x0aab7, 0x0aab8}, + {0x0aabe, 0x0aabf}, {0x0aac1, 0x0aac1}, {0x0aaeb, 0x0aaef}, + {0x0aaf5, 0x0aaf6}, {0x0abe3, 0x0abea}, {0x0abec, 0x0abed}, + {0x0abf0, 0x0abf9}, {0x0fb1e, 0x0fb1e}, {0x0fe00, 0x0fe0f}, + {0x0fe20, 0x0fe2f}, {0x0fe33, 0x0fe34}, {0x0fe4d, 0x0fe4f}, + {0x0ff10, 0x0ff19}, {0x0ff3f, 0x0ff3f}, {0x0ff9e, 0x0ff9f}, + {0x101fd, 0x101fd}, {0x102e0, 0x102e0}, {0x10376, 0x1037a}, + {0x104a0, 0x104a9}, {0x10a01, 0x10a03}, {0x10a05, 0x10a06}, + {0x10a0c, 0x10a0f}, {0x10a38, 0x10a3a}, {0x10a3f, 0x10a3f}, + {0x10ae5, 0x10ae6}, {0x10d24, 0x10d27}, {0x10d30, 0x10d39}, + {0x10eab, 0x10eac}, {0x10f46, 0x10f50}, {0x10f82, 0x10f85}, + {0x11000, 0x11002}, {0x11038, 0x11046}, {0x11066, 0x11070}, + {0x11073, 0x11074}, {0x1107f, 0x11082}, {0x110b0, 0x110ba}, + {0x110c2, 0x110c2}, {0x110f0, 0x110f9}, {0x11100, 0x11102}, + {0x11127, 0x11134}, {0x11136, 0x1113f}, {0x11145, 0x11146}, + {0x11173, 0x11173}, {0x11180, 0x11182}, {0x111b3, 0x111c0}, + {0x111c9, 0x111cc}, {0x111ce, 0x111d9}, {0x1122c, 0x11237}, + {0x1123e, 0x1123e}, {0x112df, 0x112ea}, {0x112f0, 0x112f9}, + {0x11300, 0x11303}, {0x1133b, 0x1133c}, {0x1133e, 0x11344}, + {0x11347, 0x11348}, {0x1134b, 0x1134d}, {0x11357, 0x11357}, + {0x11362, 0x11363}, {0x11366, 0x1136c}, {0x11370, 0x11374}, + {0x11435, 0x11446}, {0x11450, 0x11459}, {0x1145e, 0x1145e}, + {0x114b0, 0x114c3}, {0x114d0, 0x114d9}, {0x115af, 0x115b5}, + {0x115b8, 0x115c0}, {0x115dc, 0x115dd}, {0x11630, 0x11640}, + {0x11650, 0x11659}, {0x116ab, 0x116b7}, {0x116c0, 0x116c9}, + {0x1171d, 0x1172b}, {0x11730, 0x11739}, {0x1182c, 0x1183a}, + {0x118e0, 0x118e9}, {0x11930, 0x11935}, {0x11937, 0x11938}, + {0x1193b, 0x1193e}, {0x11940, 0x11940}, {0x11942, 0x11943}, + {0x11950, 0x11959}, {0x119d1, 0x119d7}, {0x119da, 0x119e0}, + {0x119e4, 0x119e4}, {0x11a01, 0x11a0a}, {0x11a33, 0x11a39}, + {0x11a3b, 0x11a3e}, {0x11a47, 0x11a47}, {0x11a51, 0x11a5b}, + {0x11a8a, 0x11a99}, {0x11c2f, 0x11c36}, {0x11c38, 0x11c3f}, + {0x11c50, 0x11c59}, {0x11c92, 0x11ca7}, {0x11ca9, 0x11cb6}, + {0x11d31, 0x11d36}, {0x11d3a, 0x11d3a}, {0x11d3c, 0x11d3d}, + {0x11d3f, 0x11d45}, {0x11d47, 0x11d47}, {0x11d50, 0x11d59}, + {0x11d8a, 0x11d8e}, {0x11d90, 0x11d91}, {0x11d93, 0x11d97}, + {0x11da0, 0x11da9}, {0x11ef3, 0x11ef6}, {0x16a60, 0x16a69}, + {0x16ac0, 0x16ac9}, {0x16af0, 0x16af4}, {0x16b30, 0x16b36}, + {0x16b50, 0x16b59}, {0x16f4f, 0x16f4f}, {0x16f51, 0x16f87}, + {0x16f8f, 0x16f92}, {0x16fe4, 0x16fe4}, {0x16ff0, 0x16ff1}, + {0x1bc9d, 0x1bc9e}, {0x1cf00, 0x1cf2d}, {0x1cf30, 0x1cf46}, + {0x1d165, 0x1d169}, {0x1d16d, 0x1d172}, {0x1d17b, 0x1d182}, + {0x1d185, 0x1d18b}, {0x1d1aa, 0x1d1ad}, {0x1d242, 0x1d244}, + {0x1d7ce, 0x1d7ff}, {0x1da00, 0x1da36}, {0x1da3b, 0x1da6c}, + {0x1da75, 0x1da75}, {0x1da84, 0x1da84}, {0x1da9b, 0x1da9f}, + {0x1daa1, 0x1daaf}, {0x1e000, 0x1e006}, {0x1e008, 0x1e018}, + {0x1e01b, 0x1e021}, {0x1e023, 0x1e024}, {0x1e026, 0x1e02a}, + {0x1e130, 0x1e136}, {0x1e140, 0x1e149}, {0x1e2ae, 0x1e2ae}, + {0x1e2ec, 0x1e2f9}, {0x1e8d0, 0x1e8d6}, {0x1e944, 0x1e94a}, + {0x1e950, 0x1e959}, {0x1fbf0, 0x1fbf9}, {0xe0100, 0xe01ef}, +}; + +// Number of ranges in kXIDContinueRanges +constexpr size_t kNumXIDContinueRanges = + sizeof(kXIDContinueRanges) / sizeof(kXIDContinueRanges[0]); + +/// @param code_point the input code_point +/// @return true if the code_point is part of the XIDStart unicode set +bool is_xid_start(CodePoint code_point) { + // Fast path for ASCII. + if ((code_point >= 'a' && code_point <= 'z') || + (code_point >= 'A' && code_point <= 'Z')) { + return true; + } + // With [a-zA-Z] handled, nothing less then the next sequence start can be + // XIDStart, so filter them all out. This catches most of the common symbols + // that are used in ASCII. + if (code_point < 0x000aa) { + return false; + } + return std::binary_search(kXIDStartRanges, + kXIDStartRanges + kNumXIDStartRanges, code_point); +} + +/// @param code_point the input code_point +/// @return true if the code_point is part of the XIDContinue unicode set +bool is_xid_continue(CodePoint code_point) { + // Short circuit ASCII. The binary search will find these last, but most + // of our current source is ASCII, so handle them quicker. + if ((code_point >= '0' && code_point <= '9') || code_point == '_') { + return true; + } + return is_xid_start(code_point) || + std::binary_search(kXIDContinueRanges, + kXIDContinueRanges + kNumXIDContinueRanges, + code_point); +} + +/// @return true if @p code_point is considered a whitespace +bool is_space(CodePoint code_point) { + switch (code_point) { + case 0x0020: + case 0x0009: + case 0x000a: + case 0x000b: + case 0x000c: + case 0x000d: + case 0x0085: + case 0x200e: + case 0x200f: + case 0x2028: + case 0x2029: + return true; + default: + return false; + } +} + +/// A fixed capacity, dynamic sized queue of bits (expressed as bools) +template +class BitQueue { + public: + /// @param index the index of the bit starting from the front + /// @return the bit value + auto operator[](size_t index) { + assert(index < count()); // TODO(dneto): this should error out. + return bits_[(index + read_offset_) % CAPACITY_IN_BITS]; + } + + /// Removes the bit at the front of the queue + /// @returns the value of the bit that was removed + bool pop_front() { + assert(count_ > 0); + bool value = (*this)[0]; + count_--; + read_offset_++; + return value; + } + + /// Appends a bit to the back of the queue + void push_back(bool value) { + assert(count_ < CAPACITY_IN_BITS); + count_++; + (*this)[count_ - 1] = value; + } + + /// @returns true if the queue holds no bits. + bool empty() const { return count_ == 0; } + + /// @returns the number of bits held by the queue. + size_t count() const { return count_; } + + private: + std::bitset bits_; + size_t count_ = 0; // number of bits contained + size_t read_offset_ = 0; // read offset in bits + // +#if ENABLE_LOGGING + public: + void to_chars(std::string& str) { + std::stringstream ss; + ss << count_ << ":"; + for (auto i = 0; i < count_; ++i) { + bool is_template = (*this)[i]; + ss << (is_template ? "#" : "."); + } + str = ss.str(); + } +#endif +}; + +class Lexer { + public: + Lexer(TSLexer* l) : lexer_(l) {} + + /// Advances the lexer by one code point. + void advance() { lexer_->advance(lexer_, /* whitespace */ false); } + + /// Returns the next code point, advancing the lexer by one code point. + CodePoint next() { + // TODO(dneto): should assert !lexer_->eof(lexer_) + CodePoint lookahead = lexer_->lookahead; + advance(); + return lookahead; + } + + /// @return the next code point without advancing the lexer, or kEOF if there + /// are no more code points + CodePoint peek() { return lexer_->eof(lexer_) ? kEOF : lexer_->lookahead; } + + /// @return true if the next code point is equal to @p code_point. + /// @note if the code point was found, then the lexer is advanced to that code + /// point. + bool match(CodePoint code_point) { + if (peek() == code_point) { + advance(); + return true; + } + return false; + } + + /// @return true if the next code point is found in @p code_points. + /// @note if the code point was found, then the lexer is advanced to that code + /// point. + bool match_anyof(std::initializer_list code_points) { + for (CodePoint code_point : code_points) { + if (match(code_point)) { + return true; + } + } + return false; + } + + /// Attempts to match an identifier pattern that starts with XIDStart followed by + /// any number of XIDContinue code points. + bool match_identifier() { + if (!is_xid_start(peek())) { + return false; + } + + std::stringstream ss; + bool is_ascii = true; + if (CodePoint start = next(); start < 0x80) { + ss.put(char(start)); + } else { + is_ascii = false; + } + + while (true) { + if (!is_xid_continue(peek())) { + break; + } + if (CodePoint code_point = next(); code_point < 0x80) { + ss.put(char(code_point)); + } else { + is_ascii = false; + } + } + + if (is_ascii) { + LOG("ident: '%s'", ss.str().c_str()); + } else { + LOG("ident"); + } + + return true; + } + + /// Attempts to match a /* block comment */ + bool match_block_comment() { + // TODO(dneto): Need to un-advance if matched '/' but not '*' + if (!match('/') || !match('*')) { + return false; + } + + size_t nesting = 1; + while (nesting > 0 && !match(kEOF)) { + // TODO(dneto): If we match '/' but not '*' there is no way to un-advance + // back to make '/' the lookahead. + if (match('/') && match('*')) { + nesting++; + // TODO(dneto): Same here, need to be able to un-advance to before '*' + } else if (match('*') && match('/')) { + nesting--; + } else { + next(); + } + } + return true; + } + + /// Advances the lexer while the next code point is considered whitespace + void skip_whitespace() { + while (is_space(peek())) { + lexer_->advance(lexer_, /* whitespace */ true); + } + } + + private: + TSLexer* lexer_; +}; + +struct Scanner { + struct State { + BitQueue<1024> lt_is_tmpl; // Queue of disambiguated '<' + BitQueue<1024> gt_is_tmpl; // Queue of disambiguated '>' + bool empty() const { return lt_is_tmpl.empty() && gt_is_tmpl.empty(); } + }; + State state; + static_assert(sizeof(State) < TREE_SITTER_SERIALIZATION_BUFFER_SIZE); + // State is trivially copyable, so it can be serialized and deserialized + // with memcpy. + static_assert(std::is_trivially_copyable::value); + + /// Updates #state with the disambiguated '<' and '>' tokens. + /// The following assumptions are made on entry: + /// * lexer has just advanced to the end of an identifier + /// On exit, all '<' and '>' template tokens will be paired up to the closing + /// '>' for the first '<'. + void classify_template_args(Lexer& lexer) { + LOG("classify_template_args()"); + + if (!lexer.match('<')) { + LOG(" missing '<'"); + return; + } + + // The current expression nesting depth. + size_t expr_depth = 0; + + // A stack of '<' tokens. + // Used to pair '<' and '>' tokens at the same expression depth. + struct StackEntry { + size_t index; // Index of the opening '>' in lt_is_tmpl + size_t expr_depth; // The value of 'expr_depth' for the opening '<' + }; + std::vector lt_stack; + + LOG("classify_template_args() '<'"); + lt_stack.push_back(StackEntry{state.lt_is_tmpl.count(), expr_depth}); + state.lt_is_tmpl.push_back(false); // Default to less-than + + while (!lt_stack.empty() && !lexer.match(kEOF)) { + lexer.skip_whitespace(); + + // TODO: skip line-ending comments. + if (lexer.match_block_comment()) { + continue; + } + + if (lexer.match_identifier()) { + lexer.skip_whitespace(); // TODO: Skip comments + if (lexer.match('<')) { + LOG("classify_template_args() '<'"); + lt_stack.push_back(StackEntry{state.lt_is_tmpl.count(), expr_depth}); + state.lt_is_tmpl.push_back(false); // Default to less-than + } + continue; + } + + if (lexer.match('>')) { + LOG("classify_template_args() '>'"); + if (!lt_stack.empty() && lt_stack.back().expr_depth == expr_depth) { + LOG(" TEMPLATE MATCH"); + state.gt_is_tmpl.push_back(true); + state.lt_is_tmpl[lt_stack.back().index] = true; + lt_stack.pop_back(); + } else { + LOG(" non-template '>'"); + state.gt_is_tmpl.push_back(false); + } + continue; + } + + if (lexer.match_anyof({'(', '['})) { + LOG(" expr_depth++"); + // Entering a nested expression + expr_depth++; + continue; + } + + if (lexer.match_anyof({')', ']'})) { + LOG(" expr_depth--"); + // Exiting a nested expression + // Pop the stack until we return to the current expression + // expr_depth + while (!lt_stack.empty() && lt_stack.back().expr_depth == expr_depth) { + lt_stack.pop_back(); + } + if (expr_depth > 0) { + expr_depth--; + } + continue; + } + + if (lexer.match_anyof({';', '{', '=', ':'})) { + LOG(" expression terminator"); + // Expression terminating tokens. No opening template list can + // hold these tokens, so clear the stack and expression depth. + expr_depth = 0; + lt_stack.clear(); + continue; + } + + bool short_circuit = false; + if (lexer.match('&')) { + short_circuit = lexer.match('&'); + } else if (lexer.match('|')) { + short_circuit = lexer.match('|'); + } + if (short_circuit) { + LOG(" short-circuiting expression"); + // Treat 'a < b || c > d' as a logical binary operator of two + // comparison operators instead of a single template argument + // 'b||c'. Use parentheses around 'b||c' to parse as a + // template argument list. + while (!lt_stack.empty() && lt_stack.back().expr_depth == expr_depth) { + lt_stack.pop_back(); + } + continue; + } + + LOG(" skip: '%c'",char(lexer.peek())); + lexer.next(); + } + } + + std::string valids(const bool* const valid_symbols) { + std::string result; + for (int i = 0; i < static_cast(ERROR) ; i++) { + result += std::string(valid_symbols[i] ? "+" : "_"); + } + for (int i = 0; i < static_cast(ERROR) ; i++) { + if (valid_symbols[i]) { + result += std::string(" ") + str(static_cast(i),true); + } + } + return result; + } + + /// The external token scanner function. Handles block comments and + /// template-argument-list vs less-than / greater-than disambiguation. + /// @return true if lexer->result_symbol was assigned a Token, or + /// false if the token should be taken from the regular WGSL tree-sitter + /// grammar. + bool scan(TSLexer* ts_lexer, const bool* const valid_symbols) { + Lexer lexer{ts_lexer}; + + LOG("scan: '%c' [%u] %s", char(lexer.peek()), unsigned(ts_lexer->get_column(ts_lexer)), valids(valid_symbols).c_str()); + + if (valid_symbols[Token::ERROR]) { + ts_lexer->result_symbol = Token::ERROR; + return true; + } + + if (valid_symbols[Token::DISAMBIGUATE_TEMPLATE]) { + // The parser is telling us the _disambiguate_template token + // may appear at the current position. + // The next token may be the start of a template list, so + // scan forward and use the token-list disambiguation + // algorithm to mark template-list-start and template-list-end + // tokens. These are recorded in the lt and gt bit queues. + + // Call mark_end so that we can "advance" past codepoints without + // automatically including them in the resulting token. + ts_lexer->mark_end(ts_lexer); + ts_lexer->result_symbol = Token::DISAMBIGUATE_TEMPLATE; + + // TODO(dneto): should also skip comments, both line comments + // and block comments. + lexer.skip_whitespace(); + if (lexer.peek() == '<') { + if (state.lt_is_tmpl.empty()) { + classify_template_args(lexer); + } + } + + // This has to return true so that Treesitter will save + // the state generated by the disambiguation scan. + return true; + } + + lexer.skip_whitespace(); + + auto match = [&](Token token) { + ts_lexer->mark_end(ts_lexer); + ts_lexer->result_symbol = token; + return true; + }; + + // TODO(dneto): checkpoint and rewind if failed. + if (lexer.match_block_comment()) { + return match(Token::BLOCK_COMMENT); + } + + // TODO(dneto): Check valid array first. + if (lexer.match('<')) { + if (!state.lt_is_tmpl.empty() && state.lt_is_tmpl.pop_front()) { + return match(Token::TEMPLATE_ARGS_START); + } + if (lexer.match('=')) { + return match(Token::LESS_THAN_EQUAL); + } + if (lexer.match('<')) { + if (lexer.match('=')) { + return match(Token::SHIFT_LEFT_ASSIGN); + } + return match(Token::SHIFT_LEFT); + } + return match(Token::LESS_THAN); + } + + // TODO(dneto): check valid array first. + if (lexer.match('>')) { + if (!state.gt_is_tmpl.empty() && state.gt_is_tmpl.pop_front()) { + return match(Token::TEMPLATE_ARGS_END); + } + if (lexer.match('=')) { + return match(Token::GREATER_THAN_EQUAL); + } + if (lexer.match('>')) { + if (lexer.match('=')) { + return match(Token::SHIFT_RIGHT_ASSIGN); + } + return match(Token::SHIFT_RIGHT); + } + return match(Token::GREATER_THAN); + } + + return false; // Use regular parsing + } + + /// Serializes the scanner state into @p buffer. + unsigned serialize(char* buffer) { + if (state.empty()) { + return 0; + } +#if ENABLE_LOGGING + std::string lt_str; state.lt_is_tmpl.to_chars(lt_str); + std::string gt_str; state.gt_is_tmpl.to_chars(gt_str); + LOG("serialize(lt_is_tmpl: %s, gt_is_tmpl: %s)", + lt_str.c_str(), gt_str.c_str()); +#endif + size_t bytes_written = 0; + auto write = [&](const void* data, size_t num_bytes) { + assert(bytes_written + num_bytes <= + TREE_SITTER_SERIALIZATION_BUFFER_SIZE); + memcpy(buffer + bytes_written, data, num_bytes); + bytes_written += num_bytes; + }; + write(&state.lt_is_tmpl, sizeof(state.lt_is_tmpl)); + write(&state.gt_is_tmpl, sizeof(state.gt_is_tmpl)); + // TODO(dneto): implicit conversion be narrowing. + return bytes_written; + } + + /// Deserializes the scanner state from @p buffer. + void deserialize(const char* const buffer, unsigned length) { + if (length == 0) { + state = {}; + } else { + size_t bytes_read = 0; + auto read = [&](void* data, size_t num_bytes) { + assert(bytes_read + num_bytes <= length); + memcpy(data, buffer + bytes_read, num_bytes); + bytes_read += num_bytes; + }; + read(&state.lt_is_tmpl, sizeof(state.lt_is_tmpl)); + read(&state.gt_is_tmpl, sizeof(state.gt_is_tmpl)); +#if ENABLE_LOGGING + std::string lt_str; state.lt_is_tmpl.to_chars(lt_str); + std::string gt_str; state.gt_is_tmpl.to_chars(gt_str); + LOG("deserialize(lt_is_tmpl: %s, gt_is_tmpl: %s)", + lt_str.c_str(), gt_str.c_str()); +#endif + assert(bytes_read == length); + } + } +}; + +} // anonymous namespace + +extern "C" { +// Called once when language is set on a parser. +// Allocates memory for storing scanner state. +void* tree_sitter_wgsl_external_scanner_create() { + return new Scanner(); +} + +// Called once parser is deleted or different language set. +// Frees memory storing scanner state. +void tree_sitter_wgsl_external_scanner_destroy(void* const payload) { + Scanner* const scanner = static_cast(payload); + delete scanner; +} + +// Called whenever this scanner recognizes a token. +// Serializes scanner state into buffer. +unsigned tree_sitter_wgsl_external_scanner_serialize(void* const payload, + char* const buffer) { + Scanner* scanner = static_cast(payload); + return scanner->serialize(buffer); +} + +// Called when handling edits and ambiguities. +// Deserializes scanner state from buffer. +void tree_sitter_wgsl_external_scanner_deserialize(void* const payload, + const char* const buffer, + unsigned const length) { + Scanner* const scanner = static_cast(payload); + scanner->deserialize(buffer, length); +} + +// Scans for tokens. +bool tree_sitter_wgsl_external_scanner_scan(void* const payload, + TSLexer* const lexer, + const bool* const valid_symbols) { + Scanner* const scanner = static_cast(payload); + if (scanner->scan(lexer, valid_symbols)) { + LOG("scan returned: %s", str(static_cast(lexer->result_symbol))); + return true; + } + return false; +} + +} // extern "C" diff --git a/wgsl/wgsl.recursive.bs.include b/wgsl/wgsl.recursive.bs.include index 9f333d580b..f295215069 100644 --- a/wgsl/wgsl.recursive.bs.include +++ b/wgsl/wgsl.recursive.bs.include @@ -163,6 +163,10 @@
compound_assignment_operator: + | [=syntax_sym/shift_left_assign=] + + | [=syntax_sym/shift_right_assign=] + | `'%='` | `'&='` @@ -175,10 +179,6 @@ | `'/='` - | `'<<='` - - | `'>>='` - | `'^='` | `'|='` @@ -287,7 +287,7 @@ | [=recursive descent syntax/attribute=] * `'override'` [=recursive descent syntax/optionally_typed_ident=] ( `'='` [=recursive descent syntax/expression=] )? `';'` - | [=recursive descent syntax/attribute=] * `'var'` ( `'<'` [=recursive descent syntax/address_space=] ( `','` [=recursive descent syntax/access_mode=] )? `'>'` )? [=recursive descent syntax/optionally_typed_ident=] ( `'='` [=recursive descent syntax/expression=] )? `';'` + | [=recursive descent syntax/attribute=] * `'var'` ( [=syntax_sym/_template_args_start=] [=recursive descent syntax/address_space=] ( `','` [=recursive descent syntax/access_mode=] )? [=syntax_sym/_template_args_end=] )? [=recursive descent syntax/optionally_typed_ident=] ( `'='` [=recursive descent syntax/expression=] )? `';'` | `';'` @@ -423,7 +423,7 @@ | `'('` [=recursive descent syntax/expression=] `')'` - | `'bitcast'` `'<'` [=recursive descent syntax/type_specifier=] `'>'` `'('` [=recursive descent syntax/expression=] `')'` + | `'bitcast'` [=syntax_sym/_template_args_start=] [=recursive descent syntax/type_specifier=] [=syntax_sym/_template_args_end=] `'('` [=recursive descent syntax/expression=] `')'`
@@ -431,17 +431,17 @@ | [=recursive descent syntax/shift_expression.post.unary_expression=] - | [=recursive descent syntax/shift_expression.post.unary_expression=] `'!='` [=recursive descent syntax/unary_expression=] [=recursive descent syntax/shift_expression.post.unary_expression=] + | [=recursive descent syntax/shift_expression.post.unary_expression=] [=syntax_sym/greater_than=] [=recursive descent syntax/unary_expression=] [=recursive descent syntax/shift_expression.post.unary_expression=] - | [=recursive descent syntax/shift_expression.post.unary_expression=] `'<'` [=recursive descent syntax/unary_expression=] [=recursive descent syntax/shift_expression.post.unary_expression=] + | [=recursive descent syntax/shift_expression.post.unary_expression=] [=syntax_sym/greater_than_equal=] [=recursive descent syntax/unary_expression=] [=recursive descent syntax/shift_expression.post.unary_expression=] - | [=recursive descent syntax/shift_expression.post.unary_expression=] `'<='` [=recursive descent syntax/unary_expression=] [=recursive descent syntax/shift_expression.post.unary_expression=] + | [=recursive descent syntax/shift_expression.post.unary_expression=] [=syntax_sym/less_than=] [=recursive descent syntax/unary_expression=] [=recursive descent syntax/shift_expression.post.unary_expression=] - | [=recursive descent syntax/shift_expression.post.unary_expression=] `'=='` [=recursive descent syntax/unary_expression=] [=recursive descent syntax/shift_expression.post.unary_expression=] + | [=recursive descent syntax/shift_expression.post.unary_expression=] [=syntax_sym/less_than_equal=] [=recursive descent syntax/unary_expression=] [=recursive descent syntax/shift_expression.post.unary_expression=] - | [=recursive descent syntax/shift_expression.post.unary_expression=] `'>'` [=recursive descent syntax/unary_expression=] [=recursive descent syntax/shift_expression.post.unary_expression=] + | [=recursive descent syntax/shift_expression.post.unary_expression=] `'!='` [=recursive descent syntax/unary_expression=] [=recursive descent syntax/shift_expression.post.unary_expression=] - | [=recursive descent syntax/shift_expression.post.unary_expression=] `'>='` [=recursive descent syntax/unary_expression=] [=recursive descent syntax/shift_expression.post.unary_expression=] + | [=recursive descent syntax/shift_expression.post.unary_expression=] `'=='` [=recursive descent syntax/unary_expression=] [=recursive descent syntax/shift_expression.post.unary_expression=]
@@ -473,9 +473,9 @@ | ( [=recursive descent syntax/multiplicative_operator=] [=recursive descent syntax/unary_expression=] )* ( [=recursive descent syntax/additive_operator=] [=recursive descent syntax/unary_expression=] ( [=recursive descent syntax/multiplicative_operator=] [=recursive descent syntax/unary_expression=] )* )* - | `'<<'` [=recursive descent syntax/unary_expression=] + | [=syntax_sym/shift_left=] [=recursive descent syntax/unary_expression=] - | `'>>'` [=recursive descent syntax/unary_expression=] + | [=syntax_sym/shift_right=] [=recursive descent syntax/unary_expression=]
@@ -595,13 +595,13 @@ | [=recursive descent syntax/depth_texture_type=] - | [=recursive descent syntax/sampled_texture_type=] `'<'` [=recursive descent syntax/type_specifier=] `'>'` + | [=recursive descent syntax/sampled_texture_type=] [=syntax_sym/_template_args_start=] [=recursive descent syntax/type_specifier=] [=syntax_sym/_template_args_end=] | [=recursive descent syntax/sampler_type=] - | [=recursive descent syntax/storage_texture_type=] `'<'` [=recursive descent syntax/texel_format=] `','` [=recursive descent syntax/access_mode=] `'>'` + | [=recursive descent syntax/storage_texture_type=] [=syntax_sym/_template_args_start=] [=recursive descent syntax/texel_format=] `','` [=recursive descent syntax/access_mode=] [=syntax_sym/_template_args_end=] - | [=syntax/multisampled_texture_type=] `'<'` [=recursive descent syntax/type_specifier=] `'>'` + | [=syntax/multisampled_texture_type=] [=syntax_sym/_template_args_start=] [=recursive descent syntax/type_specifier=] [=syntax_sym/_template_args_end=]
@@ -621,15 +621,15 @@
type_specifier_without_ident: - | [=recursive descent syntax/mat_prefix=] `'<'` [=recursive descent syntax/type_specifier=] `'>'` + | [=recursive descent syntax/mat_prefix=] [=syntax_sym/_template_args_start=] [=recursive descent syntax/type_specifier=] [=syntax_sym/_template_args_end=] | [=recursive descent syntax/texture_and_sampler_types=] - | [=recursive descent syntax/vec_prefix=] `'<'` [=recursive descent syntax/type_specifier=] `'>'` + | [=recursive descent syntax/vec_prefix=] [=syntax_sym/_template_args_start=] [=recursive descent syntax/type_specifier=] [=syntax_sym/_template_args_end=] - | `'array'` `'<'` [=recursive descent syntax/type_specifier=] ( `','` [=recursive descent syntax/element_count_expression=] )? `'>'` + | `'array'` [=syntax_sym/_template_args_start=] [=recursive descent syntax/type_specifier=] ( `','` [=recursive descent syntax/element_count_expression=] )? [=syntax_sym/_template_args_end=] - | `'atomic'` `'<'` [=recursive descent syntax/type_specifier=] `'>'` + | `'atomic'` [=syntax_sym/_template_args_start=] [=recursive descent syntax/type_specifier=] [=syntax_sym/_template_args_end=] | `'bool'` @@ -639,7 +639,7 @@ | `'i32'` - | `'ptr'` `'<'` [=recursive descent syntax/address_space=] `','` [=recursive descent syntax/type_specifier=] ( `','` [=recursive descent syntax/access_mode=] )? `'>'` + | `'ptr'` [=syntax_sym/_template_args_start=] [=recursive descent syntax/address_space=] `','` [=recursive descent syntax/type_specifier=] ( `','` [=recursive descent syntax/access_mode=] )? [=syntax_sym/_template_args_end=] | `'u32'`
@@ -663,7 +663,7 @@
variable_decl: - | `'var'` ( `'<'` [=recursive descent syntax/address_space=] ( `','` [=recursive descent syntax/access_mode=] )? `'>'` )? [=recursive descent syntax/optionally_typed_ident=] + | `'var'` ( [=syntax_sym/_template_args_start=] [=recursive descent syntax/address_space=] ( `','` [=recursive descent syntax/access_mode=] )? [=syntax_sym/_template_args_end=] )? [=recursive descent syntax/optionally_typed_ident=]
diff --git a/wgsl/wgsl_unit_tests.py b/wgsl/wgsl_unit_tests.py new file mode 100644 index 0000000000..5fadef0a77 --- /dev/null +++ b/wgsl/wgsl_unit_tests.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python3 +# +# Copyright 2023 Google LLC +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# 1. Redistributions of works must retain the original copyright +# notice, this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the original +# copyright notice, this list of conditions and the following disclaimer +# in the documentation and/or other materials provided with the +# distribution. +# +# 3. Neither the name of the W3C nor the names of its contributors +# may be used to endorse or promote products derived from this work +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + + +import argparse +import os +import sys +from tree_sitter import Language, Parser + +SCRIPT='wgsl_unit_tests.py' + +class Case: + """ + A test case + """ + def __init__(self,text,expect_pass=True): + self.text = text + self.expect_pass = (expect_pass == True) + + def __str__(self): + expectation = "expect_pass" if self.expect_pass else "expect_fail" + return "Case:{}:\n---\n{}\n---".format(expectation,self.text) + +class XFail(Case): + def __init__(self,text): + super().__init__(text,expect_pass=False) + +cases = [ + XFail("this fails"), + XFail("#version 450"), + Case("const pi = 3.14;"), + Case("const b = bitcast(1u);"), + Case("var s: sampler;"), + Case("@group(0) @binding(0) var s: sampler;"), + Case("var w: i32;"), + Case("fn foo() {var f: i32;}"), + Case("var w: array,1>;"), + XFail("var w: array,(vec(1).x)>;"), + Case( "var w: array,(vec3(1).x)>;"), + XFail("const c = arrayb>;"), + Case("var c : arrayb)>;"), + Case("const a = arrayb))>();"), + Case("const b = arrayb)>();"), + XFail("const d : arrayb)>();"), + Case("fn main(){i=1;}"), + Case("fn main(){var i:i32; i=1;}"), + Case("var w: array;"), + Case("var w: array,1>;"), + Case("var w: vec3;"), + Case("alias t = vec3;"), + Case("alias t = vec3;"), + Case("alias t = array;"), + Case("var c : array;"), + XFail("var c : array<(a>b)>;"), # Type specifier must start with identifier + Case("fn f(p: ptr) {}"), + Case("fn m(){x++;}"), + Case("fn m(){x--;}"), + Case("fn m(){x();}"), +] + +class Options: + def __init__(self,shared_lib): + self.shared_lib = shared_lib + self.verbose = False + +def run_tests(options): + """ + Returns True if all tests passed + """ + global cases + if not os.path.exists(options.shared_lib): + raise RuntimeException("missing shared library {}",options.shared_lib) + + language = Language(options.shared_lib, "wgsl") + parser = Parser() + parser.set_language(language) + + print("{}: ".format(SCRIPT),flush=True,end='') + + num_cases = 0 + num_errors = 0 + for case in cases: + num_cases += 1 + print(".",flush=True,end='') + if options.verbose: + print(case) + tree = parser.parse(bytes(case.text,"utf8")) + if case.expect_pass == tree.root_node.has_error: + num_errors += 1 + print("**Error**") + print(case) + print(tree.root_node.sexp()) + print("---Case end\n",flush=True) + + print("{} pass {} fail ".format(num_cases-num_errors,num_errors),flush=True) + + return num_errors == 0 + +def main(): + argparser = argparse.ArgumentParser( + prog='wgsl_grammar_test.py', + description='Unit tests for the tree-sitter WGSL parser') + argparser.add_argument("--verbose","-v", + action='store_true', + help="be verbose") + argparser.add_argument("--parser", + help="path the shared library for the WGSL tree-sitter parser", + default="grammar/build/wgsl.so") + + args = argparser.parse_args() + options = Options(args.parser) + options.verbose = args.verbose + + if not run_tests(options): + return 1 + return 0 + + +if __name__ == '__main__': + sys.exit(main())