Lexer: isTag: allow whole tag (except first char) to be any non-space (#3957)

smemsh · smemsh · commit 6a15239d6628 · 2025-09-28T14:19:43.000-07:00
Instead of walking the string and stopping at the first
non-isIdentifier() character (see Lexer::isIdentifierStart(),
isIdentifierNext() and isSingleCharOperator()), walk all the way to the
end of the word.  This allows even punctuation and other characters to
be used in tags.

We still need to use isIdentifierStart() for the first character, to
disambiguate it from a negative number or subtraction.  Apparently there
is no command context available, so the parser cannot "know" whether
it's doing a "task calc" and have different parse rules.  The lexing
seems to happen before breaking the arguments down into commands for
dispatch.
diff --git a/src/Lexer.cpp b/src/Lexer.cpp
@@ -708,7 +708,7 @@ bool Lexer::isSet(std::string& token, Lexer::Type& type) {
 ////////////////////////////////////////////////////////////////////////////////
 // Lexer::Type::tag
 //   ^ | '(' | ')' | <unicodeWhitespace>
-//     [ +|- ] <isIdentifierStart> [ <isIdentifierNext> ]*
+//     [ +|- ] <isIdentifierStart> [ <word> ]
 bool Lexer::isTag(std::string& token, Lexer::Type& type) {
   std::size_t marker = _cursor;
 
@@ -721,14 +721,12 @@ bool Lexer::isTag(std::string& token, Lexer::Type& type) {
     ++marker;
 
     if (isIdentifierStart(_text[marker])) {
-      utf8_next_char(_text, marker);
-
-      while (isIdentifierNext(_text[marker])) utf8_next_char(_text, marker);
-
-      token = _text.substr(_cursor, marker - _cursor);
-      type = Lexer::Type::tag;
-      _cursor = marker;
-      return true;
+      if (readWord(_text, marker, token)) {
+        token = _text.substr(_cursor, marker - _cursor);
+        type = Lexer::Type::tag;
+        _cursor = marker;
+        return true;
+      }
     }
   }
 
diff --git a/test/tag.test.py b/test/tag.test.py
@@ -594,6 +594,32 @@ def test_tag_filter_partial_match(self):
         self.assertNotIn("three", out)
         self.assertIn("four", out)
 
+class TestIssue3957(TestCase):
+    def setUp(self):
+        """Executed before each test in the class"""
+        self.t = Task()
+
+    def test_tag_dashes_and_utf8_glyphs(self):
+        """
+        - All non-whitespace valid if not first char
+        - Single char tags work
+        - UTF-8 can work as first/only chars too
+        """
+        crabglyph = "\U0001f980"
+        testtags = ['', 'this-test', 'that-test.foo', f"foo-{crabglyph}"]
+
+        self.t(f"add{' +'.join(testtags)} one")
+        code, out, err = self.t("_get 1.tags")
+        self.assertEqual(sorted(testtags[1:]), sorted(out.strip().split(",")))
+
+        self.t("add +x two")
+        code, out, err = self.t("_get 2.tags")
+        self.assertEqual("x\n", out)
+
+        self.t(f"add +{crabglyph} three")
+        code, out, err = self.t("_get 3.tags")
+        self.assertEqual(f"{crabglyph}\n", out)
+
 
 if __name__ == "__main__":
     from simpletap import TAPTestRunner