From 6ec8859929a16f9725319cc398b716acf913b01f Mon Sep 17 00:00:00 2001
From: Eric Bailey <git@esb.lol>
Date: Fri, 1 Mar 2024 12:44:52 -0600
Subject: [PATCH] Improve tag detection (#2260)

* Allow tags to lead with and contain only numbers

* Break tags on other whitespace characters

* Export regexes from rich text detection

* Add test

* Add test

* Disallow number-only tags

* Avoid combining enclosing screen chars

* Allow full-width number sign

* Clarify tests

* Fix punctuation edge case

* Reorder

* Simplify, add another test

* Another test, comment
---
 .changeset/chatty-cows-kick.md                |  5 +++
 .changeset/lovely-pandas-pretend.md           |  5 +++
 .changeset/quick-ducks-joke.md                |  5 +++
 packages/api/src/index.ts                     |  1 +
 packages/api/src/rich-text/detection.ts       | 18 ++++++++---
 packages/api/src/rich-text/util.ts            | 11 +++++++
 .../api/tests/rich-text-detection.test.ts     | 31 +++++++++++++++++--
 7 files changed, 68 insertions(+), 8 deletions(-)
 create mode 100644 .changeset/chatty-cows-kick.md
 create mode 100644 .changeset/lovely-pandas-pretend.md
 create mode 100644 .changeset/quick-ducks-joke.md
 create mode 100644 packages/api/src/rich-text/util.ts
diff --git a/.changeset/chatty-cows-kick.md b/.changeset/chatty-cows-kick.md
new file mode 100644
index 00000000000..76bd82f015d
--- /dev/null
+++ b/.changeset/chatty-cows-kick.md
@@ -0,0 +1,5 @@
+---
+'@atproto/api': patch
+---
+
+Export regex from rich text detection
diff --git a/.changeset/lovely-pandas-pretend.md b/.changeset/lovely-pandas-pretend.md
new file mode 100644
index 00000000000..3a75be2877b
--- /dev/null
+++ b/.changeset/lovely-pandas-pretend.md
@@ -0,0 +1,5 @@
+---
+'@atproto/api': patch
+---
+
+Disallow rare unicode whitespace characters from tags
diff --git a/.changeset/quick-ducks-joke.md b/.changeset/quick-ducks-joke.md
new file mode 100644
index 00000000000..923b2fe2fe0
--- /dev/null
+++ b/.changeset/quick-ducks-joke.md
@@ -0,0 +1,5 @@
+---
+'@atproto/api': patch
+---
+
+Allow tags to lead with numbers
diff --git a/packages/api/src/index.ts b/packages/api/src/index.ts
index 9e407142aba..87cf1ccf01a 100644
--- a/packages/api/src/index.ts
+++ b/packages/api/src/index.ts
@@ -14,6 +14,7 @@ export * from './agent'
 export * from './rich-text/rich-text'
 export * from './rich-text/sanitization'
 export * from './rich-text/unicode'
+export * from './rich-text/util'
 export * from './moderation'
 export * from './moderation/types'
 export { LABELS } from './moderation/const/labels'
diff --git a/packages/api/src/rich-text/detection.ts b/packages/api/src/rich-text/detection.ts
index 7b5444a68a5..22c5db1b087 100644
--- a/packages/api/src/rich-text/detection.ts
+++ b/packages/api/src/rich-text/detection.ts
@@ -1,6 +1,12 @@
 import TLDs from 'tlds'
 import { AppBskyRichtextFacet } from '../client'
 import { UnicodeString } from './unicode'
+import {
+  URL_REGEX,
+  MENTION_REGEX,
+  TAG_REGEX,
+  TRAILING_PUNCTUATION_REGEX,
+} from './util'
 
 export type Facet = AppBskyRichtextFacet.Main
 
@@ -9,7 +15,7 @@ export function detectFacets(text: UnicodeString): Facet[] | undefined {
   const facets: Facet[] = []
   {
     // mentions
-    const re = /(^|\s|\()(@)([a-zA-Z0-9.-]+)(\b)/g
+    const re = MENTION_REGEX
     while ((match = re.exec(text.utf16))) {
       if (!isValidDomain(match[3]) && !match[3].endsWith('.test')) {
         continue // probably not a handle
@@ -33,8 +39,7 @@ export function detectFacets(text: UnicodeString): Facet[] | undefined {
   }
   {
     // links
-    const re =
-      /(^|\s|\()((https?:\/\/[\S]+)|((?<domain>[a-z][a-z0-9]*(\.[a-z0-9]+)+)[\S]*))/gim
+    const re = URL_REGEX
     while ((match = re.exec(text.utf16))) {
       let uri = match[2]
       if (!uri.startsWith('http')) {
@@ -70,11 +75,14 @@ export function detectFacets(text: UnicodeString): Facet[] | undefined {
     }
   }
   {
-    const re = /(^|\s)#((?!\ufe0f)[^\d\s]\S*)(?=\s)?/g
+    const re = TAG_REGEX
     while ((match = re.exec(text.utf16))) {
       let [, leading, tag] = match
 
-      tag = tag.trim().replace(/\p{P}+$/gu, '') // strip ending punctuation
+      if (!tag) continue
+
+      // strip ending punctuation and any spaces
+      tag = tag.trim().replace(TRAILING_PUNCTUATION_REGEX, '')
 
       if (tag.length === 0 || tag.length > 64) continue
 
diff --git a/packages/api/src/rich-text/util.ts b/packages/api/src/rich-text/util.ts
new file mode 100644
index 00000000000..ab50c66212d
--- /dev/null
+++ b/packages/api/src/rich-text/util.ts
@@ -0,0 +1,11 @@
+export const MENTION_REGEX = /(^|\s|\()(@)([a-zA-Z0-9.-]+)(\b)/g
+export const URL_REGEX =
+  /(^|\s|\()((https?:\/\/[\S]+)|((?<domain>[a-z][a-z0-9]*(\.[a-z0-9]+)+)[\S]*))/gim
+export const TRAILING_PUNCTUATION_REGEX = /\p{P}+$/gu
+
+/**
+ * `\ufe0f` emoji modifier
+ * `\u00AD\u2060\u200A\u200B\u200C\u200D\u20e2` zero-width spaces (likely incomplete)
+ */
+export const TAG_REGEX =
+  /(^|\s)[#＃]((?!\ufe0f)[^\s\u00AD\u2060\u200A\u200B\u200C\u200D\u20e2]*[^\d\s\p{P}\u00AD\u2060\u200A\u200B\u200C\u200D\u20e2]+[^\s\u00AD\u2060\u200A\u200B\u200C\u200D\u20e2]*)?/gu
diff --git a/packages/api/tests/rich-text-detection.test.ts b/packages/api/tests/rich-text-detection.test.ts
index 0eafb65a3b1..084b5440a48 100644
--- a/packages/api/tests/rich-text-detection.test.ts
+++ b/packages/api/tests/rich-text-detection.test.ts
@@ -218,7 +218,7 @@ describe('detectFacets', () => {
     }
   })
 
-  it('correctly detects tags inline', async () => {
+  describe('correctly detects tags inline', () => {
     const inputs: [
       string,
       string[],
@@ -234,11 +234,13 @@ describe('detectFacets', () => {
         ],
       ],
       ['#1', [], []],
+      ['#1a', ['1a'], [{ byteStart: 0, byteEnd: 3 }]],
       ['#tag', ['tag'], [{ byteStart: 0, byteEnd: 4 }]],
       ['body #tag', ['tag'], [{ byteStart: 5, byteEnd: 9 }]],
       ['#tag body', ['tag'], [{ byteStart: 0, byteEnd: 4 }]],
       ['body #tag body', ['tag'], [{ byteStart: 5, byteEnd: 9 }]],
       ['body #1', [], []],
+      ['body #1a', ['1a'], [{ byteStart: 5, byteEnd: 8 }]],
       ['body #a1', ['a1'], [{ byteStart: 5, byteEnd: 8 }]],
       ['#', [], []],
       ['#?', [], []],
@@ -254,12 +256,18 @@ describe('detectFacets', () => {
         [],
         [],
       ],
+      [
+        'body #thisisa64characterstring_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa!',
+        ['thisisa64characterstring_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'],
+        [{ byteStart: 5, byteEnd: 70 }],
+      ],
       [
         'its a #double#rainbow',
         ['double#rainbow'],
         [{ byteStart: 6, byteEnd: 21 }],
       ],
       ['##hashash', ['#hashash'], [{ byteStart: 0, byteEnd: 9 }]],
+      ['##', [], []],
       ['some #n0n3s@n5e!', ['n0n3s@n5e'], [{ byteStart: 5, byteEnd: 15 }]],
       [
         'works #with,punctuation',
@@ -319,9 +327,26 @@ describe('detectFacets', () => {
           },
         ],
       ],
+      ['no match (\\u200B): #​', [], []],
+      ['no match (\\u200Ba): #​a', [], []],
+      ['match (a\\u200Bb): #a​b', ['a'], [{ byteStart: 18, byteEnd: 20 }]],
+      ['match (ab\\u200B): #ab​', ['ab'], [{ byteStart: 18, byteEnd: 21 }]],
+      ['no match (\\u20e2tag): #⃢tag', [], []],
+      ['no match (a\\u20e2b): #a⃢b', ['a'], [{ byteStart: 21, byteEnd: 23 }]],
+      [
+        'match full width number sign (tag): ＃tag',
+        ['tag'],
+        [{ byteStart: 36, byteEnd: 42 }],
+      ],
+      [
+        'match full width number sign (tag): ＃#️⃣tag',
+        ['#️⃣tag'],
+        [{ byteStart: 36, byteEnd: 49 }],
+      ],
+      ['no match 1?: #1?', [], []],
     ]
 
-    for (const [input, tags, indices] of inputs) {
+    it.each(inputs)('%s', async (input, tags, indices) => {
       const rt = new RichText({ text: input })
       await rt.detectFacets(agent)
 
@@ -340,7 +365,7 @@ describe('detectFacets', () => {
 
       expect(detectedTags).toEqual(tags)
       expect(detectedIndices).toEqual(indices)
-    }
+    })
   })
 })