From 6ec8859929a16f9725319cc398b716acf913b01f Mon Sep 17 00:00:00 2001 From: Eric Bailey Date: Fri, 1 Mar 2024 12:44:52 -0600 Subject: [PATCH] Improve tag detection (#2260) * Allow tags to lead with and contain only numbers * Break tags on other whitespace characters * Export regexes from rich text detection * Add test * Add test * Disallow number-only tags * Avoid combining enclosing screen chars * Allow full-width number sign * Clarify tests * Fix punctuation edge case * Reorder * Simplify, add another test * Another test, comment --- .changeset/chatty-cows-kick.md | 5 +++ .changeset/lovely-pandas-pretend.md | 5 +++ .changeset/quick-ducks-joke.md | 5 +++ packages/api/src/index.ts | 1 + packages/api/src/rich-text/detection.ts | 18 ++++++++--- packages/api/src/rich-text/util.ts | 11 +++++++ .../api/tests/rich-text-detection.test.ts | 31 +++++++++++++++++-- 7 files changed, 68 insertions(+), 8 deletions(-) create mode 100644 .changeset/chatty-cows-kick.md create mode 100644 .changeset/lovely-pandas-pretend.md create mode 100644 .changeset/quick-ducks-joke.md create mode 100644 packages/api/src/rich-text/util.ts diff --git a/.changeset/chatty-cows-kick.md b/.changeset/chatty-cows-kick.md new file mode 100644 index 00000000000..76bd82f015d --- /dev/null +++ b/.changeset/chatty-cows-kick.md @@ -0,0 +1,5 @@ +--- +'@atproto/api': patch +--- + +Export regex from rich text detection diff --git a/.changeset/lovely-pandas-pretend.md b/.changeset/lovely-pandas-pretend.md new file mode 100644 index 00000000000..3a75be2877b --- /dev/null +++ b/.changeset/lovely-pandas-pretend.md @@ -0,0 +1,5 @@ +--- +'@atproto/api': patch +--- + +Disallow rare unicode whitespace characters from tags diff --git a/.changeset/quick-ducks-joke.md b/.changeset/quick-ducks-joke.md new file mode 100644 index 00000000000..923b2fe2fe0 --- /dev/null +++ b/.changeset/quick-ducks-joke.md @@ -0,0 +1,5 @@ +--- +'@atproto/api': patch +--- + +Allow tags to lead with numbers diff --git a/packages/api/src/index.ts b/packages/api/src/index.ts index 9e407142aba..87cf1ccf01a 100644 --- a/packages/api/src/index.ts +++ b/packages/api/src/index.ts @@ -14,6 +14,7 @@ export * from './agent' export * from './rich-text/rich-text' export * from './rich-text/sanitization' export * from './rich-text/unicode' +export * from './rich-text/util' export * from './moderation' export * from './moderation/types' export { LABELS } from './moderation/const/labels' diff --git a/packages/api/src/rich-text/detection.ts b/packages/api/src/rich-text/detection.ts index 7b5444a68a5..22c5db1b087 100644 --- a/packages/api/src/rich-text/detection.ts +++ b/packages/api/src/rich-text/detection.ts @@ -1,6 +1,12 @@ import TLDs from 'tlds' import { AppBskyRichtextFacet } from '../client' import { UnicodeString } from './unicode' +import { + URL_REGEX, + MENTION_REGEX, + TAG_REGEX, + TRAILING_PUNCTUATION_REGEX, +} from './util' export type Facet = AppBskyRichtextFacet.Main @@ -9,7 +15,7 @@ export function detectFacets(text: UnicodeString): Facet[] | undefined { const facets: Facet[] = [] { // mentions - const re = /(^|\s|\()(@)([a-zA-Z0-9.-]+)(\b)/g + const re = MENTION_REGEX while ((match = re.exec(text.utf16))) { if (!isValidDomain(match[3]) && !match[3].endsWith('.test')) { continue // probably not a handle @@ -33,8 +39,7 @@ export function detectFacets(text: UnicodeString): Facet[] | undefined { } { // links - const re = - /(^|\s|\()((https?:\/\/[\S]+)|((?[a-z][a-z0-9]*(\.[a-z0-9]+)+)[\S]*))/gim + const re = URL_REGEX while ((match = re.exec(text.utf16))) { let uri = match[2] if (!uri.startsWith('http')) { @@ -70,11 +75,14 @@ export function detectFacets(text: UnicodeString): Facet[] | undefined { } } { - const re = /(^|\s)#((?!\ufe0f)[^\d\s]\S*)(?=\s)?/g + const re = TAG_REGEX while ((match = re.exec(text.utf16))) { let [, leading, tag] = match - tag = tag.trim().replace(/\p{P}+$/gu, '') // strip ending punctuation + if (!tag) continue + + // strip ending punctuation and any spaces + tag = tag.trim().replace(TRAILING_PUNCTUATION_REGEX, '') if (tag.length === 0 || tag.length > 64) continue diff --git a/packages/api/src/rich-text/util.ts b/packages/api/src/rich-text/util.ts new file mode 100644 index 00000000000..ab50c66212d --- /dev/null +++ b/packages/api/src/rich-text/util.ts @@ -0,0 +1,11 @@ +export const MENTION_REGEX = /(^|\s|\()(@)([a-zA-Z0-9.-]+)(\b)/g +export const URL_REGEX = + /(^|\s|\()((https?:\/\/[\S]+)|((?[a-z][a-z0-9]*(\.[a-z0-9]+)+)[\S]*))/gim +export const TRAILING_PUNCTUATION_REGEX = /\p{P}+$/gu + +/** + * `\ufe0f` emoji modifier + * `\u00AD\u2060\u200A\u200B\u200C\u200D\u20e2` zero-width spaces (likely incomplete) + */ +export const TAG_REGEX = + /(^|\s)[##]((?!\ufe0f)[^\s\u00AD\u2060\u200A\u200B\u200C\u200D\u20e2]*[^\d\s\p{P}\u00AD\u2060\u200A\u200B\u200C\u200D\u20e2]+[^\s\u00AD\u2060\u200A\u200B\u200C\u200D\u20e2]*)?/gu diff --git a/packages/api/tests/rich-text-detection.test.ts b/packages/api/tests/rich-text-detection.test.ts index 0eafb65a3b1..084b5440a48 100644 --- a/packages/api/tests/rich-text-detection.test.ts +++ b/packages/api/tests/rich-text-detection.test.ts @@ -218,7 +218,7 @@ describe('detectFacets', () => { } }) - it('correctly detects tags inline', async () => { + describe('correctly detects tags inline', () => { const inputs: [ string, string[], @@ -234,11 +234,13 @@ describe('detectFacets', () => { ], ], ['#1', [], []], + ['#1a', ['1a'], [{ byteStart: 0, byteEnd: 3 }]], ['#tag', ['tag'], [{ byteStart: 0, byteEnd: 4 }]], ['body #tag', ['tag'], [{ byteStart: 5, byteEnd: 9 }]], ['#tag body', ['tag'], [{ byteStart: 0, byteEnd: 4 }]], ['body #tag body', ['tag'], [{ byteStart: 5, byteEnd: 9 }]], ['body #1', [], []], + ['body #1a', ['1a'], [{ byteStart: 5, byteEnd: 8 }]], ['body #a1', ['a1'], [{ byteStart: 5, byteEnd: 8 }]], ['#', [], []], ['#?', [], []], @@ -254,12 +256,18 @@ describe('detectFacets', () => { [], [], ], + [ + 'body #thisisa64characterstring_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa!', + ['thisisa64characterstring_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'], + [{ byteStart: 5, byteEnd: 70 }], + ], [ 'its a #double#rainbow', ['double#rainbow'], [{ byteStart: 6, byteEnd: 21 }], ], ['##hashash', ['#hashash'], [{ byteStart: 0, byteEnd: 9 }]], + ['##', [], []], ['some #n0n3s@n5e!', ['n0n3s@n5e'], [{ byteStart: 5, byteEnd: 15 }]], [ 'works #with,punctuation', @@ -319,9 +327,26 @@ describe('detectFacets', () => { }, ], ], + ['no match (\\u200B): #​', [], []], + ['no match (\\u200Ba): #​a', [], []], + ['match (a\\u200Bb): #a​b', ['a'], [{ byteStart: 18, byteEnd: 20 }]], + ['match (ab\\u200B): #ab​', ['ab'], [{ byteStart: 18, byteEnd: 21 }]], + ['no match (\\u20e2tag): #⃢tag', [], []], + ['no match (a\\u20e2b): #a⃢b', ['a'], [{ byteStart: 21, byteEnd: 23 }]], + [ + 'match full width number sign (tag): #tag', + ['tag'], + [{ byteStart: 36, byteEnd: 42 }], + ], + [ + 'match full width number sign (tag): ##️⃣tag', + ['#️⃣tag'], + [{ byteStart: 36, byteEnd: 49 }], + ], + ['no match 1?: #1?', [], []], ] - for (const [input, tags, indices] of inputs) { + it.each(inputs)('%s', async (input, tags, indices) => { const rt = new RichText({ text: input }) await rt.detectFacets(agent) @@ -340,7 +365,7 @@ describe('detectFacets', () => { expect(detectedTags).toEqual(tags) expect(detectedIndices).toEqual(indices) - } + }) }) })