A (better) trigram tokenizer for SQLite3 FTS5
@@ -571,6 +574,8 @@ index 0000000..dec011f + +This tokenizer fixes this by treating spaces as a word boundary. The result is that `i am a bird` gets tokenized as `['i', 'am', 'a', 'bir', 'ird']` and `SELECT * FROM fts_table WHERE title MATCH 'a bird'` correctly returns the expected results. You get all the benefits for substring matching just with a wider range of queries. + ++Furthermore, the built-in `trigram` tokenizer treats CJK as normal characters and creates trigrams out of them. The problem is, in CJK a single Unicode character can be a whole word. `better-trigram` fixes this by treating each CJK character as its own token. For example: `李红:那是钢笔` gets tokenized as `['李','红',':','那','是','钢','笔']` and if there are any non-CJK words mixed in the input, they also get properly tokenized automatically. ++ +## Compatibility with `trigram` + +`better-trigram` is 99% compatible with `trigram`. This means it has full UTF-8 support, handles all the same edge cases etc. To ensure `better-trigram` remains compatible, it passes all the `trigram` tokenizer tests. Yay! @@ -642,16 +647,16 @@ index 0000000..dec011f + May you share freely, never taking more than you give. +``` diff --git a/node_modules/react-native-quick-sqlite/better-trigram/VERSION b/node_modules/react-native-quick-sqlite/better-trigram/VERSION -new file mode 100644 -index 0000000..7bcd0e3 +new file mode 100755 +index 0000000..6812f81 --- /dev/null +++ b/node_modules/react-native-quick-sqlite/better-trigram/VERSION @@ -0,0 +1 @@ -+0.0.2 ++0.0.3 \ No newline at end of file diff --git a/node_modules/react-native-quick-sqlite/better-trigram/bench.ts b/node_modules/react-native-quick-sqlite/better-trigram/bench.ts -new file mode 100644 -index 0000000..e050a3e +new file mode 100755 +index 0000000..bd9fe70 --- /dev/null +++ b/node_modules/react-native-quick-sqlite/better-trigram/bench.ts @@ -0,0 +1,44 @@ @@ -673,7 +678,7 @@ index 0000000..e050a3e +const text = lorem.generateParagraphs(100); +function initDatabase() { + const db = new Database(":memory:"); -+ db.loadExtension("./better-trigram.so"); ++ db.loadExtension("./dist/better-trigram.so"); + return db; +} +const db = initDatabase(); @@ -700,7 +705,7 @@ index 0000000..e050a3e + +await run(); diff --git a/node_modules/react-native-quick-sqlite/better-trigram/better-trigram.c b/node_modules/react-native-quick-sqlite/better-trigram/better-trigram.c -new file mode 100644 +new file mode 100755 index 0000000..010fce1 --- /dev/null +++ b/node_modules/react-native-quick-sqlite/better-trigram/better-trigram.c @@ -857,20 +862,18 @@ index 0000000..010fce1 +} +#endif diff --git a/node_modules/react-native-quick-sqlite/better-trigram/better-trigram.h b/node_modules/react-native-quick-sqlite/better-trigram/better-trigram.h -new file mode 100644 -index 0000000..ac602bc +new file mode 100755 +index 0000000..ed2be79 --- /dev/null +++ b/node_modules/react-native-quick-sqlite/better-trigram/better-trigram.h -@@ -0,0 +1,44 @@ +@@ -0,0 +1,41 @@ +#ifndef SQLITE_BETTER_TRIGRAM_H +#define SQLITE_BETTER_TRIGRAM_H + +#ifndef SQLITE_CORE +#include "sqlite3ext.h" +#else -+ +#include "sqlite3.h" -+ +#endif + +#ifndef SQLITE_PRIVATE @@ -893,7 +896,6 @@ index 0000000..ac602bc + +#ifdef SQLITE_CORE +SQLITE_PRIVATE int sqlite3Fts5BetterTrigramInit(sqlite3 *db); -+ +#else +SQLITE_BETTER_TRIGRAM_API int +sqlite3_bettertrigram_init(sqlite3 *db, char **pzErrMsg, @@ -908,11 +910,11 @@ index 0000000..ac602bc +#endif /* ifndef SQLITE_BETTER_TRIGRAM_H */ \ No newline at end of file diff --git a/node_modules/react-native-quick-sqlite/better-trigram/better-trigram.test.ts b/node_modules/react-native-quick-sqlite/better-trigram/better-trigram.test.ts -new file mode 100644 -index 0000000..daa9364 +new file mode 100755 +index 0000000..190a003 --- /dev/null +++ b/node_modules/react-native-quick-sqlite/better-trigram/better-trigram.test.ts -@@ -0,0 +1,796 @@ +@@ -0,0 +1,860 @@ +/* + ** 2024-10-21 + ** @@ -1674,6 +1676,70 @@ index 0000000..daa9364 + }); +}); + ++describe("cjk", () => { ++ const db = initDatabase(); ++ afterAll(() => db.close()); ++ ++ test("1.0", () => { ++ [ ++ `CREATE VIRTUAL TABLE t1 USING fts5(y, tokenize='better_trigram remove_diacritics 1');`, ++ `INSERT INTO t1 VALUES('王明:这是什么?');`, ++ `INSERT INTO t1 VALUES('李红:这是书。');`, ++ `INSERT INTO t1 VALUES('王明:那是什么?');`, ++ `INSERT INTO t1 VALUES('李红:那是钢笔。');`, ++ `INSERT INTO t1 VALUES('王明:那是杂志吗?');`, ++ `INSERT INTO t1 VALUES('李红:不,那不是杂志。那是字典。');`, ++ `INSERT INTO t1 VALUES('some 李红:不,那不 text 是杂志。in 那是 chinese 字典。');`, ++ ].forEach((stmt) => db.query(stmt).run()); ++ }); ++ ++ sqlTest( ++ db, ++ `1.1`, ++ `SELECT highlight(t1, 0, '(', ')') as res FROM t1('王明');`, ++ [], ++ ["(王明):这是什么?", "(王明):那是什么?", "(王明):那是杂志吗?"] ++ ); ++ ++ sqlTest( ++ db, ++ `1.2`, ++ `SELECT highlight(t1, 0, '(', ')') as res FROM t1('那是');`, ++ [], ++ [ ++ "王明:(那是)什么?", ++ "李红:(那是)钢笔。", ++ "王明:(那是)杂志吗?", ++ "李红:不,那不是杂志。(那是)字典。", ++ "some 李红:不,那不 text 是杂志。in (那是) chinese 字典。", ++ ] ++ ); ++ ++ sqlTest( ++ db, ++ `1.3`, ++ `SELECT highlight(t1, 0, '(', ')') as res FROM t1('钢');`, ++ [], ++ ["李红:那是(钢)笔。"] ++ ); ++ ++ sqlTest( ++ db, ++ `1.4`, ++ `SELECT highlight(t1, 0, '(', ')') as res FROM t1('王明:');`, ++ [], ++ ["(王明:)这是什么?", "(王明:)那是什么?", "(王明:)那是杂志吗?"] ++ ); ++ ++ sqlTest( ++ db, ++ `1.4`, ++ `SELECT highlight(t1, 0, '(', ')') as res FROM t1('some 李红');`, ++ [], ++ ["(some) (李红):不,那不 text 是杂志。in 那是 chinese 字典。"] ++ ); ++}); ++ +function sqlTest( + db: Database, + version: string, @@ -2496,7 +2562,7 @@ index 0000000..2133d5d + aAscii[0] = 0; /* 0x00 is never a token character */ +} diff --git a/node_modules/react-native-quick-sqlite/better-trigram/sqlite-dist.toml b/node_modules/react-native-quick-sqlite/better-trigram/sqlite-dist.toml -new file mode 100644 +new file mode 100755 index 0000000..423e573 --- /dev/null +++ b/node_modules/react-native-quick-sqlite/better-trigram/sqlite-dist.toml @@ -2516,11 +2582,11 @@ index 0000000..423e573 + +npm = {} diff --git a/node_modules/react-native-quick-sqlite/better-trigram/tokenizer.c b/node_modules/react-native-quick-sqlite/better-trigram/tokenizer.c -new file mode 100644 -index 0000000..6a22dc1 +new file mode 100755 +index 0000000..059015a --- /dev/null +++ b/node_modules/react-native-quick-sqlite/better-trigram/tokenizer.c -@@ -0,0 +1,160 @@ +@@ -0,0 +1,195 @@ +/* +** 2024-10-21 +** @@ -2547,7 +2613,6 @@ index 0000000..6a22dc1 +** from the sqlite3 source file: +*https://github.com/sqlite/sqlite/blob/88282af521692b398b0d0cc58a8bdb220a8ff58c/ext/fts5/fts5_tokenize.c. +*/ -+//#ifndef SQLITE_CORE +static const unsigned char sqlite3Utf8Trans1[] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, + 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, @@ -2597,7 +2662,31 @@ index 0000000..6a22dc1 + } \ + } \ + } -+//#endif ++ ++static const int CJK[6][2] = {{0x3400, 0x4DBF}, {0x4E00, 0x9FFF}, ++ {0xF900, 0xFAFF}, {0x20000, 0x2EBEF}, ++ {0x2F800, 0x2FA1F}, {0x30000, 0x3134F}}; ++// https://jrgraphix.net/research/unicode_blocks.php ++// https://en.wikipedia.org/wiki/CJK_Unified_Ideographs ++// 3400 — 4DBF CJK Unified Ideographs Extension A ++// 4E00 — 9FFF CJK Unified Ideographs ++// F900 — FAFF CJK Compatibility Ideographs ++// 20000 — 2A6DF 2A700-2B73F 2B740–2B81F. 2B820–2CEAF. 2CEB0–2EBEF. CJK ++// Unified Ideographs Extension B/C/D/E/F 2F800 — 2FA1F CJK ++// Compatibility Ideographs Supplement ++// 30000–3134F. CJK Unified Ideographs Extension G ++static inline int isCJK(int iCode) { ++ for (int i = 0; i < 6; i++) { ++ if (iCode < CJK[i][0]) { // smaller ++ break; ++ } ++ if (iCode <= CJK[i][1]) { // in range ++ return 1; ++ } ++ // bigger than range ++ } ++ return 0; ++} + +/* Function to optionally fold case and remove diacritics */ +static inline int customFold(int iCode, int foldCase, int removeDiacritics) { @@ -2655,7 +2744,9 @@ index 0000000..6a22dc1 + isPartial = 1; + } + -+ if (iCode == 32) { ++ int cjk = isCJK(iCode); ++ int isSpace = iCode == 32; ++ if (isSpace || cjk) { + // write words smaller than 3 characters directly to output + // but make sure we aren't at the end of a word + if (!isPartial && i && i < 3) { @@ -2664,6 +2755,16 @@ index 0000000..6a22dc1 + break; + } + ++ if (cjk) { ++ zOut = aBuf; ++ WRITE_UTF8(zOut, iCode); ++ ++ int result = ++ xToken(pCtx, 0, aBuf, zOut - aBuf, start, start + (zOut - aBuf)); ++ if (result != 0) ++ break; ++ } ++ + // reset for next word + isPartial = 0; + i = 0; @@ -109616,25 +109717,27 @@ index 9b284d2..bda794a 100644 + +#endif diff --git a/node_modules/react-native-quick-sqlite/cpp/sqliteBridge.cpp b/node_modules/react-native-quick-sqlite/cpp/sqliteBridge.cpp -index 9ffe010..748a1f6 100644 +index 9ffe010..bdbd44e 100644 --- a/node_modules/react-native-quick-sqlite/cpp/sqliteBridge.cpp +++ b/node_modules/react-native-quick-sqlite/cpp/sqliteBridge.cpp -@@ -20,6 +20,8 @@ +@@ -20,6 +20,9 @@ using namespace std; using namespace facebook; +extern "C" int sqlite3Fts5BetterTrigramInit(sqlite3 *db); ++extern "C" int sqlite3Fts5HtmlInit(sqlite3 *db); + mapThis domain is for use in illustrative examples in documents. You may use this ++ domain in literature without prior coordination or asking for permission.
++ ++This domain is for use in illustrative examples in documents. You may use this
');`, ++ ].forEach((stmt) => db.query(stmt).run()); ++ }); ++ ++ const queries = { ++ hello: 0, ++ domain: 1, ++ use: 1, ++ }; ++ let i = 1; ++ for (const query in queries) { ++ sqlTest( ++ db, ++ `1.${i++}.${query}`, ++ `SELECT count(*) as res FROM test WHERE test MATCH '${query}'`, ++ [], ++ [queries[query]], ++ ); ++ } ++}); ++ ++function sqlTest( ++ db: Database, ++ version: string, ++ query: string, ++ params: string[], ++ expected: string | number | undefined | (string | number | undefined)[], ++) { ++ test(version, () => { ++ const result = db.query(query).all(...params) as { ++ res: string | number; ++ }[]; ++ if (Array.isArray(expected)) { ++ expect(Array.isArray(expected)).toBeTrue(); ++ expect(result.length).toBe(expected.length); ++ result.forEach((result, i) => expect(result.res).toBe(expected[i]!)); ++ } else { ++ expect(result[0]?.res).toBe(expected!); ++ } ++ }); ++} ++ ++function explainQueryPlanTest( ++ db: Database, ++ version: string, ++ query: string, ++ params: string[], ++ expected: string, ++) { ++ test(version, () => { ++ const result = db.query(`EXPLAIN QUERY PLAN ${query}`).get(...params) as { ++ detail?: string; ++ }; ++ expect(result?.detail).toInclude(expected); ++ }); ++} +diff --git a/node_modules/react-native-quick-sqlite/sqlite3-fts5-html/genhtmlentity.ts b/node_modules/react-native-quick-sqlite/sqlite3-fts5-html/genhtmlentity.ts +new file mode 100755 +index 0000000..8878773 +--- /dev/null ++++ b/node_modules/react-native-quick-sqlite/sqlite3-fts5-html/genhtmlentity.ts +@@ -0,0 +1,80 @@ ++// @ts-ignore ++import * as fs from "node:fs/promises"; ++ ++interface EntityDefinition { ++ characters: string; ++} ++ ++const FILENAME = "fts5-html.c"; ++const entities: Record