Merge pull request #211 from bitcoin-sv/fix/utf8ToArray

Fix/utf8 to array
bitcoin-sv · Feb 28, 2025 · 585dd77 · 585dd77
2 parents b717db3 + b87bd4b
commit 585dd77
Show file tree

Hide file tree

Showing 5 changed files with 115 additions and 14 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,7 @@ All notable changes to this project will be documented in this file. The format
 ## Table of Contents
 
 - [Unreleased](#unreleased)
+- [1.3.25 - 2025-02-27](#1324---2025-02-27)
 - [1.3.24 - 2025-02-22](#1324---2025-02-22)
 - [1.3.23 - 2025-02-21](#1323---2025-02-21)
 - [1.3.22 - 2025-02-19](#1322---2025-02-19)
@@ -90,6 +91,14 @@ All notable changes to this project will be documented in this file. The format
 
 ---
 
+## [1.3.25] - 2025-02-27
+
+### Fixed
+
+- Previously, the function split each character’s 16-bit code unit into two bytes (if the high byte was non-zero), which only worked for ASCII and failed on non-ASCII/multi-byte characters. Now emojis can be encoded correctly!
+
+---
+
 ## [1.3.24] - 2025-02-22
 
 ### Added

diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@bsv/sdk",
-  "version": "1.3.24",
+  "version": "1.3.25",
   "type": "module",
   "description": "BSV Blockchain Software Development Kit",
   "main": "dist/cjs/mod.js",

diff --git a/src/primitives/__tests/utils.test.ts b/src/primitives/__tests/utils.test.ts
@@ -15,7 +15,8 @@ describe('utils', () => {
     expect(toArray('1234', 'hex')).toEqual([0x12, 0x34])
     expect(toArray('1234')).toEqual([49, 50, 51, 52])
     expect(toArray('1234', 'utf8')).toEqual([49, 50, 51, 52])
-    expect(toArray('\u1234234')).toEqual([18, 52, 50, 51, 52])
+    expect(toArray('\u1234', 'utf8')).toEqual([225, 136, 180])
+    expect(toArray('\u1234' + '234', 'utf8')).toEqual([225, 136, 180, 50, 51, 52])
     expect(toArray([1, 2, 3, 4])).toEqual([1, 2, 3, 4])
   })
 
@@ -156,4 +157,53 @@ describe('utils', () => {
       })
     })
   })
+
+  test('should return an empty array for an empty string', () => {
+    expect(toArray("")).toEqual([])
+  })
+
+  test('should encode ASCII characters correctly', () => {
+    const input = "Hello, World!"
+    const expected = [72, 101, 108, 108, 111, 44, 32, 87, 111, 114, 108, 100, 33]
+    expect(toArray(input)).toEqual(expected)
+  })
+
+  test('should encode 2-byte characters correctly', () => {
+    // "é" (U+00E9) should encode to [0xC3, 0xA9]
+    expect(toArray("é")).toEqual([0xC3, 0xA9])
+  })
+
+  test('should encode 3-byte characters correctly', () => {
+    // "€" (U+20AC) should encode to [0xE2, 0x82, 0xAC]
+    expect(toArray("€")).toEqual([0xE2, 0x82, 0xAC])
+  })
+
+  test('should encode 4-byte characters correctly', () => {
+    // "😃" (U+1F603) should encode to [0xF0, 0x9F, 0x98, 0x83]
+    expect(toArray("😃")).toEqual([0xF0, 0x9F, 0x98, 0x83])
+  })
+
+  test('should encode mixed content correctly', () => {
+    // "Hello, 😃! €" contains ASCII, an emoji, and a 3-byte character.
+    const input = "Hello, 😃! €"
+    const expected = [
+      // "Hello, " => ASCII bytes:
+      72, 101, 108, 108, 111, 44, 32,
+      // "😃" => 4-byte sequence:
+      0xF0, 0x9F, 0x98, 0x83,
+      // "!" => ASCII, then space:
+      33, 32,
+      // "€" => 3-byte sequence:
+      0xE2, 0x82, 0xAC
+    ]
+    expect(toArray(input)).toEqual(expected)
+  })
+
+  test('should replace lone surrogates with the replacement character', () => {
+    // An unpaired high surrogate "\uD800" should be replaced with U+FFFD,
+    // which is encoded in UTF-8 as [0xEF, 0xBF, 0xBD]
+    const input = "\uD800"
+    const expected = [0xEF, 0xBF, 0xBD]
+    expect(toArray(input)).toEqual(expected)
+  })
 })
diff --git a/src/primitives/utils.ts b/src/primitives/utils.ts
@@ -84,19 +84,61 @@ const base64ToArray = (msg: string): number[] => {
   return result
 }
 
-const utf8ToArray = (msg: string): number[] => {
-  const res: number[] = []
-  for (let i = 0; i < msg.length; i++) {
-    const c = msg.charCodeAt(i)
-    const hi = c >> 8
-    const lo = c & 0xff
-    if (hi !== 0) {
-      res.push(hi, lo)
+/**
+ * Encodes a string into an array of bytes representing its UTF-8 encoding.
+ * Any lone surrogates are replaced with the Unicode replacement character (U+FFFD).
+ *
+ * @param str - The string to encode.
+ * @returns An array of numbers, each representing a byte in the UTF-8 encoded string.
+ */
+function utf8ToArray (str: string): number[] {
+  const result: number[] = []
+
+  for (let i = 0; i < str.length; i++) {
+    const cp = str.codePointAt(i)
+    if (cp === undefined) {
+      // Should never be out of range.
+      throw new Error(`Index out of range: ${i}`)
+    }
+    let codePoint = cp
+
+    if (codePoint > 0xFFFF) {
+      // Valid surrogate pair => skip the next code unit because codePointAt
+      // has already combined them into a single code point.
+      i++
+    } else {
+      // Check if codePoint is a lone (unpaired) high surrogate or low surrogate.
+      if (codePoint >= 0xD800 && codePoint <= 0xDFFF) {
+        // Replace with the replacement character (U+FFFD).
+        codePoint = 0xFFFD
+      }
+    }
+
+    // Encode according to the UTF-8 standard
+    if (codePoint <= 0x7F) {
+      result.push(codePoint)
+    } else if (codePoint <= 0x7FF) {
+      result.push(
+        0xC0 | (codePoint >> 6),
+        0x80 | (codePoint & 0x3F)
+      )
+    } else if (codePoint <= 0xFFFF) {
+      result.push(
+        0xE0 | (codePoint >> 12),
+        0x80 | ((codePoint >> 6) & 0x3F),
+        0x80 | (codePoint & 0x3F)
+      )
     } else {
-      res.push(lo)
+      result.push(
+        0xF0 | (codePoint >> 18),
+        0x80 | ((codePoint >> 12) & 0x3F),
+        0x80 | ((codePoint >> 6) & 0x3F),
+        0x80 | (codePoint & 0x3F)
+      )
     }
   }
-  return res
+
+  return result
 }
 
 /**