Skip to content

Commit

Permalink
Merge pull request #211 from bitcoin-sv/fix/utf8ToArray
Browse files Browse the repository at this point in the history
Fix/utf8 to array
  • Loading branch information
ty-everett authored Feb 28, 2025
2 parents b717db3 + b87bd4b commit 585dd77
Show file tree
Hide file tree
Showing 5 changed files with 115 additions and 14 deletions.
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ All notable changes to this project will be documented in this file. The format
## Table of Contents

- [Unreleased](#unreleased)
- [1.3.25 - 2025-02-27](#1324---2025-02-27)
- [1.3.24 - 2025-02-22](#1324---2025-02-22)
- [1.3.23 - 2025-02-21](#1323---2025-02-21)
- [1.3.22 - 2025-02-19](#1322---2025-02-19)
Expand Down Expand Up @@ -90,6 +91,14 @@ All notable changes to this project will be documented in this file. The format

---

## [1.3.25] - 2025-02-27

### Fixed

- Previously, the function split each character’s 16-bit code unit into two bytes (if the high byte was non-zero), which only worked for ASCII and failed on non-ASCII/multi-byte characters. Now emojis can be encoded correctly!

---

## [1.3.24] - 2025-02-22

### Added
Expand Down
4 changes: 2 additions & 2 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@bsv/sdk",
"version": "1.3.24",
"version": "1.3.25",
"type": "module",
"description": "BSV Blockchain Software Development Kit",
"main": "dist/cjs/mod.js",
Expand Down
52 changes: 51 additions & 1 deletion src/primitives/__tests/utils.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ describe('utils', () => {
expect(toArray('1234', 'hex')).toEqual([0x12, 0x34])
expect(toArray('1234')).toEqual([49, 50, 51, 52])
expect(toArray('1234', 'utf8')).toEqual([49, 50, 51, 52])
expect(toArray('\u1234234')).toEqual([18, 52, 50, 51, 52])
expect(toArray('\u1234', 'utf8')).toEqual([225, 136, 180])
expect(toArray('\u1234' + '234', 'utf8')).toEqual([225, 136, 180, 50, 51, 52])
expect(toArray([1, 2, 3, 4])).toEqual([1, 2, 3, 4])
})

Expand Down Expand Up @@ -156,4 +157,53 @@ describe('utils', () => {
})
})
})

test('should return an empty array for an empty string', () => {
expect(toArray("")).toEqual([])
})

test('should encode ASCII characters correctly', () => {
const input = "Hello, World!"
const expected = [72, 101, 108, 108, 111, 44, 32, 87, 111, 114, 108, 100, 33]
expect(toArray(input)).toEqual(expected)
})

test('should encode 2-byte characters correctly', () => {
// "é" (U+00E9) should encode to [0xC3, 0xA9]
expect(toArray("é")).toEqual([0xC3, 0xA9])
})

test('should encode 3-byte characters correctly', () => {
// "€" (U+20AC) should encode to [0xE2, 0x82, 0xAC]
expect(toArray("€")).toEqual([0xE2, 0x82, 0xAC])
})

test('should encode 4-byte characters correctly', () => {
// "😃" (U+1F603) should encode to [0xF0, 0x9F, 0x98, 0x83]
expect(toArray("😃")).toEqual([0xF0, 0x9F, 0x98, 0x83])
})

test('should encode mixed content correctly', () => {
// "Hello, 😃! €" contains ASCII, an emoji, and a 3-byte character.
const input = "Hello, 😃! €"
const expected = [
// "Hello, " => ASCII bytes:
72, 101, 108, 108, 111, 44, 32,
// "😃" => 4-byte sequence:
0xF0, 0x9F, 0x98, 0x83,
// "!" => ASCII, then space:
33, 32,
// "€" => 3-byte sequence:
0xE2, 0x82, 0xAC
]
expect(toArray(input)).toEqual(expected)
})

test('should replace lone surrogates with the replacement character', () => {
// An unpaired high surrogate "\uD800" should be replaced with U+FFFD,
// which is encoded in UTF-8 as [0xEF, 0xBF, 0xBD]
const input = "\uD800"
const expected = [0xEF, 0xBF, 0xBD]
expect(toArray(input)).toEqual(expected)
})
})
62 changes: 52 additions & 10 deletions src/primitives/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -84,19 +84,61 @@ const base64ToArray = (msg: string): number[] => {
return result
}

const utf8ToArray = (msg: string): number[] => {
const res: number[] = []
for (let i = 0; i < msg.length; i++) {
const c = msg.charCodeAt(i)
const hi = c >> 8
const lo = c & 0xff
if (hi !== 0) {
res.push(hi, lo)
/**
* Encodes a string into an array of bytes representing its UTF-8 encoding.
* Any lone surrogates are replaced with the Unicode replacement character (U+FFFD).
*
* @param str - The string to encode.
* @returns An array of numbers, each representing a byte in the UTF-8 encoded string.
*/
function utf8ToArray (str: string): number[] {
const result: number[] = []

for (let i = 0; i < str.length; i++) {
const cp = str.codePointAt(i)
if (cp === undefined) {
// Should never be out of range.
throw new Error(`Index out of range: ${i}`)
}
let codePoint = cp

if (codePoint > 0xFFFF) {
// Valid surrogate pair => skip the next code unit because codePointAt
// has already combined them into a single code point.
i++
} else {
// Check if codePoint is a lone (unpaired) high surrogate or low surrogate.
if (codePoint >= 0xD800 && codePoint <= 0xDFFF) {
// Replace with the replacement character (U+FFFD).
codePoint = 0xFFFD
}
}

// Encode according to the UTF-8 standard
if (codePoint <= 0x7F) {
result.push(codePoint)
} else if (codePoint <= 0x7FF) {
result.push(
0xC0 | (codePoint >> 6),
0x80 | (codePoint & 0x3F)
)
} else if (codePoint <= 0xFFFF) {
result.push(
0xE0 | (codePoint >> 12),
0x80 | ((codePoint >> 6) & 0x3F),
0x80 | (codePoint & 0x3F)
)
} else {
res.push(lo)
result.push(
0xF0 | (codePoint >> 18),
0x80 | ((codePoint >> 12) & 0x3F),
0x80 | ((codePoint >> 6) & 0x3F),
0x80 | (codePoint & 0x3F)
)
}
}
return res

return result
}

/**
Expand Down

0 comments on commit 585dd77

Please sign in to comment.