Skip to content

Commit d0e69bd

Browse files
authored
Merge pull request #46 from treeform/surrogates
Fix for utf16 surrogates.
2 parents 41a9fe0 + 08f719d commit d0e69bd

File tree

2 files changed

+47
-14
lines changed

2 files changed

+47
-14
lines changed

src/jsony.nim

+23-12
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,26 @@ proc parseHook*(s: string, i: var int, v: var SomeFloat) =
129129
i += chars
130130
v = f
131131

132+
proc parseUnicodeEscape(s: string, i: var int): int =
133+
inc i
134+
result = parseHexInt(s[i ..< i + 4])
135+
i += 3
136+
# Deal with UTF-16 surrogates. Most of the time strings are encoded as utf8
137+
# but some APIs will reply with UTF-16 surrogate pairs which needs to be dealt
138+
# with.
139+
if (result and 0xfc00) == 0xd800:
140+
inc i
141+
if s[i] != '\\':
142+
error("Found an Orphan Surrogate.", i)
143+
inc i
144+
if s[i] != 'u':
145+
error("Found an Orphan Surrogate.", i)
146+
inc i
147+
let nextRune = parseHexInt(s[i ..< i + 4])
148+
i += 3
149+
if (nextRune and 0xfc00) == 0xdc00:
150+
result = 0x10000 + (((result - 0xd800) shl 10) or (nextRune - 0xdc00))
151+
132152
proc parseStringSlow(s: string, i: var int, v: var string) =
133153
while i < s.len:
134154
let c = s[i]
@@ -146,10 +166,7 @@ proc parseStringSlow(s: string, i: var int, v: var string) =
146166
of 'r': v.add '\r'
147167
of 't': v.add '\t'
148168
of 'u':
149-
inc i
150-
let u = parseHexInt(s[i ..< i + 4])
151-
i += 3
152-
v.add(Rune(u).toUTF8())
169+
v.add(Rune(parseUnicodeEscape(s, i)).toUTF8())
153170
else:
154171
v.add(c)
155172
else:
@@ -173,10 +190,7 @@ proc parseStringFast(s: string, i: var int, v: var string) =
173190
let c = s[j]
174191
case c
175192
of 'u':
176-
inc j
177-
let u = parseHexInt(s[j ..< j + 4])
178-
j += 3
179-
ll += Rune(u).toUTF8().len
193+
ll += Rune(parseUnicodeEscape(s, j)).toUTF8().len
180194
else:
181195
inc ll
182196
else:
@@ -207,10 +221,7 @@ proc parseStringFast(s: string, i: var int, v: var string) =
207221
of 'r': ss.add '\r'
208222
of 't': ss.add '\t'
209223
of 'u':
210-
inc i
211-
let u = parseHexInt(s[i ..< i + 4])
212-
i += 3
213-
for c in Rune(u).toUTF8():
224+
for c in Rune(parseUnicodeEscape(s, i)).toUTF8():
214225
ss.add(c)
215226
else:
216227
ss.add(c)

tests/test_strings.nim

+24-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import jsony, json
1+
import json, jsony
22

33
block:
44
var s = """ "hello" """
@@ -12,7 +12,7 @@ block:
1212
var v = s.fromJson(string)
1313
doAssert v == "new\nline"
1414
doAssert v.toJson().fromJson(string) == v
15-
echo v.toJson().fromJson().toJson().fromJson()
15+
doAssert v.toJson().fromJson().toJson().fromJson() == newJString("new\nline")
1616

1717
block:
1818
var s = """ "quote\"inside" """
@@ -31,3 +31,25 @@ block:
3131
var v = s.fromJson(string)
3232
doAssert v == "unicode: \u0020 \u0F88 \u1F21"
3333
doAssert v.toJson().fromJson(string) == v
34+
35+
block:
36+
# https://github.com/treeform/jsony/issues/45
37+
# A string with 🔒 emoji encoded both as normal UTF-8 and as a surrogate pair
38+
type
39+
TestObj = object
40+
content: string
41+
let
42+
raw = """{"content":"\uD83D\uDD12🔒"}"""
43+
parsed = raw.fromJson(TestObj)
44+
parsedStd = parseJson(raw).to(TestObj)
45+
echo "jsony - ", parsed.content
46+
echo "std/json - ", parsedStd.content
47+
doAssert parsed.content == parsedStd.content
48+
49+
let
50+
raw2 = """{"content":"\u00A1\uD835\uDC7D\uD835\uDC96\uD835\uDC86\uD835\uDC8D\uD835\uDC97\uD835\uDC86\uD835\uDC8F \uD835\uDC8F\uD835\uDC96\uD835\uDC86\uD835\uDC94\uD835\uDC95\uD835\uDC93\uD835\uDC90\uD835\uDC94 \uD835\uDC89\uD835\uDC8A\uD835\uDC8F\uD835\uDC84\uD835\uDC89\uD835\uDC82\uD835\uDC94!"}"""
51+
parsed2 = raw2.fromJson(TestObj)
52+
parsedStd2 = parseJson(raw2).to(TestObj)
53+
echo "jsony - ", parsed2.content
54+
echo "std/json - ", parsedStd2.content
55+
doAssert parsed2.content == parsedStd2.content

0 commit comments

Comments
 (0)