Merge pull request #46 from treeform/surrogates

treeform · web-flow · commit d0e69bddf838 · 2022-01-14T10:20:52.000-08:00
Fix for utf16 surrogates.
diff --git a/src/jsony.nim b/src/jsony.nim
@@ -129,6 +129,26 @@ proc parseHook*(s: string, i: var int, v: var SomeFloat) =
   i += chars
   v = f
 
+proc parseUnicodeEscape(s: string, i: var int): int =
+  inc i
+  result = parseHexInt(s[i ..< i + 4])
+  i += 3
+  # Deal with UTF-16 surrogates. Most of the time strings are encoded as utf8
+  # but some APIs will reply with UTF-16 surrogate pairs which needs to be dealt
+  # with.
+  if (result and 0xfc00) == 0xd800:
+    inc i
+    if s[i] != '\\':
+      error("Found an Orphan Surrogate.", i)
+    inc i
+    if s[i] != 'u':
+      error("Found an Orphan Surrogate.", i)
+    inc i
+    let nextRune = parseHexInt(s[i ..< i + 4])
+    i += 3
+    if (nextRune and 0xfc00) == 0xdc00:
+      result = 0x10000 + (((result - 0xd800) shl 10) or (nextRune - 0xdc00))
+
 proc parseStringSlow(s: string, i: var int, v: var string) =
   while i < s.len:
     let c = s[i]
@@ -146,10 +166,7 @@ proc parseStringSlow(s: string, i: var int, v: var string) =
       of 'r': v.add '\r'
       of 't': v.add '\t'
       of 'u':
-        inc i
-        let u = parseHexInt(s[i ..< i + 4])
-        i += 3
-        v.add(Rune(u).toUTF8())
+        v.add(Rune(parseUnicodeEscape(s, i)).toUTF8())
       else:
         v.add(c)
     else:
@@ -173,10 +190,7 @@ proc parseStringFast(s: string, i: var int, v: var string) =
       let c = s[j]
       case c
       of 'u':
-        inc j
-        let u = parseHexInt(s[j ..< j + 4])
-        j += 3
-        ll += Rune(u).toUTF8().len
+        ll += Rune(parseUnicodeEscape(s, j)).toUTF8().len
       else:
         inc ll
     else:
@@ -207,10 +221,7 @@ proc parseStringFast(s: string, i: var int, v: var string) =
         of 'r': ss.add '\r'
         of 't': ss.add '\t'
         of 'u':
-          inc i
-          let u = parseHexInt(s[i ..< i + 4])
-          i += 3
-          for c in Rune(u).toUTF8():
+          for c in Rune(parseUnicodeEscape(s, i)).toUTF8():
             ss.add(c)
         else:
           ss.add(c)
diff --git a/tests/test_strings.nim b/tests/test_strings.nim
@@ -1,4 +1,4 @@
-import jsony, json
+import json, jsony
 
 block:
   var s = """ "hello" """
@@ -12,7 +12,7 @@ block:
   var v = s.fromJson(string)
   doAssert v == "new\nline"
   doAssert v.toJson().fromJson(string) == v
-  echo v.toJson().fromJson().toJson().fromJson()
+  doAssert v.toJson().fromJson().toJson().fromJson() == newJString("new\nline")
 
 block:
   var s = """ "quote\"inside" """
@@ -31,3 +31,25 @@ block:
   var v = s.fromJson(string)
   doAssert v == "unicode: \u0020 \u0F88 \u1F21"
   doAssert v.toJson().fromJson(string) == v
+
+block:
+  # https://github.com/treeform/jsony/issues/45
+  # A string with 🔒 emoji encoded both as normal UTF-8 and as a surrogate pair
+  type
+    TestObj = object
+      content: string
+  let
+    raw = """{"content":"\uD83D\uDD12🔒"}"""
+    parsed = raw.fromJson(TestObj)
+    parsedStd = parseJson(raw).to(TestObj)
+  echo "jsony - ", parsed.content
+  echo "std/json - ", parsedStd.content
+  doAssert parsed.content == parsedStd.content
+
+  let
+    raw2 = """{"content":"\u00A1\uD835\uDC7D\uD835\uDC96\uD835\uDC86\uD835\uDC8D\uD835\uDC97\uD835\uDC86\uD835\uDC8F \uD835\uDC8F\uD835\uDC96\uD835\uDC86\uD835\uDC94\uD835\uDC95\uD835\uDC93\uD835\uDC90\uD835\uDC94 \uD835\uDC89\uD835\uDC8A\uD835\uDC8F\uD835\uDC84\uD835\uDC89\uD835\uDC82\uD835\uDC94!"}"""
+    parsed2 = raw2.fromJson(TestObj)
+    parsedStd2 = parseJson(raw2).to(TestObj)
+  echo "jsony - ", parsed2.content
+  echo "std/json - ", parsedStd2.content
+  doAssert parsed2.content == parsedStd2.content