From 91dad15bf68fca165761e56c56bb52010435f4b5 Mon Sep 17 00:00:00 2001 From: guowei zhang Date: Mon, 1 Mar 2021 23:37:15 +0800 Subject: [PATCH 1/7] support unicode --- .gitignore | 2 +- build.sbt | 2 +- .../io/github/shopee/idata/sjson/JSONUtil.scala | 4 ++++ .../io/github/shopee/idata/sjson/ParseTest.scala | 12 ++++++++++++ .../github/shopee/idata/sjson/TokenParserTest.scala | 4 +++- 5 files changed, 21 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index ab4cffb..bdb34bd 100644 --- a/.gitignore +++ b/.gitignore @@ -16,4 +16,4 @@ lib_managed/ src_managed/ project/boot/ project/plugins/project/ - +.idea/ diff --git a/build.sbt b/build.sbt index 0fe7ffa..e720a0d 100644 --- a/build.sbt +++ b/build.sbt @@ -1,6 +1,6 @@ name := "sjson" organization := "io.github.lock-free" -version := "0.2.0" +version := "0.2.2" scalaVersion := "2.12.4" useGpg := true diff --git a/src/main/scala/io/github/shopee/idata/sjson/JSONUtil.scala b/src/main/scala/io/github/shopee/idata/sjson/JSONUtil.scala index 4bb3437..ea8e73c 100644 --- a/src/main/scala/io/github/shopee/idata/sjson/JSONUtil.scala +++ b/src/main/scala/io/github/shopee/idata/sjson/JSONUtil.scala @@ -126,8 +126,12 @@ object JSONUtil { case 'n' => '\n' case 'f' => '\f' case 'r' => '\r' + case 'u' => 'u' case _ => next } + if(newChar == 'u') { + txtBuilder.append('\\') + } txtBuilder.append(newChar) i += 2 } else { diff --git a/src/test/scala/io/github/shopee/idata/sjson/ParseTest.scala b/src/test/scala/io/github/shopee/idata/sjson/ParseTest.scala index b036850..603134f 100644 --- a/src/test/scala/io/github/shopee/idata/sjson/ParseTest.scala +++ b/src/test/scala/io/github/shopee/idata/sjson/ParseTest.scala @@ -12,6 +12,18 @@ class ParseTest extends org.scalatest.FunSuite { assert(JSON.parse(JSON.stringify(v)) == v) } + test("parse: map unicode") { + val input = "[{\"\\u\":\"\\u\"}]" + val output = JSON.parse(input).asInstanceOf[List[Map[String, String]]](0)("\\u") + assert(output == "\\u") + } + + test("parse: map unicode2") { + val input = "[{\"\\uD83\":\"\\uD83\"}]" + val output = JSON.parse(input).asInstanceOf[List[Map[String, String]]](0)("\\uD83") + assert(output == "\\uD83") + } + test("parse: true|false|null") { List[Any](true, false, null).map(testParseSym) assert(JSON.parse(JSON.stringify(None)) == null) diff --git a/src/test/scala/io/github/shopee/idata/sjson/TokenParserTest.scala b/src/test/scala/io/github/shopee/idata/sjson/TokenParserTest.scala index e0abf05..3311083 100644 --- a/src/test/scala/io/github/shopee/idata/sjson/TokenParserTest.scala +++ b/src/test/scala/io/github/shopee/idata/sjson/TokenParserTest.scala @@ -29,7 +29,9 @@ class TokenParseTest extends org.scalatest.FunSuite { } test("toTokens: single string") { - List(s"""""""", s""""hello, world"""", s""""123"""", s""""\\""""", s""""\n"""", s""""\t"""", s""""\\\\"""").map((txt) => { + List(s"""""""", s""""hello, world"""", s""""123"""", s""""\\""""", + s""""\\n"""", s""""\t"""", s""""\\\\"""", s""""\\r"""", s""""\\b"""", + s""""\\f"""", s""""\\/"""").map((txt) => { testToToken(txt, List(JSONToken(JSONToken.STRING, txt))) }) } From c2a5aeb94b793302bdca8a99b3dde8e3d7f305a4 Mon Sep 17 00:00:00 2001 From: guowei zhang Date: Wed, 3 Mar 2021 11:43:38 +0800 Subject: [PATCH 2/7] translate unicode --- .../github/shopee/idata/sjson/JSONUtil.scala | 27 ++++++++++++++++--- .../github/shopee/idata/sjson/ParseTest.scala | 20 ++++++++++++++ 2 files changed, 43 insertions(+), 4 deletions(-) diff --git a/src/main/scala/io/github/shopee/idata/sjson/JSONUtil.scala b/src/main/scala/io/github/shopee/idata/sjson/JSONUtil.scala index ea8e73c..ecebab9 100644 --- a/src/main/scala/io/github/shopee/idata/sjson/JSONUtil.scala +++ b/src/main/scala/io/github/shopee/idata/sjson/JSONUtil.scala @@ -110,6 +110,16 @@ object JSONUtil { s""""${txtBuilder.toString()}"""" } + def isUnicode(txt: String, pos: Int): Boolean = { + val txtLen = txt.length - 1 + if (pos + 6 > txtLen) return false + //start iterating from /uD835 first hexadecimal character ('D' in this case) + val hexString = txt.substring(pos + 2, pos + 6) + val hexStringRegex = """[0-9a-fA-F]{4}""" + if(hexString.matches(hexStringRegex)) return true + false + } + def unescapeString(txt: String): String = { val txtBuilder = new StringBuilder // use txt builder to collect text @@ -129,11 +139,20 @@ object JSONUtil { case 'u' => 'u' case _ => next } - if(newChar == 'u') { - txtBuilder.append('\\') + if(newChar == 'u' && isUnicode(txt, i)) { + val unicodeString = s"\\u${txt.substring(i + 2, i + 6)}" + val unicodeChar = Integer.parseInt(unicodeString.drop(2), 16).toChar + txtBuilder.append(unicodeChar) + i += 6 + } + else if (newChar == 'u') { + txtBuilder.append("\\u") + i += 2 + } + else { + txtBuilder.append(newChar) + i += 2 } - txtBuilder.append(newChar) - i += 2 } else { txtBuilder.append(ch) i += 1 diff --git a/src/test/scala/io/github/shopee/idata/sjson/ParseTest.scala b/src/test/scala/io/github/shopee/idata/sjson/ParseTest.scala index 603134f..c079d7c 100644 --- a/src/test/scala/io/github/shopee/idata/sjson/ParseTest.scala +++ b/src/test/scala/io/github/shopee/idata/sjson/ParseTest.scala @@ -24,6 +24,26 @@ class ParseTest extends org.scalatest.FunSuite { assert(output == "\\uD83") } + test("parse: map unicode3") { + val input = "[{\"\\uD835\\uDC07\":\"\\uD835\\uDC07\"}]" + //𝐇 + val output = JSON.parse(input).asInstanceOf[List[Map[String, String]]](0)("\uD835\uDC07") + assert(output == "\uD835\uDC07") + } + + test("parse: map unicode4") { + val input = "[{\"γ‚·γƒͺをラむゼーション\":\"γ‚·γƒͺをラむゼーション\"}]" + val output = JSON.parse(input).asInstanceOf[List[Map[String, String]]](0)("γ‚·γƒͺをラむゼーション") + assert(output == "γ‚·γƒͺをラむゼーション") + } + + test("parse: map unicode5") { + val input = "[{\"\\uD835\\uDC07\":\"\\uD835\\uDC07\"}]" + //𝐇 + val output = JSON.parse(input).asInstanceOf[List[Map[String, String]]](0)("\uD835\uDC07") + assert(output == "\uD835\uDC07") + } + test("parse: true|false|null") { List[Any](true, false, null).map(testParseSym) assert(JSON.parse(JSON.stringify(None)) == null) From f720cf92629a3e866883e275daf58aa62b2eb994 Mon Sep 17 00:00:00 2001 From: guowei zhang Date: Wed, 3 Mar 2021 17:24:15 +0800 Subject: [PATCH 3/7] update version to 0.2.3 --- build.sbt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.sbt b/build.sbt index e720a0d..53e0840 100644 --- a/build.sbt +++ b/build.sbt @@ -1,6 +1,6 @@ name := "sjson" organization := "io.github.lock-free" -version := "0.2.2" +version := "0.2.3" scalaVersion := "2.12.4" useGpg := true From d508c724061a84b5ab3d93f8dec8fa61df4c93ef Mon Sep 17 00:00:00 2001 From: guowei zhang Date: Tue, 16 Mar 2021 14:41:41 +0800 Subject: [PATCH 4/7] optimize unicode decoding --- build.sbt | 4 ++- .../github/shopee/idata/sjson/JSONUtil.scala | 29 ++++++++++++----- .../github/shopee/idata/sjson/JSONUtil.scala | 32 +++++++++++++++++++ .../github/shopee/idata/sjson/ParseTest.scala | 18 +++++------ 4 files changed, 65 insertions(+), 18 deletions(-) diff --git a/build.sbt b/build.sbt index 53e0840..7789060 100644 --- a/build.sbt +++ b/build.sbt @@ -11,5 +11,7 @@ publishTo := sonatypePublishTo.value libraryDependencies ++= Seq( "org.scala-lang" % "scala-reflect" % scalaVersion.value, // test suite - "org.scalatest" %% "scalatest" % "3.0.1" % Test + "org.scalatest" %% "scalatest" % "3.0.1" % Test, + //performance test suite + "com.storm-enroute" %% "scalameter" % "0.18" ) diff --git a/src/main/scala/io/github/shopee/idata/sjson/JSONUtil.scala b/src/main/scala/io/github/shopee/idata/sjson/JSONUtil.scala index ecebab9..2928513 100644 --- a/src/main/scala/io/github/shopee/idata/sjson/JSONUtil.scala +++ b/src/main/scala/io/github/shopee/idata/sjson/JSONUtil.scala @@ -110,18 +110,31 @@ object JSONUtil { s""""${txtBuilder.toString()}"""" } - def isUnicode(txt: String, pos: Int): Boolean = { - val txtLen = txt.length - 1 - if (pos + 6 > txtLen) return false - //start iterating from /uD835 first hexadecimal character ('D' in this case) - val hexString = txt.substring(pos + 2, pos + 6) - val hexStringRegex = """[0-9a-fA-F]{4}""" - if(hexString.matches(hexStringRegex)) return true + //returns a boolean array indicating where the start of a unicode substring is + //e.g. "\"\\uD835\" returns [False, True, False, False, False, False, False] + def uniCodeArrayBuilder(txt: String): Array[Boolean] = { + val unicodeArray = new Array[Boolean](txt.length) + val (allStringHexaDecimal, startPos) = (4, 3) + var (hexadecimalCount, startPtr) = (0, 0) + for (endPtr ← startPos to unicodeArray.length - 1){ + startPtr = endPtr - 4 + if(txt.charAt(endPtr).isDigit || isHexAlphabet(txt.charAt(endPtr))) hexadecimalCount = hexadecimalCount + 1 + //startPtr starts from first possible hexadecimal character (e.g. Ds in "\uD835) that needs to be removed + //in a sliding window + if(startPtr >= 3 && (txt.charAt(startPtr).isDigit || isHexAlphabet(txt.charAt(startPtr)))) hexadecimalCount = hexadecimalCount - 1 + if(hexadecimalCount == allStringHexaDecimal) unicodeArray.update(startPtr - 1, true) + } + unicodeArray + } + + def isHexAlphabet(ch: Character): Boolean = { + if(ch >= 'A' && ch <= 'F') return true false } def unescapeString(txt: String): String = { val txtBuilder = new StringBuilder // use txt builder to collect text + val uniCodeArray = uniCodeArrayBuilder(txt) var i = 1 var len = txt.length - 1 @@ -139,7 +152,7 @@ object JSONUtil { case 'u' => 'u' case _ => next } - if(newChar == 'u' && isUnicode(txt, i)) { + if(newChar == 'u' && uniCodeArray(i)) { val unicodeString = s"\\u${txt.substring(i + 2, i + 6)}" val unicodeChar = Integer.parseInt(unicodeString.drop(2), 16).toChar txtBuilder.append(unicodeChar) diff --git a/src/test/scala/io/github/shopee/idata/sjson/JSONUtil.scala b/src/test/scala/io/github/shopee/idata/sjson/JSONUtil.scala index 937cf9e..fbc082c 100644 --- a/src/test/scala/io/github/shopee/idata/sjson/JSONUtil.scala +++ b/src/test/scala/io/github/shopee/idata/sjson/JSONUtil.scala @@ -1,5 +1,7 @@ package io.github.free.lock.sjson +import org.scalatest.Matchers.{convertToAnyShouldWrapper, equal} + class JSONUtilTest extends org.scalatest.FunSuite { test("unescapeString") { assert(JSONUtil.unescapeString(JSONUtil.escapeString("1234")) == "1234") @@ -10,4 +12,34 @@ class JSONUtilTest extends org.scalatest.FunSuite { assert(JSONUtil.unescapeString(JSONUtil.escapeString("12\f34")) == "12\f34") assert(JSONUtil.unescapeString(JSONUtil.escapeString("12\\34")) == "12\\34") } + + test("unicodeArrayBuilderSimple"){ + val currString = "\"\\uD835\\uD83\"" + val unicodeArray = new Array[Boolean](currString.length) + unicodeArray.update(1, true) + JSONUtil.uniCodeArrayBuilder(currString) should equal (unicodeArray) + } + + test("unicodeArrayBuilderWithInvalidUniCodeInBetween2"){ + val currString = "\"\\uD835\\uD835\\uD83\\uDC07\"" + val unicodeArray = new Array[Boolean](currString.length) + unicodeArray.update(1, true) + unicodeArray.update(7, true) + unicodeArray.update(18, true) + JSONUtil.uniCodeArrayBuilder(currString) should equal (unicodeArray) + } + + test("unicodeArrayBuilderWithInvalidUniCodeInBetween"){ + val currString = "\"\\uD835\\uD83\\uD835\"" + val unicodeArray = new Array[Boolean](currString.length) + unicodeArray.update(1, true) + unicodeArray.update(12, true) + JSONUtil.uniCodeArrayBuilder(currString) should equal (unicodeArray) + } + + test("unicodeArrayNoUnicode"){ + val currString = "\"\\uD83za\\uD83zxd\"" + val unicodeArray = new Array[Boolean](currString.length) + JSONUtil.uniCodeArrayBuilder(currString) should equal (unicodeArray) + } } diff --git a/src/test/scala/io/github/shopee/idata/sjson/ParseTest.scala b/src/test/scala/io/github/shopee/idata/sjson/ParseTest.scala index c079d7c..ca5b08c 100644 --- a/src/test/scala/io/github/shopee/idata/sjson/ParseTest.scala +++ b/src/test/scala/io/github/shopee/idata/sjson/ParseTest.scala @@ -19,16 +19,16 @@ class ParseTest extends org.scalatest.FunSuite { } test("parse: map unicode2") { - val input = "[{\"\\uD83\":\"\\uD83\"}]" - val output = JSON.parse(input).asInstanceOf[List[Map[String, String]]](0)("\\uD83") - assert(output == "\\uD83") + val input = "[{\"\\uD835\":\"\\uD835\"}]" + val output = JSON.parse(input).asInstanceOf[List[Map[String, String]]](0)("\uD835") + assert(output == "\uD835") } test("parse: map unicode3") { - val input = "[{\"\\uD835\\uDC07\":\"\\uD835\\uDC07\"}]" - //𝐇 - val output = JSON.parse(input).asInstanceOf[List[Map[String, String]]](0)("\uD835\uDC07") - assert(output == "\uD835\uDC07") + //actual string is π‡π¨π§ππš + val input = "[{\"\\uD835\\uDC07\\uD835\\uDC28\\uD835\\uDC27\\uD835\\uDC1D\\uD835\\uDC1A\":\"\\uD835\\uDC07\\uD835\\uDC28\\uD835\\uDC27\\uD835\\uDC1D\\uD835\\uDC1A\"}]" + val output = JSON.parse(input).asInstanceOf[List[Map[String, String]]](0)("\uD835\uDC07\uD835\uDC28\uD835\uDC27\uD835\uDC1D\uD835\uDC1A") + assert(output == "\uD835\uDC07\uD835\uDC28\uD835\uDC27\uD835\uDC1D\uD835\uDC1A") } test("parse: map unicode4") { @@ -38,9 +38,9 @@ class ParseTest extends org.scalatest.FunSuite { } test("parse: map unicode5") { - val input = "[{\"\\uD835\\uDC07\":\"\\uD835\\uDC07\"}]" + val input = "[{\"\\uDC\\uD835\\uDC07\\uDC\":\"\\uD835\\uDC07\"}]" //𝐇 - val output = JSON.parse(input).asInstanceOf[List[Map[String, String]]](0)("\uD835\uDC07") + val output = JSON.parse(input).asInstanceOf[List[Map[String, String]]](0)("\\uDC\uD835\uDC07\\uDC") assert(output == "\uD835\uDC07") } From 4b18f9c2b8e1fcf95a46148effbe6678828f3ea1 Mon Sep 17 00:00:00 2001 From: guowei zhang Date: Tue, 16 Mar 2021 15:30:04 +0800 Subject: [PATCH 5/7] update version to 0.2.4 --- build.sbt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.sbt b/build.sbt index 7789060..06cd574 100644 --- a/build.sbt +++ b/build.sbt @@ -1,6 +1,6 @@ name := "sjson" organization := "io.github.lock-free" -version := "0.2.3" +version := "0.2.4" scalaVersion := "2.12.4" useGpg := true From 91ad26a837dcff71d78ccfaa0d0b826a7121bca6 Mon Sep 17 00:00:00 2001 From: Tong Chen Date: Thu, 20 Jan 2022 14:59:01 +0800 Subject: [PATCH 6/7] :art: change organization --- build.sbt | 2 +- project/plugins.sbt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/build.sbt b/build.sbt index 06cd574..362fb0c 100644 --- a/build.sbt +++ b/build.sbt @@ -1,6 +1,6 @@ name := "sjson" organization := "io.github.lock-free" -version := "0.2.4" +version := "0.2.5" scalaVersion := "2.12.4" useGpg := true diff --git a/project/plugins.sbt b/project/plugins.sbt index 883e76d..0aa459a 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -2,4 +2,4 @@ addSbtPlugin("com.typesafe.sbt" % "sbt-native-packager" % "1.3.3") addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.5.1") addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "2.3") -addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.1.0") +addSbtPlugin("com.github.sbt" % "sbt-pgp" % "1.1.0") From d5bb522cef82d278b9c2a676bc64ea399792c7eb Mon Sep 17 00:00:00 2001 From: Tong Chen Date: Thu, 20 Jan 2022 15:40:27 +0800 Subject: [PATCH 7/7] :art: version to 2.1.2 --- project/plugins.sbt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/project/plugins.sbt b/project/plugins.sbt index 0aa459a..574ba59 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -2,4 +2,4 @@ addSbtPlugin("com.typesafe.sbt" % "sbt-native-packager" % "1.3.3") addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.5.1") addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "2.3") -addSbtPlugin("com.github.sbt" % "sbt-pgp" % "1.1.0") +addSbtPlugin("com.github.sbt" % "sbt-pgp" % "2.1.2")