From ed9af56139ad6a066910483104bae165cef53d16 Mon Sep 17 00:00:00 2001 From: He-Pin Date: Thu, 30 Apr 2026 13:02:22 +0800 Subject: [PATCH] perf: chunk long string byte escaping Motivation: Split the JMH-positive long-string rendering piece out of #776 without carrying over the broader Scala Native render-pipeline experiment. Modification: - Add CharSWAR.findFirstEscapeChar for byte arrays on JVM, JS, and Native. - Keep the existing UTF-8 byte array for long strings, but locate escape bytes and copy clean chunks with System.arraycopy. - Escape only the matching bytes inline. - Precompute the exact escaped output length before writing dirty strings so ByteBuilder does not grow repeatedly. Result: This keeps the change JDK17/JIT/GC friendly: straight byte-array loops, no internal JDK APIs, no extra temporary arrays beyond the existing UTF-8 encoding, and no regression on clean long strings. --- sjsonnet/src-js/sjsonnet/CharSWAR.scala | 10 ++ sjsonnet/src-jvm/sjsonnet/CharSWAR.java | 25 +++++ sjsonnet/src-native/sjsonnet/CharSWAR.scala | 36 +++++++ sjsonnet/src/sjsonnet/BaseByteRenderer.scala | 100 +++++++++++++++++-- 4 files changed, 160 insertions(+), 11 deletions(-) diff --git a/sjsonnet/src-js/sjsonnet/CharSWAR.scala b/sjsonnet/src-js/sjsonnet/CharSWAR.scala index bcdb85e7..6e64ce82 100644 --- a/sjsonnet/src-js/sjsonnet/CharSWAR.scala +++ b/sjsonnet/src-js/sjsonnet/CharSWAR.scala @@ -33,4 +33,14 @@ object CharSWAR { } false } + + def findFirstEscapeChar(arr: Array[Byte], from: Int, to: Int): Int = { + var i = from + while (i < to) { + val b = arr(i) & 0xff + if (b < 32 || b == '"' || b == '\\') return i + i += 1 + } + -1 + } } diff --git a/sjsonnet/src-jvm/sjsonnet/CharSWAR.java b/sjsonnet/src-jvm/sjsonnet/CharSWAR.java index 46bc7d11..97716ca0 100644 --- a/sjsonnet/src-jvm/sjsonnet/CharSWAR.java +++ b/sjsonnet/src-jvm/sjsonnet/CharSWAR.java @@ -90,6 +90,31 @@ static boolean hasEscapeChar(char[] arr, int from, int to) { return false; } + /** + * Find the first byte in {@code arr[from..to)} that needs JSON string escaping, or {@code -1} + * when the range is clean. + */ + static int findFirstEscapeChar(byte[] arr, int from, int to) { + int i = from; + int limit = to - 7; + while (i < limit) { + long word = (long) LONG_VIEW.get(arr, i); + if (swarHasMatch(word)) { + for (int j = i; j < i + 8; j++) { + int b = arr[j] & 0xFF; + if (b < 32 || b == '"' || b == '\\') return j; + } + } + i += 8; + } + while (i < to) { + int b = arr[i] & 0xFF; + if (b < 32 || b == '"' || b == '\\') return i; + i++; + } + return -1; + } + private static boolean hasEscapeCharSWAR(byte[] arr, int from, int to) { int i = from; int limit = to - 7; // 8 bytes per VarHandle.get diff --git a/sjsonnet/src-native/sjsonnet/CharSWAR.scala b/sjsonnet/src-native/sjsonnet/CharSWAR.scala index 5331c012..abe6afd3 100644 --- a/sjsonnet/src-native/sjsonnet/CharSWAR.scala +++ b/sjsonnet/src-native/sjsonnet/CharSWAR.scala @@ -89,6 +89,32 @@ object CharSWAR { false } + def findFirstEscapeChar(arr: Array[Byte], from: Int, to: Int): Int = { + val len = to - from + if (len < 8) return findFirstEscapeCharScalar(arr, from, to) + val barr = arr.asInstanceOf[ByteArray] + var i = from + val limit = to - 7 + while (i < limit) { + val word = Intrinsics.loadLong(barr.atRawUnsafe(i)) + if (swarHasMatch(word)) { + var j = i + while (j < i + 8) { + val b = arr(j) & 0xff + if (b < 32 || b == '"' || b == '\\') return j + j += 1 + } + } + i += 8 + } + while (i < to) { + val b = arr(i) & 0xff + if (b < 32 || b == '"' || b == '\\') return i + i += 1 + } + -1 + } + @inline private def hasEscapeCharScalar(s: String, len: Int): Boolean = { var i = 0 while (i < len) { @@ -108,4 +134,14 @@ object CharSWAR { } false } + + @inline private def findFirstEscapeCharScalar(arr: Array[Byte], from: Int, to: Int): Int = { + var i = from + while (i < to) { + val b = arr(i) & 0xff + if (b < 32 || b == '"' || b == '\\') return i + i += 1 + } + -1 + } } diff --git a/sjsonnet/src/sjsonnet/BaseByteRenderer.scala b/sjsonnet/src/sjsonnet/BaseByteRenderer.scala index 95a67aef..711c5c50 100644 --- a/sjsonnet/src/sjsonnet/BaseByteRenderer.scala +++ b/sjsonnet/src/sjsonnet/BaseByteRenderer.scala @@ -307,13 +307,14 @@ class BaseByteRenderer[T <: java.io.OutputStream]( } /** - * SWAR-accelerated path for long strings. Converts to UTF-8 bytes once, scans with SWAR, and - * bulk-copies if clean. The getBytes allocation is amortized by avoiding per-char processing. + * SWAR-accelerated path for long strings. Converts to UTF-8 bytes once, then bulk-copies clean + * chunks and escapes only the bytes that require it. */ private def visitLongString(str: String): Unit = { val bytes = str.getBytes(java.nio.charset.StandardCharsets.UTF_8) - if (!CharSWAR.hasEscapeChar(bytes, 0, bytes.length)) { - val bLen = bytes.length + val bLen = bytes.length + val firstEscape = CharSWAR.findFirstEscapeChar(bytes, 0, bLen) + if (firstEscape < 0) { elemBuilder.ensureLength(bLen + 2) val arr = elemBuilder.arr val pos = elemBuilder.length @@ -322,13 +323,87 @@ class BaseByteRenderer[T <: java.io.OutputStream]( arr(pos + 1 + bLen) = '"'.toByte elemBuilder.length = pos + bLen + 2 } else { - upickle.core.RenderUtils.escapeByte( - unicodeCharBuilder, - elemBuilder, - str, - escapeUnicode = false, - wrapQuotes = true - ) + val escapedLen = escapedStringLength(bytes, bLen, firstEscape) + elemBuilder.ensureLength(escapedLen) + elemBuilder.appendUnsafeC('"') + var from = 0 + var escPos = firstEscape + while (escPos >= 0) { + if (escPos > from) { + val chunkLen = escPos - from + elemBuilder.ensureLength(chunkLen) + val arr = elemBuilder.arr + val pos = elemBuilder.length + System.arraycopy(bytes, from, arr, pos, chunkLen) + elemBuilder.length = pos + chunkLen + } + escapeByteInline(bytes(escPos) & 0xff) + from = escPos + 1 + escPos = if (from < bLen) CharSWAR.findFirstEscapeChar(bytes, from, bLen) else -1 + } + if (from < bLen) { + val tailLen = bLen - from + elemBuilder.ensureLength(tailLen) + val arr = elemBuilder.arr + val pos = elemBuilder.length + System.arraycopy(bytes, from, arr, pos, tailLen) + elemBuilder.length = pos + tailLen + } + elemBuilder.ensureLength(1) + elemBuilder.appendUnsafeC('"') + } + } + + private def escapedStringLength(bytes: Array[Byte], bLen: Int, firstEscape: Int): Int = { + var len = bLen + 2 + var from = firstEscape + var escPos = firstEscape + while (escPos >= 0) { + len += escapeExtraLength(bytes(escPos) & 0xff) + from = escPos + 1 + escPos = if (from < bLen) CharSWAR.findFirstEscapeChar(bytes, from, bLen) else -1 + } + len + } + + @inline private def escapeExtraLength(b: Int): Int = + (b: @scala.annotation.switch) match { + case '"' | '\\' | '\b' | '\f' | '\n' | '\r' | '\t' => 1 + case _ => 5 + } + + /** Inline JSON escape for one byte that is known to require escaping. */ + private def escapeByteInline(b: Int): Unit = { + elemBuilder.ensureLength(6) + (b: @scala.annotation.switch) match { + case '"' => + elemBuilder.appendUnsafeC('\\') + elemBuilder.appendUnsafeC('"') + case '\\' => + elemBuilder.appendUnsafeC('\\') + elemBuilder.appendUnsafeC('\\') + case '\b' => + elemBuilder.appendUnsafeC('\\') + elemBuilder.appendUnsafeC('b') + case '\f' => + elemBuilder.appendUnsafeC('\\') + elemBuilder.appendUnsafeC('f') + case '\n' => + elemBuilder.appendUnsafeC('\\') + elemBuilder.appendUnsafeC('n') + case '\r' => + elemBuilder.appendUnsafeC('\\') + elemBuilder.appendUnsafeC('r') + case '\t' => + elemBuilder.appendUnsafeC('\\') + elemBuilder.appendUnsafeC('t') + case c => + elemBuilder.appendUnsafeC('\\') + elemBuilder.appendUnsafeC('u') + elemBuilder.appendUnsafeC('0') + elemBuilder.appendUnsafeC('0') + elemBuilder.appendUnsafeC(BaseByteRenderer.HEX_CHARS((c >> 4) & 0xf)) + elemBuilder.appendUnsafeC(BaseByteRenderer.HEX_CHARS(c & 0xf)) } } @@ -377,6 +452,9 @@ object BaseByteRenderer { a } + /** Hex digits used by inline byte escaping for control chars. */ + private[sjsonnet] val HEX_CHARS: Array[Char] = "0123456789abcdef".toCharArray + /** * Reusable scratch buffer for writeLongDirect (max 20 bytes for Long.MinValue). Not thread-safe, * but renderers are single-threaded.