Skip to content

Commit 267b298

Browse files
committed
perf: SIMD base64 via aklomp/base64 + ByteArr/RangeArr/asciiSafe
Motivation: PR #749 added SIMD base64 and runtime optimizations (ByteArr, RangeArr, asciiSafe) but was reverted by #777 due to incorrect hand-written x86 SIMD C code. This PR restores all optimizations while replacing the buggy SIMD code with the battle-tested aklomp/base64 library. Modification: - Replace hand-written C SIMD with aklomp/base64 (BSD-2-Clause) which provides correct SIMD dispatch (SSSE3/AVX2/AVX512/NEON64) via runtime CPU detection - Add PlatformBase64 abstraction: JVM/JS use java.util.Base64 with strict RFC 4648 padding validation, Native uses aklomp/base64 FFI - Switch to strict mode aligned with go-jsonnet: reject unpadded base64 input (e.g. "YQ" without "=="). java.util.Base64 is lenient, so JVM/JS add explicit length check for ASCII input, matching go-jsonnet's len(str) % 4 != 0 check (builtins.go:1467) - Restore Val.ByteArr: compact byte-backed array for base64DecodeBytes - Restore Val.RangeArr subclass from flag-based _isRange - Restore Val.Str._asciiSafe + renderAsciiSafeString - Restore Materializer/ByteRenderer fast paths for ByteArr - Add comprehensive test suite (56+ Scala tests + 4 Jsonnet golden tests) Result: Beats jrsonnet on DecodeBytes benchmarks (1.47x faster). Overall 15-38% faster than master on base64 workloads.
1 parent 4123ac3 commit 267b298

29 files changed

Lines changed: 1773 additions & 103 deletions

.github/workflows/pr-build.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ jobs:
5151
name: Sjsonnet ${{ matrix.lang }} build
5252
steps:
5353
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2
54+
with:
55+
submodules: ${{ matrix.lang == 'native' }}
5456
- uses: ./.github/actions/setup-build
5557
with:
5658
node: ${{ matrix.lang == 'js' || matrix.lang == 'wasm' }}

.gitmodules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
[submodule "vendor/base64"]
2+
path = vendor/base64
3+
url = https://github.com/aklomp/base64.git
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
{
2+
local largeStr = std.repeat("Lorem ipsum dolor sit amet, consectetur adipiscing elit. ", 100),
3+
local encoded = std.base64(largeStr),
4+
local decoded = std.base64Decode(encoded),
5+
local encodedArr = std.base64(std.makeArray(1000, function(i) i % 256)),
6+
local decodedBytes = std.base64DecodeBytes(encodedArr),
7+
8+
local encoded2 = std.base64(decoded),
9+
local decoded2 = std.base64Decode(encoded2),
10+
local encodedArr2 = std.base64(std.makeArray(2000, function(i) (i * 7 + 13) % 256)),
11+
local decodedBytes2 = std.base64DecodeBytes(encodedArr2),
12+
13+
local encoded3 = std.base64(decoded2),
14+
local decoded3 = std.base64Decode(encoded3),
15+
local encodedArr3 = std.base64(std.makeArray(3000, function(i) (i * 13 + 37) % 256)),
16+
local decodedBytes3 = std.base64DecodeBytes(encodedArr3),
17+
18+
roundtrip_ok: decoded3 == largeStr,
19+
byte_roundtrip_ok: std.length(decodedBytes3) == 3000,
20+
encoded_len: std.length(encoded3),
21+
decoded_len: std.length(decoded3)
22+
}
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
{
2+
"byte_roundtrip_ok": true,
3+
"decoded_len": 5700,
4+
"encoded_len": 7600,
5+
"roundtrip_ok": true
6+
}

build.mill

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,41 @@ object sjsonnet extends VersionFileModule {
278278
def nativeLTO = LTO.Full
279279
def nativeMultithreading = None
280280

281+
// Build aklomp/base64 as static library for SIMD-accelerated base64
282+
def buildBase64Lib = Task {
283+
val base64Dir = BuildCtx.workspaceRoot / "vendor" / "base64"
284+
if (!os.exists(base64Dir / "CMakeLists.txt")) {
285+
throw new Exception(
286+
"vendor/base64 not found. Run: git submodule update --init vendor/base64"
287+
)
288+
}
289+
val buildDir = Task.ctx().dest / "base64-build"
290+
os.makeDir.all(buildDir)
291+
os.proc(
292+
"cmake",
293+
base64Dir.toString,
294+
"-DCMAKE_POSITION_INDEPENDENT_CODE=ON",
295+
"-DBASE64_WITH_OpenMP=OFF",
296+
"-DBASE64_BUILD_TESTS=OFF",
297+
"-DBASE64_BUILD_CLI=OFF",
298+
"-DCMAKE_BUILD_TYPE=Release"
299+
).call(cwd = buildDir)
300+
os.proc("cmake", "--build", buildDir.toString, "--config", "Release").call()
301+
PathRef(buildDir)
302+
}
303+
304+
def nativeLinkingOptions = Task {
305+
super.nativeLinkingOptions() ++ Seq(
306+
(buildBase64Lib().path / "libbase64.a").toString
307+
)
308+
}
309+
310+
def nativeCompileOptions = Task {
311+
super.nativeCompileOptions() ++ Seq(
312+
s"-I${BuildCtx.workspaceRoot / "vendor" / "base64" / "include"}"
313+
)
314+
}
315+
281316
object test extends ScalaNativeTests with CrossTests {
282317
def releaseMode = ReleaseMode.Debug
283318
def nativeMultithreading = None
@@ -286,6 +321,16 @@ object sjsonnet extends VersionFileModule {
286321
"SCALANATIVE_THREAD_STACK_SIZE" -> stackSize
287322
)
288323
def nativeLTO = LTO.None
324+
def nativeLinkingOptions = Task {
325+
super.nativeLinkingOptions() ++ Seq(
326+
(SjsonnetNativeModule.this.buildBase64Lib().path / "libbase64.a").toString
327+
)
328+
}
329+
def nativeCompileOptions = Task {
330+
super.nativeCompileOptions() ++ Seq(
331+
s"-I${BuildCtx.workspaceRoot / "vendor" / "base64" / "include"}"
332+
)
333+
}
289334
}
290335
}
291336

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
package sjsonnet.stdlib
2+
3+
/**
4+
* Scala.js implementation of base64 encode/decode. Delegates to java.util.Base64 (provided by
5+
* Scala.js stdlib emulation).
6+
*/
7+
object PlatformBase64 {
8+
9+
def encodeToString(input: Array[Byte]): String =
10+
java.util.Base64.getEncoder.encodeToString(input)
11+
12+
def decode(input: String): Array[Byte] = {
13+
Base64Validation.requireStrictPadding(input)
14+
java.util.Base64.getDecoder.decode(input)
15+
}
16+
}
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
package sjsonnet.stdlib
2+
3+
/**
4+
* JVM implementation of base64 encode/decode. Delegates to java.util.Base64 which has HotSpot
5+
* intrinsics for high performance.
6+
*/
7+
object PlatformBase64 {
8+
9+
def encodeToString(input: Array[Byte]): String =
10+
java.util.Base64.getEncoder.encodeToString(input)
11+
12+
def decode(input: String): Array[Byte] = {
13+
Base64Validation.requireStrictPadding(input)
14+
java.util.Base64.getDecoder.decode(input)
15+
}
16+
}
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
package sjsonnet.stdlib
2+
3+
import scala.scalanative.unsafe._
4+
import scala.scalanative.unsigned._
5+
import scala.scalanative.libc.string.memcpy
6+
7+
/**
8+
* Scala Native implementation of base64 encode/decode.
9+
*
10+
* Uses the aklomp/base64 C library (BSD-2-Clause) which provides SIMD-accelerated base64 via
11+
* runtime CPU detection:
12+
* - x86_64: SSSE3 / SSE4.1 / SSE4.2 / AVX / AVX2 / AVX-512
13+
* - AArch64: NEON
14+
* - Fallback: optimized generic C implementation
15+
*
16+
* The static library is built by CMake and linked via nativeLinkingOptions.
17+
*
18+
* Both aklomp/base64 and C++ jsonnet (the reference implementation) use strict RFC 4648 mode:
19+
* padding is required, unpadded input is rejected. This differs from java.util.Base64 on JVM which
20+
* is more lenient (accepts unpadded input) — that JVM leniency is a pre-existing sjsonnet bug, not
21+
* something we replicate here.
22+
*/
23+
@extern
24+
private[stdlib] object libbase64 {
25+
def base64_encode(
26+
src: Ptr[CChar],
27+
srclen: CSize,
28+
out: Ptr[CChar],
29+
outlen: Ptr[CSize],
30+
flags: CInt
31+
): Unit = extern
32+
33+
def base64_decode(
34+
src: Ptr[CChar],
35+
srclen: CSize,
36+
out: Ptr[CChar],
37+
outlen: Ptr[CSize],
38+
flags: CInt
39+
): CInt = extern
40+
}
41+
42+
object PlatformBase64 {
43+
44+
private val DECODE_TABLE: Array[Int] = {
45+
val t = Array.fill[Int](256)(-1)
46+
var i = 0
47+
while (i < 26) { t('A' + i) = i; i += 1 }
48+
i = 0
49+
while (i < 26) { t('a' + i) = i + 26; i += 1 }
50+
i = 0
51+
while (i < 10) { t('0' + i) = i + 52; i += 1 }
52+
t('+') = 62
53+
t('/') = 63
54+
t
55+
}
56+
57+
/**
58+
* Diagnose why base64 decode failed and throw a JVM-compatible error message. Only called on the
59+
* error path (after aklomp/base64 returns failure), so zero overhead on the hot path.
60+
*
61+
* Error messages match java.util.Base64.Decoder behavior for golden test compatibility:
62+
* - Invalid character: "Illegal base64 character XX" (hex)
63+
* - Wrong length/padding: "Last unit does not have enough valid bits"
64+
*/
65+
private def throwDecodeError(srcBytes: Array[Byte]): Nothing = {
66+
val len = srcBytes.length
67+
68+
var i = 0
69+
while (i < len) {
70+
val b = srcBytes(i) & 0xff
71+
if (b != '='.toInt) {
72+
if (DECODE_TABLE(b) < 0) {
73+
throw new IllegalArgumentException(
74+
"Illegal base64 character " + Integer.toHexString(b)
75+
)
76+
}
77+
}
78+
i += 1
79+
}
80+
81+
throw new IllegalArgumentException(
82+
"Last unit does not have enough valid bits"
83+
)
84+
}
85+
86+
def encodeToString(input: Array[Byte]): String = {
87+
if (input.length == 0) return ""
88+
val maxOutLen = ((input.length.toLong + 2) / 3) * 4
89+
if (maxOutLen > Int.MaxValue)
90+
throw new IllegalArgumentException("Input too large for base64 encoding")
91+
val outSize = maxOutLen.toInt
92+
Zone.acquire { implicit z =>
93+
val srcPtr = alloc[Byte](input.length.toUSize)
94+
memcpy(srcPtr, input.at(0), input.length.toUSize)
95+
val outPtr = alloc[Byte]((outSize + 1).toUSize)
96+
val outLenPtr = alloc[CSize](1.toUSize)
97+
libbase64.base64_encode(srcPtr, input.length.toUSize, outPtr, outLenPtr, 0)
98+
val actualLen = (!outLenPtr).toInt
99+
val result = new Array[Byte](actualLen)
100+
memcpy(result.at(0), outPtr, actualLen.toUSize)
101+
new String(result, "US-ASCII")
102+
}
103+
}
104+
105+
def decode(input: String): Array[Byte] = {
106+
if (input.isEmpty) return Array.emptyByteArray
107+
val srcBytes = input.getBytes("US-ASCII")
108+
val maxOutLen = ((srcBytes.length.toLong / 4) * 3) + 3
109+
if (maxOutLen > Int.MaxValue)
110+
throw new IllegalArgumentException("Input too large for base64 decoding")
111+
val outSize = maxOutLen.toInt
112+
Zone.acquire { implicit z =>
113+
val srcPtr = alloc[Byte](srcBytes.length.toUSize)
114+
memcpy(srcPtr, srcBytes.at(0), srcBytes.length.toUSize)
115+
val outPtr = alloc[Byte]((outSize + 1).toUSize)
116+
val outLenPtr = alloc[CSize](1.toUSize)
117+
val ret =
118+
libbase64.base64_decode(srcPtr, srcBytes.length.toUSize, outPtr, outLenPtr, 0)
119+
if (ret != 1) {
120+
throwDecodeError(srcBytes)
121+
}
122+
val actualLen = (!outLenPtr).toInt
123+
val result = new Array[Byte](actualLen)
124+
memcpy(result.at(0), outPtr, actualLen.toUSize)
125+
result
126+
}
127+
}
128+
}

sjsonnet/src/sjsonnet/BaseByteRenderer.scala

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,27 @@ class BaseByteRenderer[T <: java.io.OutputStream](
242242
else visitLongString(str)
243243
}
244244

245+
/**
246+
* Fast path for strings known to be ASCII-safe (no escaping needed, all chars 0x20-0x7E). Skips
247+
* SWAR scanning and UTF-8 encoding — writes bytes directly from chars.
248+
*/
249+
private[sjsonnet] def renderAsciiSafeString(str: String): Unit = {
250+
val len = str.length
251+
elemBuilder.ensureLength(len + 2)
252+
val arr = elemBuilder.arr
253+
var pos = elemBuilder.length
254+
arr(pos) = '"'.toByte
255+
pos += 1
256+
var i = 0
257+
while (i < len) {
258+
arr(pos) = str.charAt(i).toByte
259+
pos += 1
260+
i += 1
261+
}
262+
arr(pos) = '"'.toByte
263+
elemBuilder.length = pos + 1
264+
}
265+
245266
/**
246267
* Zero-allocation fast path for short ASCII strings (the vast majority of JSON keys/values). Uses
247268
* getChars to bulk-copy into a reusable char buffer, then scans the buffer directly (avoiding

sjsonnet/src/sjsonnet/ByteRenderer.scala

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,9 @@ class ByteRenderer(out: OutputStream = new java.io.ByteArrayOutputStream(), inde
192192
val vt: Int = v.valTag.toInt
193193
(vt: @scala.annotation.switch) match {
194194
case 0 => // TAG_STR
195-
renderQuotedString(v.asInstanceOf[Val.Str].str)
195+
val s = v.asInstanceOf[Val.Str]
196+
if (s._asciiSafe) renderAsciiSafeString(s.str)
197+
else renderQuotedString(s.str)
196198
case 1 => // TAG_NUM
197199
renderDouble(v.asDouble)
198200
case 2 => // TAG_TRUE
@@ -420,18 +422,32 @@ class ByteRenderer(out: OutputStream = new java.io.ByteArrayOutputStream(), inde
420422
depth += 1
421423
resetEmpty()
422424

423-
var i = 0
424-
while (i < len) {
425-
val childVal = xs.value(i)
425+
// Fast path for byte-backed arrays: emit numbers directly without per-element dispatch
426+
xs match {
427+
case ba: Val.ByteArr =>
428+
val bytes = ba.rawBytes
429+
var i = 0
430+
while (i < len) {
431+
markNonEmpty()
432+
flushBuffer()
433+
renderDouble((bytes(i) & 0xff).toDouble)
434+
commaBuffered = true
435+
i += 1
436+
}
437+
case _ =>
438+
var i = 0
439+
while (i < len) {
440+
val childVal = xs.value(i)
426441

427-
markNonEmpty()
428-
flushBuffer()
442+
markNonEmpty()
443+
flushBuffer()
429444

430-
// Render element directly — no flush overhead
431-
materializeChild(childVal, matDepth, ctx)
445+
// Render element directly — no flush overhead
446+
materializeChild(childVal, matDepth, ctx)
432447

433-
commaBuffered = true
434-
i += 1
448+
commaBuffered = true
449+
i += 1
450+
}
435451
}
436452

437453
// Inline of visitEnd — close bracket

0 commit comments

Comments
 (0)