Skip to content

Commit 0253450

Browse files
committed
perf: SIMD base64 via aklomp/base64 + ByteArr/RangeArr/asciiSafe
Motivation: PR #749 added SIMD base64 and runtime optimizations (ByteArr, RangeArr, asciiSafe) but was reverted by #777 due to incorrect hand-written x86 SIMD C code. This PR restores all optimizations while replacing the buggy SIMD code with the battle-tested aklomp/base64 library. Modification: - Replace hand-written C SIMD with aklomp/base64 (BSD-2-Clause) which provides correct SIMD dispatch (SSSE3/AVX2/AVX512/NEON64) via runtime CPU detection - Add PlatformBase64 abstraction: JVM/JS use java.util.Base64 with strict RFC 4648 padding validation, Native uses aklomp/base64 FFI - Switch to strict mode aligned with go-jsonnet: reject unpadded base64 input (e.g. "YQ" without "=="). java.util.Base64 is lenient, so JVM/JS add explicit length check for ASCII input, matching go-jsonnet's len(str) % 4 != 0 check (builtins.go:1467) - Restore Val.ByteArr: compact byte-backed array for base64DecodeBytes - Restore Val.RangeArr subclass from flag-based _isRange - Restore Val.Str._asciiSafe + renderAsciiSafeString - Restore Materializer/ByteRenderer fast paths for ByteArr - Add comprehensive test suite (56+ Scala tests + 4 Jsonnet golden tests) Result: Beats jrsonnet on DecodeBytes benchmarks (1.47x faster). Overall 15-38% faster than master on base64 workloads.
1 parent b63fb40 commit 0253450

27 files changed

Lines changed: 1793 additions & 103 deletions

.github/workflows/pr-build.yaml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,16 @@ jobs:
5151
name: Sjsonnet ${{ matrix.lang }} build
5252
steps:
5353
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2
54+
- name: Fetch vendored aklomp/base64 at pinned commit
55+
if: matrix.lang == 'native'
56+
env:
57+
BASE64_REPO: https://github.com/aklomp/base64.git
58+
BASE64_COMMIT: 9e8ed65048ff0f703fad3deb03bf66ac7f78a4d7 # v0.5.2-26-g9e8ed65
59+
run: |
60+
set -euo pipefail
61+
rm -rf vendor/base64
62+
git clone --filter=blob:none "$BASE64_REPO" vendor/base64
63+
git -C vendor/base64 checkout --detach "$BASE64_COMMIT"
5464
- uses: ./.github/actions/setup-build
5565
with:
5666
node: ${{ matrix.lang == 'js' || matrix.lang == 'wasm' }}
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
{
2+
local largeStr = std.repeat("Lorem ipsum dolor sit amet, consectetur adipiscing elit. ", 100),
3+
local encoded = std.base64(largeStr),
4+
local decoded = std.base64Decode(encoded),
5+
local encodedArr = std.base64(std.makeArray(1000, function(i) i % 256)),
6+
local decodedBytes = std.base64DecodeBytes(encodedArr),
7+
8+
local encoded2 = std.base64(decoded),
9+
local decoded2 = std.base64Decode(encoded2),
10+
local encodedArr2 = std.base64(std.makeArray(2000, function(i) (i * 7 + 13) % 256)),
11+
local decodedBytes2 = std.base64DecodeBytes(encodedArr2),
12+
13+
local encoded3 = std.base64(decoded2),
14+
local decoded3 = std.base64Decode(encoded3),
15+
local encodedArr3 = std.base64(std.makeArray(3000, function(i) (i * 13 + 37) % 256)),
16+
local decodedBytes3 = std.base64DecodeBytes(encodedArr3),
17+
18+
roundtrip_ok: decoded3 == largeStr,
19+
byte_roundtrip_ok: std.length(decodedBytes3) == 3000,
20+
encoded_len: std.length(encoded3),
21+
decoded_len: std.length(decoded3)
22+
}
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
{
2+
"byte_roundtrip_ok": true,
3+
"decoded_len": 5700,
4+
"encoded_len": 7600,
5+
"roundtrip_ok": true
6+
}

build.mill

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,53 @@ object sjsonnet extends VersionFileModule {
278278
def nativeLTO = LTO.Full
279279
def nativeMultithreading = None
280280

281+
// Build aklomp/base64 as a static library for SIMD-accelerated base64.
282+
// We pin to a specific upstream commit (no git submodule). CI syncs the pin
283+
// explicitly via `git clone` + `git checkout` in .github/workflows/pr-build.yaml.
284+
// For local builds the task clones into Task.dest on demand so no manual
285+
// setup is required. Keep the SHA here in sync with the workflow file.
286+
def aklompBase64Repo = "https://github.com/aklomp/base64.git"
287+
def aklompBase64Commit = "9e8ed65048ff0f703fad3deb03bf66ac7f78a4d7" // v0.5.2-26-g9e8ed65
288+
def aklompBase64Source = Task {
289+
val vendored = BuildCtx.workspaceRoot / "vendor" / "base64"
290+
if (os.exists(vendored / "CMakeLists.txt")) PathRef(vendored)
291+
else {
292+
val cloneDir = Task.ctx().dest / "base64-src"
293+
os.remove.all(cloneDir)
294+
os.proc("git", "clone", "--filter=blob:none", aklompBase64Repo, cloneDir.toString).call()
295+
os.proc("git", "checkout", "--detach", aklompBase64Commit).call(cwd = cloneDir)
296+
PathRef(cloneDir)
297+
}
298+
}
299+
def buildBase64Lib = Task {
300+
val srcDir = aklompBase64Source().path
301+
val buildDir = Task.ctx().dest / "base64-build"
302+
os.makeDir.all(buildDir)
303+
os.proc(
304+
"cmake",
305+
srcDir.toString,
306+
"-DCMAKE_POSITION_INDEPENDENT_CODE=ON",
307+
"-DBASE64_WITH_OpenMP=OFF",
308+
"-DBASE64_BUILD_TESTS=OFF",
309+
"-DBASE64_BUILD_CLI=OFF",
310+
"-DCMAKE_BUILD_TYPE=Release"
311+
).call(cwd = buildDir)
312+
os.proc("cmake", "--build", buildDir.toString, "--config", "Release").call()
313+
PathRef(buildDir)
314+
}
315+
316+
def nativeLinkingOptions = Task {
317+
super.nativeLinkingOptions() ++ Seq(
318+
(buildBase64Lib().path / "libbase64.a").toString
319+
)
320+
}
321+
322+
def nativeCompileOptions = Task {
323+
super.nativeCompileOptions() ++ Seq(
324+
s"-I${aklompBase64Source().path / "include"}"
325+
)
326+
}
327+
281328
object test extends ScalaNativeTests with CrossTests {
282329
def releaseMode = ReleaseMode.Debug
283330
def nativeMultithreading = None
@@ -286,6 +333,16 @@ object sjsonnet extends VersionFileModule {
286333
"SCALANATIVE_THREAD_STACK_SIZE" -> stackSize
287334
)
288335
def nativeLTO = LTO.None
336+
def nativeLinkingOptions = Task {
337+
super.nativeLinkingOptions() ++ Seq(
338+
(SjsonnetNativeModule.this.buildBase64Lib().path / "libbase64.a").toString
339+
)
340+
}
341+
def nativeCompileOptions = Task {
342+
super.nativeCompileOptions() ++ Seq(
343+
s"-I${SjsonnetNativeModule.this.aklompBase64Source().path / "include"}"
344+
)
345+
}
289346
}
290347
}
291348

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
package sjsonnet.stdlib
2+
3+
/**
4+
* Scala.js implementation of base64 encode/decode. Delegates to java.util.Base64 (provided by
5+
* Scala.js stdlib emulation).
6+
*/
7+
object PlatformBase64 {
8+
9+
def encodeToString(input: Array[Byte]): String =
10+
java.util.Base64.getEncoder.encodeToString(input)
11+
12+
def decode(input: String): Array[Byte] = {
13+
Base64Validation.requireStrictPadding(input)
14+
java.util.Base64.getDecoder.decode(input)
15+
}
16+
}
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
package sjsonnet.stdlib
2+
3+
/**
4+
* JVM implementation of base64 encode/decode. Delegates to java.util.Base64 which has HotSpot
5+
* intrinsics for high performance.
6+
*/
7+
object PlatformBase64 {
8+
9+
def encodeToString(input: Array[Byte]): String =
10+
java.util.Base64.getEncoder.encodeToString(input)
11+
12+
def decode(input: String): Array[Byte] = {
13+
Base64Validation.requireStrictPadding(input)
14+
java.util.Base64.getDecoder.decode(input)
15+
}
16+
}
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
package sjsonnet.stdlib
2+
3+
import scala.scalanative.unsafe._
4+
import scala.scalanative.unsigned._
5+
import scala.scalanative.libc.string.memcpy
6+
7+
/**
8+
* Scala Native implementation of base64 encode/decode.
9+
*
10+
* Uses the aklomp/base64 C library (BSD-2-Clause) which provides SIMD-accelerated base64 via
11+
* runtime CPU detection:
12+
* - x86_64: SSSE3 / SSE4.1 / SSE4.2 / AVX / AVX2 / AVX-512
13+
* - AArch64: NEON
14+
* - Fallback: optimized generic C implementation
15+
*
16+
* The static library is built by CMake and linked via nativeLinkingOptions.
17+
*
18+
* Both aklomp/base64 and C++ jsonnet (the reference implementation) use strict RFC 4648 mode:
19+
* padding is required, unpadded input is rejected. This differs from java.util.Base64 on JVM which
20+
* is more lenient (accepts unpadded input) — that JVM leniency is a pre-existing sjsonnet bug, not
21+
* something we replicate here.
22+
*/
23+
@extern
24+
private[stdlib] object libbase64 {
25+
def base64_encode(
26+
src: Ptr[CChar],
27+
srclen: CSize,
28+
out: Ptr[CChar],
29+
outlen: Ptr[CSize],
30+
flags: CInt
31+
): Unit = extern
32+
33+
def base64_decode(
34+
src: Ptr[CChar],
35+
srclen: CSize,
36+
out: Ptr[CChar],
37+
outlen: Ptr[CSize],
38+
flags: CInt
39+
): CInt = extern
40+
}
41+
42+
object PlatformBase64 {
43+
44+
private val DECODE_TABLE: Array[Int] = {
45+
val t = Array.fill[Int](256)(-1)
46+
var i = 0
47+
while (i < 26) { t('A' + i) = i; i += 1 }
48+
i = 0
49+
while (i < 26) { t('a' + i) = i + 26; i += 1 }
50+
i = 0
51+
while (i < 10) { t('0' + i) = i + 52; i += 1 }
52+
t('+') = 62
53+
t('/') = 63
54+
t
55+
}
56+
57+
/**
58+
* Diagnose why base64 decode failed and throw a JVM-compatible error message. Only called on the
59+
* error path (after aklomp/base64 returns failure), so zero overhead on the hot path.
60+
*
61+
* Error messages match java.util.Base64.Decoder behavior for golden test compatibility:
62+
* - Invalid character: "Illegal base64 character XX" (hex)
63+
* - Wrong length/padding: "Last unit does not have enough valid bits"
64+
*/
65+
private def throwDecodeError(srcBytes: Array[Byte]): Nothing = {
66+
val len = srcBytes.length
67+
68+
var i = 0
69+
while (i < len) {
70+
val b = srcBytes(i) & 0xff
71+
if (b != '='.toInt) {
72+
if (DECODE_TABLE(b) < 0) {
73+
throw new IllegalArgumentException(
74+
"Illegal base64 character " + Integer.toHexString(b)
75+
)
76+
}
77+
}
78+
i += 1
79+
}
80+
81+
throw new IllegalArgumentException(
82+
"Last unit does not have enough valid bits"
83+
)
84+
}
85+
86+
def encodeToString(input: Array[Byte]): String = {
87+
if (input.length == 0) return ""
88+
val maxOutLen = ((input.length.toLong + 2) / 3) * 4
89+
if (maxOutLen > Int.MaxValue)
90+
throw new IllegalArgumentException("Input too large for base64 encoding")
91+
val outSize = maxOutLen.toInt
92+
Zone.acquire { implicit z =>
93+
val srcPtr = alloc[Byte](input.length.toUSize)
94+
memcpy(srcPtr, input.at(0), input.length.toUSize)
95+
val outPtr = alloc[Byte]((outSize + 1).toUSize)
96+
val outLenPtr = alloc[CSize](1.toUSize)
97+
libbase64.base64_encode(srcPtr, input.length.toUSize, outPtr, outLenPtr, 0)
98+
val actualLen = (!outLenPtr).toInt
99+
val result = new Array[Byte](actualLen)
100+
memcpy(result.at(0), outPtr, actualLen.toUSize)
101+
new String(result, "US-ASCII")
102+
}
103+
}
104+
105+
def decode(input: String): Array[Byte] = {
106+
if (input.isEmpty) return Array.emptyByteArray
107+
val srcBytes = input.getBytes("US-ASCII")
108+
val maxOutLen = ((srcBytes.length.toLong / 4) * 3) + 3
109+
if (maxOutLen > Int.MaxValue)
110+
throw new IllegalArgumentException("Input too large for base64 decoding")
111+
val outSize = maxOutLen.toInt
112+
Zone.acquire { implicit z =>
113+
val srcPtr = alloc[Byte](srcBytes.length.toUSize)
114+
memcpy(srcPtr, srcBytes.at(0), srcBytes.length.toUSize)
115+
val outPtr = alloc[Byte]((outSize + 1).toUSize)
116+
val outLenPtr = alloc[CSize](1.toUSize)
117+
val ret =
118+
libbase64.base64_decode(srcPtr, srcBytes.length.toUSize, outPtr, outLenPtr, 0)
119+
if (ret != 1) {
120+
throwDecodeError(srcBytes)
121+
}
122+
val actualLen = (!outLenPtr).toInt
123+
val result = new Array[Byte](actualLen)
124+
memcpy(result.at(0), outPtr, actualLen.toUSize)
125+
result
126+
}
127+
}
128+
}

sjsonnet/src/sjsonnet/BaseByteRenderer.scala

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,27 @@ class BaseByteRenderer[T <: java.io.OutputStream](
242242
else visitLongString(str)
243243
}
244244

245+
/**
246+
* Fast path for strings known to be ASCII-safe (no escaping needed, all chars 0x20-0x7E). Skips
247+
* SWAR scanning and UTF-8 encoding — writes bytes directly from chars.
248+
*/
249+
private[sjsonnet] def renderAsciiSafeString(str: String): Unit = {
250+
val len = str.length
251+
elemBuilder.ensureLength(len + 2)
252+
val arr = elemBuilder.arr
253+
var pos = elemBuilder.length
254+
arr(pos) = '"'.toByte
255+
pos += 1
256+
var i = 0
257+
while (i < len) {
258+
arr(pos) = str.charAt(i).toByte
259+
pos += 1
260+
i += 1
261+
}
262+
arr(pos) = '"'.toByte
263+
elemBuilder.length = pos + 1
264+
}
265+
245266
/**
246267
* Zero-allocation fast path for short ASCII strings (the vast majority of JSON keys/values). Uses
247268
* getChars to bulk-copy into a reusable char buffer, then scans the buffer directly (avoiding

sjsonnet/src/sjsonnet/ByteRenderer.scala

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,9 @@ class ByteRenderer(out: OutputStream = new java.io.ByteArrayOutputStream(), inde
192192
val vt: Int = v.valTag.toInt
193193
(vt: @scala.annotation.switch) match {
194194
case 0 => // TAG_STR
195-
renderQuotedString(v.asInstanceOf[Val.Str].str)
195+
val s = v.asInstanceOf[Val.Str]
196+
if (s._asciiSafe) renderAsciiSafeString(s.str)
197+
else renderQuotedString(s.str)
196198
case 1 => // TAG_NUM
197199
renderDouble(v.asDouble)
198200
case 2 => // TAG_TRUE
@@ -420,18 +422,32 @@ class ByteRenderer(out: OutputStream = new java.io.ByteArrayOutputStream(), inde
420422
depth += 1
421423
resetEmpty()
422424

423-
var i = 0
424-
while (i < len) {
425-
val childVal = xs.value(i)
425+
// Fast path for byte-backed arrays: emit numbers directly without per-element dispatch
426+
xs match {
427+
case ba: Val.ByteArr =>
428+
val bytes = ba.rawBytes
429+
var i = 0
430+
while (i < len) {
431+
markNonEmpty()
432+
flushBuffer()
433+
renderDouble((bytes(i) & 0xff).toDouble)
434+
commaBuffered = true
435+
i += 1
436+
}
437+
case _ =>
438+
var i = 0
439+
while (i < len) {
440+
val childVal = xs.value(i)
426441

427-
markNonEmpty()
428-
flushBuffer()
442+
markNonEmpty()
443+
flushBuffer()
429444

430-
// Render element directly — no flush overhead
431-
materializeChild(childVal, matDepth, ctx)
445+
// Render element directly — no flush overhead
446+
materializeChild(childVal, matDepth, ctx)
432447

433-
commaBuffered = true
434-
i += 1
448+
commaBuffered = true
449+
i += 1
450+
}
435451
}
436452

437453
// Inline of visitEnd — close bracket

0 commit comments

Comments
 (0)