diff --git a/bench/resources/go_suite/base64_stress.jsonnet b/bench/resources/go_suite/base64_stress.jsonnet new file mode 100644 index 000000000..dcd152b0b --- /dev/null +++ b/bench/resources/go_suite/base64_stress.jsonnet @@ -0,0 +1,22 @@ +{ + local largeStr = std.repeat("Lorem ipsum dolor sit amet, consectetur adipiscing elit. ", 100), + local encoded = std.base64(largeStr), + local decoded = std.base64Decode(encoded), + local encodedArr = std.base64(std.makeArray(1000, function(i) i % 256)), + local decodedBytes = std.base64DecodeBytes(encodedArr), + + local encoded2 = std.base64(decoded), + local decoded2 = std.base64Decode(encoded2), + local encodedArr2 = std.base64(std.makeArray(2000, function(i) (i * 7 + 13) % 256)), + local decodedBytes2 = std.base64DecodeBytes(encodedArr2), + + local encoded3 = std.base64(decoded2), + local decoded3 = std.base64Decode(encoded3), + local encodedArr3 = std.base64(std.makeArray(3000, function(i) (i * 13 + 37) % 256)), + local decodedBytes3 = std.base64DecodeBytes(encodedArr3), + + roundtrip_ok: decoded3 == largeStr, + byte_roundtrip_ok: std.length(decodedBytes3) == 3000, + encoded_len: std.length(encoded3), + decoded_len: std.length(decoded3) +} diff --git a/bench/resources/go_suite/base64_stress.jsonnet.golden b/bench/resources/go_suite/base64_stress.jsonnet.golden new file mode 100644 index 000000000..41b0dacae --- /dev/null +++ b/bench/resources/go_suite/base64_stress.jsonnet.golden @@ -0,0 +1,6 @@ +{ + "byte_roundtrip_ok": true, + "decoded_len": 5700, + "encoded_len": 7600, + "roundtrip_ok": true +} diff --git a/build.mill b/build.mill index 83fce1979..773083c53 100644 --- a/build.mill +++ b/build.mill @@ -278,6 +278,48 @@ object sjsonnet extends VersionFileModule { def nativeLTO = LTO.Full def nativeMultithreading = None + // Build aklomp/base64 as a static library for SIMD-accelerated base64. + // We pin to a specific upstream commit (no git submodule) and clone on + // demand into the task's sandboxed dest. The pinned SHA below is the single + // source of truth — bump it here when upgrading the upstream library. + def aklompBase64Repo = "https://github.com/aklomp/base64.git" + def aklompBase64Commit = "9e8ed65048ff0f703fad3deb03bf66ac7f78a4d7" // v0.5.2-26-g9e8ed65 + def aklompBase64Source = Task { + val cloneDir = Task.ctx().dest / "base64-src" + os.remove.all(cloneDir) + os.proc("git", "clone", "--filter=blob:none", aklompBase64Repo, cloneDir.toString).call() + os.proc("git", "checkout", "--detach", aklompBase64Commit).call(cwd = cloneDir) + PathRef(cloneDir) + } + def buildBase64Lib = Task { + val srcDir = aklompBase64Source().path + val buildDir = Task.ctx().dest / "base64-build" + os.makeDir.all(buildDir) + os.proc( + "cmake", + srcDir.toString, + "-DCMAKE_POSITION_INDEPENDENT_CODE=ON", + "-DBASE64_WITH_OpenMP=OFF", + "-DBASE64_BUILD_TESTS=OFF", + "-DBASE64_BUILD_CLI=OFF", + "-DCMAKE_BUILD_TYPE=Release" + ).call(cwd = buildDir) + os.proc("cmake", "--build", buildDir.toString, "--config", "Release").call() + PathRef(buildDir) + } + + def nativeLinkingOptions = Task { + super.nativeLinkingOptions() ++ Seq( + (buildBase64Lib().path / "libbase64.a").toString + ) + } + + def nativeCompileOptions = Task { + super.nativeCompileOptions() ++ Seq( + s"-I${aklompBase64Source().path / "include"}" + ) + } + object test extends ScalaNativeTests with CrossTests { def releaseMode = ReleaseMode.Debug def nativeMultithreading = None @@ -286,6 +328,16 @@ object sjsonnet extends VersionFileModule { "SCALANATIVE_THREAD_STACK_SIZE" -> stackSize ) def nativeLTO = LTO.None + def nativeLinkingOptions = Task { + super.nativeLinkingOptions() ++ Seq( + (SjsonnetNativeModule.this.buildBase64Lib().path / "libbase64.a").toString + ) + } + def nativeCompileOptions = Task { + super.nativeCompileOptions() ++ Seq( + s"-I${SjsonnetNativeModule.this.aklompBase64Source().path / "include"}" + ) + } } } diff --git a/sjsonnet/src-js/sjsonnet/stdlib/PlatformBase64.scala b/sjsonnet/src-js/sjsonnet/stdlib/PlatformBase64.scala new file mode 100644 index 000000000..d91802a73 --- /dev/null +++ b/sjsonnet/src-js/sjsonnet/stdlib/PlatformBase64.scala @@ -0,0 +1,16 @@ +package sjsonnet.stdlib + +/** + * Scala.js implementation of base64 encode/decode. Delegates to java.util.Base64 (provided by + * Scala.js stdlib emulation). + */ +object PlatformBase64 { + + def encodeToString(input: Array[Byte]): String = + java.util.Base64.getEncoder.encodeToString(input) + + def decode(input: String): Array[Byte] = { + Base64Validation.requireStrictPadding(input) + java.util.Base64.getDecoder.decode(input) + } +} diff --git a/sjsonnet/src-jvm/sjsonnet/stdlib/PlatformBase64.scala b/sjsonnet/src-jvm/sjsonnet/stdlib/PlatformBase64.scala new file mode 100644 index 000000000..64ad666e7 --- /dev/null +++ b/sjsonnet/src-jvm/sjsonnet/stdlib/PlatformBase64.scala @@ -0,0 +1,16 @@ +package sjsonnet.stdlib + +/** + * JVM implementation of base64 encode/decode. Delegates to java.util.Base64 which has HotSpot + * intrinsics for high performance. + */ +object PlatformBase64 { + + def encodeToString(input: Array[Byte]): String = + java.util.Base64.getEncoder.encodeToString(input) + + def decode(input: String): Array[Byte] = { + Base64Validation.requireStrictPadding(input) + java.util.Base64.getDecoder.decode(input) + } +} diff --git a/sjsonnet/src-native/sjsonnet/stdlib/PlatformBase64.scala b/sjsonnet/src-native/sjsonnet/stdlib/PlatformBase64.scala new file mode 100644 index 000000000..c994ab72a --- /dev/null +++ b/sjsonnet/src-native/sjsonnet/stdlib/PlatformBase64.scala @@ -0,0 +1,128 @@ +package sjsonnet.stdlib + +import scala.scalanative.unsafe._ +import scala.scalanative.unsigned._ +import scala.scalanative.libc.string.memcpy + +/** + * Scala Native implementation of base64 encode/decode. + * + * Uses the aklomp/base64 C library (BSD-2-Clause) which provides SIMD-accelerated base64 via + * runtime CPU detection: + * - x86_64: SSSE3 / SSE4.1 / SSE4.2 / AVX / AVX2 / AVX-512 + * - AArch64: NEON + * - Fallback: optimized generic C implementation + * + * The static library is built by CMake and linked via nativeLinkingOptions. + * + * Both aklomp/base64 and C++ jsonnet (the reference implementation) use strict RFC 4648 mode: + * padding is required, unpadded input is rejected. This differs from java.util.Base64 on JVM which + * is more lenient (accepts unpadded input) — that JVM leniency is a pre-existing sjsonnet bug, not + * something we replicate here. + */ +@extern +private[stdlib] object libbase64 { + def base64_encode( + src: Ptr[CChar], + srclen: CSize, + out: Ptr[CChar], + outlen: Ptr[CSize], + flags: CInt + ): Unit = extern + + def base64_decode( + src: Ptr[CChar], + srclen: CSize, + out: Ptr[CChar], + outlen: Ptr[CSize], + flags: CInt + ): CInt = extern +} + +object PlatformBase64 { + + private val DECODE_TABLE: Array[Int] = { + val t = Array.fill[Int](256)(-1) + var i = 0 + while (i < 26) { t('A' + i) = i; i += 1 } + i = 0 + while (i < 26) { t('a' + i) = i + 26; i += 1 } + i = 0 + while (i < 10) { t('0' + i) = i + 52; i += 1 } + t('+') = 62 + t('/') = 63 + t + } + + /** + * Diagnose why base64 decode failed and throw a JVM-compatible error message. Only called on the + * error path (after aklomp/base64 returns failure), so zero overhead on the hot path. + * + * Error messages match java.util.Base64.Decoder behavior for golden test compatibility: + * - Invalid character: "Illegal base64 character XX" (hex) + * - Wrong length/padding: "Last unit does not have enough valid bits" + */ + private def throwDecodeError(srcBytes: Array[Byte]): Nothing = { + val len = srcBytes.length + + var i = 0 + while (i < len) { + val b = srcBytes(i) & 0xff + if (b != '='.toInt) { + if (DECODE_TABLE(b) < 0) { + throw new IllegalArgumentException( + "Illegal base64 character " + Integer.toHexString(b) + ) + } + } + i += 1 + } + + throw new IllegalArgumentException( + "Last unit does not have enough valid bits" + ) + } + + def encodeToString(input: Array[Byte]): String = { + if (input.length == 0) return "" + val maxOutLen = ((input.length.toLong + 2) / 3) * 4 + if (maxOutLen > Int.MaxValue) + throw new IllegalArgumentException("Input too large for base64 encoding") + val outSize = maxOutLen.toInt + Zone.acquire { implicit z => + val srcPtr = alloc[Byte](input.length.toUSize) + memcpy(srcPtr, input.at(0), input.length.toUSize) + val outPtr = alloc[Byte]((outSize + 1).toUSize) + val outLenPtr = alloc[CSize](1.toUSize) + libbase64.base64_encode(srcPtr, input.length.toUSize, outPtr, outLenPtr, 0) + val actualLen = (!outLenPtr).toInt + val result = new Array[Byte](actualLen) + memcpy(result.at(0), outPtr, actualLen.toUSize) + new String(result, "US-ASCII") + } + } + + def decode(input: String): Array[Byte] = { + if (input.isEmpty) return Array.emptyByteArray + val srcBytes = input.getBytes("US-ASCII") + val maxOutLen = ((srcBytes.length.toLong / 4) * 3) + 3 + if (maxOutLen > Int.MaxValue) + throw new IllegalArgumentException("Input too large for base64 decoding") + val outSize = maxOutLen.toInt + Zone.acquire { implicit z => + val srcPtr = alloc[Byte](srcBytes.length.toUSize) + memcpy(srcPtr, srcBytes.at(0), srcBytes.length.toUSize) + val outPtr = alloc[Byte]((outSize + 1).toUSize) + val outLenPtr = alloc[CSize](1.toUSize) + val ret = + libbase64.base64_decode(srcPtr, srcBytes.length.toUSize, outPtr, outLenPtr, 0) + if (ret != 1) { + throwDecodeError(srcBytes) + } + val actualLen = (!outLenPtr).toInt + val result = new Array[Byte](actualLen) + memcpy(result.at(0), outPtr, actualLen.toUSize) + result + } + } +} diff --git a/sjsonnet/src/sjsonnet/BaseByteRenderer.scala b/sjsonnet/src/sjsonnet/BaseByteRenderer.scala index a2cde09eb..95a67aef9 100644 --- a/sjsonnet/src/sjsonnet/BaseByteRenderer.scala +++ b/sjsonnet/src/sjsonnet/BaseByteRenderer.scala @@ -242,6 +242,27 @@ class BaseByteRenderer[T <: java.io.OutputStream]( else visitLongString(str) } + /** + * Fast path for strings known to be ASCII-safe (no escaping needed, all chars 0x20-0x7E). Skips + * SWAR scanning and UTF-8 encoding — writes bytes directly from chars. + */ + private[sjsonnet] def renderAsciiSafeString(str: String): Unit = { + val len = str.length + elemBuilder.ensureLength(len + 2) + val arr = elemBuilder.arr + var pos = elemBuilder.length + arr(pos) = '"'.toByte + pos += 1 + var i = 0 + while (i < len) { + arr(pos) = str.charAt(i).toByte + pos += 1 + i += 1 + } + arr(pos) = '"'.toByte + elemBuilder.length = pos + 1 + } + /** * Zero-allocation fast path for short ASCII strings (the vast majority of JSON keys/values). Uses * getChars to bulk-copy into a reusable char buffer, then scans the buffer directly (avoiding diff --git a/sjsonnet/src/sjsonnet/ByteRenderer.scala b/sjsonnet/src/sjsonnet/ByteRenderer.scala index 7eb8656f3..46c6175ae 100644 --- a/sjsonnet/src/sjsonnet/ByteRenderer.scala +++ b/sjsonnet/src/sjsonnet/ByteRenderer.scala @@ -192,7 +192,9 @@ class ByteRenderer(out: OutputStream = new java.io.ByteArrayOutputStream(), inde val vt: Int = v.valTag.toInt (vt: @scala.annotation.switch) match { case 0 => // TAG_STR - renderQuotedString(v.asInstanceOf[Val.Str].str) + val s = v.asInstanceOf[Val.Str] + if (s._asciiSafe) renderAsciiSafeString(s.str) + else renderQuotedString(s.str) case 1 => // TAG_NUM renderDouble(v.asDouble) case 2 => // TAG_TRUE @@ -420,18 +422,32 @@ class ByteRenderer(out: OutputStream = new java.io.ByteArrayOutputStream(), inde depth += 1 resetEmpty() - var i = 0 - while (i < len) { - val childVal = xs.value(i) + // Fast path for byte-backed arrays: emit numbers directly without per-element dispatch + xs match { + case ba: Val.ByteArr => + val bytes = ba.rawBytes + var i = 0 + while (i < len) { + markNonEmpty() + flushBuffer() + renderDouble((bytes(i) & 0xff).toDouble) + commaBuffered = true + i += 1 + } + case _ => + var i = 0 + while (i < len) { + val childVal = xs.value(i) - markNonEmpty() - flushBuffer() + markNonEmpty() + flushBuffer() - // Render element directly — no flush overhead - materializeChild(childVal, matDepth, ctx) + // Render element directly — no flush overhead + materializeChild(childVal, matDepth, ctx) - commaBuffered = true - i += 1 + commaBuffered = true + i += 1 + } } // Inline of visitEnd — close bracket diff --git a/sjsonnet/src/sjsonnet/Materializer.scala b/sjsonnet/src/sjsonnet/Materializer.scala index 0742ca6e4..a1c8041a4 100644 --- a/sjsonnet/src/sjsonnet/Materializer.scala +++ b/sjsonnet/src/sjsonnet/Materializer.scala @@ -232,6 +232,25 @@ abstract class Materializer { depth: Int, ctx: Materializer.MaterializeContext)(implicit evaluator: EvalScope): T = { storePos(xs.pos) + // Fast path for byte-backed arrays: skip per-element value() + type dispatch + xs match { + case ba: Val.ByteArr => + val bytes = ba.rawBytes + val len = bytes.length + val av = visitor.visitArray(len, -1) + var i = 0 + while (i < len) { + av.visitValue( + av.subVisitor + .asInstanceOf[Visitor[T, T]] + .visitFloat64((bytes(i) & 0xff).toDouble, -1), + -1 + ) + i += 1 + } + return av.visitEnd(-1) + case _ => + } val len = xs.length val av = visitor.visitArray(len, -1) var i = 0 @@ -357,6 +376,22 @@ abstract class Materializer { case frame: Materializer.MaterializeArrFrame[T @unchecked] => val arr = frame.arr val av = frame.arrVisitor + // Fast path for byte-backed arrays: emit all elements directly + if (frame.index == 0) { + arr match { + case ba: Val.ByteArr => + val bytes = ba.rawBytes + val len = bytes.length + var i = 0 + while (i < len) { + val sub = av.subVisitor.asInstanceOf[Visitor[T, T]] + av.visitValue(sub.visitFloat64((bytes(i) & 0xff).toDouble, -1), -1) + i += 1 + } + frame.index = len // mark as done + case _ => + } + } if (frame.index < arr.length) { val childVal = arr.value(frame.index) frame.index += 1 diff --git a/sjsonnet/src/sjsonnet/Val.scala b/sjsonnet/src/sjsonnet/Val.scala index 4b5d76330..db720e206 100644 --- a/sjsonnet/src/sjsonnet/Val.scala +++ b/sjsonnet/src/sjsonnet/Val.scala @@ -310,6 +310,11 @@ object Val { // cold flatten path, which is amortized O(1) per character. private[sjsonnet] var _children: Array[Str] = null + // Flag indicating this string is known to contain only printable ASCII (0x20-0x7E) with no + // characters requiring JSON escaping (no ", \, or control chars). When true, the renderer + // can skip SWAR escape scanning and UTF-8 encoding, writing bytes directly. + private[sjsonnet] var _asciiSafe: Boolean = false + def prettyName = "string" private[sjsonnet] def valTag: Byte = TAG_STR @@ -377,6 +382,13 @@ object Val { /** Create a leaf string node — zero overhead vs the old case class. */ def apply(pos: Position, s: String): Str = new Str(pos, s) + /** Create a leaf string node marked as ASCII-safe (no JSON escaping needed). */ + def asciiSafe(pos: Position, s: String): Str = { + val v = new Str(pos, s) + v._asciiSafe = true + v + } + /** Backward-compatible extractor: `case Val.Str(pos, s) =>` still works. */ def unapply(s: Str): Option[(Position, String)] = Some((s.pos, s.str)) @@ -449,7 +461,7 @@ object Val { private[sjsonnet] def valTag: Byte = TAG_NUM } - final class Arr(var pos: Position, private var arr: Array[? <: Eval]) extends Literal { + class Arr(var pos: Position, private[Val] var arr: Array[? <: Eval]) extends Literal { def prettyName = "array" private[sjsonnet] def valTag: Byte = TAG_ARR @@ -468,20 +480,9 @@ object Val { // The 'arr' field is lazily materialized when bulk access (asLazyArray, etc.) is needed. private var _concatLeft: Arr = _ private var _concatRight: Arr = _ - private var _length: Int = -1 + private[Val] var _length: Int = -1 - // Lazy range state. When _isRange is true, this array represents - // a contiguous integer sequence [from, from+1, ..., from+length-1]. - // Elements are computed on demand via Val.cachedNum, avoiding upfront allocation - // of the full backing array. Inspired by jrsonnet's RangeArray (arr/spec.rs) - // which uses O(1) creation for std.range results. - // Uses a separate boolean flag instead of a sentinel value to avoid collisions - // with valid range start values (e.g. Int.MinValue). - private var _isRange: Boolean = false - private var _rangeFrom: Int = 0 - - @inline private def isConcatView: Boolean = _concatLeft ne null - @inline private def isRange: Boolean = _isRange + @inline final private[Val] def isConcatView: Boolean = _concatLeft ne null override def asArr: Arr = this @@ -491,7 +492,7 @@ object Val { else { val computed = if (isConcatView) _concatLeft.length + _concatRight.length - else arr.length // isRange always has _length pre-set, never reaches here + else arr.length // RangeArr/ByteArr always have _length pre-set, never reach here _length = computed computed } @@ -501,10 +502,6 @@ object Val { if (isConcatView) { val leftLen = _concatLeft.length if (i < leftLen) _concatLeft.value(i) else _concatRight.value(i - leftLen) - } else if (isRange) { - // For reversed ranges, _rangeFrom is the last element and we count down - if (_reversed) Val.cachedNum(pos, _rangeFrom - i) - else Val.cachedNum(pos, _rangeFrom + i) } else if (_reversed) { arr(arr.length - 1 - i).value } else { @@ -521,9 +518,6 @@ object Val { if (isConcatView) { val leftLen = _concatLeft.length if (i < leftLen) _concatLeft.eval(i) else _concatRight.eval(i - leftLen) - } else if (isRange) { - if (_reversed) Val.cachedNum(pos, _rangeFrom - i) - else Val.cachedNum(pos, _rangeFrom + i) } else if (_reversed) { arr(arr.length - 1 - i) } else { @@ -549,7 +543,6 @@ object Val { */ def asLazyArray: Array[Eval] = { if (isConcatView) materialize() - if (isRange) materializeRange() if (_reversed) { val len = arr.length val result = new Array[Eval](len) @@ -594,26 +587,6 @@ object Val { _concatRight = null } - /** - * Materialize a lazy range view into a flat array. After this call, `arr` holds the full - * Val.Num elements and the range flag is cleared. Handles both forward and reversed ranges. - */ - private def materializeRange(): Unit = { - val len = _length - val from = _rangeFrom - val rev = _reversed - val p = pos - val result = new Array[Eval](len) - var i = 0 - while (i < len) { - result(i) = Val.cachedNum(p, if (rev) from - i else from + i) - i += 1 - } - arr = result - _isRange = false // clear range flag - _reversed = false // range is now materialized in correct order - } - /** * Concatenate two arrays. For large left-side arrays where neither operand is already a concat * view, creates a lazy ConcatView that defers the copy until bulk access is needed. This is @@ -680,41 +653,148 @@ object Val { * Double-reversal cancels out. */ def reversed(newPos: Position): Arr = { - if (isRange) { - // Double-reverse of a range cancels out: return a forward range with original start - if (_reversed) { - // Currently reversed: _rangeFrom is the high end, counting down. - // Reversing again restores the original forward range. - val originalFrom = _rangeFrom - _length + 1 - val result = new Arr(newPos, null) - result._isRange = true - result._rangeFrom = originalFrom - result._length = _length - // _reversed defaults to false — forward range - result - } else { - // Forward range: reverse to [from+len-1, from+len-2, ..., from] - val len = _length - val newFrom = _rangeFrom + len - 1 - val result = new Arr(newPos, null) - result._isRange = true - result._rangeFrom = newFrom - result._length = len - result._reversed = true // signal to compute from-i instead of from+i - result - } + if (isConcatView) materialize() // flatten before reverse + val result = Arr(newPos, arr) + result._reversed = !this._reversed + result + } + } + + /** + * Lazy range array representing the integer sequence [from, from+1, ..., from+size-1]. Separate + * subclass keeps the `rangeFrom` field out of regular `Arr` instances, saving ~9 bytes per + * non-range array (boolean + int + padding). + * + * Elements are computed on demand via `Val.cachedNum`. When bulk access is needed (asLazyArray, + * concat eager path), the range materializes into a flat array and subsequent calls delegate to + * the parent `Arr` implementation. + * + * Inspired by jrsonnet's RangeArray (arr/spec.rs). + */ + final class RangeArr(pos0: Position, private val rangeFrom: Int, size: Int) + extends Arr(pos0, null) { + _length = size + + // After materialization arr becomes non-null; delegate to parent Arr logic. + @inline private def isMaterialized: Boolean = arr ne null + + override def value(i: Int): Val = { + if (isMaterialized || isConcatView) super.value(i) + else if (_reversed) Val.cachedNum(pos, rangeFrom - i) + else Val.cachedNum(pos, rangeFrom + i) + } + + override def eval(i: Int): Eval = { + if (isMaterialized || isConcatView) super.eval(i) + else if (_reversed) Val.cachedNum(pos, rangeFrom - i) + else Val.cachedNum(pos, rangeFrom + i) + } + + override def asLazyArray: Array[Eval] = { + if (!isMaterialized && !isConcatView) materializeRange() + super.asLazyArray + } + + override def reversed(newPos: Position): Arr = { + if (isMaterialized || isConcatView) { + super.reversed(newPos) + } else if (_reversed) { + // Double-reverse: restore the original forward range. + val originalFrom = rangeFrom - _length + 1 + new RangeArr(newPos, originalFrom, _length) } else { - if (isConcatView) materialize() // flatten before reverse - val result = Arr(newPos, arr) - result._reversed = !this._reversed + // Forward range → reversed: store high end as rangeFrom, flag as reversed. + val newFrom = rangeFrom + _length - 1 + val result = new RangeArr(newPos, newFrom, _length) + result._reversed = true result } } + + /** Materialize this range into a flat Val.Num array. */ + private def materializeRange(): Unit = { + val len = _length + val from = rangeFrom + val rev = _reversed + val p = pos + val result = new Array[Eval](len) + var i = 0 + while (i < len) { + result(i) = Val.cachedNum(p, if (rev) from - i else from + i) + i += 1 + } + arr = result + _reversed = false // materialized in correct order + } + } + + /** + * Byte-backed array for compact storage of 0-255 integer values (e.g. from base64DecodeBytes). + * Stores `Array[Byte]` directly instead of N `Val.Num` / `Eval` wrappers, serving elements via + * `Val.cachedNum` on demand. + * + * Unlike RangeArr, `byteData` is a `val` — it is never cleared after materialization. This + * guarantees `rawBytes` is always non-null, so fast paths in Materializer/ByteRenderer/ + * EncodingModule can rely on it unconditionally. The extra memory (1 byte per element on top of + * the materialized Eval array) is negligible. + * + * Inspired by jrsonnet's BytesArray which uses compact byte storage for decoded data. + */ + final class ByteArr(pos0: Position, private val byteData: Array[Byte]) extends Arr(pos0, null) { + _length = byteData.length + + // After materialization arr becomes non-null; delegate to parent Arr logic. + @inline private def isMaterialized: Boolean = arr ne null + + /** Raw byte backing data for zero-copy extraction (e.g. base64 encode). Always non-null. */ + def rawBytes: Array[Byte] = byteData + + override def value(i: Int): Val = { + if (isMaterialized || isConcatView) super.value(i) + else Val.cachedNum(pos, (byteData(i) & 0xff).toDouble) + } + + override def eval(i: Int): Eval = { + if (isMaterialized || isConcatView) super.eval(i) + else Val.cachedNum(pos, (byteData(i) & 0xff).toDouble) + } + + override def asLazyArray: Array[Eval] = { + if (!isMaterialized && !isConcatView) materializeByteData() + super.asLazyArray + } + + // DO NOT CHANGE + // WHY: reversed() materializes first instead of flipping a _reversed flag. This keeps + // ByteArr simple — value()/eval()/rawBytes never need to handle reversed indexing. + // The returned plain Arr uses the materialized backing array with _reversed=true. + // Reversing byte arrays is rare; the one-time O(n) materialization is acceptable. + override def reversed(newPos: Position): Arr = { + if (!isMaterialized && !isConcatView) materializeByteData() + super.reversed(newPos) + } + + /** Materialize byte data into a flat Val.Num array. Does NOT clear byteData (it is a val). */ + private def materializeByteData(): Unit = { + val bytes = byteData + val len = bytes.length + val p = pos + val result = new Array[Eval](len) + var i = 0 + while (i < len) { + result(i) = Val.cachedNum(p, (bytes(i) & 0xff).toDouble) + i += 1 + } + arr = result + } } object Arr { def apply(pos: Position, arr: Array[? <: Eval]): Arr = new Arr(pos, arr) + /** Create a byte-backed array from raw bytes (e.g. base64DecodeBytes output). */ + def fromBytes(pos: Position, bytes: Array[Byte]): Arr = new ByteArr(pos, bytes) + /** * Create a lazy range array representing the integer sequence [from, from+1, ..., from+size-1]. * Elements are computed on demand via Val.cachedNum, avoiding upfront allocation of the full @@ -722,13 +802,7 @@ object Val { * * Inspired by jrsonnet's RangeArray (arr/spec.rs) which uses the same deferred approach. */ - def range(pos: Position, from: Int, size: Int): Arr = { - val a = new Arr(pos, null) - a._isRange = true - a._rangeFrom = from - a._length = size - a - } + def range(pos: Position, from: Int, size: Int): Arr = new RangeArr(pos, from, size) /** * Threshold for lazy concat. Arrays with left.length >= this value use a virtual ConcatView diff --git a/sjsonnet/src/sjsonnet/stdlib/Base64Validation.scala b/sjsonnet/src/sjsonnet/stdlib/Base64Validation.scala new file mode 100644 index 000000000..2fdb8893a --- /dev/null +++ b/sjsonnet/src/sjsonnet/stdlib/Base64Validation.scala @@ -0,0 +1,38 @@ +package sjsonnet.stdlib + +/** + * Shared base64 validation logic used by JVM and JS platforms. The Native platform uses + * aklomp/base64 which enforces strict padding natively. + */ +object Base64Validation { + + /** + * Validate strict RFC 4648 padding: reject ASCII input whose length is not a multiple of 4. + * + * java.util.Base64 is lenient and accepts unpadded input (e.g. "YQ" instead of "YQ=="), but the + * Jsonnet spec requires strict compliance. Both go-jsonnet and C++ jsonnet reject unpadded input: + * - go-jsonnet: checks `len(str) % 4 != 0` before calling base64.StdEncoding.DecodeString + * - C++ jsonnet: rejects with "Not a base64 encoded string" + * + * We only apply this check when all characters are ASCII. Non-ASCII characters (e.g. "ĀQ=") are + * never valid base64 and should be caught by java.util.Base64 with a more specific "Illegal + * base64 character" error message. This matches go-jsonnet's behavior where len() counts UTF-8 + * bytes (so "ĀQ=" is 4 bytes, passes the length check, and fails on the invalid character). + */ + def requireStrictPadding(input: String): Unit = { + val len = input.length + if (len > 0 && len % 4 != 0) { + var allAscii = true + var i = 0 + while (i < len && allAscii) { + if (input.charAt(i) > 127) allAscii = false + i += 1 + } + if (allAscii) { + throw new IllegalArgumentException( + "Last unit does not have enough valid bits" + ) + } + } + } +} diff --git a/sjsonnet/src/sjsonnet/stdlib/EncodingModule.scala b/sjsonnet/src/sjsonnet/stdlib/EncodingModule.scala index 8fa00f73b..049c6f950 100644 --- a/sjsonnet/src/sjsonnet/stdlib/EncodingModule.scala +++ b/sjsonnet/src/sjsonnet/stdlib/EncodingModule.scala @@ -3,8 +3,6 @@ package sjsonnet.stdlib import sjsonnet._ import sjsonnet.functions.AbstractFunctionModule -import java.util.Base64 - object EncodingModule extends AbstractFunctionModule { def name = "encoding" @@ -15,17 +13,21 @@ object EncodingModule extends AbstractFunctionModule { val functions: Seq[(String, Val.Func)] = Seq( builtin(MD5), - builtin("base64", "input") { (_, _, input: Val) => - input match { + builtin("base64", "input") { (pos, _, input: Val) => + (input match { case Val.Str(_, value) => - Base64.getEncoder.encodeToString(value.getBytes("UTF-8")) + Val.Str.asciiSafe(pos, PlatformBase64.encodeToString(value.getBytes("UTF-8"))) + case ba: Val.ByteArr => + Val.Str.asciiSafe(pos, PlatformBase64.encodeToString(ba.rawBytes)) case arr: Val.Arr => val byteArr = new Array[Byte](arr.length) var i = 0 while (i < arr.length) { val v = arr.value(i) if (!v.isInstanceOf[Val.Num]) { - Error.fail(f"Expected an array of numbers, got a ${v.prettyName} at position $i") + Error.fail( + f"Expected an array of numbers, got a ${v.prettyName} at position $i" + ) } val vInt = v.asInt if (vInt < 0 || vInt > 255) { @@ -36,13 +38,13 @@ object EncodingModule extends AbstractFunctionModule { byteArr(i) = vInt.toByte i += 1 } - Base64.getEncoder.encodeToString(byteArr) + Val.Str.asciiSafe(pos, PlatformBase64.encodeToString(byteArr)) case x => Error.fail("Cannot base64 encode " + x.prettyName) - } + }): Val }, builtin("base64Decode", "str") { (_, _, str: String) => try { - new String(Base64.getDecoder.decode(str)) + new String(PlatformBase64.decode(str), "UTF-8") } catch { case e: IllegalArgumentException => Error.fail("Invalid base64 string: " + e.getMessage) @@ -50,14 +52,7 @@ object EncodingModule extends AbstractFunctionModule { }, builtin("base64DecodeBytes", "str") { (pos, _, str: String) => try { - val decoded = Base64.getDecoder.decode(str) - val result = new Array[Eval](decoded.length) - var i = 0 - while (i < decoded.length) { - result(i) = Val.cachedNum(pos, (decoded(i) & 0xff).toDouble) - i += 1 - } - Val.Arr(pos, result) + Val.Arr.fromBytes(pos, PlatformBase64.decode(str)) } catch { case e: IllegalArgumentException => Error.fail("Invalid base64 string: " + e.getMessage) diff --git a/sjsonnet/test/resources/new_test_suite/base64_byte_array_comprehensive.jsonnet b/sjsonnet/test/resources/new_test_suite/base64_byte_array_comprehensive.jsonnet new file mode 100644 index 000000000..4a988db57 --- /dev/null +++ b/sjsonnet/test/resources/new_test_suite/base64_byte_array_comprehensive.jsonnet @@ -0,0 +1,112 @@ +// Byte array base64 encode/decode comprehensive tests. +// Ported from: +// - aklomp/base64 test_char_table: all 256 byte values, sliding window +// - Additional boundary byte value coverage vectors + +// ================================================================ +// All 256 byte values — encode and roundtrip +// (ported from aklomp/base64 test_char_table offset=0) +// ================================================================ +local all256 = std.makeArray(256, function(i) i); +local encoded256 = std.base64(all256); +local decoded256 = std.base64DecodeBytes(encoded256); +std.assertEqual(std.length(decoded256), 256) && +std.all(std.makeArray(256, function(i) decoded256[i] == i)) && + +// ================================================================ +// Sliding window byte coverage +// (ported from aklomp/base64 test_char_table: loop from offset 0..255) +// Tests each starting offset to exercise all byte values at +// different alignment positions within SIMD processing units. +// ================================================================ + +// Offset 0: full 256 bytes [0,1,2,...,255] +local sw0 = std.makeArray(256, function(i) i % 256); +std.assertEqual(std.base64DecodeBytes(std.base64(sw0)), sw0) && + +// Offset 1: [1,2,...,255,0] +local sw1 = std.makeArray(256, function(i) (i + 1) % 256); +std.assertEqual(std.base64DecodeBytes(std.base64(sw1)), sw1) && + +// Offset 3: [3,4,...,255,0,1,2] +local sw3 = std.makeArray(256, function(i) (i + 3) % 256); +std.assertEqual(std.base64DecodeBytes(std.base64(sw3)), sw3) && + +// Offset 7 (non-power-of-2 alignment) +local sw7 = std.makeArray(256, function(i) (i + 7) % 256); +std.assertEqual(std.base64DecodeBytes(std.base64(sw7)), sw7) && + +// Offset 15 (SSSE3-1 boundary) +local sw15 = std.makeArray(256, function(i) (i + 15) % 256); +std.assertEqual(std.base64DecodeBytes(std.base64(sw15)), sw15) && + +// Offset 31 (AVX2-1 boundary) +local sw31 = std.makeArray(256, function(i) (i + 31) % 256); +std.assertEqual(std.base64DecodeBytes(std.base64(sw31)), sw31) && + +// Offset 63 (NEON/AVX512-1 boundary) +local sw63 = std.makeArray(256, function(i) (i + 63) % 256); +std.assertEqual(std.base64DecodeBytes(std.base64(sw63)), sw63) && + +// Offset 127 (half-way) +local sw127 = std.makeArray(256, function(i) (i + 127) % 256); +std.assertEqual(std.base64DecodeBytes(std.base64(sw127)), sw127) && + +// Offset 128 +local sw128 = std.makeArray(256, function(i) (i + 128) % 256); +std.assertEqual(std.base64DecodeBytes(std.base64(sw128)), sw128) && + +// Offset 200 +local sw200 = std.makeArray(256, function(i) (i + 200) % 256); +std.assertEqual(std.base64DecodeBytes(std.base64(sw200)), sw200) && + +// Offset 255 +local sw255 = std.makeArray(256, function(i) (i + 255) % 256); +std.assertEqual(std.base64DecodeBytes(std.base64(sw255)), sw255) && + +// ================================================================ +// Shortened arrays (variable length, ported from aklomp/base64) +// Tests that shorter-than-256 arrays at each offset also roundtrip +// ================================================================ + +// 1 byte at each interesting offset +std.assertEqual(std.base64DecodeBytes(std.base64([0])), [0]) && +std.assertEqual(std.base64DecodeBytes(std.base64([1])), [1]) && +std.assertEqual(std.base64DecodeBytes(std.base64([127])), [127]) && +std.assertEqual(std.base64DecodeBytes(std.base64([128])), [128]) && +std.assertEqual(std.base64DecodeBytes(std.base64([254])), [254]) && +std.assertEqual(std.base64DecodeBytes(std.base64([255])), [255]) && + +// 2 bytes (boundary vectors) +std.assertEqual(std.base64DecodeBytes(std.base64([0, 0])), [0, 0]) && +std.assertEqual(std.base64DecodeBytes(std.base64([0, 1])), [0, 1]) && +std.assertEqual(std.base64DecodeBytes(std.base64([254, 255])), [254, 255]) && +std.assertEqual(std.base64DecodeBytes(std.base64([255, 254])), [255, 254]) && + +// 3 bytes (no padding case) +std.assertEqual(std.base64DecodeBytes(std.base64([0, 0, 0])), [0, 0, 0]) && +std.assertEqual(std.base64DecodeBytes(std.base64([255, 255, 255])), [255, 255, 255]) && + +// 5 bytes (multi-byte boundary vector) +std.assertEqual(std.base64DecodeBytes(std.base64([0, 1, 128, 254, 255])), + [0, 1, 128, 254, 255]) && + +// ================================================================ +// Large byte array roundtrip (1000 elements) +// ================================================================ +local large1000 = std.makeArray(1000, function(i) i % 256); +local encodedLarge = std.base64(large1000); +local decodedLarge = std.base64DecodeBytes(encodedLarge); +std.assertEqual(std.length(decodedLarge), 1000) && +std.all(std.makeArray(1000, function(i) decodedLarge[i] == i % 256)) && + +// ================================================================ +// Large byte array roundtrip (4096 elements) +// ================================================================ +local large4096 = std.makeArray(4096, function(i) (i * 3 + 17) % 256); +local encoded4096 = std.base64(large4096); +local decoded4096 = std.base64DecodeBytes(encoded4096); +std.assertEqual(std.length(decoded4096), 4096) && +std.all(std.makeArray(4096, function(i) decoded4096[i] == (i * 3 + 17) % 256)) && + +true diff --git a/sjsonnet/test/resources/new_test_suite/base64_byte_array_comprehensive.jsonnet.golden b/sjsonnet/test/resources/new_test_suite/base64_byte_array_comprehensive.jsonnet.golden new file mode 100644 index 000000000..27ba77dda --- /dev/null +++ b/sjsonnet/test/resources/new_test_suite/base64_byte_array_comprehensive.jsonnet.golden @@ -0,0 +1 @@ +true diff --git a/sjsonnet/test/resources/new_test_suite/base64_comprehensive.jsonnet b/sjsonnet/test/resources/new_test_suite/base64_comprehensive.jsonnet new file mode 100644 index 000000000..2c729d72f --- /dev/null +++ b/sjsonnet/test/resources/new_test_suite/base64_comprehensive.jsonnet @@ -0,0 +1,197 @@ +// Comprehensive base64 tests ported from: +// - RFC 4648 Section 10 (all 7 test vectors) +// - aklomp/base64 test suite (char table, Moby Dick roundtrip) +// - Additional boundary and byte-value coverage vectors +// +// Tests encode (std.base64), decode (std.base64Decode), and +// byte-level decode (std.base64DecodeBytes) for correctness. + +// ================================================================ +// RFC 4648 Section 10 — All official test vectors (encode) +// ================================================================ +std.assertEqual(std.base64(""), "") && +std.assertEqual(std.base64("f"), "Zg==") && +std.assertEqual(std.base64("fo"), "Zm8=") && +std.assertEqual(std.base64("foo"), "Zm9v") && +std.assertEqual(std.base64("foob"), "Zm9vYg==") && +std.assertEqual(std.base64("fooba"), "Zm9vYmE=") && +std.assertEqual(std.base64("foobar"), "Zm9vYmFy") && + +// ================================================================ +// RFC 4648 Section 10 — All official test vectors (decode) +// ================================================================ +std.assertEqual(std.base64Decode(""), "") && +std.assertEqual(std.base64Decode("Zg=="), "f") && +std.assertEqual(std.base64Decode("Zm8="), "fo") && +std.assertEqual(std.base64Decode("Zm9v"), "foo") && +std.assertEqual(std.base64Decode("Zm9vYg=="), "foob") && +std.assertEqual(std.base64Decode("Zm9vYmE="), "fooba") && +std.assertEqual(std.base64Decode("Zm9vYmFy"), "foobar") && + +// ================================================================ +// Padding variants +// ================================================================ +// Input length mod 3 == 0 → no padding +std.assertEqual(std.base64("abc"), "YWJj") && +// Input length mod 3 == 2 → one '=' +std.assertEqual(std.base64("ab"), "YWI=") && +// Input length mod 3 == 1 → two '=' +std.assertEqual(std.base64("a"), "YQ==") && + +// ================================================================ +// Boundary byte value vectors (byte array encode) +// ================================================================ +std.assertEqual(std.base64([0]), "AA==") && +std.assertEqual(std.base64([1]), "AQ==") && +std.assertEqual(std.base64([128]), "gA==") && +std.assertEqual(std.base64([255]), "/w==") && +std.assertEqual(std.base64([0, 1]), "AAE=") && +std.assertEqual(std.base64([254, 255]), "/v8=") && +std.assertEqual(std.base64([0, 1, 128, 254, 255]), "AAGA/v8=") && + +// ================================================================ +// Boundary byte value vectors (byte array decode) +// ================================================================ +std.assertEqual(std.base64DecodeBytes("AA=="), [0]) && +std.assertEqual(std.base64DecodeBytes("AQ=="), [1]) && +std.assertEqual(std.base64DecodeBytes("gA=="), [128]) && +std.assertEqual(std.base64DecodeBytes("/w=="), [255]) && +std.assertEqual(std.base64DecodeBytes("AAE="), [0, 1]) && +std.assertEqual(std.base64DecodeBytes("/v8="), [254, 255]) && +std.assertEqual(std.base64DecodeBytes("AAGA/v8="), [0, 1, 128, 254, 255]) && + +// ================================================================ +// Known string encode/decode pairs +// ================================================================ +std.assertEqual(std.base64("hello"), "aGVsbG8=") && +std.assertEqual(std.base64("Hello, World!"), "SGVsbG8sIFdvcmxkIQ==") && +std.assertEqual(std.base64("The quick brown fox jumps over the lazy dog"), + "VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wcyBvdmVyIHRoZSBsYXp5IGRvZw==") && +std.assertEqual(std.base64Decode("aGVsbG8="), "hello") && +std.assertEqual(std.base64Decode("SGVsbG8sIFdvcmxkIQ=="), "Hello, World!") && + +// ================================================================ +// Known byte array encode/decode pairs +// ================================================================ +std.assertEqual(std.base64([0, 1, 2, 3]), "AAECAw==") && +std.assertEqual(std.base64([104, 101, 108, 108, 111]), "aGVsbG8=") && +std.assertEqual(std.base64DecodeBytes("AAECAw=="), [0, 1, 2, 3]) && +std.assertEqual(std.base64DecodeBytes("aGVsbG8="), [104, 101, 108, 108, 111]) && + +// ================================================================ +// Base64 alphabet completeness — every char in A-Za-z0-9+/ appears +// ================================================================ +local fullAlphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; +local decoded = std.base64DecodeBytes(fullAlphabet); +local reEncoded = std.base64(decoded); +std.assertEqual(reEncoded, fullAlphabet) && + +// ================================================================ +// Progressive padding tests — "Base64" spelled out incrementally +// Each prefix exercises a different padding scenario +// ================================================================ +std.assertEqual(std.base64("B"), "Qg==") && +std.assertEqual(std.base64Decode("Qg=="), "B") && +std.assertEqual(std.base64("Ba"), "QmE=") && +std.assertEqual(std.base64Decode("QmE="), "Ba") && +std.assertEqual(std.base64("Bas"), "QmFz") && +std.assertEqual(std.base64Decode("QmFz"), "Bas") && +std.assertEqual(std.base64("Base"), "QmFzZQ==") && +std.assertEqual(std.base64Decode("QmFzZQ=="), "Base") && +std.assertEqual(std.base64("Base6"), "QmFzZTY=") && +std.assertEqual(std.base64Decode("QmFzZTY="), "Base6") && +std.assertEqual(std.base64("Base64"), "QmFzZTY0") && +std.assertEqual(std.base64Decode("QmFzZTY0"), "Base64") && + +// Long text bidirectional (full "Base64 is..." paragraph) +local longText = + "Base64 is a group of similar binary-to-text encoding schemes that " + + "represent binary data in an ASCII string format by translating it " + + "into a radix-64 representation"; +local longTextB64 = + "QmFzZTY0IGlzIGEgZ3JvdXAgb2Ygc2ltaWxhciBiaW5hcnktdG8tdGV4dCBlbmNvZGluZyBzY2hlbWVzIHRoYXQg" + + "cmVwcmVzZW50IGJpbmFyeSBkYXRhIGluIGFuIEFTQ0lJIHN0cmluZyBmb3JtYXQgYnkgdHJhbnNsYXRpbmcgaXQg" + + "aW50byBhIHJhZGl4LTY0IHJlcHJlc2VudGF0aW9u"; +std.assertEqual(std.base64(longText), longTextB64) && +std.assertEqual(std.base64Decode(longTextB64), longText) && + +// Progressive substrings of longText — bidirectional +std.assertEqual(std.base64("Base64 is "), "QmFzZTY0IGlzIA==") && +std.assertEqual(std.base64Decode("QmFzZTY0IGlzIA=="), "Base64 is ") && + +// ================================================================ +// Multi-cycle roundtrip stability +// ================================================================ +local original = "The quick brown fox jumps over the lazy dog"; +local e1 = std.base64(original); +local d1 = std.base64Decode(e1); +local e2 = std.base64(d1); +local d2 = std.base64Decode(e2); +local e3 = std.base64(d2); +local d3 = std.base64Decode(e3); +std.assertEqual(d3, original) && +std.assertEqual(e1, e2) && +std.assertEqual(e2, e3) && + +// ================================================================ +// Unicode string roundtrip (UTF-8) +// ================================================================ +std.assertEqual(std.base64Decode(std.base64("café résumé naïve")), "café résumé naïve") && +std.assertEqual(std.base64Decode(std.base64("你好世界")), "你好世界") && +std.assertEqual(std.base64Decode(std.base64("日本語テスト")), "日本語テスト") && + +// ================================================================ +// String roundtrip for sizes 0..64 (covers SIMD boundaries) +// ================================================================ +local mkStr(len) = std.join("", std.makeArray(len, function(i) std.char(65 + (i % 26)))); +std.all(std.makeArray(65, function(n) + std.base64Decode(std.base64(mkStr(n))) == mkStr(n) +)) && + +// ================================================================ +// Moby Dick roundtrip (ported from aklomp/base64 test suite) +// ================================================================ +local mobyDickPlain = + "Call me Ishmael. Some years ago--never mind how long precisely--having\n" + + "little or no money in my purse, and nothing particular to interest me on\n" + + "shore, I thought I would sail about a little and see the watery part of\n" + + "the world. It is a way I have of driving off the spleen and regulating\n" + + "the circulation. Whenever I find myself growing grim about the mouth;\n" + + "whenever it is a damp, drizzly November in my soul; whenever I find\n" + + "myself involuntarily pausing before coffin warehouses, and bringing up\n" + + "the rear of every funeral I meet; and especially whenever my hypos get\n" + + "such an upper hand of me, that it requires a strong moral principle to\n" + + "prevent me from deliberately stepping into the street, and methodically\n" + + "knocking people's hats off--then, I account it high time to get to sea\n" + + "as soon as I can. This is my substitute for pistol and ball. With a\n" + + "philosophical flourish Cato throws himself upon his sword; I quietly\n" + + "take to the ship. There is nothing surprising in this. If they but knew\n" + + "it, almost all men in their degree, some time or other, cherish very\n" + + "nearly the same feelings towards the ocean with me.\n"; +local mobyDickBase64 = + "Q2FsbCBtZSBJc2htYWVsLiBTb21lIHllYXJzIGFnby0tbmV2ZXIgbWluZCBob3cgbG9uZ" + + "yBwcmVjaXNlbHktLWhhdmluZwpsaXR0bGUgb3Igbm8gbW9uZXkgaW4gbXkgcHVyc2UsIG" + + "FuZCBub3RoaW5nIHBhcnRpY3VsYXIgdG8gaW50ZXJlc3QgbWUgb24Kc2hvcmUsIEkgdGh" + + "vdWdodCBJIHdvdWxkIHNhaWwgYWJvdXQgYSBsaXR0bGUgYW5kIHNlZSB0aGUgd2F0ZXJ5" + + "IHBhcnQgb2YKdGhlIHdvcmxkLiBJdCBpcyBhIHdheSBJIGhhdmUgb2YgZHJpdmluZyBvZ" + + "mYgdGhlIHNwbGVlbiBhbmQgcmVndWxhdGluZwp0aGUgY2lyY3VsYXRpb24uIFdoZW5ldm" + + "VyIEkgZmluZCBteXNlbGYgZ3Jvd2luZyBncmltIGFib3V0IHRoZSBtb3V0aDsKd2hlbmV" + + "2ZXIgaXQgaXMgYSBkYW1wLCBkcml6emx5IE5vdmVtYmVyIGluIG15IHNvdWw7IHdoZW5l" + + "dmVyIEkgZmluZApteXNlbGYgaW52b2x1bnRhcmlseSBwYXVzaW5nIGJlZm9yZSBjb2Zma" + + "W4gd2FyZWhvdXNlcywgYW5kIGJyaW5naW5nIHVwCnRoZSByZWFyIG9mIGV2ZXJ5IGZ1bm" + + "VyYWwgSSBtZWV0OyBhbmQgZXNwZWNpYWxseSB3aGVuZXZlciBteSBoeXBvcyBnZXQKc3V" + + "jaCBhbiB1cHBlciBoYW5kIG9mIG1lLCB0aGF0IGl0IHJlcXVpcmVzIGEgc3Ryb25nIG1v" + + "cmFsIHByaW5jaXBsZSB0bwpwcmV2ZW50IG1lIGZyb20gZGVsaWJlcmF0ZWx5IHN0ZXBwa" + + "W5nIGludG8gdGhlIHN0cmVldCwgYW5kIG1ldGhvZGljYWxseQprbm9ja2luZyBwZW9wbG" + + "UncyBoYXRzIG9mZi0tdGhlbiwgSSBhY2NvdW50IGl0IGhpZ2ggdGltZSB0byBnZXQgdG8" + + "gc2VhCmFzIHNvb24gYXMgSSBjYW4uIFRoaXMgaXMgbXkgc3Vic3RpdHV0ZSBmb3IgcGlz" + + "dG9sIGFuZCBiYWxsLiBXaXRoIGEKcGhpbG9zb3BoaWNhbCBmbG91cmlzaCBDYXRvIHRoc" + + "m93cyBoaW1zZWxmIHVwb24gaGlzIHN3b3JkOyBJIHF1aWV0bHkKdGFrZSB0byB0aGUgc2" + + "hpcC4gVGhlcmUgaXMgbm90aGluZyBzdXJwcmlzaW5nIGluIHRoaXMuIElmIHRoZXkgYnV" + + "0IGtuZXcKaXQsIGFsbW9zdCBhbGwgbWVuIGluIHRoZWlyIGRlZ3JlZSwgc29tZSB0aW1l" + + "IG9yIG90aGVyLCBjaGVyaXNoIHZlcnkKbmVhcmx5IHRoZSBzYW1lIGZlZWxpbmdzIHRvd" + + "2FyZHMgdGhlIG9jZWFuIHdpdGggbWUuCg=="; +std.assertEqual(std.base64(mobyDickPlain), mobyDickBase64) && +std.assertEqual(std.base64Decode(mobyDickBase64), mobyDickPlain) && + +true diff --git a/sjsonnet/test/resources/new_test_suite/base64_comprehensive.jsonnet.golden b/sjsonnet/test/resources/new_test_suite/base64_comprehensive.jsonnet.golden new file mode 100644 index 000000000..27ba77dda --- /dev/null +++ b/sjsonnet/test/resources/new_test_suite/base64_comprehensive.jsonnet.golden @@ -0,0 +1 @@ +true diff --git a/sjsonnet/test/resources/new_test_suite/base64_directional.jsonnet b/sjsonnet/test/resources/new_test_suite/base64_directional.jsonnet new file mode 100644 index 000000000..a19b21eba --- /dev/null +++ b/sjsonnet/test/resources/new_test_suite/base64_directional.jsonnet @@ -0,0 +1,150 @@ +// Bidirectional base64 tests — every test vector verifies BOTH directions: +// encode(input) == expected_base64 AND decode(expected_base64) == input +// +// This catches bugs where encode and decode are "consistently wrong" +// (i.e., roundtrip passes but both directions produce incorrect output). +// This was the class of bug in PR #749 where hand-written SIMD code +// produced wrong results on x86 that still roundtripped. +// +// All reference values pre-computed via java.util.Base64 reference impl. + +// ================================================================ +// Helpers: assert both encode and decode match the reference +// ================================================================ +local assertBidi(plain, b64) = + std.assertEqual(std.base64(plain), b64) && + std.assertEqual(std.base64Decode(b64), plain); + +local assertBidiBytes(arr, b64) = + std.assertEqual(std.base64(arr), b64) && + std.assertEqual(std.base64DecodeBytes(b64), arr); + +// ================================================================ +// RFC 4648 Section 10 — All 7 official test vectors (bidirectional) +// ================================================================ +assertBidi("", "") && +assertBidi("f", "Zg==") && +assertBidi("fo", "Zm8=") && +assertBidi("foo", "Zm9v") && +assertBidi("foob", "Zm9vYg==") && +assertBidi("fooba", "Zm9vYmE=") && +assertBidi("foobar", "Zm9vYmFy") && + +// ================================================================ +// Wikipedia "pleasure" progression (bidirectional) +// ================================================================ +assertBidi("pleasure.", "cGxlYXN1cmUu") && +assertBidi("leasure.", "bGVhc3VyZS4=") && +assertBidi("easure.", "ZWFzdXJlLg==") && +assertBidi("asure.", "YXN1cmUu") && +assertBidi("sure.", "c3VyZS4=") && +assertBidi("ure.", "dXJlLg==") && +assertBidi("re.", "cmUu") && +assertBidi("e.", "ZS4=") && +assertBidi(".", "Lg==") && + +// ================================================================ +// Single/multi char strings (bidirectional) +// ================================================================ +assertBidi("A", "QQ==") && +assertBidi("AB", "QUI=") && +assertBidi("ABC", "QUJD") && +assertBidi("ABCD", "QUJDRA==") && +assertBidi("Man", "TWFu") && +assertBidi("Ma", "TWE=") && +assertBidi("M", "TQ==") && +assertBidi("hello", "aGVsbG8=") && +assertBidi("Hello, World!", "SGVsbG8sIFdvcmxkIQ==") && +assertBidi("The quick brown fox jumps over the lazy dog", + "VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wcyBvdmVyIHRoZSBsYXp5IGRvZw==") && + +// ================================================================ +// Single byte values — boundary values (bidirectional) +// ================================================================ +assertBidiBytes([0], "AA==") && +assertBidiBytes([1], "AQ==") && +assertBidiBytes([63], "Pw==") && +assertBidiBytes([64], "QA==") && +assertBidiBytes([127], "fw==") && +assertBidiBytes([128], "gA==") && +assertBidiBytes([191], "vw==") && +assertBidiBytes([192], "wA==") && +assertBidiBytes([254], "/g==") && +assertBidiBytes([255], "/w==") && + +// ================================================================ +// Two bytes — padding=1 cases (bidirectional) +// ================================================================ +assertBidiBytes([0, 0], "AAA=") && +assertBidiBytes([0, 1], "AAE=") && +assertBidiBytes([0, 255], "AP8=") && +assertBidiBytes([255, 0], "/wA=") && +assertBidiBytes([255, 255], "//8=") && +assertBidiBytes([128, 128], "gIA=") && + +// ================================================================ +// Three bytes — no padding (bidirectional) +// ================================================================ +assertBidiBytes([0, 0, 0], "AAAA") && +assertBidiBytes([0, 0, 1], "AAAB") && +assertBidiBytes([0, 0, 63], "AAA/") && +assertBidiBytes([255, 255, 255], "////") && +assertBidiBytes([0, 16, 131], "ABCD") && +assertBidiBytes([77, 97, 110], "TWFu") && + +// ================================================================ +// Four, five, six bytes (bidirectional) +// ================================================================ +assertBidiBytes([0, 0, 0, 0], "AAAAAA==") && +assertBidiBytes([255, 255, 255, 255], "/////w==") && +assertBidiBytes([0, 0, 0, 0, 0], "AAAAAAA=") && +assertBidiBytes([0, 0, 0, 0, 0, 0], "AAAAAAAA") && +assertBidiBytes([255, 255, 255, 255, 255, 255], "////////") && + +// ================================================================ +// Sequential byte patterns (bidirectional) +// ================================================================ +assertBidiBytes([0, 1, 2], "AAEC") && +assertBidiBytes([0, 1, 2, 3], "AAECAw==") && +assertBidiBytes([0, 1, 2, 3, 4], "AAECAwQ=") && +assertBidiBytes([0, 1, 2, 3, 4, 5], "AAECAwQF") && + +// Powers of 2 boundary values +assertBidiBytes([1, 2, 4, 8, 16, 32, 64, 128], "AQIECBAgQIA=") && + +// Vectors that exercise '+' and '/' in output +assertBidiBytes([251, 239, 190], "++++") && +assertBidiBytes([251, 239], "++8=") && +assertBidiBytes([104, 101, 108, 108, 111], "aGVsbG8=") && +assertBidiBytes([0, 1, 128, 254, 255], "AAGA/v8=") && + +// ================================================================ +// SIMD boundary sizes — exact pre-computed values (bidirectional) +// ================================================================ + +// 12 bytes — SSSE3 encode unit boundary +assertBidiBytes(std.makeArray(12, function(i) i), + "AAECAwQFBgcICQoL") && + +// 24 bytes — AVX2 encode unit boundary +assertBidiBytes(std.makeArray(24, function(i) i), + "AAECAwQFBgcICQoLDA0ODxAREhMUFRYX") && + +// 48 bytes — NEON / AVX-512 encode unit boundary +assertBidiBytes(std.makeArray(48, function(i) i), + "AAECAwQFBgcICQoLDA0ODxAREhMUFRYXGBkaGxwdHh8gISIjJCUmJygpKissLS4v") && + +// ================================================================ +// Verify standard base64 alphabet (not URL-safe) +// '+' and '/' must appear, not '-' and '_' +// ================================================================ +local testPlus = std.base64([251, 239]); +local testSlash = std.base64([255]); +std.assertEqual(testPlus, "++8=") && +std.assertEqual(testSlash, "/w==") && +std.assertEqual(std.length(std.findSubstr("+", testPlus)) > 0, true) && +std.assertEqual(std.length(std.findSubstr("/", testSlash)) > 0, true) && +std.assertEqual(std.length(std.findSubstr("-", testPlus)), 0) && +std.assertEqual(std.length(std.findSubstr("_", testSlash)), 0) && + +true diff --git a/sjsonnet/test/resources/new_test_suite/base64_directional.jsonnet.golden b/sjsonnet/test/resources/new_test_suite/base64_directional.jsonnet.golden new file mode 100644 index 000000000..27ba77dda --- /dev/null +++ b/sjsonnet/test/resources/new_test_suite/base64_directional.jsonnet.golden @@ -0,0 +1 @@ +true diff --git a/sjsonnet/test/resources/new_test_suite/base64_simd_boundaries.jsonnet b/sjsonnet/test/resources/new_test_suite/base64_simd_boundaries.jsonnet new file mode 100644 index 000000000..2994bcbc4 --- /dev/null +++ b/sjsonnet/test/resources/new_test_suite/base64_simd_boundaries.jsonnet @@ -0,0 +1,97 @@ +// SIMD boundary size tests for base64 encode/decode. +// Ported from aklomp/base64 test suite (char table tests at various alignments) +// and additional deterministic byte patterns at SIMD unit sizes. +// +// Tests sizes that hit exact SIMD processing unit boundaries: +// SSSE3 encode: 12 bytes (→ 16 chars) +// SSSE3 decode: 16 chars (→ 12 bytes) +// AVX2 encode: 24 bytes (→ 32 chars) +// AVX2 decode: 32 chars (→ 24 bytes) +// NEON/AVX-512 encode: 48 bytes (→ 64 chars) +// NEON/AVX-512 decode: 64 chars (→ 48 bytes) + +// Helper: create a deterministic byte array of given length +local mkArr(len) = std.makeArray(len, function(i) (i * 7 + 13) % 256); + +// Helper: verify byte array roundtrip +local verifyRoundtrip(size) = + local arr = mkArr(size); + local encoded = std.base64(arr); + local decoded = std.base64DecodeBytes(encoded); + std.assertEqual(std.length(decoded), size) && + std.all(std.makeArray(size, function(i) decoded[i] == arr[i])); + +// ================================================================ +// SSSE3 boundaries (12-byte encode unit, 16-byte decode unit) +// ================================================================ +verifyRoundtrip(11) && +verifyRoundtrip(12) && +verifyRoundtrip(13) && +verifyRoundtrip(15) && +verifyRoundtrip(16) && +verifyRoundtrip(17) && + +// ================================================================ +// AVX2 boundaries (24-byte encode unit, 32-byte decode unit) +// ================================================================ +verifyRoundtrip(23) && +verifyRoundtrip(24) && +verifyRoundtrip(25) && +verifyRoundtrip(31) && +verifyRoundtrip(32) && +verifyRoundtrip(33) && + +// ================================================================ +// NEON / AVX-512 boundaries (48/64 bytes) +// ================================================================ +verifyRoundtrip(47) && +verifyRoundtrip(48) && +verifyRoundtrip(49) && +verifyRoundtrip(63) && +verifyRoundtrip(64) && +verifyRoundtrip(65) && + +// ================================================================ +// Multi-block sizes (multiple SIMD iterations) +// ================================================================ +verifyRoundtrip(96) && // 2x NEON encode +verifyRoundtrip(128) && // 2x NEON decode +verifyRoundtrip(192) && // 4x NEON encode +verifyRoundtrip(255) && // Max byte value as size +verifyRoundtrip(256) && // All byte values + +// ================================================================ +// Special byte patterns at SIMD boundaries +// ================================================================ + +// All-zeros at SIMD boundary +local zeros48 = std.makeArray(48, function(i) 0); +local zeros48Encoded = std.base64(zeros48); +local zeros48Decoded = std.base64DecodeBytes(zeros48Encoded); +std.assertEqual(std.length(zeros48Decoded), 48) && +std.all(std.makeArray(48, function(i) zeros48Decoded[i] == 0)) && + +// All-0xFF at SIMD boundary +local ff64 = std.makeArray(64, function(i) 255); +local ff64Encoded = std.base64(ff64); +local ff64Decoded = std.base64DecodeBytes(ff64Encoded); +std.assertEqual(std.length(ff64Decoded), 64) && +std.all(std.makeArray(64, function(i) ff64Decoded[i] == 255)) && + +// Alternating 0x00/0xFF at AVX2 boundary +local alt32 = std.makeArray(32, function(i) if i % 2 == 0 then 0 else 255); +local alt32Encoded = std.base64(alt32); +local alt32Decoded = std.base64DecodeBytes(alt32Encoded); +std.assertEqual(std.length(alt32Decoded), 32) && +std.all(std.makeArray(32, function(i) + alt32Decoded[i] == (if i % 2 == 0 then 0 else 255) +)) && + +// Sequential 0..47 at NEON boundary +local seq48 = std.makeArray(48, function(i) i); +local seq48Encoded = std.base64(seq48); +local seq48Decoded = std.base64DecodeBytes(seq48Encoded); +std.assertEqual(std.length(seq48Decoded), 48) && +std.all(std.makeArray(48, function(i) seq48Decoded[i] == i)) && + +true diff --git a/sjsonnet/test/resources/new_test_suite/base64_simd_boundaries.jsonnet.golden b/sjsonnet/test/resources/new_test_suite/base64_simd_boundaries.jsonnet.golden new file mode 100644 index 000000000..27ba77dda --- /dev/null +++ b/sjsonnet/test/resources/new_test_suite/base64_simd_boundaries.jsonnet.golden @@ -0,0 +1 @@ +true diff --git a/sjsonnet/test/resources/new_test_suite/byte_arr_correctness.jsonnet b/sjsonnet/test/resources/new_test_suite/byte_arr_correctness.jsonnet new file mode 100644 index 000000000..271d87ed0 --- /dev/null +++ b/sjsonnet/test/resources/new_test_suite/byte_arr_correctness.jsonnet @@ -0,0 +1,67 @@ +// Regression tests for ByteArr subclass correctness. +// Covers multi-use, reverse, concat, and round-trip scenarios to ensure +// byte-backed arrays behave identically to normal arrays. +local decoded = std.base64DecodeBytes("AQIDBAUG"); // [1,2,3,4,5,6] +local empty = std.base64DecodeBytes(""); // [] + +// 1. Multiple consumption of the same ByteArr +local enc1 = std.base64(decoded); +local enc2 = std.base64(decoded); + +// 2. Reverse of ByteArr +local rev = std.reverse(decoded); + +// 3. Double-reverse of ByteArr (should equal original) +local rev2 = std.reverse(std.reverse(decoded)); + +// 4. Concat then re-encode +local extended = decoded + [7, 8]; +local extEnc = std.base64(extended); + +// 5. ByteArr after concat (materialization path) — original still usable +local afterConcat = decoded + [99]; +local encAfterConcat = std.base64(decoded); + +// 6. Element access on ByteArr +local elem0 = decoded[0]; +local elem5 = decoded[5]; + +// 7. Length +local len = std.length(decoded); + +// 8. Slice +local sliced = decoded[1:4]; + +// 9. std.map over ByteArr +local mapped = std.map(function(x) x * 2, decoded); + +// 10. Empty ByteArr +local emptyEnc = std.base64(empty); +local emptyRev = std.reverse(empty); +local emptyLen = std.length(empty); + +// 11. Round-trip: decode -> encode -> decode -> encode +local rt = std.base64(std.base64DecodeBytes(std.base64(decoded))); + +// 12. Reverse then encode (reversed bytes should encode differently) +local revEnc = std.base64(std.reverse(std.base64DecodeBytes("AQID"))); + +// Assertions +std.assertEqual(enc1, "AQIDBAUG") && +std.assertEqual(enc2, "AQIDBAUG") && +std.assertEqual(rev, [6, 5, 4, 3, 2, 1]) && +std.assertEqual(rev2, [1, 2, 3, 4, 5, 6]) && +std.assertEqual(std.base64(rev), std.base64([6, 5, 4, 3, 2, 1])) && +std.assertEqual(extEnc, std.base64([1, 2, 3, 4, 5, 6, 7, 8])) && +std.assertEqual(encAfterConcat, "AQIDBAUG") && +std.assertEqual(elem0, 1) && +std.assertEqual(elem5, 6) && +std.assertEqual(len, 6) && +std.assertEqual(sliced, [2, 3, 4]) && +std.assertEqual(mapped, [2, 4, 6, 8, 10, 12]) && +std.assertEqual(emptyEnc, "") && +std.assertEqual(emptyRev, []) && +std.assertEqual(emptyLen, 0) && +std.assertEqual(rt, "AQIDBAUG") && +std.assertEqual(revEnc, "AwIB") && +true diff --git a/sjsonnet/test/resources/new_test_suite/byte_arr_correctness.jsonnet.golden b/sjsonnet/test/resources/new_test_suite/byte_arr_correctness.jsonnet.golden new file mode 100644 index 000000000..27ba77dda --- /dev/null +++ b/sjsonnet/test/resources/new_test_suite/byte_arr_correctness.jsonnet.golden @@ -0,0 +1 @@ +true diff --git a/sjsonnet/test/resources/new_test_suite/range_arr_correctness.jsonnet b/sjsonnet/test/resources/new_test_suite/range_arr_correctness.jsonnet new file mode 100644 index 000000000..47c5c9c82 --- /dev/null +++ b/sjsonnet/test/resources/new_test_suite/range_arr_correctness.jsonnet @@ -0,0 +1,61 @@ +// Regression tests for RangeArr subclass correctness. +// Covers multi-use, reverse, concat, and element access scenarios. +local r = std.range(0, 9); // [0,1,2,...,9] + +// 1. Multiple consumption +local arr1 = [x for x in r]; +local arr2 = [x for x in r]; + +// 2. Reverse +local rev = std.reverse(r); + +// 3. Double-reverse +local rev2 = std.reverse(std.reverse(r)); + +// 4. Concat +local extended = r + [10, 11]; + +// 5. Original still usable after concat +local afterConcat = r + [99]; +local arr3 = [x for x in r]; + +// 6. Element access +local first = r[0]; +local last = r[9]; + +// 7. Slice +local sliced = r[2:5]; + +// 8. Length +local len = std.length(r); + +// 9. Reversed element access +local revFirst = rev[0]; +local revLast = rev[9]; + +// 10. Large range +local big = std.range(0, 999); +local bigRev = std.reverse(big); +local bigFirst = big[0]; +local bigLast = big[999]; +local bigRevFirst = bigRev[0]; +local bigRevLast = bigRev[999]; + +std.assertEqual(arr1, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) && +std.assertEqual(arr2, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) && +std.assertEqual(arr3, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) && +std.assertEqual(rev, [9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) && +std.assertEqual(rev2, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) && +std.assertEqual(extended, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]) && +std.assertEqual(first, 0) && +std.assertEqual(last, 9) && +std.assertEqual(sliced, [2, 3, 4]) && +std.assertEqual(len, 10) && +std.assertEqual(revFirst, 9) && +std.assertEqual(revLast, 0) && +std.assertEqual(bigFirst, 0) && +std.assertEqual(bigLast, 999) && +std.assertEqual(bigRevFirst, 999) && +std.assertEqual(bigRevLast, 0) && +std.assertEqual(std.length(big), 1000) && +true diff --git a/sjsonnet/test/resources/new_test_suite/range_arr_correctness.jsonnet.golden b/sjsonnet/test/resources/new_test_suite/range_arr_correctness.jsonnet.golden new file mode 100644 index 000000000..27ba77dda --- /dev/null +++ b/sjsonnet/test/resources/new_test_suite/range_arr_correctness.jsonnet.golden @@ -0,0 +1 @@ +true diff --git a/sjsonnet/test/src/sjsonnet/Base64Tests.scala b/sjsonnet/test/src/sjsonnet/Base64Tests.scala new file mode 100644 index 000000000..e641d4c16 --- /dev/null +++ b/sjsonnet/test/src/sjsonnet/Base64Tests.scala @@ -0,0 +1,543 @@ +package sjsonnet + +import utest._ + +/** + * Comprehensive Base64 tests covering: + * - RFC 4648 Section 10 test vectors (all 7) + * - Padding variants (0, 1, 2 padding chars) + * - SIMD boundary sizes (12, 16, 24, 32, 48, 64 bytes) + * - All 256 byte values coverage + * - Roundtrip encode/decode for various sizes + * - Invalid input handling + * - Byte array encode/decode + * - Large input stress tests + * + * Ported from aklomp/base64 test suite (test_char_table, roundtrip tests) + * and RFC 4648 Section 10. + */ +object Base64Tests extends TestSuite { + + def eval(s: String) = TestUtils.eval(s) + def evalErr(s: String) = TestUtils.evalErr(s) + + val tests = Tests { + + // ================================================================ + // RFC 4648 Section 10 — All official test vectors + // ================================================================ + test("rfc4648") { + test("empty") { + val r = eval("""std.base64("")""") + assert(r.str == "") + } + test("f") { + val r = eval("""std.base64("f")""") + assert(r.str == "Zg==") + } + test("fo") { + val r = eval("""std.base64("fo")""") + assert(r.str == "Zm8=") + } + test("foo") { + val r = eval("""std.base64("foo")""") + assert(r.str == "Zm9v") + } + test("foob") { + val r = eval("""std.base64("foob")""") + assert(r.str == "Zm9vYg==") + } + test("fooba") { + val r = eval("""std.base64("fooba")""") + assert(r.str == "Zm9vYmE=") + } + test("foobar") { + val r = eval("""std.base64("foobar")""") + assert(r.str == "Zm9vYmFy") + } + } + + // ================================================================ + // RFC 4648 Section 10 — Decode direction + // ================================================================ + test("rfc4648Decode") { + test("empty") { + val r = eval("""std.base64Decode("")""") + assert(r.str == "") + } + test("Zg") { + val r = eval("""std.base64Decode("Zg==")""") + assert(r.str == "f") + } + test("Zm8") { + val r = eval("""std.base64Decode("Zm8=")""") + assert(r.str == "fo") + } + test("Zm9v") { + val r = eval("""std.base64Decode("Zm9v")""") + assert(r.str == "foo") + } + test("Zm9vYg") { + val r = eval("""std.base64Decode("Zm9vYg==")""") + assert(r.str == "foob") + } + test("Zm9vYmE") { + val r = eval("""std.base64Decode("Zm9vYmE=")""") + assert(r.str == "fooba") + } + test("Zm9vYmFy") { + val r = eval("""std.base64Decode("Zm9vYmFy")""") + assert(r.str == "foobar") + } + } + + // ================================================================ + // Padding variants + // ================================================================ + test("padding") { + test("noPadding") { + // Input length mod 3 == 0 → no padding + val r = eval("""std.base64("abc")""") + assert(r.str == "YWJj") + } + test("onePad") { + // Input length mod 3 == 2 → one '=' + val r = eval("""std.base64("ab")""") + assert(r.str == "YWI=") + } + test("twoPads") { + // Input length mod 3 == 1 → two '=' + val r = eval("""std.base64("a")""") + assert(r.str == "YQ==") + } + } + + // ================================================================ + // String roundtrip tests for various sizes + // (covers SIMD processing boundaries) + // ================================================================ + test("roundtripSizes") { + // Generate deterministic strings of given lengths + def makeString(len: Int): String = { + val sb = new StringBuilder(len) + var i = 0 + while (i < len) { + sb.append(('A' + (i % 26)).toChar) + i += 1 + } + sb.toString() + } + + val sizes = Seq( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, // SSSE3 encode unit + 13, 14, 15, + 16, // SSSE3 decode unit + 17, 23, + 24, // AVX2 encode unit + 25, 31, + 32, // AVX2 decode unit + 33, 47, + 48, // NEON encode unit / AVX-512 encode unit + 49, 63, + 64, // NEON decode unit / AVX-512 decode unit + 100, 128, 255, 256, 500, 1000, 4096 + ) + + for (size <- sizes) { + val s = makeString(size) + val escaped = s.replace("\\", "\\\\").replace("\"", "\\\"") + val result = + eval(s"""std.base64Decode(std.base64("$escaped"))""") + assert(result.str == s) + } + } + + // ================================================================ + // Byte array roundtrip (ported from aklomp/base64 test_char_table) + // All 256 byte values + // ================================================================ + test("byteValueCoverage") { + // Encode array [0, 1, 2, ..., 255] and decode back + val r = eval( + """local arr = std.makeArray(256, function(i) i); + |local encoded = std.base64(arr); + |local decoded = std.base64DecodeBytes(encoded); + |std.assertEqual(std.length(decoded), 256) && + |std.all(std.makeArray(256, function(i) decoded[i] == i)) + |""".stripMargin + ) + assert(r == ujson.True) + } + + // ================================================================ + // Sliding window byte coverage + // (ported from aklomp/base64 test_char_table: loop from offset 0..255) + // ================================================================ + test("slidingWindowBytes") { + // For each starting offset, encode remaining bytes and roundtrip + val offsets = Seq(0, 1, 2, 3, 7, 15, 31, 63, 127, 128, 200, 254, 255) + for (offset <- offsets) { + val len = 256 - offset + val r = eval( + s"""local arr = std.makeArray($len, function(i) (i + $offset) % 256); + |local encoded = std.base64(arr); + |local decoded = std.base64DecodeBytes(encoded); + |std.assertEqual(std.length(decoded), $len) && + |std.all(std.makeArray($len, function(i) decoded[i] == (i + $offset) % 256)) + |""".stripMargin + ) + assert(r == ujson.True) + } + } + + // ================================================================ + // Special byte patterns + // ================================================================ + test("specialPatterns") { + test("allZeros") { + val r = eval( + """local arr = std.makeArray(64, function(i) 0); + |local encoded = std.base64(arr); + |local decoded = std.base64DecodeBytes(encoded); + |std.assertEqual(std.length(decoded), 64) && + |std.all(std.makeArray(64, function(i) decoded[i] == 0)) + |""".stripMargin + ) + assert(r == ujson.True) + } + test("allFF") { + val r = eval( + """local arr = std.makeArray(64, function(i) 255); + |local encoded = std.base64(arr); + |local decoded = std.base64DecodeBytes(encoded); + |std.assertEqual(std.length(decoded), 64) && + |std.all(std.makeArray(64, function(i) decoded[i] == 255)) + |""".stripMargin + ) + assert(r == ujson.True) + } + test("alternating") { + val r = eval( + """local arr = std.makeArray(64, function(i) if i % 2 == 0 then 0 else 255); + |local encoded = std.base64(arr); + |local decoded = std.base64DecodeBytes(encoded); + |std.assertEqual(std.length(decoded), 64) && + |std.all(std.makeArray(64, function(i) decoded[i] == (if i % 2 == 0 then 0 else 255))) + |""".stripMargin + ) + assert(r == ujson.True) + } + } + + // ================================================================ + // Base64 alphabet completeness + // ================================================================ + test("alphabetCompleteness") { + // The full base64 alphabet: A-Za-z0-9+/ + // Decode a string that contains every base64 character + val r = eval( + """local fullAlphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + |local decoded = std.base64DecodeBytes(fullAlphabet); + |local reEncoded = std.base64(decoded); + |reEncoded == fullAlphabet + |""".stripMargin + ) + assert(r == ujson.True) + } + + // ================================================================ + // Large roundtrip (ported from aklomp/base64 Moby Dick test) + // ================================================================ + test("largeTextRoundtrip") { + val r = eval( + """local largeStr = std.repeat("Lorem ipsum dolor sit amet, consectetur adipiscing elit. ", 100); + |local encoded = std.base64(largeStr); + |local decoded = std.base64Decode(encoded); + |decoded == largeStr + |""".stripMargin + ) + assert(r == ujson.True) + } + + // ================================================================ + // Large byte array roundtrip + // ================================================================ + test("largeByteArrayRoundtrip") { + val r = eval( + """local arr = std.makeArray(1000, function(i) i % 256); + |local encoded = std.base64(arr); + |local decoded = std.base64DecodeBytes(encoded); + |std.assertEqual(std.length(decoded), 1000) && + |std.all(std.makeArray(1000, function(i) decoded[i] == i % 256)) + |""".stripMargin + ) + assert(r == ujson.True) + } + + // ================================================================ + // Multiple encode/decode cycles (stability test) + // ================================================================ + test("multiCycleRoundtrip") { + val r = eval( + """local original = "The quick brown fox jumps over the lazy dog"; + |local e1 = std.base64(original); + |local d1 = std.base64Decode(e1); + |local e2 = std.base64(d1); + |local d2 = std.base64Decode(e2); + |local e3 = std.base64(d2); + |local d3 = std.base64Decode(e3); + |d3 == original && e1 == e2 && e2 == e3 + |""".stripMargin + ) + assert(r == ujson.True) + } + + // ================================================================ + // Known encode/decode pairs (additional test vectors) + // All tests verify BOTH encode AND decode directions. + // ================================================================ + test("knownPairs") { + test("hello") { + assert(eval("""std.base64("hello")""").str == "aGVsbG8=") + assert(eval("""std.base64Decode("aGVsbG8=")""").str == "hello") + } + test("helloWorld") { + assert(eval("""std.base64("Hello, World!")""").str == "SGVsbG8sIFdvcmxkIQ==") + assert( + eval("""std.base64Decode("SGVsbG8sIFdvcmxkIQ==")""").str == "Hello, World!" + ) + } + test("binaryData") { + assert(eval("""std.base64([0, 1, 2, 3])""").str == "AAECAw==") + assert( + eval("""std.base64DecodeBytes("AAECAw==")""").arr.toSeq + .map(_.num.toInt) == Seq(0, 1, 2, 3) + ) + } + test("singleByte0") { + assert(eval("""std.base64([0])""").str == "AA==") + assert( + eval("""std.base64DecodeBytes("AA==")""").arr.toSeq.map(_.num.toInt) == Seq(0) + ) + } + test("singleByte255") { + assert(eval("""std.base64([255])""").str == "/w==") + assert( + eval("""std.base64DecodeBytes("/w==")""").arr.toSeq + .map(_.num.toInt) == Seq(255) + ) + } + test("helloBytes") { + assert(eval("""std.base64([104, 101, 108, 108, 111])""").str == "aGVsbG8=") + assert( + eval("""std.base64DecodeBytes("aGVsbG8=")""").arr.toSeq + .map(_.num.toInt) == Seq(104, 101, 108, 108, 111) + ) + } + } + + // ================================================================ + // Progressive padding tests — "Base64" spelled out incrementally + // Each prefix exercises a different padding scenario. + // All tests verify BOTH encode AND decode directions. + // ================================================================ + test("progressivePadding") { + val pairs = Seq( + ("B", "Qg=="), + ("Ba", "QmE="), + ("Bas", "QmFz"), + ("Base", "QmFzZQ=="), + ("Base6", "QmFzZTY="), + ("Base64", "QmFzZTY0"), + ("Base64 is ", "QmFzZTY0IGlzIA==") + ) + for ((plain, b64) <- pairs) { + val escaped = plain.replace("\\", "\\\\").replace("\"", "\\\"") + val encResult = eval(s"""std.base64("$escaped")""") + assert(encResult.str == b64) + val decResult = eval(s"""std.base64Decode("$b64")""") + assert(decResult.str == plain) + } + } + + // ================================================================ + // Long text bidirectional test + // ================================================================ + test("longTextBidirectional") { + val r = eval( + """local longText = + | "Base64 is a group of similar binary-to-text encoding schemes that " + + | "represent binary data in an ASCII string format by translating it " + + | "into a radix-64 representation"; + |local expectedB64 = + | "QmFzZTY0IGlzIGEgZ3JvdXAgb2Ygc2ltaWxhciBiaW5hcnktdG8tdGV4dCBlbmNvZGluZyBzY2hlbWVzIHRoYXQg" + + | "cmVwcmVzZW50IGJpbmFyeSBkYXRhIGluIGFuIEFTQ0lJIHN0cmluZyBmb3JtYXQgYnkgdHJhbnNsYXRpbmcgaXQg" + + | "aW50byBhIHJhZGl4LTY0IHJlcHJlc2VudGF0aW9u"; + |std.assertEqual(std.base64(longText), expectedB64) && + |std.assertEqual(std.base64Decode(expectedB64), longText) + |""".stripMargin + ) + assert(r == ujson.True) + } + + // ================================================================ + // DecodeBytes verification + // ================================================================ + test("decodeBytes") { + test("hello") { + val r = eval("""std.base64DecodeBytes("aGVsbG8=")""") + val arr = r.arr.toSeq.map(_.num.toInt) + assert(arr == Seq(104, 101, 108, 108, 111)) + } + test("empty") { + val r = eval("""std.base64DecodeBytes("")""") + assert(r.arr.toSeq.isEmpty) + } + test("singleByte") { + val r = eval("""std.base64DecodeBytes("AA==")""") + val arr = r.arr.toSeq.map(_.num.toInt) + assert(arr == Seq(0)) + } + } + + // ================================================================ + // Unicode string base64 (UTF-8 encoding) + // ================================================================ + test("unicode") { + test("chineseRoundtrip") { + val r = eval( + """local s = "你好世界"; + |std.base64Decode(std.base64(s)) == s + |""".stripMargin + ) + assert(r == ujson.True) + } + test("emojiRoundtrip") { + val r = eval( + """local s = "Hello 🌍!"; + |std.base64Decode(std.base64(s)) == s + |""".stripMargin + ) + assert(r == ujson.True) + } + test("mixedRoundtrip") { + val r = eval( + """local s = "café résumé naïve"; + |std.base64Decode(std.base64(s)) == s + |""".stripMargin + ) + assert(r == ujson.True) + } + } + + // ================================================================ + // SIMD boundary size tests (byte array variant) + // Tests sizes that hit exact SIMD processing unit boundaries + // ================================================================ + test("simdBoundaries") { + val boundaries = Seq( + 11, 12, 13, // SSSE3 encode boundary (12 bytes → 16 chars) + 15, 16, 17, // SSSE3 decode boundary + 23, 24, 25, // AVX2 encode boundary (24 bytes → 32 chars) + 31, 32, 33, // AVX2 decode boundary + 47, 48, 49, // NEON/AVX-512 encode boundary + 63, 64, 65 // NEON/AVX-512 decode boundary + ) + for (size <- boundaries) { + val r = eval( + s"""local arr = std.makeArray($size, function(i) (i * 7 + 13) % 256); + |local encoded = std.base64(arr); + |local decoded = std.base64DecodeBytes(encoded); + |std.assertEqual(std.length(decoded), $size) && + |std.all(std.makeArray($size, function(i) decoded[i] == (i * 7 + 13) % 256)) + |""".stripMargin + ) + assert(r == ujson.True) + } + } + + // ================================================================ + // Error handling + // ================================================================ + test("errors") { + test("invalidBase64Char") { + val err = evalErr("""std.base64Decode("!!!!")""") + assert(err.contains("Invalid base64")) + } + test("invalidByteArrayValue") { + val err = evalErr("""std.base64([256])""") + assert(err.contains("invalid codepoint")) + } + test("negativeByteValue") { + val err = evalErr("""std.base64([-1])""") + assert(err.contains("invalid codepoint")) + } + test("nonNumberInArray") { + val err = evalErr("""std.base64(["a"])""") + assert(err.contains("Expected an array of numbers")) + } + test("wrongTypeEncode") { + val err = evalErr("""std.base64(true)""") + assert(err.contains("Cannot base64 encode")) + } + test("wrongTypeDecode") { + val err = evalErr("""std.base64Decode(123)""") + assert(err.toLowerCase.contains("expected")) + } + } + + // ================================================================ + // Strict padding enforcement — aligns with go-jsonnet and C++ jsonnet + // + // Both official implementations reject unpadded base64 input: + // - go-jsonnet: len(str) % 4 != 0 check (UTF-8 byte count) + // - C++ jsonnet: std.length(str) % 4 != 0 check (char count) + // + // java.util.Base64 on JVM/JS is lenient and accepts unpadded input, + // so PlatformBase64 adds explicit length validation. + // ================================================================ + test("strictPadding") { + // Unpadded variants that java.util.Base64 would accept but Jsonnet spec rejects + test("missingTwoPads") { + // "YQ" should be "YQ==" — both go-jsonnet and C++ jsonnet reject this + val err = evalErr("""std.base64Decode("YQ")""") + assert(err.contains("Invalid base64")) + } + test("missingOnePad") { + // "YWI" should be "YWI=" — rejected by both official implementations + val err = evalErr("""std.base64Decode("YWI")""") + assert(err.contains("Invalid base64")) + } + test("singleChar") { + // Single character is never valid base64 + val err = evalErr("""std.base64Decode("A")""") + assert(err.contains("Invalid base64")) + } + test("fiveChars") { + // "wrong" (5 chars) — the go_test_suite canonical test case + val err = evalErr("""std.base64Decode("wrong")""") + assert(err.contains("Invalid base64")) + } + test("validPaddedStillWorks") { + // Properly padded input must still work + val r = eval("""std.base64Decode("YQ==")""") + assert(r.str == "a") + } + test("validPaddedOnePad") { + val r = eval("""std.base64Decode("YWI=")""") + assert(r.str == "ab") + } + test("validNoPadNeeded") { + // Length is multiple of 4, no padding needed + val r = eval("""std.base64Decode("YWJj")""") + assert(r.str == "abc") + } + // DecodeBytes also enforces strict padding + test("decodeBytesUnpadded") { + val err = evalErr("""std.base64DecodeBytes("YQ")""") + assert(err.contains("Invalid base64")) + } + } + } +}