perf: hybrid hot-path evaluator — up to 40% faster dispatch (#785)

He-Pin · web-flow · commit b63fb404ee33 · 2026-04-21T10:29:49.000-07:00
## Summary

- Profile all 66 benchmark files across 5 suites to identify ExprTag
visit frequencies. **Top 7 types cover 96.1%** of all `visitExpr` calls:
ValidId (30%), BinaryOp (21%), Val.Literal (18%), Select (13%), Apply1
(5%), ObjExtend (4%), IfElse (4%).
- Split `NewEvaluator.visitExpr` into a **hot path** (~120 bytecodes, 7
`instanceof` checks) and a **cold path** (`private visitExprCold` using
`tag + @switch` for remaining 30 types).
- The hot path fits within JIT `FreqInlineSize=325` bytecodes, enabling
**C2 to inline `visitExpr` into callers** (`visitBinaryOp`,
`visitSelect`, etc.). The old evaluator's ~700-bytecode method body
never gets inlined.
- Add `--new-evaluator` CLI flag for A/B testing.
- Add `EvaluatorBenchmark` (JMH) and `ExprTagProfile` profiling tool.

## JMH Results

Steady-state performance (1 fork, 8 warmup, 10 measurement iterations):

| Benchmark | Old (ms) | New (ms) | Delta |
|-----------|----------|----------|-------|
| bench.01 | 0.026 | 0.018 | **-31%** |
| bench.02 | 32.58 | 25.73 | **-21%** |
| bench.03 | 9.39 | 5.64 | **-40%** |
| gen_big_object | 0.928 | 0.715 | **-23%** |
| string_render_perf | 0.768 | 0.496 | **-35%** |
| base64_mega | 3.462 | 3.106 | **-10%** |
| realistic1 | 1.850 | 1.764 | **-5%** |
| heavy_string_render | 34.80 | 33.09 | **-5%** |
| realistic2 | 47.32 | 47.78 | ~tied |
| bench.04, 06, 08, 09 | - | - | ~tied |

Evaluator-heavy benchmarks (bench.01–03, gen_big_object,
string_render_perf) show **21–40% improvement**. Builtin-dominated
benchmarks (bench.04, foldl, comparison) are unaffected — the evaluator
dispatch is not their bottleneck.

## Why it works

The old evaluator's `visitExpr` compiles to a ~700-bytecode `instanceof`
chain. This exceeds JIT's `FreqInlineSize=325`, so **C2 never inlines
it** into callers. Every recursive `visitExpr` call from within
`visitBinaryOp`, `visitSelect`, etc. pays full virtual dispatch
overhead.

The hybrid approach splits into:
- **Hot path** (~120 bytecodes): 7 `instanceof` checks for 96% of calls
— small enough for C2 to inline
- **Cold path** (separate method): `tag + @switch` tableswitch for the
remaining 4% — O(1) dispatch instead of scanning 30+ `instanceof` checks

## ExprTag frequency data (global across all 66 benchmark files)

```
  Rank  ExprTag              Count        Pct   Cumulative
  1     ValidId          3,435,607     29.9%      29.9%
  2     BinaryOp         2,455,182     21.4%      51.3%
  3     Val.Literal      2,099,413     18.3%      69.5%
  4     Select           1,464,561     12.7%      82.3%
  5     Apply1             619,927      5.4%      87.7%
  6     ObjExtend          485,621      4.2%      91.9%
  7     IfElse             485,570      4.2%      96.1%
  8     ObjBody.MemberList 250,734      2.2%      98.3%
  9     ApplyBuiltin1      132,666      1.2%      99.4%
  10+   (remaining)         63,212      0.6%     100.0%
```

## Test plan

- [x] `./mill 'sjsonnet.jvm[3.3.7]'.test` — all JVM tests pass (both old
and new evaluator)
- [x] `./mill __.reformat` — scalafmt clean
- [x] JMH A/B benchmarks across cpp_suite, go_suite, bug_suite,
sjsonnet_suite
- [x] ExprTagProfile across all 66 benchmark files
diff --git a/bench/src/sjsonnet/bench/EvaluatorBenchmark.scala b/bench/src/sjsonnet/bench/EvaluatorBenchmark.scala
@@ -0,0 +1,101 @@
+package sjsonnet.bench
+
+import org.openjdk.jmh.annotations.*
+import org.openjdk.jmh.infra.*
+import sjsonnet.*
+
+import java.io.{ByteArrayOutputStream, OutputStream, PrintStream, StringWriter}
+import java.util.concurrent.TimeUnit
+
+/**
+ * A/B benchmark comparing old (instanceof chain) vs new (tag + tableswitch) evaluator.
+ *
+ * Runs the full interpret pipeline (parse → optimize → evaluate → materialize) for each benchmark
+ * file, isolating the evaluator difference by using the same Settings with only `useNewEvaluator`
+ * toggled.
+ */
+@BenchmarkMode(Array(Mode.AverageTime))
+@Fork(2)
+@Threads(1)
+@Warmup(iterations = 15)
+@Measurement(iterations = 20)
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@State(Scope.Benchmark)
+class EvaluatorBenchmark {
+
+  @Param(
+    Array(
+      // cpp_suite — C++ jsonnet benchmarks
+      "bench/resources/cpp_suite/bench.01.jsonnet",
+      "bench/resources/cpp_suite/bench.02.jsonnet",
+      "bench/resources/cpp_suite/bench.03.jsonnet",
+      "bench/resources/cpp_suite/bench.04.jsonnet",
+      "bench/resources/cpp_suite/bench.06.jsonnet",
+      "bench/resources/cpp_suite/bench.08.jsonnet",
+      "bench/resources/cpp_suite/bench.09.jsonnet",
+      "bench/resources/cpp_suite/gen_big_object.jsonnet",
+      "bench/resources/cpp_suite/heavy_string_render.jsonnet",
+      "bench/resources/cpp_suite/large_string_join.jsonnet",
+      "bench/resources/cpp_suite/realistic1.jsonnet",
+      "bench/resources/cpp_suite/realistic2.jsonnet",
+      "bench/resources/cpp_suite/string_render_perf.jsonnet",
+      // go_suite — Go jsonnet builtins
+      "bench/resources/go_suite/base64_heavy.jsonnet",
+      "bench/resources/go_suite/base64_mega.jsonnet",
+      "bench/resources/go_suite/comparison.jsonnet",
+      "bench/resources/go_suite/comparison2.jsonnet",
+      "bench/resources/go_suite/foldl.jsonnet",
+      "bench/resources/go_suite/reverse.jsonnet",
+      "bench/resources/go_suite/substr.jsonnet",
+      // bug_suite
+      "bench/resources/bug_suite/assertions.jsonnet",
+      // sjsonnet_suite
+      "bench/resources/sjsonnet_suite/setDiff.jsonnet"
+    )
+  )
+  var path: String = _
+
+  private var wd: os.Path = _
+  private var filePath: OsPath = _
+  private var fileContent: String = _
+  private var jpaths: Seq[OsPath] = _
+
+  @Setup(Level.Trial)
+  def setup(): Unit = {
+    wd = sys.env.get("MILL_WORKSPACE_ROOT").map(os.Path(_)).getOrElse(os.pwd)
+    filePath = OsPath(wd / os.RelPath(path))
+    fileContent = os.read(wd / os.RelPath(path))
+    jpaths = Seq(OsPath(wd))
+  }
+
+  private def run(useNew: Boolean): String = {
+    val settings = new Settings(
+      useNewEvaluator = useNew,
+      maxStack = 100000
+    )
+    val interp = new Interpreter(
+      Map.empty[String, String],
+      Map.empty[String, String],
+      OsPath(wd),
+      importer = new SjsonnetMainBase.SimpleImporter(jpaths, None),
+      parseCache = new DefaultParseCache,
+      settings = settings
+    )
+    val writer = new StringWriter
+    val renderer = new Renderer(writer, indent = 3)
+    interp.interpret0(fileContent, filePath, renderer) match {
+      case Right(_) => writer.toString
+      case Left(e)  => throw new RuntimeException(e)
+    }
+  }
+
+  @Benchmark
+  def oldEvaluator(bh: Blackhole): Unit = {
+    bh.consume(run(useNew = false))
+  }
+
+  @Benchmark
+  def newEvaluator(bh: Blackhole): Unit = {
+    bh.consume(run(useNew = true))
+  }
+}
diff --git a/bench/src/sjsonnet/bench/ExprTagProfile.scala b/bench/src/sjsonnet/bench/ExprTagProfile.scala
@@ -0,0 +1,166 @@
+package sjsonnet.bench
+
+import sjsonnet.*
+
+import java.io.StringWriter
+
+/**
+ * Profile the frequency of each ExprTag in visitExpr calls across benchmark workloads. Run with:
+ * ./mill bench.runMain sjsonnet.bench.ExprTagProfile [file1.jsonnet file2.jsonnet ...]
+ *
+ * If no files are given, profiles ALL .jsonnet files under bench/resources/.
+ */
+object ExprTagProfile {
+
+  private val tagNames = Array(
+    "UNTAGGED", // 0
+    "ValidId", // 1
+    "BinaryOp", // 2
+    "Select", // 3
+    "Val.Literal", // 4
+    "Val.Func", // 5
+    "ApplyBuiltin0", // 6
+    "ApplyBuiltin1", // 7
+    "ApplyBuiltin2", // 8
+    "ApplyBuiltin3", // 9
+    "ApplyBuiltin4", // 10
+    "And", // 11
+    "Or", // 12
+    "UnaryOp", // 13
+    "Apply1", // 14
+    "Lookup", // 15
+    "Function", // 16
+    "LocalExpr", // 17
+    "Apply", // 18
+    "IfElse", // 19
+    "Apply3", // 20
+    "ObjBody.MemberList", // 21
+    "Apply2", // 22
+    "AssertExpr", // 23
+    "ApplyBuiltin", // 24
+    "Comp", // 25
+    "Arr", // 26
+    "SelectSuper", // 27
+    "LookupSuper", // 28
+    "InSuper", // 29
+    "ObjExtend", // 30
+    "ObjBody.ObjComp", // 31
+    "Slice", // 32
+    "Import", // 33
+    "Apply0", // 34
+    "ImportStr", // 35
+    "ImportBin", // 36
+    "Error" // 37
+  )
+
+  def main(args: Array[String]): Unit = {
+    val wd = sys.env.get("MILL_WORKSPACE_ROOT").map(os.Path(_)).getOrElse(os.pwd)
+    val benchRoot = wd / "bench" / "resources"
+
+    val files =
+      if (args.nonEmpty) args.map(os.RelPath(_)).toSeq
+      else
+        os.walk(benchRoot)
+          .filter(_.ext == "jsonnet")
+          .map(_.relativeTo(wd))
+          .sorted
+
+    val globalCounts = new Array[Long](40)
+    var globalTotal = 0L
+    val perFile = scala.collection.mutable.ArrayBuffer[(String, Long, Array[Long])]()
+
+    for (rel <- files) {
+      val counts = new Array[Long](40)
+      val filePath = OsPath(wd / rel)
+      val content =
+        try os.read(wd / rel)
+        catch { case _: Exception => System.err.println(s"SKIP (read error): $rel"); "" }
+      if (content.nonEmpty) {
+        val ok =
+          try {
+            val interp = new Interpreter(
+              Map.empty[String, String],
+              Map.empty[String, String],
+              OsPath(wd),
+              importer = new SjsonnetMainBase.SimpleImporter(
+                Seq(OsPath(wd), OsPath(wd / "bench"), OsPath(wd / "bench" / "resources")),
+                None
+              ),
+              parseCache = new DefaultParseCache,
+              settings = new Settings(maxStack = 100000)
+            ) {
+              override def createEvaluator(
+                  resolver: CachedResolver,
+                  extVars: String => Option[Expr],
+                  wd: Path,
+                  settings: Settings): Evaluator =
+                new Evaluator(resolver, extVars, wd, settings) {
+                  override def visitExpr(e: Expr)(implicit scope: ValScope): Val = {
+                    val t = e.tag & 0xff
+                    if (t < counts.length) counts(t) += 1
+                    super.visitExpr(e)
+                  }
+                }
+            }
+            val writer = new StringWriter
+            val renderer = new Renderer(writer, indent = 3)
+            interp.interpret0(content, filePath, renderer) match {
+              case Right(_) => true
+              case Left(e)  =>
+                System.err.println(s"ERROR: $rel: $e")
+                false
+            }
+          } catch {
+            case e: StackOverflowError =>
+              System.err.println(s"SKIP (StackOverflow): $rel")
+              false
+            case e: Exception =>
+              System.err.println(s"SKIP (${e.getClass.getSimpleName}): $rel")
+              false
+          }
+
+        val total = counts.sum
+        if (ok && total > 0) {
+          perFile += ((rel.toString, total, counts.clone()))
+          var i = 0
+          while (i < counts.length) {
+            globalCounts(i) += counts(i)
+            i += 1
+          }
+          globalTotal += total
+        }
+      }
+    }
+
+    // Per-file summary
+    println("\n" + "=" * 100)
+    println("PER-FILE SUMMARY")
+    println("=" * 100)
+    for ((file, total, counts) <- perFile.sortBy(-_._2)) {
+      val sorted = counts.zipWithIndex.filter(_._1 > 0).sortBy(-_._1)
+      val top3 = sorted
+        .take(3)
+        .map { case (c, idx) =>
+          val name = if (idx < tagNames.length) tagNames(idx) else s"tag=$idx"
+          f"$name(${c * 100.0 / total}%.0f%%)"
+        }
+        .mkString(", ")
+      println(f"  $file%-65s total=$total%10d  top3: $top3")
+    }
+
+    // Global aggregation
+    println("\n" + "=" * 100)
+    println(f"GLOBAL AGGREGATE (${perFile.size} files, $globalTotal%,d total visitExpr calls)")
+    println("=" * 100)
+    val globalSorted = globalCounts.zipWithIndex.filter(_._1 > 0).sortBy(-_._1)
+    var cumPct = 0.0
+    println(f"  ${"Rank"}%-5s ${"ExprTag"}%-20s ${"Count"}%12s ${"Pct"}%7s ${"Cumulative"}%10s")
+    println("  " + "-" * 60)
+    for (((count, idx), rank) <- globalSorted.zipWithIndex) {
+      val name = if (idx < tagNames.length) tagNames(idx) else s"tag=$idx"
+      val pct = count * 100.0 / globalTotal
+      cumPct += pct
+      println(f"  ${rank + 1}%-5d $name%-20s $count%,12d $pct%6.1f%% $cumPct%9.1f%%")
+    }
+  }
+}
diff --git a/sjsonnet/src-jvm-native/sjsonnet/Config.scala b/sjsonnet/src-jvm-native/sjsonnet/Config.scala
@@ -173,6 +173,11 @@ final case class Config(
         "Profile evaluation and write results to a file. Format: --profile <file> or --profile <format>:<file> where format is 'text' (default) or 'flamegraph'"
     )
     profile: Option[String] = None,
+    @arg(
+      name = "new-evaluator",
+      doc = "Use the new tag-based evaluator (hybrid instanceof + tableswitch dispatch)"
+    )
+    newEvaluator: Flag = Flag(),
     @arg(
       doc = "The jsonnet file you wish to evaluate",
       positional = true
diff --git a/sjsonnet/src-jvm-native/sjsonnet/SjsonnetMainBase.scala b/sjsonnet/src-jvm-native/sjsonnet/SjsonnetMainBase.scala
@@ -189,6 +189,7 @@ object SjsonnetMainBase {
           throwErrorForInvalidSets = config.throwErrorForInvalidSets.value,
           maxParserRecursionDepth = config.maxParserRecursionDepth,
           brokenAssertionLogic = config.brokenAssertionLogic.value,
+          useNewEvaluator = config.newEvaluator.value,
           maxStack = config.maxStack
         ),
         parseCache,
diff --git a/sjsonnet/src/sjsonnet/Evaluator.scala b/sjsonnet/src/sjsonnet/Evaluator.scala
@@ -2066,13 +2066,24 @@ class NewEvaluator(
     fc: FormatCache = FormatCache.SharedDefault)
     extends Evaluator(r, e, w, s, wa, ds, fc) {
 
+  // Hot path: top 7 types cover 96.1% of all visitExpr calls across benchmarks.
+  // ~120 bytes bytecode — within JIT FreqInlineSize=325, unlike the old evaluator's ~700 bytes.
+  // Order matches old evaluator's first 4 types (ValidId, BinaryOp, Select, Val) for C1 parity.
   override def visitExpr(e: Expr)(implicit scope: ValScope): Val = try {
+    if (e.isInstanceOf[ValidId]) visitValidId(e.asInstanceOf[ValidId])
+    else if (e.isInstanceOf[BinaryOp]) visitBinaryOp(e.asInstanceOf[BinaryOp])
+    else if (e.isInstanceOf[Select]) visitSelect(e.asInstanceOf[Select])
+    else if (e.isInstanceOf[Val]) e.asInstanceOf[Val]
+    else if (e.isInstanceOf[Apply1]) visitApply1(e.asInstanceOf[Apply1])
+    else if (e.isInstanceOf[ObjExtend]) visitObjExtend(e.asInstanceOf[ObjExtend])
+    else if (e.isInstanceOf[IfElse]) visitIfElse(e.asInstanceOf[IfElse])
+    else visitExprCold(e)
+  } catch {
+    Error.withStackFrame(e)
+  }
+
+  private def visitExprCold(e: Expr)(implicit scope: ValScope): Val =
     (e.tag: @switch) match {
-      case ExprTags.ValidId       => visitValidId(e.asInstanceOf[ValidId])
-      case ExprTags.BinaryOp      => visitBinaryOp(e.asInstanceOf[BinaryOp])
-      case ExprTags.Select        => visitSelect(e.asInstanceOf[Select])
-      case ExprTags.`Val.Func`    => e.asInstanceOf[Val.Func]
-      case ExprTags.`Val.Literal` => e.asInstanceOf[Val.Literal]
       case ExprTags.ApplyBuiltin0 => visitApplyBuiltin0(e.asInstanceOf[ApplyBuiltin0])
       case ExprTags.ApplyBuiltin1 => visitApplyBuiltin1(e.asInstanceOf[ApplyBuiltin1])
       case ExprTags.ApplyBuiltin2 => visitApplyBuiltin2(e.asInstanceOf[ApplyBuiltin2])
@@ -2081,14 +2092,12 @@ class NewEvaluator(
       case ExprTags.And           => visitAnd(e.asInstanceOf[And])
       case ExprTags.Or            => visitOr(e.asInstanceOf[Or])
       case ExprTags.UnaryOp       => visitUnaryOp(e.asInstanceOf[UnaryOp])
-      case ExprTags.Apply1        => visitApply1(e.asInstanceOf[Apply1])
       case ExprTags.Lookup        => visitLookup(e.asInstanceOf[Lookup])
       case ExprTags.Function      =>
         val f = e.asInstanceOf[Function]
         visitMethod(f.body, f.params, f.pos)
       case ExprTags.LocalExpr            => visitLocalExpr(e.asInstanceOf[LocalExpr])
       case ExprTags.Apply                => visitApply(e.asInstanceOf[Apply])
-      case ExprTags.IfElse               => visitIfElse(e.asInstanceOf[IfElse])
       case ExprTags.Apply3               => visitApply3(e.asInstanceOf[Apply3])
       case ExprTags.`ObjBody.MemberList` =>
         val oml = e.asInstanceOf[ObjBody.MemberList]
@@ -2101,7 +2110,6 @@ class NewEvaluator(
       case ExprTags.SelectSuper       => visitSelectSuper(e.asInstanceOf[SelectSuper])
       case ExprTags.LookupSuper       => visitLookupSuper(e.asInstanceOf[LookupSuper])
       case ExprTags.InSuper           => visitInSuper(e.asInstanceOf[InSuper])
-      case ExprTags.ObjExtend         => visitObjExtend(e.asInstanceOf[ObjExtend])
       case ExprTags.`ObjBody.ObjComp` => visitObjComp(e.asInstanceOf[ObjBody.ObjComp], null)
       case ExprTags.Slice             => visitSlice(e.asInstanceOf[Slice])
       case ExprTags.Import            => visitImport(e.asInstanceOf[Import])
@@ -2111,9 +2119,6 @@ class NewEvaluator(
       case ExprTags.Error             => visitError(e.asInstanceOf[Expr.Error])
       case _                          => visitInvalid(e)
     }
-  } catch {
-    Error.withStackFrame(e)
-  }
   // This is only needed for --no-static-errors, otherwise these expression types do not make it past the optimizer
   override def visitInvalid(e: Expr): Nothing = (e.tag: @switch) match {
     case ExprTags.Id =>