databricks
diff --git a/‎examples/plot_comparison.py‎
Lines changed: 82 additions & 30 deletions b/‎examples/plot_comparison.py‎
Lines changed: 82 additions & 30 deletions
diff --git a/‎examples/profile_column_tags.py‎
Lines changed: 24 additions & 12 deletions b/‎examples/profile_column_tags.py‎
Lines changed: 24 additions & 12 deletions
diff --git a/‎examples/profile_read_then_write_table_tags.py‎
Lines changed: 19 additions & 9 deletions b/‎examples/profile_read_then_write_table_tags.py‎
Lines changed: 19 additions & 9 deletions
@@ -38,7 +38,7 @@ def parse_report(filepath):
 
     m = re.search(r"\*\*(ALTERs/sec|SELECTs/sec|Operations/sec)\*\*:\s*([\d.]+)", content)
     if m:
-        metrics["throughput"] = float(m.group(2))
+        metrics["throughput_ops"] = float(m.group(2))
 
     for pct in ["p50", "p90", "p95", "p99"]:
         m = re.search(rf"\|\s*{pct}\s*\|\s*([\d.]+)\s*\|", content)
@@ -65,6 +65,17 @@ def parse_report(filepath):
     if m:
         metrics["columns"] = int(m.group(1))
 
+    m = re.search(r"\*\*Tables per iteration\*\*:\s*(\d+)", content)
+    if m:
+        metrics["tables_per_iteration"] = int(m.group(1))
+
+    # Also match older reports that used "Tables": N
+    if "tables_per_iteration" not in metrics:
+        m = re.search(r"\*\*Total SELECTs\*\*:\s*(\d+)", content)
+        iters = metrics.get("iterations", 1)
+        if m and iters:
+            metrics["tables_per_iteration"] = int(float(m.group(1))) // iters
+
     m = re.search(r"\*\*Tags per ALTER\*\*:\s*(\d+)", content)
     if m:
         metrics["tags"] = int(m.group(1))
@@ -111,17 +122,19 @@ def discover_reports():
 
             threads = metrics["threads"]
 
+            tbl = metrics.get("tables_per_iteration", "?")
+
             if report_type == "alter" and category == "column":
                 cols = metrics.get("columns", "?")
                 tags = metrics.get("tags", "?")
-                label = f"ALTER column tags (c={cols}, t={tags})"
+                label = f"ALTER column tags (columns={cols}, tags_per_column={tags}, tables={tbl})"
             elif report_type == "alter" and category == "table":
                 tags = metrics.get("tags", "?")
-                label = f"ALTER table tags (t={tags})"
+                label = f"ALTER table tags (tags={tags}, tables={tbl})"
             elif report_type == "info_schema" and category == "column":
-                label = "info_schema column_tags SELECT"
+                label = f"info_schema column_tags SELECT (tables={tbl})"
             elif report_type == "info_schema" and category == "table":
-                label = "info_schema table_tags SELECT"
+                label = f"info_schema table_tags SELECT (tables={tbl})"
             else:
                 continue
 
@@ -130,23 +143,24 @@ def discover_reports():
             if existing and metrics.get("iterations", 0) <= existing.get("iterations", 0):
                 continue
 
+            # Compute tables/sec from wall-clock and tables_per_iteration
+            tpi = metrics.get("tables_per_iteration")
+            wc = metrics.get("wall_clock_s")
+            if tpi and wc and wc > 0:
+                metrics["tables_per_sec"] = round(tpi / wc, 2)
+
             categories[category][label][threads] = metrics
             print(f"  [{category}] {label} threads={threads}: "
                   f"wall={metrics.get('wall_clock_s', '?')}s, "
                   f"p50={metrics.get('p50', '?')}ms, "
-                  f"throughput={metrics.get('throughput', '?')} ops/s "
+                  f"tables/s={metrics.get('tables_per_sec', '?')} "
                   f"[{fname}]")
 
     return categories
 
 
-def plot_category(category_name, series, output_path):
-    """Generate a 2x2 chart PNG for one category (column or table)."""
-    if not series:
-        print(f"  No data for {category_name}, skipping.")
-        return
-
-    # Color/style assignment
+def build_style_map(series):
+    """Assign colors and styles to series labels."""
     colors_info = ["#d62728", "#ff7f0e"]
     colors_alter = ["#1f77b4", "#2ca02c", "#9467bd", "#17becf", "#8c564b"]
     info_idx = 0
@@ -161,16 +175,20 @@ def plot_category(category_name, series, output_path):
             style_map[label] = {"color": colors_alter[alter_idx % len(colors_alter)], "marker": "s", "linestyle": "-"}
             alter_idx += 1
 
-    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
+    return style_map
 
-    chart_configs = [
-        (axes[0][0], "wall_clock_s", "Wall-Clock Time (seconds)", "Wall-Clock Time vs Thread Count"),
-        (axes[0][1], "throughput", "Operations / second", "Throughput vs Thread Count"),
-        (axes[1][0], "p50", "P50 Latency (ms)", "P50 Latency vs Thread Count"),
-        (axes[1][1], "p99", "P99 Latency (ms)", "P99 Latency vs Thread Count"),
-    ]
 
-    for ax, metric_key, ylabel, title in chart_configs:
+def plot_charts(series, style_map, chart_configs, suptitle, output_path):
+    """Generate a chart PNG with len(chart_configs) subplots."""
+    n = len(chart_configs)
+    cols = 2
+    rows = (n + 1) // 2
+    fig, axes = plt.subplots(rows, cols, figsize=(16, 6 * rows))
+    if rows == 1:
+        axes = [axes]
+
+    for idx, (metric_key, ylabel, title) in enumerate(chart_configs):
+        ax = axes[idx // cols][idx % cols]
         for label, thread_data in sorted(series.items()):
             threads = sorted(thread_data.keys())
             values = [thread_data[t].get(metric_key) for t in threads]
@@ -184,15 +202,51 @@ def plot_category(category_name, series, output_path):
         ax.legend(fontsize=8)
         ax.grid(True, alpha=0.3)
 
-    title_label = "Column Tags" if category_name == "column" else "Table Tags"
-    plt.suptitle(f"SET TAGS Profiling: {title_label} — info_schema SELECT vs Direct ALTER",
-                 fontsize=14, fontweight="bold")
+    # Hide unused subplot if odd number of charts
+    if n % 2 == 1:
+        axes[rows - 1][1].set_visible(False)
+
+    plt.suptitle(suptitle, fontsize=14, fontweight="bold")
     plt.tight_layout()
     plt.savefig(output_path, dpi=150, bbox_inches="tight")
     plt.close(fig)
     print(f"  Chart saved to: {output_path}")
 
 
+def plot_category(category_name, series, output_dir):
+    """Generate two PNGs per category: table-level comparison + individual operation detail."""
+    if not series:
+        print(f"  No data for {category_name}, skipping.")
+        return
+
+    style_map = build_style_map(series)
+    title_label = "Column Tags" if category_name == "column" else "Table Tags"
+
+    # Chart 1: Table-level comparison (apples-to-apples across approaches)
+    table_charts = [
+        ("wall_clock_s", "Wall-Clock Time (seconds)", "Wall-Clock Time vs Thread Count (Lower is better)"),
+        ("tables_per_sec", "Tables / second", "Tables Processed per Second vs Thread Count (Higher is better)"),
+    ]
+    plot_charts(
+        series, style_map, table_charts,
+        f"{title_label}: Table-Level Comparison — info_schema SELECT vs Direct ALTER",
+        os.path.join(output_dir, f"comparison_{category_name}_tags_tables.png"),
+    )
+
+    # Chart 2: Individual operation detail (per-op latency)
+    op_charts = [
+        ("throughput_ops", "Individual Operations / second", "Individual Op Throughput vs Thread Count (Higher is better)"),
+        ("p50", "P50 Latency per Op (ms)", "P50 Latency vs Thread Count (Lower is better)"),
+        ("p99", "P99 Latency per Op (ms)", "P99 Latency vs Thread Count (Lower is better)"),
+        ("max", "Max Latency per Op (ms)", "Max Latency vs Thread Count (Lower is better)"),
+    ]
+    plot_charts(
+        series, style_map, op_charts,
+        f"{title_label}: Individual Operation Detail",
+        os.path.join(output_dir, f"comparison_{category_name}_tags_ops.png"),
+    )
+
+
 if __name__ == "__main__":
     print("Discovering results...\n")
     categories = discover_reports()
@@ -202,14 +256,12 @@ def plot_category(category_name, series, output_path):
     print(f"\nFound {total_series} series across {total_points} data points.\n")
 
     if "column" in categories:
-        print("Generating column tags chart...")
-        plot_category("column", categories["column"],
-                      os.path.join(RESULTS_DIR, "comparison_column_tags.png"))
+        print("Generating column tags charts...")
+        plot_category("column", categories["column"], RESULTS_DIR)
 
     if "table" in categories:
-        print("Generating table tags chart...")
-        plot_category("table", categories["table"],
-                      os.path.join(RESULTS_DIR, "comparison_table_tags.png"))
+        print("Generating table tags charts...")
+        plot_category("table", categories["table"], RESULTS_DIR)
 
     if not categories:
         print("No results found. Run experiments first.")
 
@@ -55,7 +55,7 @@
 SCHEMA = _creds["SCHEMA"]
 # ============================================================
 
-NUM_TABLES = 64
+NUM_TABLES = 128  # total tables available (table1..table128)
 MAX_COLUMNS = 128  # tables always created with this many columns
 RESULTS_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "results", "column_tags")
 
@@ -279,11 +279,14 @@ def run_iteration(
     num_columns: int,
     num_tags: int,
     num_threads: int,
+    tables_per_iteration: int,
 ) -> tuple:
-    """Run a single iteration: distribute 20 tables across threads."""
+    """Run a single iteration: tables_per_iteration tables distributed across num_threads threads."""
     table_queue = Queue()
-    for t in range(1, NUM_TABLES + 1):
-        table_queue.put(f"table{t}")
+    start = ((iteration - 1) * tables_per_iteration) % NUM_TABLES
+    for i in range(tables_per_iteration):
+        table_idx = start + i + 1
+        table_queue.put(f"table{table_idx}")
 
     alter_results: list = []
     table_results: list = []
@@ -353,7 +356,7 @@ def w(text=""):
     w(f"- **Server**: `{SERVER_HOSTNAME}`")
     w(f"- **HTTP Path**: `{HTTP_PATH}`")
     w(f"- **Catalog.Schema**: `{CATALOG}.{SCHEMA}`")
-    w(f"- **Tables**: {NUM_TABLES}")
+    w(f"- **Tables per iteration**: {args.tables_per_iteration}")
     w(f"- **Columns tagged per table**: {args.columns}")
     w(f"- **Tags per ALTER**: {args.tags}")
     w(f"- **Threads**: {args.threads}")
@@ -583,18 +586,26 @@ def w(text=""):
 
 def main():
     parser = argparse.ArgumentParser(description="Profile SET COLUMN TAGS performance")
-    parser.add_argument("--columns", type=int, required=True, help="Number of columns to tag per table (1, 2, 4)")
-    parser.add_argument("--tags", type=int, required=True, help="Number of tags per ALTER command (1, 2, 4)")
-    parser.add_argument("--threads", type=int, required=True, help="Number of concurrent threads (1, 2, 4, 8, 16)")
-    parser.add_argument("--iterations", type=int, required=True, help="Number of times to repeat the full sweep")
+    parser.add_argument("--columns", type=int, required=True, help="Number of columns to tag per table")
+    parser.add_argument("--tags", type=int, required=True, help="Number of tags per ALTER command")
+    parser.add_argument("--threads", type=int, required=True, help="Number of concurrent threads")
+    parser.add_argument("--iterations", type=int, required=True, help="Number of iterations")
+    parser.add_argument("--tables-per-iteration", type=int, default=None, help="Tables to process per iteration (default = --threads, i.e. 1 table per thread)")
     parser.add_argument("--validate", action="store_true", help="Quick validation: override to 1 iteration, print result")
     parser.add_argument("--skip-setup", action="store_true", help="Skip table creation (tables already exist)")
     args = parser.parse_args()
 
+    if args.tables_per_iteration is None:
+        args.tables_per_iteration = args.threads
+
     if args.columns > MAX_COLUMNS:
         print(f"Error: --columns {args.columns} exceeds MAX_COLUMNS={MAX_COLUMNS}")
         sys.exit(1)
 
+    if args.tables_per_iteration > NUM_TABLES:
+        print(f"Error: --tables-per-iteration {args.tables_per_iteration} exceeds NUM_TABLES={NUM_TABLES}")
+        sys.exit(1)
+
     if args.validate:
         args.iterations = 1
         print("=== VALIDATION MODE: 1 iteration only ===\n")
@@ -609,9 +620,9 @@ def main():
     # Logging
     profile_handler = setup_logging(log_path)
 
-    print(f"Profile: columns={args.columns}, tags={args.tags}, threads={args.threads}, iterations={args.iterations}")
-    print(f"ALTERs per iteration: {NUM_TABLES * args.columns}")
-    print(f"Total ALTERs: {NUM_TABLES * args.columns * args.iterations}")
+    print(f"Profile: columns={args.columns}, tags={args.tags}, threads={args.threads}, iterations={args.iterations}, tables_per_iteration={args.tables_per_iteration}")
+    print(f"ALTERs per iteration: {args.tables_per_iteration * args.columns} ({args.tables_per_iteration} tables x {args.columns} columns)")
+    print(f"Total ALTERs: {args.tables_per_iteration * args.columns * args.iterations}")
     print(f"Output: {report_path}")
     print()
 
@@ -631,6 +642,7 @@ def main():
             num_columns=args.columns,
             num_tags=args.tags,
             num_threads=args.threads,
+            tables_per_iteration=args.tables_per_iteration,
         )
         all_alter_results.extend(alter_results)
         all_table_results.extend(table_results)
 
@@ -45,7 +45,7 @@
 SCHEMA = _creds["SCHEMA"]
 # ============================================================
 
-NUM_TABLES = 64
+NUM_TABLES = 128
 RESULTS_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "results", "read_then_write_table_tags")
 
 SELECT_TEMPLATE = """SELECT tag_name, tag_value
@@ -215,10 +215,12 @@ def worker(
 # Run one iteration
 # ---------------------------------------------------------------------------
 
-def run_iteration(iteration: int, num_threads: int) -> tuple:
+def run_iteration(iteration: int, num_threads: int, tables_per_iteration: int) -> tuple:
     table_queue = Queue()
-    for t in range(1, NUM_TABLES + 1):
-        table_queue.put(f"table{t}")
+    start = ((iteration - 1) * tables_per_iteration) % NUM_TABLES
+    for i in range(tables_per_iteration):
+        table_idx = start + i + 1
+        table_queue.put(f"table{table_idx}")
 
     results: list = []
     results_lock = threading.Lock()
@@ -484,10 +486,18 @@ def main():
         description="Profile read-from-information_schema then write-table-tag pattern"
     )
     parser.add_argument("--threads", type=int, required=True, help="Number of concurrent threads")
-    parser.add_argument("--iterations", type=int, required=True, help="Number of times to repeat the full sweep")
+    parser.add_argument("--iterations", type=int, required=True, help="Number of iterations")
+    parser.add_argument("--tables-per-iteration", type=int, default=None, help="Tables per iteration (default = --threads)")
     parser.add_argument("--validate", action="store_true", help="Quick validation: override to 1 iteration")
     args = parser.parse_args()
 
+    if args.tables_per_iteration is None:
+        args.tables_per_iteration = args.threads
+
+    if args.tables_per_iteration > NUM_TABLES:
+        print(f"Error: --tables-per-iteration {args.tables_per_iteration} exceeds NUM_TABLES={NUM_TABLES}")
+        sys.exit(1)
+
     if args.validate:
         args.iterations = 1
         print("=== VALIDATION MODE: 1 iteration only ===\n")
@@ -500,9 +510,9 @@ def main():
 
     profile_handler = setup_logging(log_path)
 
-    print(f"Profile (information_schema.table_tags): threads={args.threads}, iterations={args.iterations}")
-    print(f"SELECTs per iteration: {NUM_TABLES} (1 per table)")
-    print(f"Total SELECTs: {NUM_TABLES * args.iterations}")
+    print(f"Profile (information_schema.table_tags): threads={args.threads}, iterations={args.iterations}, tables_per_iteration={args.tables_per_iteration}")
+    print(f"SELECTs per iteration: {args.tables_per_iteration} (1 per table)")
+    print(f"Total SELECTs: {args.tables_per_iteration * args.iterations}")
     print(f"Output: {report_path}")
     print()
 
@@ -511,7 +521,7 @@ def main():
 
     for i in range(1, args.iterations + 1):
         print(f"Iteration {i}/{args.iterations}...", end=" ", flush=True)
-        results, duration = run_iteration(iteration=i, num_threads=args.threads)
+        results, duration = run_iteration(iteration=i, num_threads=args.threads, tables_per_iteration=args.tables_per_iteration)
         all_results.extend(results)
         iteration_durations.append(duration)