tpch examples: rewrite non-idiomatic queries in idiomatic DataFrame form

timsaucer · claude · timsaucer · commit 91f96cb689b8 · 2026-04-24T08:50:54.000-04:00
Rewrite the seven TPC-H example queries that did not demonstrate the
idiomatic DataFrame pattern. The remaining queries (Q02/Q11/Q15/Q17/Q22,
which use window functions in place of correlated subqueries) already are
idiomatic and are left unchanged.

- Q04: replace `.aggregate([col("l_orderkey")], [])` with
  `.select("l_orderkey").distinct()`, which is the natural way to express
  "reduce to one row per order" on a DataFrame.
- Q07: remove the CASE-as-filter on `n_name` and use
  `F.in_list(col("n_name"), [nation_1, nation_2])` instead. Drops a
  comment block that admitted the filter form was simpler.
- Q08: rewrite the switched CASE `F.case(...).when(lit(False), ...)` as a
  searched `F.when(col(...).is_not_null(), ...).otherwise(...)`. That
  mirrors the reference SQL's `case when ... then ... else 0 end` shape.
- Q12: replace `array_position(make_array(...), col)` with
  `F.in_list(col("l_shipmode"), [...])`. Same semantics, without routing
  through array construction / array search.
- Q19: remove the pyarrow UDF that re-implemented a disjunctive predicate
  in Python. Build the same predicate in DataFusion by OR-combining one
  `in_list` + range-filter expression per brand. Keeps the per-brand
  constants in the existing `items_of_interest` dict.
- Q20: use `F.starts_with` instead of an explicit substring slice. Replace
  the inner-join + `select(...).distinct()` tail with a semi join against
  a precomputed set of excess-quantity suppliers so the supplier columns
  are preserved without deduplication after the fact.
- Q21: replace the `array_agg` / `array_length` / `array_element` pipeline
  with two semi joins. One semi join keeps orders with more than one
  distinct supplier (stand-in for the reference SQL's `exists` subquery),
  the other keeps orders with exactly one late supplier (stand-in for the
  `not exists` subquery).

All 22 answer-file comparisons and 22 plan-comparison diagnostics still
pass (`pytest examples/tpch/_tests.py`: 44 passed).

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/examples/tpch/q04_order_priority_checking.py b/examples/tpch/q04_order_priority_checking.py
@@ -77,13 +77,13 @@
 
 interval = pa.scalar((0, INTERVAL_DAYS, 0), type=pa.month_day_nano_interval())
 
-# Limit results to cases where commitment date before receipt date
-# Aggregate the results so we only get one row to join with the order table.
-# Alternately, and likely more idiomatic is instead of `.aggregate` you could
-# do `.select("l_orderkey").distinct()`. The goal here is to show
-# multiple examples of how to use Data Fusion.
-df_lineitem = df_lineitem.filter(col("l_commitdate") < col("l_receiptdate")).aggregate(
-    [col("l_orderkey")], []
+# Limit results to cases where commitment date before receipt date, then
+# reduce to a single row per order so the join with the orders table is a
+# semantic EXISTS rather than a fan-out.
+df_lineitem = (
+    df_lineitem.filter(col("l_commitdate") < col("l_receiptdate"))
+    .select("l_orderkey")
+    .distinct()
 )
 
 # Limit orders to date range of interest
diff --git a/examples/tpch/q07_volume_shipping.py b/examples/tpch/q07_volume_shipping.py
@@ -116,20 +116,8 @@
 )
 
 
-# A simpler way to do the following operation is to use a filter, but we also want to demonstrate
-# how to use case statements. Here we are assigning `n_name` to be itself when it is either of
-# the two nations of interest. Since there is no `otherwise()` statement, any values that do
-# not match these will result in a null value and then get filtered out.
-#
-# To do the same using a simple filter would be:
-# df_nation = df_nation.filter((F.col("n_name") == nation_1) | (F.col("n_name") == nation_2)) # noqa: ERA001
-df_nation = df_nation.with_column(
-    "n_name",
-    F.case(col("n_name"))
-    .when(nation_1, col("n_name"))
-    .when(nation_2, col("n_name"))
-    .end(),
-).filter(~col("n_name").is_null())
+# Limit the nation table to the two nations of interest.
+df_nation = df_nation.filter(F.in_list(col("n_name"), [nation_1, nation_2]))
 
 
 # Limit suppliers to either nation
diff --git a/examples/tpch/q08_market_share.py b/examples/tpch/q08_market_share.py
@@ -186,12 +186,13 @@
     df_national_suppliers, left_on=["l_suppkey"], right_on=["s_suppkey"], how="left"
 )
 
-# Use a case statement to compute the volume sold by suppliers in the nation of interest
+# Use a searched CASE (``F.when(...).otherwise(...)``) to keep only the
+# volume attributable to suppliers in the nation of interest. This mirrors
+# the ``case when nation = '...' then volume else 0 end`` form of the
+# reference SQL rather than dispatching on a boolean subject.
 df = df.with_column(
     "national_volume",
-    F.case(col("s_suppkey").is_null())
-    .when(lit(value=False), col("volume"))
-    .otherwise(lit(0.0)),
+    F.when(col("s_suppkey").is_not_null(), col("volume")).otherwise(lit(0.0)),
 )
 
 df = df.with_column(
diff --git a/examples/tpch/q12_ship_mode_order_priority.py b/examples/tpch/q12_ship_mode_order_priority.py
@@ -91,20 +91,9 @@
     col("l_receiptdate") < lit(date) + lit(interval)
 )
 
-# Note: It is not recommended to use array_has because it treats the second argument as an argument
-# so if you pass it col("l_shipmode") it will pass the entire array to process which is very slow.
-# Instead check the position of the entry is not null.
-df = df.filter(
-    ~F.array_position(
-        F.make_array(lit(SHIP_MODE_1), lit(SHIP_MODE_2)), col("l_shipmode")
-    ).is_null()
-)
-
-# Since we have only two values, it's much easier to do this as a filter where the l_shipmode
-# matches either of the two values, but we want to show doing some array operations in this
-# example. If you want to see this done with filters, comment out the above line and uncomment
-# this one.
-# df = df.filter((col("l_shipmode") == lit(SHIP_MODE_1)) | (col("l_shipmode") == lit(SHIP_MODE_2))) # noqa: ERA001
+# Restrict to the two ship modes of interest. ``in_list`` maps directly to
+# the ``l_shipmode in ('FOB', 'SHIP')`` clause of the reference SQL.
+df = df.filter(F.in_list(col("l_shipmode"), [lit(SHIP_MODE_1), lit(SHIP_MODE_2)]))
 
 
 # We need order priority, so join order df to line item
diff --git a/examples/tpch/q19_discounted_revenue.py b/examples/tpch/q19_discounted_revenue.py
@@ -64,8 +64,7 @@
         );
 """
 
-import pyarrow as pa
-from datafusion import SessionContext, col, lit, udf
+from datafusion import SessionContext, col, lit
 from datafusion import functions as F
 from util import get_data_path
 
@@ -114,59 +113,34 @@
 df = df.join(df_part, left_on=["l_partkey"], right_on=["p_partkey"], how="inner")
 
 
-# Create the user defined function (UDF) definition that does the work
-def is_of_interest(
-    brand_arr: pa.Array,
-    container_arr: pa.Array,
-    quantity_arr: pa.Array,
-    size_arr: pa.Array,
-) -> pa.Array:
-    """
-    The purpose of this function is to demonstrate how a UDF works, taking as input a pyarrow Array
-    and generating a resultant Array. The length of the inputs should match and there should be the
-    same number of rows in the output.
-    """
-    result = []
-    for idx, brand_val in enumerate(brand_arr):
-        brand = brand_val.as_py()
-        if brand in items_of_interest:
-            values_of_interest = items_of_interest[brand]
-
-            container_matches = (
-                container_arr[idx].as_py() in values_of_interest["containers"]
-            )
-
-            quantity = quantity_arr[idx].as_py()
-            quantity_matches = (
-                values_of_interest["min_quantity"]
-                <= quantity
-                <= values_of_interest["min_quantity"] + 10
-            )
-
-            size = size_arr[idx].as_py()
-            size_matches = 1 <= size <= values_of_interest["max_size"]
-
-            result.append(container_matches and quantity_matches and size_matches)
-        else:
-            result.append(False)
-
-    return pa.array(result)
-
-
-# Turn the above function into a UDF that DataFusion can understand
-is_of_interest_udf = udf(
-    is_of_interest,
-    [pa.utf8(), pa.utf8(), pa.decimal128(15, 2), pa.int32()],
-    pa.bool_(),
-    "stable",
-)
+# Build one OR-combined predicate per brand. Each disjunct encodes the
+# brand-specific container list, quantity window, and size range from the
+# reference SQL. This mirrors the SQL ``where (... brand A ...) or (... brand
+# B ...) or (... brand C ...)`` form directly, without a UDF.
+def _brand_predicate(
+    brand: str, min_quantity: int, containers: list[str], max_size: int
+):
+    return (
+        (col("p_brand") == lit(brand))
+        & F.in_list(col("p_container"), [lit(c) for c in containers])
+        & (col("l_quantity") >= lit(min_quantity))
+        & (col("l_quantity") <= lit(min_quantity + 10))
+        & (col("p_size") >= lit(1))
+        & (col("p_size") <= lit(max_size))
+    )
 
-# Filter results using the above UDF
-df = df.filter(
-    is_of_interest_udf(
-        col("p_brand"), col("p_container"), col("l_quantity"), col("p_size")
+
+predicate = None
+for brand, params in items_of_interest.items():
+    part_predicate = _brand_predicate(
+        brand,
+        params["min_quantity"],
+        params["containers"],
+        params["max_size"],
     )
-)
+    predicate = part_predicate if predicate is None else predicate | part_predicate
+
+df = df.filter(predicate)
 
 df = df.aggregate(
     [],
diff --git a/examples/tpch/q20_potential_part_promotion.py b/examples/tpch/q20_potential_part_promotion.py
@@ -100,42 +100,46 @@
 
 interval = pa.scalar((0, 365, 0), type=pa.month_day_nano_interval())
 
-# Filter down dataframes
+# Filter down dataframes. ``starts_with`` reads more naturally than an
+# explicit substring slice and maps directly to the reference SQL's
+# ``p_name like 'forest%'`` clause.
 df_nation = df_nation.filter(col("n_name") == lit(NATION_OF_INTEREST))
-df_part = df_part.filter(
-    F.substring(col("p_name"), lit(0), lit(len(COLOR_OF_INTEREST) + 1))
-    == lit(COLOR_OF_INTEREST)
+df_part = df_part.filter(F.starts_with(col("p_name"), lit(COLOR_OF_INTEREST)))
+
+# Compute the total quantity of interesting parts shipped by each (part,
+# supplier) pair within the year of interest.
+totals = (
+    df_lineitem.filter(col("l_shipdate") >= lit(date))
+    .filter(col("l_shipdate") < lit(date) + lit(interval))
+    .join(df_part, left_on="l_partkey", right_on="p_partkey", how="inner")
+    .aggregate(
+        [col("l_partkey"), col("l_suppkey")],
+        [F.sum(col("l_quantity")).alias("total_sold")],
+    )
 )
 
-df = df_lineitem.filter(col("l_shipdate") >= lit(date)).filter(
-    col("l_shipdate") < lit(date) + lit(interval)
+# Keep only (part, supplier) pairs whose available quantity exceeds 50% of
+# the total shipped. The result already contains one row per supplier of
+# interest, so we can semi-join the supplier table rather than inner-join
+# and deduplicate afterwards.
+excess_suppliers = (
+    df_partsupp.join(
+        totals,
+        left_on=["ps_partkey", "ps_suppkey"],
+        right_on=["l_partkey", "l_suppkey"],
+        how="inner",
+    )
+    .filter(col("ps_availqty") > lit(0.5) * col("total_sold"))
+    .select(col("ps_suppkey").alias("suppkey"))
+    .distinct()
 )
 
-# This will filter down the line items to the parts of interest
-df = df.join(df_part, left_on="l_partkey", right_on="p_partkey", how="inner")
+# Limit to suppliers in the nation of interest and pick out the two
+# requested columns.
+df = df_supplier.join(
+    df_nation, left_on=["s_nationkey"], right_on=["n_nationkey"], how="inner"
+).join(excess_suppliers, left_on="s_suppkey", right_on="suppkey", how="semi")
 
-# Compute the total sold and limit ourselves to individual supplier/part combinations
-df = df.aggregate(
-    [col("l_partkey"), col("l_suppkey")], [F.sum(col("l_quantity")).alias("total_sold")]
-)
-
-df = df.join(
-    df_partsupp,
-    left_on=["l_partkey", "l_suppkey"],
-    right_on=["ps_partkey", "ps_suppkey"],
-    how="inner",
-)
-
-# Find cases of excess quantity
-df = df.filter(col("ps_availqty") > lit(0.5) * col("total_sold"))
-
-# We could do these joins earlier, but now limit to the nation of interest suppliers
-df = df.join(df_supplier, left_on=["ps_suppkey"], right_on=["s_suppkey"], how="inner")
-df = df.join(df_nation, left_on=["s_nationkey"], right_on=["n_nationkey"], how="inner")
-
-# Restrict to the requested data per the problem statement
-df = df.select("s_name", "s_address").distinct()
-
-df = df.sort(col("s_name").sort())
+df = df.select("s_name", "s_address").sort(col("s_name").sort())
 
 df.show()
diff --git a/examples/tpch/q21_suppliers_kept_orders_waiting.py b/examples/tpch/q21_suppliers_kept_orders_waiting.py
@@ -92,65 +92,68 @@
 )
 
 # Limit to suppliers in the nation of interest
-df_suppliers_of_interest = df_nation.filter(col("n_name") == lit(NATION_OF_INTEREST))
-
-df_suppliers_of_interest = df_suppliers_of_interest.join(
-    df_supplier, left_on="n_nationkey", right_on="s_nationkey", how="inner"
+df_suppliers_of_interest = df_nation.filter(
+    col("n_name") == lit(NATION_OF_INTEREST)
+).join(df_supplier, left_on="n_nationkey", right_on="s_nationkey", how="inner")
+
+# Line items for orders that have status 'F'. This is the candidate set of
+# (order, supplier) pairs we reason about below.
+failed_order_lineitems = df_lineitem.join(
+    df_orders.filter(col("o_orderstatus") == lit("F")),
+    left_on="l_orderkey",
+    right_on="o_orderkey",
+    how="inner",
 )
 
-# Find the failed orders and all their line items
-df = df_orders.filter(col("o_orderstatus") == lit("F"))
-
-df = df_lineitem.join(df, left_on="l_orderkey", right_on="o_orderkey", how="inner")
-
-# Identify the line items for which the order is failed due to.
-df = df.with_column(
-    "failed_supp",
-    F.case(col("l_receiptdate") > col("l_commitdate"))
-    .when(lit(value=True), col("l_suppkey"))
-    .end(),
+# Line items whose receipt was late. This corresponds to ``l1`` in the
+# reference SQL.
+late_lineitems = failed_order_lineitems.filter(
+    col("l_receiptdate") > col("l_commitdate")
 )
 
-# There are other ways we could do this but the purpose of this example is to work with rows where
-# an element is an array of values. In this case, we will create two columns of arrays. One will be
-# an array of all of the suppliers who made up this order. That way we can filter the dataframe for
-# only orders where this array is larger than one for multiple supplier orders. The second column
-# is all of the suppliers who failed to make their commitment. We can filter the second column for
-# arrays with size one. That combination will give us orders that had multiple suppliers where only
-# one failed. Use distinct=True in the blow aggregation so we don't get multiple line items from the
-# same supplier reported in either array.
-df = df.aggregate(
-    [col("o_orderkey")],
-    [
-        F.array_agg(col("l_suppkey"), distinct=True).alias("all_suppliers"),
-        F.array_agg(
-            col("failed_supp"), filter=col("failed_supp").is_not_null(), distinct=True
-        ).alias("failed_suppliers"),
-    ],
+# Orders that had more than one distinct supplier. Expressed as
+# ``count(distinct l_suppkey) > 1``. Stands in for the reference SQL's
+# ``exists (... l2.l_suppkey <> l1.l_suppkey ...)`` subquery.
+multi_supplier_orders = (
+    failed_order_lineitems.select("l_orderkey", "l_suppkey")
+    .distinct()
+    .aggregate([col("l_orderkey")], [F.count(col("l_suppkey")).alias("n_suppliers")])
+    .filter(col("n_suppliers") > lit(1))
+    .select("l_orderkey")
 )
 
-# This is the check described above which will identify single failed supplier in a multiple
-# supplier order.
-df = df.filter(F.array_length(col("failed_suppliers")) == lit(1)).filter(
-    F.array_length(col("all_suppliers")) > lit(1)
+# Orders where exactly one distinct supplier was late. Stands in for the
+# reference SQL's ``not exists (... l3.l_suppkey <> l1.l_suppkey and l3 is
+# also late ...)`` subquery: if only one supplier on the order was late,
+# nobody else on the same order was late.
+single_late_supplier_orders = (
+    late_lineitems.select("l_orderkey", "l_suppkey")
+    .distinct()
+    .aggregate(
+        [col("l_orderkey")], [F.count(col("l_suppkey")).alias("n_late_suppliers")]
+    )
+    .filter(col("n_late_suppliers") == lit(1))
+    .select("l_orderkey")
 )
 
-# Since we have an array we know is exactly one element long, we can extract that single value.
-df = df.select(
-    col("o_orderkey"), F.array_element(col("failed_suppliers"), lit(1)).alias("suppkey")
+# Keep late line items whose order qualifies on both counts. Semi joins
+# preserve the left-side columns without fanning out on the right.
+df = late_lineitems.join(multi_supplier_orders, on="l_orderkey", how="semi").join(
+    single_late_supplier_orders, on="l_orderkey", how="semi"
 )
 
-# Join to the supplier of interest list for the nation of interest
-df = df.join(
-    df_suppliers_of_interest, left_on=["suppkey"], right_on=["s_suppkey"], how="inner"
+# Attach the supplier name for suppliers in the nation of interest, count
+# one row per qualifying order, and return the top 100.
+df = (
+    df.join(
+        df_suppliers_of_interest,
+        left_on="l_suppkey",
+        right_on="s_suppkey",
+        how="inner",
+    )
+    .aggregate([col("s_name")], [F.count(col("l_orderkey")).alias("numwait")])
+    .sort(col("numwait").sort(ascending=False), col("s_name").sort())
+    .limit(100)
 )
 
-# Count how many orders that supplier is the only failed supplier for
-df = df.aggregate([col("s_name")], [F.count(col("o_orderkey")).alias("numwait")])
-
-# Return in descending order
-df = df.sort(col("numwait").sort(ascending=False), col("s_name").sort())
-
-df = df.limit(100)
-
 df.show()