apache
diff --git a/‎examples/tpch/q01_pricing_summary_report.py‎
Lines changed: 7 additions & 13 deletions b/‎examples/tpch/q01_pricing_summary_report.py‎
Lines changed: 7 additions & 13 deletions
diff --git a/‎examples/tpch/q02_minimum_cost_supplier.py‎
Lines changed: 14 additions & 21 deletions b/‎examples/tpch/q02_minimum_cost_supplier.py‎
Lines changed: 14 additions & 21 deletions
diff --git a/‎examples/tpch/q03_shipping_priority.py‎
Lines changed: 11 additions & 15 deletions b/‎examples/tpch/q03_shipping_priority.py‎
Lines changed: 11 additions & 15 deletions
diff --git a/‎examples/tpch/q04_order_priority_checking.py‎
Lines changed: 11 additions & 19 deletions b/‎examples/tpch/q04_order_priority_checking.py‎
Lines changed: 11 additions & 19 deletions
diff --git a/‎examples/tpch/q05_local_supplier_volume.py‎
Lines changed: 11 additions & 17 deletions b/‎examples/tpch/q05_local_supplier_volume.py‎
Lines changed: 11 additions & 17 deletions
diff --git a/‎examples/tpch/q06_forecasting_revenue_change.py‎
Lines changed: 5 additions & 6 deletions b/‎examples/tpch/q06_forecasting_revenue_change.py‎
Lines changed: 5 additions & 6 deletions
diff --git a/‎examples/tpch/q07_volume_shipping.py‎
Lines changed: 16 additions & 20 deletions b/‎examples/tpch/q07_volume_shipping.py‎
Lines changed: 16 additions & 20 deletions
@@ -82,31 +82,25 @@
 
 # Aggregate the results
 
+disc_price = col("l_extendedprice") * (lit(1) - col("l_discount"))
+
 df = df.aggregate(
-    [col("l_returnflag"), col("l_linestatus")],
+    ["l_returnflag", "l_linestatus"],
     [
         F.sum(col("l_quantity")).alias("sum_qty"),
         F.sum(col("l_extendedprice")).alias("sum_base_price"),
-        F.sum(col("l_extendedprice") * (lit(1) - col("l_discount"))).alias(
-            "sum_disc_price"
-        ),
-        F.sum(
-            col("l_extendedprice")
-            * (lit(1) - col("l_discount"))
-            * (lit(1) + col("l_tax"))
-        ).alias("sum_charge"),
+        F.sum(disc_price).alias("sum_disc_price"),
+        F.sum(disc_price * (lit(1) + col("l_tax"))).alias("sum_charge"),
         F.avg(col("l_quantity")).alias("avg_qty"),
         F.avg(col("l_extendedprice")).alias("avg_price"),
         F.avg(col("l_discount")).alias("avg_disc"),
-        F.count(col("l_returnflag")).alias(
-            "count_order"
-        ),  # Counting any column should return same result
+        F.count_star().alias("count_order"),
     ],
 )
 
 # Sort per the expected result
 
-df = df.sort(col("l_returnflag").sort(), col("l_linestatus").sort())
+df = df.sort_by("l_returnflag", "l_linestatus")
 
 # Note: There appears to be a discrepancy between what is returned here and what is in the generated
 # answers file for the case of return flag N and line status O, but I did not investigate further.
 
@@ -118,30 +118,25 @@
 # in the string where it is located.
 
 df_part = df_part.filter(
-    F.strpos(col("p_type"), lit(TYPE_OF_INTEREST)) > lit(0)
-).filter(col("p_size") == lit(SIZE_OF_INTEREST))
+    F.strpos(col("p_type"), lit(TYPE_OF_INTEREST)) > 0,
+    col("p_size") == SIZE_OF_INTEREST,
+)
 
 # Filter regions down to the one of interest
 
-df_region = df_region.filter(col("r_name") == lit(REGION_OF_INTEREST))
+df_region = df_region.filter(col("r_name") == REGION_OF_INTEREST)
 
 # Now that we have the region, find suppliers in that region. Suppliers are tied to their nation
 # and nations are tied to the region.
 
-df_nation = df_nation.join(
-    df_region, left_on=["n_regionkey"], right_on=["r_regionkey"], how="inner"
-)
-df_supplier = df_supplier.join(
-    df_nation, left_on=["s_nationkey"], right_on=["n_nationkey"], how="inner"
-)
+df_nation = df_nation.join(df_region, left_on="n_regionkey", right_on="r_regionkey")
+df_supplier = df_supplier.join(df_nation, left_on="s_nationkey", right_on="n_nationkey")
 
 # Now that we know who the potential suppliers are for the part, we can limit out part
 # supplies table down. We can further join down to the specific parts we've identified
 # as matching the request
 
-df = df_partsupp.join(
-    df_supplier, left_on=["ps_suppkey"], right_on=["s_suppkey"], how="inner"
-)
+df = df_partsupp.join(df_supplier, left_on="ps_suppkey", right_on="s_suppkey")
 
 # Locate the minimum cost across all suppliers. There are multiple ways you could do this,
 # but one way is to create a window function across all suppliers, find the minimum, and
@@ -158,9 +153,9 @@
     ),
 )
 
-df = df.filter(col("min_cost") == col("ps_supplycost"))
-
-df = df.join(df_part, left_on=["ps_partkey"], right_on=["p_partkey"], how="inner")
+df = df.filter(col("min_cost") == col("ps_supplycost")).join(
+    df_part, left_on="ps_partkey", right_on="p_partkey"
+)
 
 # From the problem statement, these are the values we wish to output
 
@@ -178,12 +173,10 @@
 # Sort and display 100 entries
 df = df.sort(
     col("s_acctbal").sort(ascending=False),
-    col("n_name").sort(),
-    col("s_name").sort(),
-    col("p_partkey").sort(),
-)
-
-df = df.limit(100)
+    "n_name",
+    "s_name",
+    "p_partkey",
+).limit(100)
 
 # Show results
 
 
@@ -75,38 +75,34 @@
 
 # Limit dataframes to the rows of interest
 
-df_customer = df_customer.filter(col("c_mktsegment") == lit(SEGMENT_OF_INTEREST))
+df_customer = df_customer.filter(col("c_mktsegment") == SEGMENT_OF_INTEREST)
 df_orders = df_orders.filter(col("o_orderdate") < lit(DATE_OF_INTEREST))
 df_lineitem = df_lineitem.filter(col("l_shipdate") > lit(DATE_OF_INTEREST))
 
 # Join all 3 dataframes
 
-df = df_customer.join(
-    df_orders, left_on=["c_custkey"], right_on=["o_custkey"], how="inner"
-).join(df_lineitem, left_on=["o_orderkey"], right_on=["l_orderkey"], how="inner")
+df = df_customer.join(df_orders, left_on="c_custkey", right_on="o_custkey").join(
+    df_lineitem, left_on="o_orderkey", right_on="l_orderkey"
+)
 
 # Compute the revenue
 
 df = df.aggregate(
-    [col("l_orderkey")],
+    ["l_orderkey"],
     [
         F.first_value(col("o_orderdate")).alias("o_orderdate"),
         F.first_value(col("o_shippriority")).alias("o_shippriority"),
         F.sum(col("l_extendedprice") * (lit(1.0) - col("l_discount"))).alias("revenue"),
     ],
 )
 
-# Sort by priority
-
-df = df.sort(col("revenue").sort(ascending=False), col("o_orderdate").sort())
-
-# Only return 10 results
+# Sort by priority, take 10, and project in the order expected by the spec.
 
-df = df.limit(10)
-
-# Change the order that the columns are reported in just to match the spec
-
-df = df.select("l_orderkey", "revenue", "o_orderdate", "o_shippriority")
+df = (
+    df.sort(col("revenue").sort(ascending=False), "o_orderdate")
+    .limit(10)
+    .select("l_orderkey", "revenue", "o_orderdate", "o_shippriority")
+)
 
 # Show result
 
 
@@ -77,31 +77,23 @@
 
 interval = pa.scalar((0, INTERVAL_DAYS, 0), type=pa.month_day_nano_interval())
 
-# Limit results to cases where commitment date before receipt date, then
-# reduce to a single row per order so the join with the orders table is a
-# semantic EXISTS rather than a fan-out.
-df_lineitem = (
-    df_lineitem.filter(col("l_commitdate") < col("l_receiptdate"))
-    .select("l_orderkey")
-    .distinct()
+# Keep only orders in the quarter of interest, then restrict to those that
+# have at least one late lineitem via a semi join (the DataFrame form of
+# ``EXISTS`` from the reference SQL).
+df_orders = df_orders.filter(
+    col("o_orderdate") >= lit(date),
+    col("o_orderdate") < lit(date) + lit(interval),
 )
 
-# Limit orders to date range of interest
-df_orders = df_orders.filter(col("o_orderdate") >= lit(date)).filter(
-    col("o_orderdate") < lit(date) + lit(interval)
-)
+late_lineitems = df_lineitem.filter(col("l_commitdate") < col("l_receiptdate"))
 
-# Perform the join to find only orders for which there are lineitems outside of expected range
 df = df_orders.join(
-    df_lineitem, left_on=["o_orderkey"], right_on=["l_orderkey"], how="inner"
+    late_lineitems, left_on="o_orderkey", right_on="l_orderkey", how="semi"
 )
 
-# Based on priority, find the number of entries
-df = df.aggregate(
-    [col("o_orderpriority")], [F.count(col("o_orderpriority")).alias("order_count")]
+# Count the number of orders in each priority group and sort.
+df = df.aggregate(["o_orderpriority"], [F.count_star().alias("order_count")]).sort_by(
+    "o_orderpriority"
 )
 
-# Sort the results
-df = df.sort(col("o_orderpriority").sort())
-
 df.show()
@@ -95,38 +95,32 @@
 )
 
 # Restrict dataframes to cases of interest
-df_orders = df_orders.filter(col("o_orderdate") >= lit(date)).filter(
-    col("o_orderdate") < lit(date) + lit(interval)
+df_orders = df_orders.filter(
+    col("o_orderdate") >= lit(date),
+    col("o_orderdate") < lit(date) + lit(interval),
 )
 
-df_region = df_region.filter(col("r_name") == lit(REGION_OF_INTEREST))
+df_region = df_region.filter(col("r_name") == REGION_OF_INTEREST)
 
 # Join all the dataframes
 
 df = (
-    df_customer.join(
-        df_orders, left_on=["c_custkey"], right_on=["o_custkey"], how="inner"
-    )
-    .join(df_lineitem, left_on=["o_orderkey"], right_on=["l_orderkey"], how="inner")
+    df_customer.join(df_orders, left_on="c_custkey", right_on="o_custkey")
+    .join(df_lineitem, left_on="o_orderkey", right_on="l_orderkey")
     .join(
         df_supplier,
         left_on=["l_suppkey", "c_nationkey"],
         right_on=["s_suppkey", "s_nationkey"],
-        how="inner",
     )
-    .join(df_nation, left_on=["s_nationkey"], right_on=["n_nationkey"], how="inner")
-    .join(df_region, left_on=["n_regionkey"], right_on=["r_regionkey"], how="inner")
+    .join(df_nation, left_on="s_nationkey", right_on="n_nationkey")
+    .join(df_region, left_on="n_regionkey", right_on="r_regionkey")
 )
 
-# Compute the final result
+# Compute the final result, then sort in descending order.
 
 df = df.aggregate(
-    [col("n_name")],
+    ["n_name"],
     [F.sum(col("l_extendedprice") * (lit(1.0) - col("l_discount"))).alias("revenue")],
-)
-
-# Sort in descending order
-
-df = df.sort(col("revenue").sort(ascending=False))
+).sort(col("revenue").sort(ascending=False))
 
 df.show()
@@ -71,12 +71,11 @@
 
 # Filter down to lineitems of interest
 
-df = (
-    df_lineitem.filter(col("l_shipdate") >= lit(date))
-    .filter(col("l_shipdate") < lit(date) + lit(interval))
-    .filter(col("l_discount") >= lit(DISCOUT) - lit(DELTA))
-    .filter(col("l_discount") <= lit(DISCOUT) + lit(DELTA))
-    .filter(col("l_quantity") < lit(QUANTITY))
+df = df_lineitem.filter(
+    col("l_shipdate") >= lit(date),
+    col("l_shipdate") < lit(date) + lit(interval),
+    col("l_discount").between(lit(DISCOUT - DELTA), lit(DISCOUT + DELTA)),
+    col("l_quantity") < QUANTITY,
 )
 
 # Add up all the "lost" revenue
 
@@ -111,8 +111,8 @@
 
 
 # Filter to time of interest
-df_lineitem = df_lineitem.filter(col("l_shipdate") >= start_date).filter(
-    col("l_shipdate") <= end_date
+df_lineitem = df_lineitem.filter(
+    col("l_shipdate") >= start_date, col("l_shipdate") <= end_date
 )
 
 
@@ -122,37 +122,33 @@
 
 # Limit suppliers to either nation
 df_supplier = df_supplier.join(
-    df_nation, left_on=["s_nationkey"], right_on=["n_nationkey"], how="inner"
-).select(col("s_suppkey"), col("n_name").alias("supp_nation"))
+    df_nation, left_on="s_nationkey", right_on="n_nationkey"
+).select("s_suppkey", col("n_name").alias("supp_nation"))
 
 # Limit customers to either nation
 df_customer = df_customer.join(
-    df_nation, left_on=["c_nationkey"], right_on=["n_nationkey"], how="inner"
-).select(col("c_custkey"), col("n_name").alias("cust_nation"))
+    df_nation, left_on="c_nationkey", right_on="n_nationkey"
+).select("c_custkey", col("n_name").alias("cust_nation"))
 
 # Join up all the data frames from line items, and make sure the supplier and customer are in
 # different nations.
 df = (
-    df_lineitem.join(
-        df_orders, left_on=["l_orderkey"], right_on=["o_orderkey"], how="inner"
-    )
-    .join(df_customer, left_on=["o_custkey"], right_on=["c_custkey"], how="inner")
-    .join(df_supplier, left_on=["l_suppkey"], right_on=["s_suppkey"], how="inner")
+    df_lineitem.join(df_orders, left_on="l_orderkey", right_on="o_orderkey")
+    .join(df_customer, left_on="o_custkey", right_on="c_custkey")
+    .join(df_supplier, left_on="l_suppkey", right_on="s_suppkey")
     .filter(col("cust_nation") != col("supp_nation"))
 )
 
 # Extract out two values for every line item
-df = df.with_column(
-    "l_year", F.datepart(lit("year"), col("l_shipdate")).cast(pa.int32())
-).with_column("volume", col("l_extendedprice") * (lit(1.0) - col("l_discount")))
+df = df.with_columns(
+    l_year=F.datepart(lit("year"), col("l_shipdate")).cast(pa.int32()),
+    volume=col("l_extendedprice") * (lit(1.0) - col("l_discount")),
+)
 
-# Aggregate the results
+# Aggregate and sort per the spec.
 df = df.aggregate(
-    [col("supp_nation"), col("cust_nation"), col("l_year")],
+    ["supp_nation", "cust_nation", "l_year"],
     [F.sum(col("volume")).alias("revenue")],
-)
-
-# Sort based on problem statement requirements
-df = df.sort(col("supp_nation").sort(), col("cust_nation").sort(), col("l_year").sort())
+).sort_by("supp_nation", "cust_nation", "l_year")
 
 df.show()