apache
diff --git a/‎examples/tpch/q02_minimum_cost_supplier.py‎
Lines changed: 4 additions & 4 deletions b/‎examples/tpch/q02_minimum_cost_supplier.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎examples/tpch/q04_order_priority_checking.py‎
Lines changed: 5 additions & 12 deletions b/‎examples/tpch/q04_order_priority_checking.py‎
Lines changed: 5 additions & 12 deletions
diff --git a/‎examples/tpch/q05_local_supplier_volume.py‎
Lines changed: 5 additions & 10 deletions b/‎examples/tpch/q05_local_supplier_volume.py‎
Lines changed: 5 additions & 10 deletions
diff --git a/‎examples/tpch/q06_forecasting_revenue_change.py‎
Lines changed: 5 additions & 11 deletions b/‎examples/tpch/q06_forecasting_revenue_change.py‎
Lines changed: 5 additions & 11 deletions
diff --git a/‎examples/tpch/q07_volume_shipping.py‎
Lines changed: 4 additions & 7 deletions b/‎examples/tpch/q07_volume_shipping.py‎
Lines changed: 4 additions & 7 deletions
diff --git a/‎examples/tpch/q08_market_share.py‎
Lines changed: 36 additions & 51 deletions b/‎examples/tpch/q08_market_share.py‎
Lines changed: 36 additions & 51 deletions
diff --git a/‎examples/tpch/q09_product_type_profit_measure.py‎
Lines changed: 4 additions & 3 deletions b/‎examples/tpch/q09_product_type_profit_measure.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎examples/tpch/q10_returned_item_reporting.py‎
Lines changed: 5 additions & 9 deletions b/‎examples/tpch/q10_returned_item_reporting.py‎
Lines changed: 5 additions & 9 deletions
@@ -113,12 +113,12 @@
     "r_regionkey", "r_name"
 )
 
-# Filter down parts. Part names contain the type of interest, so we can use strpos to find where
-# in the p_type column the word is. `strpos` will return 0 if not found, otherwise the position
-# in the string where it is located.
+# Filter down parts. The reference SQL uses ``p_type like '%BRASS'`` which
+# is an ``ends_with`` check; use the dedicated string function rather than
+# a manual substring match.
 
 df_part = df_part.filter(
-    F.strpos(col("p_type"), lit(TYPE_OF_INTEREST)) > 0,
+    F.ends_with(col("p_type"), lit(TYPE_OF_INTEREST)),
     col("p_size") == SIZE_OF_INTEREST,
 )
 
 
@@ -50,16 +50,14 @@
         o_orderpriority;
 """
 
-from datetime import datetime
+from datetime import date
 
-import pyarrow as pa
 from datafusion import SessionContext, col, lit
 from datafusion import functions as F
 from util import get_data_path
 
-# Ideally we could put 3 months into the interval. See note below.
-INTERVAL_DAYS = 92
-DATE_OF_INTEREST = "1993-07-01"
+QUARTER_START = date(1993, 7, 1)
+QUARTER_END = date(1993, 10, 1)
 
 # Load the dataframes we need
 
@@ -72,17 +70,12 @@
     "l_orderkey", "l_commitdate", "l_receiptdate"
 )
 
-# Create a date object from the string
-date = datetime.strptime(DATE_OF_INTEREST, "%Y-%m-%d").date()
-
-interval = pa.scalar((0, INTERVAL_DAYS, 0), type=pa.month_day_nano_interval())
-
 # Keep only orders in the quarter of interest, then restrict to those that
 # have at least one late lineitem via a semi join (the DataFrame form of
 # ``EXISTS`` from the reference SQL).
 df_orders = df_orders.filter(
-    col("o_orderdate") >= lit(date),
-    col("o_orderdate") < lit(date) + lit(interval),
+    col("o_orderdate") >= lit(QUARTER_START),
+    col("o_orderdate") < lit(QUARTER_END),
 )
 
 late_lineitems = df_lineitem.filter(col("l_commitdate") < col("l_receiptdate"))
 
@@ -56,21 +56,16 @@
         revenue desc;
 """
 
-from datetime import datetime
+from datetime import date
 
-import pyarrow as pa
 from datafusion import SessionContext, col, lit
 from datafusion import functions as F
 from util import get_data_path
 
-DATE_OF_INTEREST = "1994-01-01"
-INTERVAL_DAYS = 365
+YEAR_START = date(1994, 1, 1)
+YEAR_END = date(1995, 1, 1)
 REGION_OF_INTEREST = "ASIA"
 
-date = datetime.strptime(DATE_OF_INTEREST, "%Y-%m-%d").date()
-
-interval = pa.scalar((0, INTERVAL_DAYS, 0), type=pa.month_day_nano_interval())
-
 # Load the dataframes we need
 
 ctx = SessionContext()
@@ -96,8 +91,8 @@
 
 # Restrict dataframes to cases of interest
 df_orders = df_orders.filter(
-    col("o_orderdate") >= lit(date),
-    col("o_orderdate") < lit(date) + lit(interval),
+    col("o_orderdate") >= lit(YEAR_START),
+    col("o_orderdate") < lit(YEAR_END),
 )
 
 df_region = df_region.filter(col("r_name") == REGION_OF_INTEREST)
 
@@ -41,26 +41,20 @@
         and l_quantity < 24;
 """
 
-from datetime import datetime
+from datetime import date
 
-import pyarrow as pa
 from datafusion import SessionContext, col, lit
 from datafusion import functions as F
 from util import get_data_path
 
 # Variables from the example query
 
-DATE_OF_INTEREST = "1994-01-01"
+YEAR_START = date(1994, 1, 1)
+YEAR_END = date(1995, 1, 1)
 DISCOUT = 0.06
 DELTA = 0.01
 QUANTITY = 24
 
-INTERVAL_DAYS = 365
-
-date = datetime.strptime(DATE_OF_INTEREST, "%Y-%m-%d").date()
-
-interval = pa.scalar((0, INTERVAL_DAYS, 0), type=pa.month_day_nano_interval())
-
 # Load the dataframes we need
 
 ctx = SessionContext()
@@ -72,8 +66,8 @@
 # Filter down to lineitems of interest
 
 df = df_lineitem.filter(
-    col("l_shipdate") >= lit(date),
-    col("l_shipdate") < lit(date) + lit(interval),
+    col("l_shipdate") >= lit(YEAR_START),
+    col("l_shipdate") < lit(YEAR_END),
     col("l_discount").between(lit(DISCOUT - DELTA), lit(DISCOUT + DELTA)),
     col("l_quantity") < QUANTITY,
 )
 
@@ -70,7 +70,7 @@
         l_year;
 """
 
-from datetime import datetime
+from datetime import date
 
 import pyarrow as pa
 from datafusion import SessionContext, col, lit
@@ -82,11 +82,8 @@
 nation_1 = lit("FRANCE")
 nation_2 = lit("GERMANY")
 
-START_DATE = "1995-01-01"
-END_DATE = "1996-12-31"
-
-start_date = lit(datetime.strptime(START_DATE, "%Y-%m-%d").date())
-end_date = lit(datetime.strptime(END_DATE, "%Y-%m-%d").date())
+START_DATE = date(1995, 1, 1)
+END_DATE = date(1996, 12, 31)
 
 
 # Load the dataframes we need
@@ -112,7 +109,7 @@
 
 # Filter to time of interest
 df_lineitem = df_lineitem.filter(
-    col("l_shipdate") >= start_date, col("l_shipdate") <= end_date
+    col("l_shipdate") >= lit(START_DATE), col("l_shipdate") <= lit(END_DATE)
 )
 
 
 
@@ -67,22 +67,19 @@
         o_year;
 """
 
-from datetime import datetime
+from datetime import date
 
 import pyarrow as pa
 from datafusion import SessionContext, col, lit
 from datafusion import functions as F
 from util import get_data_path
 
-supplier_nation = lit("BRAZIL")
-customer_region = lit("AMERICA")
-part_of_interest = lit("ECONOMY ANODIZED STEEL")
+supplier_nation = "BRAZIL"
+customer_region = "AMERICA"
+part_of_interest = "ECONOMY ANODIZED STEEL"
 
-START_DATE = "1995-01-01"
-END_DATE = "1996-12-31"
-
-start_date = lit(datetime.strptime(START_DATE, "%Y-%m-%d").date())
-end_date = lit(datetime.strptime(END_DATE, "%Y-%m-%d").date())
+START_DATE = date(1995, 1, 1)
+END_DATE = date(1996, 12, 31)
 
 
 # Load the dataframes we need
@@ -115,67 +112,55 @@
 # Limit orders to those in the specified range
 
 df_orders = df_orders.filter(
-    col("o_orderdate") >= start_date, col("o_orderdate") <= end_date
+    col("o_orderdate") >= lit(START_DATE), col("o_orderdate") <= lit(END_DATE)
 )
 
-# Part 1: Find customers in the region
+# Pair each supplier with its nation name so every regional-customer row
+# below carries the supplier's nation and can be filtered inside the
+# aggregate with ``F.sum(..., filter=...)``.
 
-# We want customers in region specified by region_of_interest. This will be used to compute
-# the total sales of the part of interest. We want to know of those sales what fraction
-# was supplied by the nation of interest. There is no guarantee that the nation of
-# interest is within the region of interest.
+df_supplier_with_nation = df_supplier.join(
+    df_nation, left_on="s_nationkey", right_on="n_nationkey"
+).select("s_suppkey", col("n_name").alias("supp_nation"))
 
-# First we find all the sales that make up the basis.
+# Build every (part, lineitem, order, customer) row for customers in the
+# target region ordering the target part. Each row carries the supplier's
+# nation so we can aggregate on it below.
 
-df_regional_customers = (
+df = (
     df_region.filter(col("r_name") == customer_region)
     .join(df_nation, left_on="r_regionkey", right_on="n_regionkey")
     .join(df_customer, left_on="n_nationkey", right_on="c_nationkey")
     .join(df_orders, left_on="c_custkey", right_on="o_custkey")
     .join(df_lineitem, left_on="o_orderkey", right_on="l_orderkey")
     .join(df_part, left_on="l_partkey", right_on="p_partkey")
-    .with_column("volume", col("l_extendedprice") * (lit(1.0) - col("l_discount")))
-)
-
-# Part 2: Find suppliers from the nation
-
-# Now that we have all of the sales of that part in the specified region, we need
-# to determine which of those came from suppliers in the nation we are interested in.
-
-df_national_suppliers = (
-    df_nation.filter(col("n_name") == supplier_nation)
-    .join(df_supplier, left_on="n_nationkey", right_on="s_nationkey")
-    .select("s_suppkey")
-)
-
-
-# Part 3: Combine suppliers and customers and compute the market share
-
-# Left-outer join the national suppliers onto the regional sales. Rows from
-# other suppliers get a NULL ``s_suppkey``, which the CASE expression uses
-# to zero out the non-national volume.
-
-df = df_regional_customers.join(
-    df_national_suppliers, left_on="l_suppkey", right_on="s_suppkey", how="left"
-).with_columns(
-    national_volume=F.when(col("s_suppkey").is_not_null(), col("volume")).otherwise(
-        lit(0.0)
-    ),
-    o_year=F.datepart(lit("year"), col("o_orderdate")).cast(pa.int32()),
+    .join(df_supplier_with_nation, left_on="l_suppkey", right_on="s_suppkey")
+    .with_columns(
+        volume=col("l_extendedprice") * (lit(1.0) - col("l_discount")),
+        o_year=F.datepart(lit("year"), col("o_orderdate")).cast(pa.int32()),
+    )
 )
 
-
-# Aggregate, compute the share, and sort.
-
+# Aggregate the total and national volumes per year via the ``filter``
+# kwarg on ``F.sum`` (DataFrame form of SQL ``sum(... ) FILTER (WHERE ...)``).
+# ``coalesce`` handles the case where no sale came from the target nation
+# for a given year.
 df = (
     df.aggregate(
         ["o_year"],
         [
-            F.sum(col("volume")).alias("volume"),
-            F.sum(col("national_volume")).alias("national_volume"),
+            F.sum(col("volume"), filter=col("supp_nation") == supplier_nation).alias(
+                "national_volume"
+            ),
+            F.sum(col("volume")).alias("total_volume"),
         ],
     )
-    .select("o_year", (col("national_volume") / col("volume")).alias("mkt_share"))
+    .select(
+        "o_year",
+        (F.coalesce(col("national_volume"), lit(0.0)) / col("total_volume")).alias(
+            "mkt_share"
+        ),
+    )
     .sort_by("o_year")
 )
 
 
@@ -69,7 +69,7 @@
 from datafusion import functions as F
 from util import get_data_path
 
-part_color = lit("green")
+part_color = "green"
 
 # Load the dataframes we need
 
@@ -98,9 +98,10 @@
 )
 
 # Limit possible parts to the color specified, then walk the joins down to the
-# line-item rows we need and attach the supplier's nation.
+# line-item rows we need and attach the supplier's nation. ``F.contains``
+# maps directly to the reference SQL's ``p_name like '%green%'``.
 df = (
-    df_part.filter(F.strpos(col("p_name"), part_color) > 0)
+    df_part.filter(F.contains(col("p_name"), lit(part_color)))
     .join(df_lineitem, left_on="p_partkey", right_on="l_partkey")
     .join(df_supplier, left_on="l_suppkey", right_on="s_suppkey")
     .join(df_orders, left_on="l_orderkey", right_on="o_orderkey")
 
@@ -63,18 +63,14 @@
         revenue desc limit 20;
 """
 
-from datetime import datetime
+from datetime import date
 
-import pyarrow as pa
 from datafusion import SessionContext, col, lit
 from datafusion import functions as F
 from util import get_data_path
 
-DATE_START_OF_QUARTER = "1993-10-01"
-
-date_start_of_quarter = lit(datetime.strptime(DATE_START_OF_QUARTER, "%Y-%m-%d").date())
-
-interval_one_quarter = lit(pa.scalar((0, 92, 0), type=pa.month_day_nano_interval()))
+QUARTER_START = date(1993, 10, 1)
+QUARTER_END = date(1994, 1, 1)
 
 # Load the dataframes we need
 
@@ -108,8 +104,8 @@
 
 df = (
     df_orders.filter(
-        col("o_orderdate") >= date_start_of_quarter,
-        col("o_orderdate") < date_start_of_quarter + interval_one_quarter,
+        col("o_orderdate") >= lit(QUARTER_START),
+        col("o_orderdate") < lit(QUARTER_END),
     )
     .join(df_lineitem, left_on="o_orderkey", right_on="l_orderkey")
     .aggregate(
Original file line number	Diff line number	Diff line change
`@@ -113,12 +113,12 @@`
`113`	`113`	`"r_regionkey", "r_name"`
`114`	`114`	`)`
`115`	`115`
`116`		`-# Filter down parts. Part names contain the type of interest, so we can use strpos to find where`
`117`		-# in the p_type column the word is. `strpos` will return 0 if not found, otherwise the position
`118`		`-# in the string where it is located.`
	`116`	+# Filter down parts. The reference SQL uses ``p_type like '%BRASS'`` which
	`117`	+# is an ``ends_with`` check; use the dedicated string function rather than
	`118`	`+# a manual substring match.`
`119`	`119`
`120`	`120`	`df_part = df_part.filter(`
`121`		`- F.strpos(col("p_type"), lit(TYPE_OF_INTEREST)) > 0,`
	`121`	`+ F.ends_with(col("p_type"), lit(TYPE_OF_INTEREST)),`
`122`	`122`	`col("p_size") == SIZE_OF_INTEREST,`
`123`	`123`	`)`
`124`	`124`