Skip to content

Commit e808db8

Browse files
timsaucerclaude
andcommitted
tpch examples: add reference SQL to each query, fix Q20
- Append the canonical TPC-H reference SQL (from benchmarks/tpch/queries/) to each q01..q22 module docstring so readers can compare the DataFrame translation against the SQL at a glance. - Fix Q20: `df = df.filter(col("ps_availqty") > lit(0.5) * col("total_sold"))` was missing the assignment so the filter was dropped from the pipeline. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent c8bb9f7 commit e808db8

22 files changed

Lines changed: 692 additions & 1 deletion

examples/tpch/q01_pricing_summary_report.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,30 @@
2727
2828
The above problem statement text is copyrighted by the Transaction Processing Performance Council
2929
as part of their TPC Benchmark H Specification revision 2.18.0.
30+
31+
Reference SQL (from TPC-H specification, used by the benchmark suite)::
32+
33+
select
34+
l_returnflag,
35+
l_linestatus,
36+
sum(l_quantity) as sum_qty,
37+
sum(l_extendedprice) as sum_base_price,
38+
sum(l_extendedprice * (1 - l_discount)) as sum_disc_price,
39+
sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge,
40+
avg(l_quantity) as avg_qty,
41+
avg(l_extendedprice) as avg_price,
42+
avg(l_discount) as avg_disc,
43+
count(*) as count_order
44+
from
45+
lineitem
46+
where
47+
l_shipdate <= date '1998-12-01' - interval '68 days'
48+
group by
49+
l_returnflag,
50+
l_linestatus
51+
order by
52+
l_returnflag,
53+
l_linestatus;
3054
"""
3155

3256
import pyarrow as pa

examples/tpch/q02_minimum_cost_supplier.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,52 @@
2727
2828
The above problem statement text is copyrighted by the Transaction Processing Performance Council
2929
as part of their TPC Benchmark H Specification revision 2.18.0.
30+
31+
Reference SQL (from TPC-H specification, used by the benchmark suite)::
32+
33+
select
34+
s_acctbal,
35+
s_name,
36+
n_name,
37+
p_partkey,
38+
p_mfgr,
39+
s_address,
40+
s_phone,
41+
s_comment
42+
from
43+
part,
44+
supplier,
45+
partsupp,
46+
nation,
47+
region
48+
where
49+
p_partkey = ps_partkey
50+
and s_suppkey = ps_suppkey
51+
and p_size = 48
52+
and p_type like '%TIN'
53+
and s_nationkey = n_nationkey
54+
and n_regionkey = r_regionkey
55+
and r_name = 'ASIA'
56+
and ps_supplycost = (
57+
select
58+
min(ps_supplycost)
59+
from
60+
partsupp,
61+
supplier,
62+
nation,
63+
region
64+
where
65+
p_partkey = ps_partkey
66+
and s_suppkey = ps_suppkey
67+
and s_nationkey = n_nationkey
68+
and n_regionkey = r_regionkey
69+
and r_name = 'ASIA'
70+
)
71+
order by
72+
s_acctbal desc,
73+
n_name,
74+
s_name,
75+
p_partkey limit 100;
3076
"""
3177

3278
import datafusion

examples/tpch/q03_shipping_priority.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,31 @@
2525
2626
The above problem statement text is copyrighted by the Transaction Processing Performance Council
2727
as part of their TPC Benchmark H Specification revision 2.18.0.
28+
29+
Reference SQL (from TPC-H specification, used by the benchmark suite)::
30+
31+
select
32+
l_orderkey,
33+
sum(l_extendedprice * (1 - l_discount)) as revenue,
34+
o_orderdate,
35+
o_shippriority
36+
from
37+
customer,
38+
orders,
39+
lineitem
40+
where
41+
c_mktsegment = 'BUILDING'
42+
and c_custkey = o_custkey
43+
and l_orderkey = o_orderkey
44+
and o_orderdate < date '1995-03-15'
45+
and l_shipdate > date '1995-03-15'
46+
group by
47+
l_orderkey,
48+
o_orderdate,
49+
o_shippriority
50+
order by
51+
revenue desc,
52+
o_orderdate limit 10;
2853
"""
2954

3055
from datafusion import SessionContext, col, lit

examples/tpch/q04_order_priority_checking.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,30 @@
2424
2525
The above problem statement text is copyrighted by the Transaction Processing Performance Council
2626
as part of their TPC Benchmark H Specification revision 2.18.0.
27+
28+
Reference SQL (from TPC-H specification, used by the benchmark suite)::
29+
30+
select
31+
o_orderpriority,
32+
count(*) as order_count
33+
from
34+
orders
35+
where
36+
o_orderdate >= date '1995-04-01'
37+
and o_orderdate < date '1995-04-01' + interval '3' month
38+
and exists (
39+
select
40+
*
41+
from
42+
lineitem
43+
where
44+
l_orderkey = o_orderkey
45+
and l_commitdate < l_receiptdate
46+
)
47+
group by
48+
o_orderpriority
49+
order by
50+
o_orderpriority;
2751
"""
2852

2953
from datetime import datetime

examples/tpch/q05_local_supplier_volume.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,33 @@
2727
2828
The above problem statement text is copyrighted by the Transaction Processing Performance Council
2929
as part of their TPC Benchmark H Specification revision 2.18.0.
30+
31+
Reference SQL (from TPC-H specification, used by the benchmark suite)::
32+
33+
select
34+
n_name,
35+
sum(l_extendedprice * (1 - l_discount)) as revenue
36+
from
37+
customer,
38+
orders,
39+
lineitem,
40+
supplier,
41+
nation,
42+
region
43+
where
44+
c_custkey = o_custkey
45+
and l_orderkey = o_orderkey
46+
and l_suppkey = s_suppkey
47+
and c_nationkey = s_nationkey
48+
and s_nationkey = n_nationkey
49+
and n_regionkey = r_regionkey
50+
and r_name = 'AFRICA'
51+
and o_orderdate >= date '1994-01-01'
52+
and o_orderdate < date '1994-01-01' + interval '1' year
53+
group by
54+
n_name
55+
order by
56+
revenue desc;
3057
"""
3158

3259
from datetime import datetime

examples/tpch/q06_forecasting_revenue_change.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,18 @@
2727
2828
The above problem statement text is copyrighted by the Transaction Processing Performance Council
2929
as part of their TPC Benchmark H Specification revision 2.18.0.
30+
31+
Reference SQL (from TPC-H specification, used by the benchmark suite)::
32+
33+
select
34+
sum(l_extendedprice * l_discount) as revenue
35+
from
36+
lineitem
37+
where
38+
l_shipdate >= date '1994-01-01'
39+
and l_shipdate < date '1994-01-01' + interval '1' year
40+
and l_discount between 0.04 - 0.01 and 0.04 + 0.01
41+
and l_quantity < 24;
3042
"""
3143

3244
from datetime import datetime

examples/tpch/q07_volume_shipping.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,48 @@
2626
2727
The above problem statement text is copyrighted by the Transaction Processing Performance Council
2828
as part of their TPC Benchmark H Specification revision 2.18.0.
29+
30+
Reference SQL (from TPC-H specification, used by the benchmark suite)::
31+
32+
select
33+
supp_nation,
34+
cust_nation,
35+
l_year,
36+
sum(volume) as revenue
37+
from
38+
(
39+
select
40+
n1.n_name as supp_nation,
41+
n2.n_name as cust_nation,
42+
extract(year from l_shipdate) as l_year,
43+
l_extendedprice * (1 - l_discount) as volume
44+
from
45+
supplier,
46+
lineitem,
47+
orders,
48+
customer,
49+
nation n1,
50+
nation n2
51+
where
52+
s_suppkey = l_suppkey
53+
and o_orderkey = l_orderkey
54+
and c_custkey = o_custkey
55+
and s_nationkey = n1.n_nationkey
56+
and c_nationkey = n2.n_nationkey
57+
and (
58+
(n1.n_name = 'GERMANY' and n2.n_name = 'IRAQ')
59+
or (n1.n_name = 'IRAQ' and n2.n_name = 'GERMANY')
60+
)
61+
and l_shipdate between date '1995-01-01' and date '1996-12-31'
62+
) as shipping
63+
group by
64+
supp_nation,
65+
cust_nation,
66+
l_year
67+
order by
68+
supp_nation,
69+
cust_nation,
70+
l_year;
2971
"""
3072

3173
from datetime import datetime

examples/tpch/q08_market_share.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,46 @@
2525
2626
The above problem statement text is copyrighted by the Transaction Processing Performance Council
2727
as part of their TPC Benchmark H Specification revision 2.18.0.
28+
29+
Reference SQL (from TPC-H specification, used by the benchmark suite)::
30+
31+
select
32+
o_year,
33+
sum(case
34+
when nation = 'IRAQ' then volume
35+
else 0
36+
end) / sum(volume) as mkt_share
37+
from
38+
(
39+
select
40+
extract(year from o_orderdate) as o_year,
41+
l_extendedprice * (1 - l_discount) as volume,
42+
n2.n_name as nation
43+
from
44+
part,
45+
supplier,
46+
lineitem,
47+
orders,
48+
customer,
49+
nation n1,
50+
nation n2,
51+
region
52+
where
53+
p_partkey = l_partkey
54+
and s_suppkey = l_suppkey
55+
and l_orderkey = o_orderkey
56+
and o_custkey = c_custkey
57+
and c_nationkey = n1.n_nationkey
58+
and n1.n_regionkey = r_regionkey
59+
and r_name = 'MIDDLE EAST'
60+
and s_nationkey = n2.n_nationkey
61+
and o_orderdate between date '1995-01-01' and date '1996-12-31'
62+
and p_type = 'LARGE PLATED STEEL'
63+
) as all_nations
64+
group by
65+
o_year
66+
order by
67+
o_year;
2868
"""
2969

3070
from datetime import datetime

examples/tpch/q09_product_type_profit_measure.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,41 @@
2727
2828
The above problem statement text is copyrighted by the Transaction Processing Performance Council
2929
as part of their TPC Benchmark H Specification revision 2.18.0.
30+
31+
Reference SQL (from TPC-H specification, used by the benchmark suite)::
32+
33+
select
34+
nation,
35+
o_year,
36+
sum(amount) as sum_profit
37+
from
38+
(
39+
select
40+
n_name as nation,
41+
extract(year from o_orderdate) as o_year,
42+
l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity as amount
43+
from
44+
part,
45+
supplier,
46+
lineitem,
47+
partsupp,
48+
orders,
49+
nation
50+
where
51+
s_suppkey = l_suppkey
52+
and ps_suppkey = l_suppkey
53+
and ps_partkey = l_partkey
54+
and p_partkey = l_partkey
55+
and o_orderkey = l_orderkey
56+
and s_nationkey = n_nationkey
57+
and p_name like '%moccasin%'
58+
) as profit
59+
group by
60+
nation,
61+
o_year
62+
order by
63+
nation,
64+
o_year desc;
3065
"""
3166

3267
import pyarrow as pa

examples/tpch/q10_returned_item_reporting.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,40 @@
2727
2828
The above problem statement text is copyrighted by the Transaction Processing Performance Council
2929
as part of their TPC Benchmark H Specification revision 2.18.0.
30+
31+
Reference SQL (from TPC-H specification, used by the benchmark suite)::
32+
33+
select
34+
c_custkey,
35+
c_name,
36+
sum(l_extendedprice * (1 - l_discount)) as revenue,
37+
c_acctbal,
38+
n_name,
39+
c_address,
40+
c_phone,
41+
c_comment
42+
from
43+
customer,
44+
orders,
45+
lineitem,
46+
nation
47+
where
48+
c_custkey = o_custkey
49+
and l_orderkey = o_orderkey
50+
and o_orderdate >= date '1993-07-01'
51+
and o_orderdate < date '1993-07-01' + interval '3' month
52+
and l_returnflag = 'R'
53+
and c_nationkey = n_nationkey
54+
group by
55+
c_custkey,
56+
c_name,
57+
c_acctbal,
58+
c_phone,
59+
n_name,
60+
c_address,
61+
c_comment
62+
order by
63+
revenue desc limit 20;
3064
"""
3165

3266
from datetime import datetime

0 commit comments

Comments
 (0)