|
92 | 92 | ) |
93 | 93 |
|
94 | 94 | # Limit to suppliers in the nation of interest |
95 | | -df_suppliers_of_interest = df_nation.filter(col("n_name") == lit(NATION_OF_INTEREST)) |
96 | | - |
97 | | -df_suppliers_of_interest = df_suppliers_of_interest.join( |
98 | | - df_supplier, left_on="n_nationkey", right_on="s_nationkey", how="inner" |
| 95 | +df_suppliers_of_interest = df_nation.filter( |
| 96 | + col("n_name") == lit(NATION_OF_INTEREST) |
| 97 | +).join(df_supplier, left_on="n_nationkey", right_on="s_nationkey", how="inner") |
| 98 | + |
| 99 | +# Line items for orders that have status 'F'. This is the candidate set of |
| 100 | +# (order, supplier) pairs we reason about below. |
| 101 | +failed_order_lineitems = df_lineitem.join( |
| 102 | + df_orders.filter(col("o_orderstatus") == lit("F")), |
| 103 | + left_on="l_orderkey", |
| 104 | + right_on="o_orderkey", |
| 105 | + how="inner", |
99 | 106 | ) |
100 | 107 |
|
101 | | -# Find the failed orders and all their line items |
102 | | -df = df_orders.filter(col("o_orderstatus") == lit("F")) |
103 | | - |
104 | | -df = df_lineitem.join(df, left_on="l_orderkey", right_on="o_orderkey", how="inner") |
105 | | - |
106 | | -# Identify the line items for which the order is failed due to. |
107 | | -df = df.with_column( |
108 | | - "failed_supp", |
109 | | - F.case(col("l_receiptdate") > col("l_commitdate")) |
110 | | - .when(lit(value=True), col("l_suppkey")) |
111 | | - .end(), |
| 108 | +# Line items whose receipt was late. This corresponds to ``l1`` in the |
| 109 | +# reference SQL. |
| 110 | +late_lineitems = failed_order_lineitems.filter( |
| 111 | + col("l_receiptdate") > col("l_commitdate") |
112 | 112 | ) |
113 | 113 |
|
114 | | -# There are other ways we could do this but the purpose of this example is to work with rows where |
115 | | -# an element is an array of values. In this case, we will create two columns of arrays. One will be |
116 | | -# an array of all of the suppliers who made up this order. That way we can filter the dataframe for |
117 | | -# only orders where this array is larger than one for multiple supplier orders. The second column |
118 | | -# is all of the suppliers who failed to make their commitment. We can filter the second column for |
119 | | -# arrays with size one. That combination will give us orders that had multiple suppliers where only |
120 | | -# one failed. Use distinct=True in the blow aggregation so we don't get multiple line items from the |
121 | | -# same supplier reported in either array. |
122 | | -df = df.aggregate( |
123 | | - [col("o_orderkey")], |
124 | | - [ |
125 | | - F.array_agg(col("l_suppkey"), distinct=True).alias("all_suppliers"), |
126 | | - F.array_agg( |
127 | | - col("failed_supp"), filter=col("failed_supp").is_not_null(), distinct=True |
128 | | - ).alias("failed_suppliers"), |
129 | | - ], |
| 114 | +# Orders that had more than one distinct supplier. Expressed as |
| 115 | +# ``count(distinct l_suppkey) > 1``. Stands in for the reference SQL's |
| 116 | +# ``exists (... l2.l_suppkey <> l1.l_suppkey ...)`` subquery. |
| 117 | +multi_supplier_orders = ( |
| 118 | + failed_order_lineitems.select("l_orderkey", "l_suppkey") |
| 119 | + .distinct() |
| 120 | + .aggregate([col("l_orderkey")], [F.count(col("l_suppkey")).alias("n_suppliers")]) |
| 121 | + .filter(col("n_suppliers") > lit(1)) |
| 122 | + .select("l_orderkey") |
130 | 123 | ) |
131 | 124 |
|
132 | | -# This is the check described above which will identify single failed supplier in a multiple |
133 | | -# supplier order. |
134 | | -df = df.filter(F.array_length(col("failed_suppliers")) == lit(1)).filter( |
135 | | - F.array_length(col("all_suppliers")) > lit(1) |
| 125 | +# Orders where exactly one distinct supplier was late. Stands in for the |
| 126 | +# reference SQL's ``not exists (... l3.l_suppkey <> l1.l_suppkey and l3 is |
| 127 | +# also late ...)`` subquery: if only one supplier on the order was late, |
| 128 | +# nobody else on the same order was late. |
| 129 | +single_late_supplier_orders = ( |
| 130 | + late_lineitems.select("l_orderkey", "l_suppkey") |
| 131 | + .distinct() |
| 132 | + .aggregate( |
| 133 | + [col("l_orderkey")], [F.count(col("l_suppkey")).alias("n_late_suppliers")] |
| 134 | + ) |
| 135 | + .filter(col("n_late_suppliers") == lit(1)) |
| 136 | + .select("l_orderkey") |
136 | 137 | ) |
137 | 138 |
|
138 | | -# Since we have an array we know is exactly one element long, we can extract that single value. |
139 | | -df = df.select( |
140 | | - col("o_orderkey"), F.array_element(col("failed_suppliers"), lit(1)).alias("suppkey") |
| 139 | +# Keep late line items whose order qualifies on both counts. Semi joins |
| 140 | +# preserve the left-side columns without fanning out on the right. |
| 141 | +df = late_lineitems.join(multi_supplier_orders, on="l_orderkey", how="semi").join( |
| 142 | + single_late_supplier_orders, on="l_orderkey", how="semi" |
141 | 143 | ) |
142 | 144 |
|
143 | | -# Join to the supplier of interest list for the nation of interest |
144 | | -df = df.join( |
145 | | - df_suppliers_of_interest, left_on=["suppkey"], right_on=["s_suppkey"], how="inner" |
| 145 | +# Attach the supplier name for suppliers in the nation of interest, count |
| 146 | +# one row per qualifying order, and return the top 100. |
| 147 | +df = ( |
| 148 | + df.join( |
| 149 | + df_suppliers_of_interest, |
| 150 | + left_on="l_suppkey", |
| 151 | + right_on="s_suppkey", |
| 152 | + how="inner", |
| 153 | + ) |
| 154 | + .aggregate([col("s_name")], [F.count(col("l_orderkey")).alias("numwait")]) |
| 155 | + .sort(col("numwait").sort(ascending=False), col("s_name").sort()) |
| 156 | + .limit(100) |
146 | 157 | ) |
147 | 158 |
|
148 | | -# Count how many orders that supplier is the only failed supplier for |
149 | | -df = df.aggregate([col("s_name")], [F.count(col("o_orderkey")).alias("numwait")]) |
150 | | - |
151 | | -# Return in descending order |
152 | | -df = df.sort(col("numwait").sort(ascending=False), col("s_name").sort()) |
153 | | - |
154 | | -df = df.limit(100) |
155 | | - |
156 | 159 | df.show() |
0 commit comments