@@ -127,7 +127,7 @@ df.aggregate(["a"], [F.sum(col("b")), F.count(col("a"))])
127127# HAVING equivalent: use the filter keyword on the aggregate function
128128df.aggregate(
129129 [" region" ],
130- [F.sum(col(" sales" ), filter = col(" sales" ) > lit( 1000 ) ).alias(" large_sales" )],
130+ [F.sum(col(" sales" ), filter = col(" sales" ) > 1000 ).alias(" large_sales" )],
131131)
132132```
133133
@@ -378,8 +378,8 @@ status_label = (
378378
379379# Searched CASE (each branch has its own predicate)
380380severity = (
381- F.when(col(" value" ) > lit( 100 ) , lit(" high" ))
382- .when(col(" value" ) > lit( 50 ) , lit(" medium" ))
381+ F.when(col(" value" ) > 100 , lit(" high" ))
382+ .when(col(" value" ) > 50 , lit(" medium" ))
383383 .otherwise(lit(" low" ))
384384)
385385```
@@ -423,9 +423,9 @@ col("array_col")[1:3] # array slice (0-indexed)
423423| ` SELECT a, b ` | ` df.select("a", "b") ` |
424424| ` SELECT a, b + 1 AS c ` | ` df.select(col("a"), (col("b") + lit(1)).alias("c")) ` |
425425| ` SELECT *, a + 1 AS c ` | ` df.with_column("c", col("a") + lit(1)) ` |
426- | ` WHERE a > 10 ` | ` df.filter(col("a") > lit(10) ) ` |
426+ | ` WHERE a > 10 ` | ` df.filter(col("a") > 10 ) ` |
427427| ` GROUP BY a ` with ` SUM(b) ` | ` df.aggregate(["a"], [F.sum(col("b"))]) ` |
428- | ` SUM(b) FILTER (WHERE b > 100) ` | ` F.sum(col("b"), filter=col("b") > lit( 100) ) ` |
428+ | ` SUM(b) FILTER (WHERE b > 100) ` | ` F.sum(col("b"), filter=col("b") > 100) ` |
429429| ` ORDER BY a DESC ` | ` df.sort(col("a").sort(ascending=False)) ` |
430430| ` LIMIT 10 OFFSET 5 ` | ` df.limit(10, offset=5) ` |
431431| ` DISTINCT ` | ` df.distinct() ` |
@@ -440,7 +440,7 @@ col("array_col")[1:3] # array slice (0-indexed)
440440| ` EXCEPT ALL ` | ` df1.except_all(df2) ` |
441441| ` EXCEPT ` (distinct) | ` df1.except_all(df2, distinct=True) ` |
442442| ` CASE x WHEN 1 THEN 'a' END ` | ` F.case(col("x")).when(lit(1), lit("a")).end() ` |
443- | ` CASE WHEN x > 1 THEN 'a' END ` | ` F.when(col("x") > lit(1) , lit("a")).end() ` |
443+ | ` CASE WHEN x > 1 THEN 'a' END ` | ` F.when(col("x") > 1 , lit("a")).end() ` |
444444| ` x IN (1, 2, 3) ` | ` F.in_list(col("x"), [lit(1), lit(2), lit(3)]) ` |
445445| ` x BETWEEN 1 AND 10 ` | ` col("x").between(lit(1), lit(10)) ` |
446446| ` CAST(x AS DOUBLE) ` | ` col("x").cast(pa.float64()) ` |
@@ -452,7 +452,7 @@ col("array_col")[1:3] # array slice (0-indexed)
452452## Common Pitfalls
453453
4544541 . ** Boolean operators** : Use ` & ` , ` | ` , ` ~ ` -- not Python's ` and ` , ` or ` , ` not ` .
455- Always parenthesize: ` (col("a") > lit(1)) & (col("b") < lit(2) ) ` .
455+ Always parenthesize: ` (col("a") > 1) & (col("b") < 2 ) ` .
456456
4574572 . ** Wrapping scalars with ` lit() ` ** : Prefer raw Python values on the
458458 right-hand side of comparisons — ` col("a") > 10 ` , ` col("name") == "Alice" `
@@ -525,7 +525,7 @@ col("array_col")[1:3] # array slice (0-indexed)
525525``` python
526526result = (
527527 ctx.read_parquet(" data.parquet" )
528- .filter(col(" year" ) >= lit( 2020 ) )
528+ .filter(col(" year" ) >= 2020 )
529529 .select(col(" region" ), col(" sales" ))
530530 .aggregate([" region" ], [F.sum(col(" sales" )).alias(" total" )])
531531 .sort(col(" total" ).sort(ascending = False ))
@@ -540,9 +540,9 @@ Instead of SQL CTEs (`WITH ... AS`), assign intermediate DataFrames to
540540variables:
541541
542542``` python
543- base = ctx.read_parquet(" orders.parquet" ).filter(col(" status" ) == lit( " shipped" ) )
543+ base = ctx.read_parquet(" orders.parquet" ).filter(col(" status" ) == " shipped" )
544544by_region = base.aggregate([" region" ], [F.sum(col(" amount" )).alias(" total" )])
545- top_regions = by_region.filter(col(" total" ) > lit( 10000 ) )
545+ top_regions = by_region.filter(col(" total" ) > 10000 )
546546```
547547
548548### Reusing Expressions as Variables
0 commit comments