|
91 | 91 | Extension = expr_internal.Extension |
92 | 92 | FileType = expr_internal.FileType |
93 | 93 | Filter = expr_internal.Filter |
94 | | -GroupingSet = expr_internal.GroupingSet |
| 94 | +_GroupingSetInternal = expr_internal.GroupingSet |
95 | 95 | Join = expr_internal.Join |
96 | 96 | ILike = expr_internal.ILike |
97 | 97 | InList = expr_internal.InList |
@@ -1430,3 +1430,135 @@ def __repr__(self) -> str: |
1430 | 1430 |
|
1431 | 1431 |
|
1432 | 1432 | SortKey = Expr | SortExpr | str |
| 1433 | + |
| 1434 | + |
| 1435 | +class GroupingSet: |
| 1436 | + """Factory for creating grouping set expressions. |
| 1437 | +
|
| 1438 | + Grouping sets control how |
| 1439 | + :py:meth:`~datafusion.dataframe.DataFrame.aggregate` groups rows. |
| 1440 | + Instead of a single ``GROUP BY``, they produce multiple grouping |
| 1441 | + levels in one pass — subtotals, cross-tabulations, or arbitrary |
| 1442 | + column subsets. |
| 1443 | +
|
| 1444 | + Use :py:func:`~datafusion.functions.grouping` in the aggregate list |
| 1445 | + to tell which columns are aggregated across in each result row. |
| 1446 | + """ |
| 1447 | + |
| 1448 | + @staticmethod |
| 1449 | + def rollup(*exprs: Expr) -> Expr: |
| 1450 | + """Create a ``ROLLUP`` grouping set for use with ``aggregate()``. |
| 1451 | +
|
| 1452 | + ``ROLLUP`` generates all prefixes of the given column list as |
| 1453 | + grouping sets. For example, ``rollup(a, b)`` produces grouping |
| 1454 | + sets ``(a, b)``, ``(a)``, and ``()`` (grand total). |
| 1455 | +
|
| 1456 | + This is equivalent to ``GROUP BY ROLLUP(a, b)`` in SQL. |
| 1457 | +
|
| 1458 | + Args: |
| 1459 | + *exprs: Column expressions to include in the rollup. |
| 1460 | +
|
| 1461 | + Examples: |
| 1462 | + >>> import pyarrow as pa |
| 1463 | + >>> import datafusion as dfn |
| 1464 | + >>> from datafusion.expr import GroupingSet |
| 1465 | + >>> ctx = dfn.SessionContext() |
| 1466 | + >>> df = ctx.from_pydict({"a": [1, 1, 2], "b": [10, 20, 30]}) |
| 1467 | + >>> result = df.aggregate( |
| 1468 | + ... [GroupingSet.rollup(dfn.col("a"))], |
| 1469 | + ... [dfn.functions.sum(dfn.col("b")).alias("s"), |
| 1470 | + ... dfn.functions.grouping(dfn.col("a"))], |
| 1471 | + ... ).sort(dfn.col("a").sort(nulls_first=False)) |
| 1472 | + >>> batches = result.collect() |
| 1473 | + >>> pa.concat_arrays([b.column("s") for b in batches]).to_pylist() |
| 1474 | + [30, 30, 60] |
| 1475 | +
|
| 1476 | + See Also: |
| 1477 | + :py:meth:`cube`, :py:meth:`grouping_sets`, |
| 1478 | + :py:func:`~datafusion.functions.grouping` |
| 1479 | + """ |
| 1480 | + args = [e.expr for e in exprs] |
| 1481 | + return Expr(_GroupingSetInternal.rollup(*args)) |
| 1482 | + |
| 1483 | + @staticmethod |
| 1484 | + def cube(*exprs: Expr) -> Expr: |
| 1485 | + """Create a ``CUBE`` grouping set for use with ``aggregate()``. |
| 1486 | +
|
| 1487 | + ``CUBE`` generates all possible subsets of the given column list |
| 1488 | + as grouping sets. For example, ``cube(a, b)`` produces grouping |
| 1489 | + sets ``(a, b)``, ``(a)``, ``(b)``, and ``()`` (grand total). |
| 1490 | +
|
| 1491 | + This is equivalent to ``GROUP BY CUBE(a, b)`` in SQL. |
| 1492 | +
|
| 1493 | + Args: |
| 1494 | + *exprs: Column expressions to include in the cube. |
| 1495 | +
|
| 1496 | + Examples: |
| 1497 | + With a single column, ``cube`` behaves identically to |
| 1498 | + :py:meth:`rollup`: |
| 1499 | +
|
| 1500 | + >>> import pyarrow as pa |
| 1501 | + >>> import datafusion as dfn |
| 1502 | + >>> from datafusion.expr import GroupingSet |
| 1503 | + >>> ctx = dfn.SessionContext() |
| 1504 | + >>> df = ctx.from_pydict({"a": [1, 1, 2], "b": [10, 20, 30]}) |
| 1505 | + >>> result = df.aggregate( |
| 1506 | + ... [GroupingSet.cube(dfn.col("a"))], |
| 1507 | + ... [dfn.functions.sum(dfn.col("b")).alias("s"), |
| 1508 | + ... dfn.functions.grouping(dfn.col("a"))], |
| 1509 | + ... ).sort(dfn.col("a").sort(nulls_first=False)) |
| 1510 | + >>> batches = result.collect() |
| 1511 | + >>> pa.concat_arrays([b.column(2) for b in batches]).to_pylist() |
| 1512 | + [0, 0, 1] |
| 1513 | +
|
| 1514 | + See Also: |
| 1515 | + :py:meth:`rollup`, :py:meth:`grouping_sets`, |
| 1516 | + :py:func:`~datafusion.functions.grouping` |
| 1517 | + """ |
| 1518 | + args = [e.expr for e in exprs] |
| 1519 | + return Expr(_GroupingSetInternal.cube(*args)) |
| 1520 | + |
| 1521 | + @staticmethod |
| 1522 | + def grouping_sets(*expr_lists: list[Expr]) -> Expr: |
| 1523 | + """Create explicit grouping sets for use with ``aggregate()``. |
| 1524 | +
|
| 1525 | + Each argument is a list of column expressions representing one |
| 1526 | + grouping set. For example, ``grouping_sets([a], [b])`` groups |
| 1527 | + by ``a`` alone and by ``b`` alone in a single query. |
| 1528 | +
|
| 1529 | + This is equivalent to ``GROUP BY GROUPING SETS ((a), (b))`` in |
| 1530 | + SQL. |
| 1531 | +
|
| 1532 | + Args: |
| 1533 | + *expr_lists: Each positional argument is a list of |
| 1534 | + expressions forming one grouping set. |
| 1535 | +
|
| 1536 | + Examples: |
| 1537 | + >>> import pyarrow as pa |
| 1538 | + >>> import datafusion as dfn |
| 1539 | + >>> from datafusion.expr import GroupingSet |
| 1540 | + >>> ctx = dfn.SessionContext() |
| 1541 | + >>> df = ctx.from_pydict( |
| 1542 | + ... {"a": ["x", "x", "y"], "b": ["m", "n", "m"], |
| 1543 | + ... "c": [1, 2, 3]}) |
| 1544 | + >>> result = df.aggregate( |
| 1545 | + ... [GroupingSet.grouping_sets( |
| 1546 | + ... [dfn.col("a")], [dfn.col("b")])], |
| 1547 | + ... [dfn.functions.sum(dfn.col("c")).alias("s"), |
| 1548 | + ... dfn.functions.grouping(dfn.col("a")), |
| 1549 | + ... dfn.functions.grouping(dfn.col("b"))], |
| 1550 | + ... ).sort( |
| 1551 | + ... dfn.col("a").sort(nulls_first=False), |
| 1552 | + ... dfn.col("b").sort(nulls_first=False), |
| 1553 | + ... ) |
| 1554 | + >>> batches = result.collect() |
| 1555 | + >>> pa.concat_arrays( |
| 1556 | + ... [b.column("s") for b in batches]).to_pylist() |
| 1557 | + [3, 3, 4, 2] |
| 1558 | +
|
| 1559 | + See Also: |
| 1560 | + :py:meth:`rollup`, :py:meth:`cube`, |
| 1561 | + :py:func:`~datafusion.functions.grouping` |
| 1562 | + """ |
| 1563 | + raw_lists = [[e.expr for e in lst] for lst in expr_lists] |
| 1564 | + return Expr(_GroupingSetInternal.grouping_sets(*raw_lists)) |
0 commit comments