docs: enrich module docstrings and add doctest examples (#1498)

timsaucer · claude · web-flow · commit 8741d30cd812 · 2026-04-23T22:01:01.000-04:00
* Enrich module docstrings and add doctest examples

Expands the module docstrings for `functions.py`, `dataframe.py`,
`expr.py`, and `context.py` so each module opens with a concept summary,
cross-references to related APIs, and a small executable example.

Adds doctest examples to the high-traffic `DataFrame` methods that
previously lacked them: `select`, `aggregate`, `sort`, `limit`, `join`,
and `union`. Optional parameters are demonstrated with keyword syntax,
and examples reuse the same input data across variants so the effect of
each option is easy to see.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;

* Use distinct group sums in aggregate docstring example

Change the score data from [1, 2, 3] to [1, 2, 5] so the grouped
result produces [3, 5] instead of [3, 3], removing ambiguity about
which total belongs to which team.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;

* Align module-docstring examples with SKILL.md idioms

Drop the redundant lit() in the dataframe.py module-docstring filter
example and use a plain string group key in the aggregate() doctest, so
both examples model the style SKILL.md recommends. Also document the
sort("a") string form and sort_by() shortcut in SKILL.md's sorting
section.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;

---------

Co-authored-by: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/SKILL.md b/SKILL.md
@@ -128,14 +128,22 @@ aggregate.
 ### Sorting
 
 ```python
-df.sort(col("a"))                            # ascending (default)
+df.sort("a")                                 # ascending (plain name, preferred)
+df.sort(col("a"))                            # ascending via col()
 df.sort(col("a").sort(ascending=False))      # descending
 df.sort(col("a").sort(nulls_first=False))    # override null placement
+
+df.sort_by("a", "b")                         # ascending-only shortcut
 ```
 
-A plain expression passed to `sort()` is already treated as ascending. Only
-reach for `col(...).sort(...)` when you need to override a default (descending
-order or null placement). Writing `col("a").sort(ascending=True)` is redundant.
+As with `select()` and `aggregate()`, bare column references can be passed as
+plain name strings. A plain expression passed to `sort()` is already treated
+as ascending, so reach for `col(...).sort(...)` only when you need to override
+a default (descending order or null placement). Writing
+`col("a").sort(ascending=True)` is redundant.
+
+For ascending-only sorts with no null-placement override, `df.sort_by(...)` is
+a shorter alias for `df.sort(...)`.
 
 ### Joining
 
diff --git a/python/datafusion/context.py b/python/datafusion/context.py
@@ -15,7 +15,32 @@
 # specific language governing permissions and limitations
 # under the License.
 
-"""Session Context and it's associated configuration."""
+""":py:class:`SessionContext` — entry point for running DataFusion queries.
+
+A :py:class:`SessionContext` holds registered tables, catalogs, and
+configuration for the current session. It is the first object most programs
+create: from it you register data, run SQL strings
+(:py:meth:`SessionContext.sql`), read files
+(:py:meth:`SessionContext.read_csv`,
+:py:meth:`SessionContext.read_parquet`, ...), and construct
+:py:class:`~datafusion.dataframe.DataFrame` objects in memory
+(:py:meth:`SessionContext.from_pydict`,
+:py:meth:`SessionContext.from_arrow`).
+
+Session behavior (memory limits, batch size, configured optimizer passes,
+...) is controlled by :py:class:`SessionConfig` and
+:py:class:`RuntimeEnvBuilder`; SQL dialect limits are controlled by
+:py:class:`SQLOptions`.
+
+Examples:
+    >>> ctx = dfn.SessionContext()
+    >>> df = ctx.from_pydict({"a": [1, 2, 3]})
+    >>> ctx.sql("SELECT 1 AS n").to_pydict()
+    {'n': [1]}
+
+See :ref:`user_guide_concepts` in the online documentation for the broader
+execution model.
+"""
 
 from __future__ import annotations
 
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -14,9 +14,32 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-""":py:class:`DataFrame` is one of the core concepts in DataFusion.
-
-See :ref:`user_guide_concepts` in the online documentation for more information.
+""":py:class:`DataFrame` — lazy, chainable query representation.
+
+A :py:class:`DataFrame` is a logical plan over one or more data sources.
+Methods that reshape the plan (:py:meth:`DataFrame.select`,
+:py:meth:`DataFrame.filter`, :py:meth:`DataFrame.aggregate`,
+:py:meth:`DataFrame.sort`, :py:meth:`DataFrame.join`,
+:py:meth:`DataFrame.limit`, the set-operation methods, ...) return a new
+:py:class:`DataFrame` and do no work until a terminal method such as
+:py:meth:`DataFrame.collect`, :py:meth:`DataFrame.to_pydict`,
+:py:meth:`DataFrame.show`, or one of the ``write_*`` methods is called.
+
+DataFrames are produced from a
+:py:class:`~datafusion.context.SessionContext`, typically via
+:py:meth:`~datafusion.context.SessionContext.sql`,
+:py:meth:`~datafusion.context.SessionContext.read_csv`,
+:py:meth:`~datafusion.context.SessionContext.read_parquet`, or
+:py:meth:`~datafusion.context.SessionContext.from_pydict`.
+
+Examples:
+    >>> ctx = dfn.SessionContext()
+    >>> df = ctx.from_pydict({"a": [1, 2, 3], "b": [10, 20, 30]})
+    >>> df.filter(col("a") > 1).select("b").to_pydict()
+    {'b': [20, 30]}
+
+See :ref:`user_guide_concepts` in the online documentation for a high-level
+overview of the execution model.
 """
 
 from __future__ import annotations
@@ -503,21 +526,29 @@ def select_exprs(self, *args: str) -> DataFrame:
     def select(self, *exprs: Expr | str) -> DataFrame:
         """Project arbitrary expressions into a new :py:class:`DataFrame`.
 
+        String arguments are treated as column names; :py:class:`~datafusion.expr.Expr`
+        arguments can reshape, rename, or compute new columns.
+
         Args:
             exprs: Either column names or :py:class:`~datafusion.expr.Expr` to select.
 
         Returns:
             DataFrame after projection. It has one column for each expression.
 
-        Example usage:
+        Examples:
+            Select columns by name:
 
-        The following example will return 3 columns from the original dataframe.
-        The first two columns will be the original column ``a`` and ``b`` since the
-        string "a" is assumed to refer to column selection. Also a duplicate of
-        column ``a`` will be returned with the column name ``alternate_a``::
+            >>> ctx = dfn.SessionContext()
+            >>> df = ctx.from_pydict({"a": [1, 2, 3], "b": [10, 20, 30]})
+            >>> df.select("a").to_pydict()
+            {'a': [1, 2, 3]}
 
-            df = df.select("a", col("b"), col("a").alias("alternate_a"))
+            Mix column names, expressions, and aliases. The string ``"a"`` selects
+            column ``a`` directly; ``col("a").alias("alternate_a")`` returns a
+            duplicate under a new name:
 
+            >>> df.select("a", col("b"), col("a").alias("alternate_a")).to_pydict()
+            {'a': [1, 2, 3], 'b': [10, 20, 30], 'alternate_a': [1, 2, 3]}
         """
         exprs_internal = expr_list_to_raw_expr_list(exprs)
         return DataFrame(self.df.select(*exprs_internal))
@@ -766,6 +797,24 @@ def aggregate(
 
         Returns:
             DataFrame after aggregation.
+
+        Examples:
+            Aggregate without grouping — an empty ``group_by`` produces a
+            single row:
+
+            >>> ctx = dfn.SessionContext()
+            >>> df = ctx.from_pydict(
+            ...     {"team": ["x", "x", "y"], "score": [1, 2, 5]}
+            ... )
+            >>> df.aggregate([], [F.sum(col("score")).alias("total")]).to_pydict()
+            {'total': [8]}
+
+            Group by a column and produce one row per group:
+
+            >>> df.aggregate(
+            ...     ["team"], [F.sum(col("score")).alias("total")]
+            ... ).sort("team").to_pydict()
+            {'team': ['x', 'y'], 'total': [3, 5]}
         """
         group_by_list = (
             list(group_by)
@@ -786,13 +835,27 @@ def sort(self, *exprs: SortKey) -> DataFrame:
         """Sort the DataFrame by the specified sorting expressions or column names.
 
         Note that any expression can be turned into a sort expression by
-        calling its ``sort`` method.
+        calling its ``sort`` method. For ascending-only sorts, the shorter
+        :py:meth:`sort_by` is usually more convenient.
 
         Args:
             exprs: Sort expressions or column names, applied in order.
 
         Returns:
             DataFrame after sorting.
+
+        Examples:
+            Sort ascending by a column name:
+
+            >>> ctx = dfn.SessionContext()
+            >>> df = ctx.from_pydict({"a": [3, 1, 2], "b": [10, 20, 30]})
+            >>> df.sort("a").to_pydict()
+            {'a': [1, 2, 3], 'b': [20, 30, 10]}
+
+            Sort descending using :py:meth:`Expr.sort`:
+
+            >>> df.sort(col("a").sort(ascending=False)).to_pydict()
+            {'a': [3, 2, 1], 'b': [10, 30, 20]}
         """
         exprs_raw = sort_list_to_raw_sort_list(exprs)
         return DataFrame(self.df.sort(*exprs_raw))
@@ -812,12 +875,28 @@ def cast(self, mapping: dict[str, pa.DataType[Any]]) -> DataFrame:
     def limit(self, count: int, offset: int = 0) -> DataFrame:
         """Return a new :py:class:`DataFrame` with a limited number of rows.
 
+        Results are returned in unspecified order unless the DataFrame is
+        explicitly sorted first via :py:meth:`sort` or :py:meth:`sort_by`.
+
         Args:
             count: Number of rows to limit the DataFrame to.
             offset: Number of rows to skip.
 
         Returns:
             DataFrame after limiting.
+
+        Examples:
+            Take the first two rows:
+
+            >>> ctx = dfn.SessionContext()
+            >>> df = ctx.from_pydict({"a": [1, 2, 3, 4]}).sort("a")
+            >>> df.limit(2).to_pydict()
+            {'a': [1, 2]}
+
+            Skip the first row then take two (paging):
+
+            >>> df.limit(2, offset=1).to_pydict()
+            {'a': [2, 3]}
         """
         return DataFrame(self.df.limit(count, offset))
 
@@ -972,6 +1051,28 @@ def join(
 
         Returns:
             DataFrame after join.
+
+        Examples:
+            Inner-join two DataFrames on a shared column:
+
+            >>> ctx = dfn.SessionContext()
+            >>> left = ctx.from_pydict({"id": [1, 2, 3], "val": [10, 20, 30]})
+            >>> right = ctx.from_pydict({"id": [2, 3, 4], "label": ["b", "c", "d"]})
+            >>> left.join(right, on="id").sort("id").to_pydict()
+            {'id': [2, 3], 'val': [20, 30], 'label': ['b', 'c']}
+
+            Left join to keep all rows from the left side:
+
+            >>> left.join(right, on="id", how="left").sort("id").to_pydict()
+            {'id': [1, 2, 3], 'val': [10, 20, 30], 'label': [None, 'b', 'c']}
+
+            Use ``left_on`` / ``right_on`` when the key columns differ in name:
+
+            >>> right2 = ctx.from_pydict({"rid": [2, 3], "label": ["b", "c"]})
+            >>> left.join(
+            ...     right2, left_on="id", right_on="rid"
+            ... ).sort("id").to_pydict()
+            {'id': [2, 3], 'val': [20, 30], 'rid': [2, 3], 'label': ['b', 'c']}
         """
         if join_keys is not None:
             warnings.warn(
@@ -1165,6 +1266,20 @@ def union(self, other: DataFrame, distinct: bool = False) -> DataFrame:
 
         Returns:
             DataFrame after union.
+
+        Examples:
+            Stack rows from both DataFrames, preserving duplicates:
+
+            >>> ctx = dfn.SessionContext()
+            >>> df1 = ctx.from_pydict({"a": [1, 2]})
+            >>> df2 = ctx.from_pydict({"a": [2, 3]})
+            >>> df1.union(df2).sort("a").to_pydict()
+            {'a': [1, 2, 2, 3]}
+
+            Deduplicate the combined result with ``distinct=True``:
+
+            >>> df1.union(df2, distinct=True).sort("a").to_pydict()
+            {'a': [1, 2, 3]}
         """
         return DataFrame(self.df.union(other.df, distinct))
 
diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py
@@ -15,9 +15,31 @@
 # specific language governing permissions and limitations
 # under the License.
 
-"""This module supports expressions, one of the core concepts in DataFusion.
-
-See :ref:`Expressions` in the online documentation for more details.
+""":py:class:`Expr` — the logical expression type used to build DataFusion queries.
+
+An :py:class:`Expr` represents a computation over columns or literals: a
+column reference (``col("a")``), a literal (``lit(5)``), an operator
+combination (``col("a") + lit(1)``), or the output of a function from
+:py:mod:`datafusion.functions`. Expressions are passed to
+:py:class:`~datafusion.dataframe.DataFrame` methods such as
+:py:meth:`~datafusion.dataframe.DataFrame.select`,
+:py:meth:`~datafusion.dataframe.DataFrame.filter`,
+:py:meth:`~datafusion.dataframe.DataFrame.aggregate`, and
+:py:meth:`~datafusion.dataframe.DataFrame.sort`.
+
+Convenience constructors are re-exported at the package level:
+:py:func:`datafusion.col` / :py:func:`datafusion.column` for column references
+and :py:func:`datafusion.lit` / :py:func:`datafusion.literal` for scalar
+literals.
+
+Examples:
+    >>> ctx = dfn.SessionContext()
+    >>> df = ctx.from_pydict({"a": [1, 2, 3]})
+    >>> df.select((col("a") * lit(10)).alias("ten_a")).to_pydict()
+    {'ten_a': [10, 20, 30]}
+
+See :ref:`expressions` in the online documentation for details on available
+operators and helpers.
 """
 
 # ruff: noqa: PLC0415
diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py
@@ -14,7 +14,27 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""User functions for operating on :py:class:`~datafusion.expr.Expr`."""
+"""Scalar, aggregate, and window functions for :py:class:`~datafusion.expr.Expr`.
+
+Each function returns an :py:class:`~datafusion.expr.Expr` that can be combined
+with other expressions and passed to
+:py:class:`~datafusion.dataframe.DataFrame` methods such as
+:py:meth:`~datafusion.dataframe.DataFrame.select`,
+:py:meth:`~datafusion.dataframe.DataFrame.filter`,
+:py:meth:`~datafusion.dataframe.DataFrame.aggregate`, and
+:py:meth:`~datafusion.dataframe.DataFrame.window`. The module is conventionally
+imported as ``F`` so calls read like ``F.sum(col("price"))``.
+
+Examples:
+    >>> from datafusion import functions as F
+    >>> ctx = dfn.SessionContext()
+    >>> df = ctx.from_pydict({"a": [1, 2, 3, 4]})
+    >>> df.aggregate([], [F.sum(col("a")).alias("total")]).to_pydict()
+    {'total': [10]}
+
+See :ref:`aggregation` and :ref:`window_functions` in the online documentation
+for categorized catalogs of aggregate and window functions.
+"""
 
 from __future__ import annotations