1414# KIND, either express or implied. See the License for the
1515# specific language governing permissions and limitations
1616# under the License.
17- """:py:class:`DataFrame` is one of the core concepts in DataFusion.
18-
19- See :ref:`user_guide_concepts` in the online documentation for more information.
17+ """:py:class:`DataFrame` — lazy, chainable query representation.
18+
19+ A :py:class:`DataFrame` is a logical plan over one or more data sources.
20+ Methods that reshape the plan (:py:meth:`DataFrame.select`,
21+ :py:meth:`DataFrame.filter`, :py:meth:`DataFrame.aggregate`,
22+ :py:meth:`DataFrame.sort`, :py:meth:`DataFrame.join`,
23+ :py:meth:`DataFrame.limit`, the set-operation methods, ...) return a new
24+ :py:class:`DataFrame` and do no work until a terminal method such as
25+ :py:meth:`DataFrame.collect`, :py:meth:`DataFrame.to_pydict`,
26+ :py:meth:`DataFrame.show`, or one of the ``write_*`` methods is called.
27+
28+ DataFrames are produced from a
29+ :py:class:`~datafusion.context.SessionContext`, typically via
30+ :py:meth:`~datafusion.context.SessionContext.sql`,
31+ :py:meth:`~datafusion.context.SessionContext.read_csv`,
32+ :py:meth:`~datafusion.context.SessionContext.read_parquet`, or
33+ :py:meth:`~datafusion.context.SessionContext.from_pydict`.
34+
35+ Examples:
36+ >>> ctx = dfn.SessionContext()
37+ >>> df = ctx.from_pydict({"a": [1, 2, 3], "b": [10, 20, 30]})
38+ >>> df.filter(col("a") > 1).select("b").to_pydict()
39+ {'b': [20, 30]}
40+
41+ See :ref:`user_guide_concepts` in the online documentation for a high-level
42+ overview of the execution model.
2043"""
2144
2245from __future__ import annotations
@@ -503,21 +526,29 @@ def select_exprs(self, *args: str) -> DataFrame:
503526 def select (self , * exprs : Expr | str ) -> DataFrame :
504527 """Project arbitrary expressions into a new :py:class:`DataFrame`.
505528
529+ String arguments are treated as column names; :py:class:`~datafusion.expr.Expr`
530+ arguments can reshape, rename, or compute new columns.
531+
506532 Args:
507533 exprs: Either column names or :py:class:`~datafusion.expr.Expr` to select.
508534
509535 Returns:
510536 DataFrame after projection. It has one column for each expression.
511537
512- Example usage:
538+ Examples:
539+ Select columns by name:
513540
514- The following example will return 3 columns from the original dataframe.
515- The first two columns will be the original column ``a`` and ``b`` since the
516- string "a" is assumed to refer to column selection. Also a duplicate of
517- column ``a`` will be returned with the column name ``alternate_a``::
541+ >>> ctx = dfn.SessionContext()
542+ >>> df = ctx.from_pydict({"a": [1, 2, 3], "b": [10, 20, 30]})
543+ >>> df.select("a").to_pydict()
544+ {'a': [1, 2, 3]}
518545
519- df = df.select("a", col("b"), col("a").alias("alternate_a"))
546+ Mix column names, expressions, and aliases. The string ``"a"`` selects
547+ column ``a`` directly; ``col("a").alias("alternate_a")`` returns a
548+ duplicate under a new name:
520549
550+ >>> df.select("a", col("b"), col("a").alias("alternate_a")).to_pydict()
551+ {'a': [1, 2, 3], 'b': [10, 20, 30], 'alternate_a': [1, 2, 3]}
521552 """
522553 exprs_internal = expr_list_to_raw_expr_list (exprs )
523554 return DataFrame (self .df .select (* exprs_internal ))
@@ -766,6 +797,24 @@ def aggregate(
766797
767798 Returns:
768799 DataFrame after aggregation.
800+
801+ Examples:
802+ Aggregate without grouping — an empty ``group_by`` produces a
803+ single row:
804+
805+ >>> ctx = dfn.SessionContext()
806+ >>> df = ctx.from_pydict(
807+ ... {"team": ["x", "x", "y"], "score": [1, 2, 5]}
808+ ... )
809+ >>> df.aggregate([], [F.sum(col("score")).alias("total")]).to_pydict()
810+ {'total': [8]}
811+
812+ Group by a column and produce one row per group:
813+
814+ >>> df.aggregate(
815+ ... ["team"], [F.sum(col("score")).alias("total")]
816+ ... ).sort("team").to_pydict()
817+ {'team': ['x', 'y'], 'total': [3, 5]}
769818 """
770819 group_by_list = (
771820 list (group_by )
@@ -786,13 +835,27 @@ def sort(self, *exprs: SortKey) -> DataFrame:
786835 """Sort the DataFrame by the specified sorting expressions or column names.
787836
788837 Note that any expression can be turned into a sort expression by
789- calling its ``sort`` method.
838+ calling its ``sort`` method. For ascending-only sorts, the shorter
839+ :py:meth:`sort_by` is usually more convenient.
790840
791841 Args:
792842 exprs: Sort expressions or column names, applied in order.
793843
794844 Returns:
795845 DataFrame after sorting.
846+
847+ Examples:
848+ Sort ascending by a column name:
849+
850+ >>> ctx = dfn.SessionContext()
851+ >>> df = ctx.from_pydict({"a": [3, 1, 2], "b": [10, 20, 30]})
852+ >>> df.sort("a").to_pydict()
853+ {'a': [1, 2, 3], 'b': [20, 30, 10]}
854+
855+ Sort descending using :py:meth:`Expr.sort`:
856+
857+ >>> df.sort(col("a").sort(ascending=False)).to_pydict()
858+ {'a': [3, 2, 1], 'b': [10, 30, 20]}
796859 """
797860 exprs_raw = sort_list_to_raw_sort_list (exprs )
798861 return DataFrame (self .df .sort (* exprs_raw ))
@@ -812,12 +875,28 @@ def cast(self, mapping: dict[str, pa.DataType[Any]]) -> DataFrame:
812875 def limit (self , count : int , offset : int = 0 ) -> DataFrame :
813876 """Return a new :py:class:`DataFrame` with a limited number of rows.
814877
878+ Results are returned in unspecified order unless the DataFrame is
879+ explicitly sorted first via :py:meth:`sort` or :py:meth:`sort_by`.
880+
815881 Args:
816882 count: Number of rows to limit the DataFrame to.
817883 offset: Number of rows to skip.
818884
819885 Returns:
820886 DataFrame after limiting.
887+
888+ Examples:
889+ Take the first two rows:
890+
891+ >>> ctx = dfn.SessionContext()
892+ >>> df = ctx.from_pydict({"a": [1, 2, 3, 4]}).sort("a")
893+ >>> df.limit(2).to_pydict()
894+ {'a': [1, 2]}
895+
896+ Skip the first row then take two (paging):
897+
898+ >>> df.limit(2, offset=1).to_pydict()
899+ {'a': [2, 3]}
821900 """
822901 return DataFrame (self .df .limit (count , offset ))
823902
@@ -972,6 +1051,28 @@ def join(
9721051
9731052 Returns:
9741053 DataFrame after join.
1054+
1055+ Examples:
1056+ Inner-join two DataFrames on a shared column:
1057+
1058+ >>> ctx = dfn.SessionContext()
1059+ >>> left = ctx.from_pydict({"id": [1, 2, 3], "val": [10, 20, 30]})
1060+ >>> right = ctx.from_pydict({"id": [2, 3, 4], "label": ["b", "c", "d"]})
1061+ >>> left.join(right, on="id").sort("id").to_pydict()
1062+ {'id': [2, 3], 'val': [20, 30], 'label': ['b', 'c']}
1063+
1064+ Left join to keep all rows from the left side:
1065+
1066+ >>> left.join(right, on="id", how="left").sort("id").to_pydict()
1067+ {'id': [1, 2, 3], 'val': [10, 20, 30], 'label': [None, 'b', 'c']}
1068+
1069+ Use ``left_on`` / ``right_on`` when the key columns differ in name:
1070+
1071+ >>> right2 = ctx.from_pydict({"rid": [2, 3], "label": ["b", "c"]})
1072+ >>> left.join(
1073+ ... right2, left_on="id", right_on="rid"
1074+ ... ).sort("id").to_pydict()
1075+ {'id': [2, 3], 'val': [20, 30], 'rid': [2, 3], 'label': ['b', 'c']}
9751076 """
9761077 if join_keys is not None :
9771078 warnings .warn (
@@ -1165,6 +1266,20 @@ def union(self, other: DataFrame, distinct: bool = False) -> DataFrame:
11651266
11661267 Returns:
11671268 DataFrame after union.
1269+
1270+ Examples:
1271+ Stack rows from both DataFrames, preserving duplicates:
1272+
1273+ >>> ctx = dfn.SessionContext()
1274+ >>> df1 = ctx.from_pydict({"a": [1, 2]})
1275+ >>> df2 = ctx.from_pydict({"a": [2, 3]})
1276+ >>> df1.union(df2).sort("a").to_pydict()
1277+ {'a': [1, 2, 2, 3]}
1278+
1279+ Deduplicate the combined result with ``distinct=True``:
1280+
1281+ >>> df1.union(df2, distinct=True).sort("a").to_pydict()
1282+ {'a': [1, 2, 3]}
11681283 """
11691284 return DataFrame (self .df .union (other .df , distinct ))
11701285
0 commit comments