@@ -1164,6 +1164,42 @@ def register_arrow(
11641164 20,
11651165 30
11661166 ]
1167+
1168+ Provide an explicit ``schema`` to override schema inference:
1169+
1170+ >>> with tempfile.TemporaryDirectory() as tmpdir:
1171+ ... path = os.path.join(tmpdir, "data.arrow")
1172+ ... with pa.ipc.new_file(path, table.schema) as writer:
1173+ ... writer.write_table(table)
1174+ ... ctx.register_arrow(
1175+ ... "arrow_schema",
1176+ ... path,
1177+ ... schema=pa.schema([("x", pa.int64())]),
1178+ ... )
1179+ ... ctx.sql("SELECT * FROM arrow_schema").collect()[0].column(0)
1180+ <pyarrow.lib.Int64Array object at ...>
1181+ [
1182+ 10,
1183+ 20,
1184+ 30
1185+ ]
1186+
1187+ Use ``file_extension`` to read files with a non-default extension:
1188+
1189+ >>> with tempfile.TemporaryDirectory() as tmpdir:
1190+ ... path = os.path.join(tmpdir, "data.ipc")
1191+ ... with pa.ipc.new_file(path, table.schema) as writer:
1192+ ... writer.write_table(table)
1193+ ... ctx.register_arrow(
1194+ ... "arrow_ipc", path, file_extension=".ipc"
1195+ ... )
1196+ ... ctx.sql("SELECT * FROM arrow_ipc").collect()[0].column(0)
1197+ <pyarrow.lib.Int64Array object at ...>
1198+ [
1199+ 10,
1200+ 20,
1201+ 30
1202+ ]
11671203 """
11681204 if table_partition_cols is None :
11691205 table_partition_cols = []
@@ -1465,6 +1501,36 @@ def read_arrow(
14651501 2,
14661502 3
14671503 ]
1504+
1505+ Provide an explicit ``schema`` to override schema inference:
1506+
1507+ >>> with tempfile.TemporaryDirectory() as tmpdir:
1508+ ... path = os.path.join(tmpdir, "data.arrow")
1509+ ... with pa.ipc.new_file(path, table.schema) as writer:
1510+ ... writer.write_table(table)
1511+ ... df = ctx.read_arrow(path, schema=pa.schema([("a", pa.int64())]))
1512+ ... df.collect()[0].column(0)
1513+ <pyarrow.lib.Int64Array object at ...>
1514+ [
1515+ 1,
1516+ 2,
1517+ 3
1518+ ]
1519+
1520+ Use ``file_extension`` to read files with a non-default extension:
1521+
1522+ >>> with tempfile.TemporaryDirectory() as tmpdir:
1523+ ... path = os.path.join(tmpdir, "data.ipc")
1524+ ... with pa.ipc.new_file(path, table.schema) as writer:
1525+ ... writer.write_table(table)
1526+ ... df = ctx.read_arrow(path, file_extension=".ipc")
1527+ ... df.collect()[0].column(0)
1528+ <pyarrow.lib.Int64Array object at ...>
1529+ [
1530+ 1,
1531+ 2,
1532+ 3
1533+ ]
14681534 """
14691535 if file_partition_cols is None :
14701536 file_partition_cols = []
0 commit comments