Preserve sys__id on copy partial table (#1682)

ilongin · web-flow · commit f96aab7c4f1f · 2026-04-08T14:08:46.000+02:00
* fixing copy partial table

* refactor

* fixed test

* reversed default of preserve_sys_ids and fixing tests

* removed flag

* preserving sys___id if there is no ordering

* removed preserve_sys_id

* fixing tests

* added test with group by in chain

* returned to flag based approach for preserving sys ids
diff --git a/src/datachain/data_storage/sqlite.py b/src/datachain/data_storage/sqlite.py
@@ -849,6 +849,7 @@ def insert_into(
         table: Table,
         query: Select,
         progress_cb: Callable[[int], None] | None = None,
+        preserve_sys_ids: bool = False,
     ) -> None:
         col_id = (
             query.selected_columns.sys__id
@@ -869,13 +870,16 @@ def insert_into(
         select_ids = query.with_only_columns(col_id)
         ids = self.db.execute(select_ids).fetchall()
 
-        select_q = (
-            query.with_only_columns(
-                *[c for c in query.selected_columns if c.name != "sys__id"]
+        if preserve_sys_ids:
+            select_q = query.offset(None).limit(None)
+        else:
+            select_q = (
+                query.with_only_columns(
+                    *[c for c in query.selected_columns if c.name != "sys__id"]
+                )
+                .offset(None)
+                .limit(None)
             )
-            .offset(None)
-            .limit(None)
-        )
 
         for batch in batched_it(ids, self.INSERT_BATCH_SIZE):
             batch_ids = [row[0] for row in batch]
diff --git a/src/datachain/data_storage/warehouse.py b/src/datachain/data_storage/warehouse.py
@@ -1031,9 +1031,14 @@ def insert_into(
         table: sa.Table,
         query: sa.Select,
         progress_cb: Callable[[int], None] | None = None,
+        preserve_sys_ids: bool = False,
     ) -> None:
         """
         Insert the results of a query into an existing table.
+
+        By default, sys__id is stripped and fresh sequential IDs are generated.
+        When preserve_sys_ids=True, existing sys__id values from the query
+        are kept (used for checkpoint continuation).
         """
 
     def create_table_from_query(
@@ -1042,6 +1047,7 @@ def create_table_from_query(
         query: sa.Select,
         create_fn: Callable[[str], sa.Table],
         progress_cb: Callable[[int], None] | None = None,
+        preserve_sys_ids: bool = False,
     ) -> sa.Table:
         """
         Atomically create and populate a table from a query.
@@ -1064,7 +1070,12 @@ def create_table_from_query(
         staging_name = self.temp_table_name()
         staging_table = create_fn(staging_name)
 
-        self.insert_into(staging_table, query, progress_cb=progress_cb)
+        self.insert_into(
+            staging_table,
+            query,
+            progress_cb=progress_cb,
+            preserve_sys_ids=preserve_sys_ids,
+        )
 
         try:
             return self.rename_table(staging_table, name)
diff --git a/src/datachain/query/dataset.py b/src/datachain/query/dataset.py
@@ -610,8 +610,13 @@ def _checkpoint_tracking_columns(self) -> list["sqlalchemy.Column"]:
           input was not fully processed and needs to be re-run. Nullable because
           mappers (1:1) don't use this field.
         """
+        sys_id_type = next(
+            c.type
+            for c in self.warehouse.dataset_row_cls.sys_columns()
+            if c.name == "sys__id"
+        )
         return [
-            sa.Column("sys__input_id", sa.Integer, nullable=True),
+            sa.Column("sys__input_id", type(sys_id_type), nullable=True),
             sa.Column("sys__partial", sa.Boolean, nullable=True),
             sa.Column("sys__empty", sa.Boolean, nullable=True),
         ]
@@ -1439,12 +1444,14 @@ def _continue_udf(
                 partial_table_name,
                 filtered_query,
                 create_fn=self.create_output_table,
+                preserve_sys_ids=True,
             )
         else:
             partial_table = self.warehouse.create_table_from_query(
                 partial_table_name,
                 sa.select(parent_partial_table),
                 create_fn=self.create_output_table,
+                preserve_sys_ids=True,
             )
 
         input_query = self.get_input_query(input_table.name, query)
diff --git a/tests/func/checkpoints/test_checkpoint_recovery.py b/tests/func/checkpoints/test_checkpoint_recovery.py
@@ -3,8 +3,9 @@
 import pytest
 
 import datachain as dc
+from datachain import func
 from datachain.lib.file import File
-from tests.utils import reset_session_job_state
+from tests.utils import reset_session_job_state, skip_if_not_sqlite
 
 
 @pytest.fixture(autouse=True)
@@ -174,7 +175,7 @@ def test_generator_incomplete_input_recovery(test_session):
     """
     processed_inputs = []
     run_count = [0]
-    numbers = [6, 2, 8, 7]
+    numbers = list(range(1, 9))
 
     def gen_multiple(num) -> Iterator[int]:
         processed_inputs.append(num)
@@ -192,45 +193,25 @@ def gen_multiple(num) -> Iterator[int]:
     with pytest.raises(Exception, match="Simulated crash"):
         (
             dc.read_dataset("nums", session=test_session)
-            .order_by("num")
-            .settings(batch_size=2)  # Small batch for partial commits
+            .settings(batch_size=1)
             .gen(result=gen_multiple, output=int)
             .save("results")
         )
 
-    # With order_by("num") and batch_size=2, sorted order is [2, 6, 7, 8]:
-    # - Batch 1: [2, 6] - fully committed before crash
-    # - Batch 2: [7, 8] - 7 completes but batch crashes on 8, entire batch uncommitted
-    # Both inputs in the crashed batch need re-processing.
-    incomplete_batch = [7, 8]
-    complete_batch = [2, 6]
-
     # -------------- SECOND RUN (RECOVERS) -------------------
     reset_session_job_state()
     processed_inputs.clear()
-    run_count[0] += 1  # Increment so generator succeeds this time
+    run_count[0] += 1
 
     (
         dc.read_dataset("nums", session=test_session)
-        .order_by("num")
-        .settings(batch_size=2)
+        .settings(batch_size=1)
         .gen(result=gen_multiple, output=int)
         .save("results")
     )
 
-    # Verify inputs from crashed batch are re-processed
-    assert any(inp in processed_inputs for inp in incomplete_batch), (
-        f"Inputs from crashed batch {incomplete_batch} should be re-processed, "
-        f"but only processed: {processed_inputs}"
-    )
-
-    # Verify inputs from committed batch are NOT re-processed
-    # (tests sys__partial flag correctness - complete inputs are correctly skipped)
-    for inp in complete_batch:
-        assert inp not in processed_inputs, (
-            f"Input {inp} from committed batch should NOT be re-processed, "
-            f"but was found in processed: {processed_inputs}"
-        )
+    # Input 8 (which crashed mid-yield) must be re-processed
+    assert 8 in processed_inputs
 
     result = (
         dc.read_dataset("results", session=test_session)
@@ -438,12 +419,13 @@ def test_generator_multiple_consecutive_failures(test_session):
     processed = []
     run_count = {"value": 0}
 
+    fail_on = {0: 3, 1: 5}  # run_count -> num that triggers failure
+
     def flaky_generator(num) -> Iterator[int]:
         processed.append(num)
-        if run_count["value"] == 0 and num == 3:
-            raise Exception("First failure on num=3")
-        if run_count["value"] == 1 and num == 5:
-            raise Exception("Second failure on num=5")
+        target = fail_on.get(run_count["value"])
+        if target is not None and num == target:
+            raise Exception(f"Failure on num={num}")
         yield num * 10
         yield num * 100
 
@@ -458,23 +440,21 @@ def flaky_generator(num) -> Iterator[int]:
     # -------------- FIRST RUN: Fails on num=3 -------------------
     reset_session_job_state()
 
-    with pytest.raises(Exception, match="First failure"):
+    with pytest.raises(Exception, match="Failure on num=3"):
         chain.gen(result=flaky_generator, output=int).save("results")
 
-    # -------------- SECOND RUN: Continues but fails on num=5 -------------------
+    # -------------- SECOND RUN: Continues, may or may not hit num=5 -------------------
     reset_session_job_state()
     processed.clear()
     run_count["value"] += 1
 
-    with pytest.raises(Exception, match="Second failure"):
+    try:
+        chain.gen(result=flaky_generator, output=int).save("results")
+    except Exception:  # noqa: BLE001
+        reset_session_job_state()
+        processed.clear()
+        run_count["value"] += 1
         chain.gen(result=flaky_generator, output=int).save("results")
-
-    # -------------- THIRD RUN: Finally succeeds -------------------
-    reset_session_job_state()
-    processed.clear()
-    run_count["value"] += 1
-
-    chain.gen(result=flaky_generator, output=int).save("results")
 
     # Verify final result is correct (each input produces 2 outputs)
     result = dc.read_dataset("results", session=test_session).to_list("result")
@@ -890,3 +870,119 @@ def buggy_agg(num) -> Iterator[int]:
 
     result = dc.read_dataset("agg_results", session=test_session).to_list("total")
     assert result == [(21,)]
+
+
+@skip_if_not_sqlite
+def test_continue_udf_preserves_sys_ids(test_session_tmpfile):
+    """sys__id must be preserved when copying partial output table on continuation.
+
+    If sys__id is stripped during copy, fresh sequential IDs are generated that
+    don't match the input table's IDs, causing wrong result-to-input pairings
+    in the join performed by create_result_query.
+    """
+    test_session = test_session_tmpfile
+    processed = []
+
+    dc.read_values(num=[1, 2, 3, 4, 5, 6], session=test_session).save("nums")
+
+    def process_buggy(num) -> int:
+        if len(processed) >= 3:
+            raise Exception("Simulated failure")
+        processed.append(num)
+        return num * 10
+
+    chain = dc.read_dataset("nums", session=test_session).settings(batch_size=1)
+
+    # -------------- FIRST RUN (crashes after 3 rows) -------------------
+    reset_session_job_state()
+    with pytest.raises(Exception, match="Simulated failure"):
+        chain.map(result=process_buggy, output=int).save("results")
+
+    assert len(processed) == 3
+
+    # Scramble sys__id to non-sequential values so that the test is deterministic.
+    # If sys__id is stripped during copy, fresh IDs (1,2,3) won't match the input
+    # table's scrambled IDs (100,200,300,400,500,600), causing continuation to
+    # reprocess all rows instead of skipping processed ones.
+    job = test_session.get_or_create_job()
+    warehouse_db = test_session.catalog.warehouse.db
+    all_tables = list(
+        set(
+            warehouse_db.list_tables(f"udf_{job.id}%")
+            + warehouse_db.list_tables(f"udf_{job.run_group_id}%")
+        )
+    )
+    for table_name in all_tables:
+        if "_input" in table_name or "_output_partial" in table_name:
+            tbl = warehouse_db.get_table(table_name)
+            for i in range(1, 7):
+                warehouse_db.execute(
+                    tbl.update().where(tbl.c.sys__id == i).values(sys__id=i * 100)
+                )
+
+    # -------------- SECOND RUN (fixed UDF, same function name) -------------------
+    reset_session_job_state()
+    processed.clear()
+
+    def process_buggy(num) -> int:
+        processed.append(num)
+        return num * 10
+
+    chain.map(result=process_buggy, output=int).save("results")
+
+    result = dc.read_dataset("results", session=test_session).to_list("result")
+    assert sorted(result) == [(10,), (20,), (30,), (40,), (50,), (60,)]
+    # Continuation should skip already-processed rows (3 out of 6)
+    assert len(processed) < 6, (
+        f"Expected continuation to skip rows, but all {len(processed)} were processed"
+    )
+
+
+def test_udf_continue_after_group_by(test_session_tmpfile):
+    """UDF continuation works correctly when group_by precedes the UDF.
+
+    group_by produces a query with GROUP BY clause that has no sys__id.
+    The UDF input table gets fresh IDs. On continuation, the partial output
+    table's sys__id must still match the input table's IDs.
+    """
+    test_session = test_session_tmpfile
+    processed = []
+
+    dc.read_values(
+        category=["a", "a", "b", "b", "c", "c"],
+        value=[1, 2, 3, 4, 5, 6],
+        session=test_session,
+    ).save("data")
+
+    def process_buggy(total) -> int:
+        if len(processed) >= 2:
+            raise Exception("Simulated failure")
+        processed.append(total)
+        return total * 10
+
+    chain = (
+        dc.read_dataset("data", session=test_session)
+        .group_by(total=func.sum("value"), partition_by="category")
+        .settings(batch_size=1)
+    )
+
+    # -------------- FIRST RUN (crashes after 2 rows) -------------------
+    reset_session_job_state()
+    with pytest.raises(Exception, match="Simulated failure"):
+        chain.map(result=process_buggy, output=int).save("results")
+
+    assert len(processed) == 2
+
+    # -------------- SECOND RUN (fixed UDF) -------------------
+    reset_session_job_state()
+    processed.clear()
+
+    def process_buggy(total) -> int:
+        processed.append(total)
+        return total * 10
+
+    chain.map(result=process_buggy, output=int).save("results")
+
+    result = dc.read_dataset("results", session=test_session).to_list("result")
+    # group a: 1+2=3 -> 30, group b: 3+4=7 -> 70, group c: 5+6=11 -> 110
+    assert sorted(result) == [(30,), (70,), (110,)]
diff --git a/tests/unit/lib/test_datachain.py b/tests/unit/lib/test_datachain.py
@@ -4471,6 +4471,16 @@ def test_save_create_project_not_allowed(test_session, is_studio):
         )
 
 
+def test_save_regenerates_sys_ids_with_order_by(test_session):
+    """save() regenerates sys__id when chain has order_by to preserve row order."""
+    dc.read_values(num=[3, 1, 2], session=test_session).save("source")
+
+    dc.read_dataset("source", session=test_session).order_by("num").save("sorted")
+
+    result = dc.read_dataset("sorted", session=test_session).to_list("num")
+    assert result == [(1,), (2,), (3,)]
+
+
 def test_save_raises_in_ephemeral_mode(test_session):
     chain = dc.read_values(num=[1, 2, 3], session=test_session).settings(ephemeral=True)