graph, node, store: Add incremental dump support

lutter · lutter · commit 314ba71fe83f · 2026-03-10T18:15:10.000-07:00
When a dump directory already contains a previous dump (metadata.json),
subsequent dumps only write new rows (VID-append) and generate clamp
files for mutable tables whose block_range was closed since the last
dump. On restore, clamps are loaded into memory and applied to rows
before DB insertion.

Key changes:
- Add clamp Arrow schema (vid + block_range_end)
- Add #[serde(default)] clamps field to TableInfo for backward compat
- Make dump_entity_table and dump_data_sources incremental via prev param
- Layout::dump detects previous dumps, validates deployment match and
  guards against reorgs
- Add load_clamps() to read clamp parquet files into a vid-&gt;end map
- Apply clamps during restore in import_entity_table and import_data_sources
- Add DumpReporter::start_clamps/finish_clamps with CLI progress output
diff --git a/graph/src/components/store/mod.rs b/graph/src/components/store/mod.rs
@@ -1001,6 +1001,12 @@ pub trait DumpReporter: Send + 'static {
     /// Called after data_sources$ has been dumped.
     fn finish_data_sources(&mut self, rows: usize) {}
 
+    /// Called before dumping clamps for a table.
+    fn start_clamps(&mut self, _table: &str, _rows_approx: usize) {}
+
+    /// Called after clamps have been dumped for a table.
+    fn finish_clamps(&mut self, _table: &str, _rows: usize) {}
+
     /// Called when the entire dump has completed.
     fn finish(&mut self) {}
 }
diff --git a/node/src/manager/commands/dump.rs b/node/src/manager/commands/dump.rs
@@ -87,6 +87,26 @@ impl DumpReporter for DumpProgress {
         self.spinner.suspend(|| println!("{line}"));
     }
 
+    fn start_clamps(&mut self, table: &str, rows_approx: usize) {
+        self.spinner.set_message(format!(
+            "{:<32} clamps ~{} rows",
+            table,
+            format_count(rows_approx),
+        ));
+    }
+
+    fn finish_clamps(&mut self, table: &str, rows: usize) {
+        if rows > 0 {
+            let line = format!(
+                "  {} {:<32} {:>10} clamps",
+                style("\u{2714}").green(),
+                table,
+                format_count(rows),
+            );
+            self.spinner.suspend(|| println!("{line}"));
+        }
+    }
+
     fn finish(&mut self) {
         let elapsed = self.start.elapsed().as_secs();
         self.spinner.finish_with_message(format!(
diff --git a/store/postgres/src/parquet/convert.rs b/store/postgres/src/parquet/convert.rs
@@ -1,3 +1,5 @@
+use std::collections::HashMap;
+use std::path::Path;
 use std::str::FromStr;
 use std::sync::Arc;
 
@@ -412,6 +414,31 @@ pub fn record_batch_to_data_source_rows(
     Ok(rows)
 }
 
+/// Load clamp files and build a `vid → block_range_end` map.
+///
+/// Each clamp parquet file contains rows with `(vid: i64, block_range_end: i32)`.
+/// Later clamp files overwrite earlier ones for the same vid (only the
+/// latest value matters).
+pub fn load_clamps(
+    dir: &Path,
+    clamps: &[crate::parquet::writer::ChunkInfo],
+) -> Result<HashMap<i64, i32>, StoreError> {
+    let mut map = HashMap::new();
+    for chunk_info in clamps {
+        let path = dir.join(&chunk_info.file);
+        let batches = crate::parquet::reader::read_batches(&path)?;
+        for batch in batches {
+            let batch = batch?;
+            let vid_arr = downcast_i64(batch.column(0), "vid")?;
+            let bre_arr = downcast_i32(batch.column(1), "block_range_end")?;
+            for row in 0..batch.num_rows() {
+                map.insert(vid_arr.value(row), bre_arr.value(row));
+            }
+        }
+    }
+    Ok(map)
+}
+
 // -- Downcasting helpers --
 
 fn downcast_i64<'a>(array: &'a ArrayRef, name: &str) -> Result<&'a Int64Array, StoreError> {
@@ -1134,4 +1161,66 @@ mod tests {
         let restore_rows = record_batch_to_restore_rows(&batch, table).unwrap();
         assert!(restore_rows.is_empty());
     }
+
+    #[test]
+    fn load_clamps_roundtrip() {
+        use crate::parquet::schema::clamp_arrow_schema;
+        use crate::parquet::writer::{ChunkInfo, ParquetChunkWriter};
+
+        let tmp_dir =
+            std::env::temp_dir().join(format!("graph_node_clamp_test_{}", std::process::id()));
+        std::fs::create_dir_all(&tmp_dir).unwrap();
+
+        let schema = clamp_arrow_schema();
+
+        // Write clamp file 1
+        let path1 = tmp_dir.join("clamp_000000.parquet");
+        let mut writer1 =
+            ParquetChunkWriter::new(path1, "clamp_000000.parquet".into(), &schema).unwrap();
+        let batch1 = arrow::array::RecordBatch::try_new(
+            Arc::new(schema.clone()),
+            vec![
+                Arc::new(Int64Array::from(vec![10, 20])),
+                Arc::new(Int32Array::from(vec![500, 600])),
+            ],
+        )
+        .unwrap();
+        writer1.write_batch(&batch1, 10, 20).unwrap();
+        let chunk1 = writer1.finish().unwrap();
+
+        // Write clamp file 2 (overwrites vid 20 with a different end)
+        let path2 = tmp_dir.join("clamp_000001.parquet");
+        let mut writer2 =
+            ParquetChunkWriter::new(path2, "clamp_000001.parquet".into(), &schema).unwrap();
+        let batch2 = arrow::array::RecordBatch::try_new(
+            Arc::new(schema.clone()),
+            vec![
+                Arc::new(Int64Array::from(vec![20, 30])),
+                Arc::new(Int32Array::from(vec![700, 800])),
+            ],
+        )
+        .unwrap();
+        writer2.write_batch(&batch2, 20, 30).unwrap();
+        let chunk2 = writer2.finish().unwrap();
+
+        let clamps = vec![
+            ChunkInfo {
+                file: chunk1.file,
+                ..chunk1
+            },
+            ChunkInfo {
+                file: chunk2.file,
+                ..chunk2
+            },
+        ];
+
+        let map = load_clamps(&tmp_dir, &clamps).unwrap();
+        assert_eq!(map.len(), 3);
+        assert_eq!(map[&10], 500);
+        assert_eq!(map[&20], 700); // overwritten by second clamp file
+        assert_eq!(map[&30], 800);
+
+        // Cleanup
+        let _ = std::fs::remove_dir_all(&tmp_dir);
+    }
 }
diff --git a/store/postgres/src/parquet/schema.rs b/store/postgres/src/parquet/schema.rs
@@ -62,6 +62,15 @@ pub fn data_sources_arrow_schema() -> Schema {
     ])
 }
 
+/// Arrow schema for clamp files: records which rows had their
+/// `block_range` upper bound set (clamped) since a previous dump.
+pub fn clamp_arrow_schema() -> Schema {
+    Schema::new(vec![
+        Field::new("vid", DataType::Int64, false),
+        Field::new("block_range_end", DataType::Int32, false),
+    ])
+}
+
 fn column_type_to_arrow(ct: &ColumnType) -> DataType {
     match ct {
         ColumnType::Boolean => DataType::Boolean,
@@ -361,6 +370,31 @@ mod tests {
         );
     }
 
+    #[test]
+    fn clamp_schema() {
+        let schema = clamp_arrow_schema();
+        let names: Vec<&str> = schema.fields().iter().map(|f| f.name().as_str()).collect();
+        assert_eq!(names, &["vid", "block_range_end"]);
+
+        assert_eq!(
+            schema.field_with_name("vid").unwrap().data_type(),
+            &DataType::Int64
+        );
+        assert!(!schema.field_with_name("vid").unwrap().is_nullable());
+
+        assert_eq!(
+            schema
+                .field_with_name("block_range_end")
+                .unwrap()
+                .data_type(),
+            &DataType::Int32
+        );
+        assert!(!schema
+            .field_with_name("block_range_end")
+            .unwrap()
+            .is_nullable());
+    }
+
     #[test]
     fn enum_columns_map_to_utf8() {
         let layout = test_layout(
diff --git a/store/postgres/src/relational/dump.rs b/store/postgres/src/relational/dump.rs
diff --git a/store/postgres/src/relational/restore.rs b/store/postgres/src/relational/restore.rs