ORC-2619: Fix estimateRgEndOffset slop calculation for incompressible data

thexia · thexia · commit a9ffd5d5755e · 2026-05-07T23:55:57.000+08:00
The stretchFactor calculation in estimateRgEndOffset did not account for
the 2-byte RLEv2 DIRECT run header. This caused insufficient buffer
allocation when data is incompressible, leading to 'Buffer size too small'
errors.

Fix: Include RLE_V2_HEADER_SIZE in the worst-case payload calculation.
Add test demonstrating the issue with the old formula.
diff --git a/java/core/src/java/org/apache/orc/impl/RecordReaderUtils.java b/java/core/src/java/org/apache/orc/impl/RecordReaderUtils.java
@@ -217,7 +217,9 @@ public static long estimateRgEndOffset(boolean isCompressed,
     // Stretch the slop by a factor to safely accommodate following compression blocks.
     // We need to calculate the maximum number of blocks(stretchFactor) by bufferSize accordingly.
     if (isCompressed) {
-      int stretchFactor = 2 + (MAX_VALUES_LENGTH * MAX_BYTE_WIDTH - 1) / bufferSize;
+      // RLEv2 DIRECT runs can need a 2-byte header in addition to their value payload.
+      int maxRleDirectRunSize = MAX_VALUES_LENGTH * MAX_BYTE_WIDTH + 2;
+      int stretchFactor = 2 + (maxRleDirectRunSize - 1) / bufferSize;
       slop = stretchFactor * (OutStream.HEADER_SIZE + bufferSize);
     }
     return isLast ? streamLength : Math.min(streamLength, nextGroupOffset + slop);
@@ -300,6 +302,8 @@ public static int getIndexPosition(OrcProto.ColumnEncoding.Kind columnEncoding,
   // the maximum byte width for each value
   static final int MAX_BYTE_WIDTH =
       SerializationUtils.decodeBitWidth(SerializationUtils.FixedBitSizes.SIXTYFOUR.ordinal()) / 8;
+  // RLEv2 DIRECT run header size in bytes
+  public static final int RLE_V2_HEADER_SIZE = 2;
 
   /**
    * Is this stream part of a dictionary?
diff --git a/java/core/src/test/org/apache/orc/impl/TestInStream.java b/java/core/src/test/org/apache/orc/impl/TestInStream.java
@@ -35,11 +35,16 @@
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
+import java.util.Random;
 
+import static org.apache.orc.impl.RecordReaderUtils.MAX_BYTE_WIDTH;
+import static org.apache.orc.impl.RecordReaderUtils.MAX_VALUES_LENGTH;
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertNotEquals;
 import static org.junit.jupiter.api.Assertions.assertNotSame;
 import static org.junit.jupiter.api.Assertions.assertSame;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+import static org.junit.jupiter.api.Assertions.assertTrue;
 import static org.junit.jupiter.api.Assertions.fail;
 
 public class TestInStream {
@@ -1000,4 +1005,67 @@ public void testStreamResetWithoutIncreasedLength() throws IOException {
     byte[] inBuffer = new byte[5];
     assertEquals(5, inStream.read(inBuffer));
   }
+
+  /**
+   * Demonstrates that the old estimateRgEndOffset slop calculation is insufficient.
+   * When a compressed stream is truncated at the old estimated end offset,
+   * reading a full RLE v2 DIRECT run fails because the estimated slop doesn't
+   * account for enough compressed blocks.
+   */
+  @Test
+  public void testTruncatedRleV2DirectRunAtEstimatedEndFails() throws Exception {
+    final int bufferSize = 1024;
+    final int chunkSize = OutStream.HEADER_SIZE + bufferSize;
+    final int nextGroupOffset = bufferSize;
+    final int oldStretchFactor =
+        2 + (MAX_VALUES_LENGTH * MAX_BYTE_WIDTH - 1) / bufferSize;
+    final int oldEstimatedEnd = nextGroupOffset + oldStretchFactor * chunkSize;
+
+    TestInStream.OutputCollector receiver = new TestInStream.OutputCollector();
+    CompressionCodec codec = new ZlibCodec();
+    StreamOptions streamOptions = new StreamOptions(bufferSize)
+        .withCodec(codec, codec.getDefaultOptions());
+    byte[] data = new byte[bufferSize * 6];
+    new Random(42).nextBytes(data);
+    try (OutStream out = new OutStream("test", streamOptions, receiver)) {
+      out.write(data);
+      out.flush();
+    }
+
+    byte[] encoded = receiver.buffer.get();
+    assertEquals(nextGroupOffset + 5 * chunkSize, oldEstimatedEnd);
+    assertTrue(encoded.length > oldEstimatedEnd);
+
+    InStream stream = InStream.create("test",
+        new BufferChunk(ByteBuffer.wrap(encoded, 0, oldEstimatedEnd), 0),
+        0, oldEstimatedEnd,
+        InStream.options().withCodec(codec).withBufferSize(bufferSize));
+    byte[] rleDirectRun = new byte[MAX_VALUES_LENGTH * MAX_BYTE_WIDTH
+        + RecordReaderUtils.RLE_V2_HEADER_SIZE];
+
+    stream.seek(new SimplePositionProvider(nextGroupOffset, 0));
+    IllegalArgumentException error = assertThrows(
+        IllegalArgumentException.class, () -> {
+          int offset = 0;
+          while (offset < rleDirectRun.length) {
+            offset += stream.read(
+                rleDirectRun, offset, rleDirectRun.length - offset);
+          }
+        });
+    assertTrue(error.getMessage().contains("Buffer size too small"));
+  }
+
+  private static class SimplePositionProvider implements PositionProvider {
+    private final long[] positions;
+    private int index = 0;
+
+    SimplePositionProvider(long... positions) {
+      this.positions = positions;
+    }
+
+    @Override
+    public long getNext() {
+      return positions[index++];
+    }
+  }
 }