[CUDA] PagedAttention: fix int32 overflow + heuristic max_query_len (tianleiwu review)

elwhyjay · elwhyjay · commit f3458655a5dc · 2026-04-24T05:13:13.000Z
Two silent-corruption bugs flagged in the PR #28200 review: 1. Int32 overflow in GatherAndExpandPagedKVCache / LaunchGatherAndExpandPagedKVCache. `total_kv_tokens * num_heads * head_size` was computed in int; for realistic large- context GQA configs (e.g. 2M tokens * 64 heads * 128 head_size = 16.4B) this overflows INT32_MAX, producing a wrong element count, wrong block count, and wrong `tid` bound — silent corruption or OOB reads. Kernel now takes total_elems as int64_t and uses a grid-stride loop instead of a per-thread (tid >= total_elems) early-exit. Launcher computes total_elems in int64_t and caps the grid at kMaxBlocks = 65535 (grid-stride loop covers the rest). paged_idx, page_stride, and the outer stride are all int64_t so no intermediate multiplication overflows. 2. max_query_len heuristic (token_count - batch_size + 1) silently drops query tokens in the MEA path. CUTLASS MEA uses `p.sequence_length` directly as grid_x (ceil_div(sequence_length, kQueriesPerBlock)); missing blocks are never launched, so if any batch has 0 new query tokens the heuristic underestimates the actual max and query tokens from larger batches are silently unprocessed. Same issue affects the rotary grid. The FA path is unaffected — max_query_len is a hint there. Added an int max_query_len field to PagedAttentionData. paged_attention.cc now D->H syncs the full cumulative_seqlens_q (and cumulative_seqlens_kv) — both are batch_size+1 ints so the extra copy is cheap and avoids a second sync. The host computes max per-batch new-query length and propagates via data.max_query_len; EfficientAttention uses data.max_query_len instead of the heuristic.
diff --git a/onnxruntime/contrib_ops/cuda/bert/attention_data.h b/onnxruntime/contrib_ops/cuda/bert/attention_data.h
@@ -233,6 +233,13 @@ struct PagedAttentionData {
   // Populated by the caller after a D->H sync on cumulative_seqlens_kv[batch_size].
   int total_kv_tokens = 0;
 
+  // Actual max of per-batch new-query lengths (cumulative_seqlens_q[i+1] - cumulative_seqlens_q[i]).
+  // Populated by the caller via the same D->H sync so the MEA path's rotary grid and MEA's
+  // grid_x (ceil_div(sequence_length, kQueriesPerBlock)) cover every query token. The previous
+  // heuristic `token_count - batch_size + 1` underestimates when any batch has 0 new tokens,
+  // producing silent per-token dropout in MEA and rotary.
+  int max_query_len = 0;
+
   // Output Tensors
   T* output = nullptr;
 
diff --git a/onnxruntime/contrib_ops/cuda/bert/paged_attention.cc b/onnxruntime/contrib_ops/cuda/bert/paged_attention.cc
@@ -221,18 +221,38 @@ Status PagedAttention<T>::ComputeInternal(OpKernelContext* context) const {
       parameters.batch_size, cuda_stream));
 
   int total_kv_tokens = 0;
+  int max_query_len = 0;
   IAllocatorUniquePtr<void> gathered_key_buffer;
   IAllocatorUniquePtr<void> gathered_value_buffer;
   IAllocatorUniquePtr<void> fmha_buffer;
 
 #if USE_MEMORY_EFFICIENT_ATTENTION
   if (use_memory_efficient_attention) {
-    auto total_kv_pinned = this->AllocateBufferOnCPUPinned<int>(1);
-    CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(total_kv_pinned.get(),
-                                         cumulative_seqlens_kv_ptr + parameters.batch_size,
-                                         sizeof(int), cudaMemcpyDeviceToHost, cuda_stream));
+    // MEA needs two host-side quantities:
+    //   - total_kv_tokens (= cumulative_seqlens_kv[batch_size]) to size tight gather buffers.
+    //   - max_query_len (= max per-batch new-query length) to size the rotary and MEA grids
+    //     correctly. The heuristic `token_count - batch_size + 1` underestimates when any
+    //     batch has 0 new tokens (valid input), silently dropping query-tokens from those
+    //     larger-than-average batches.
+    // Both come from cumulative_seqlens_q / cumulative_seqlens_kv, which are tiny (batch+1
+    // ints each), so one D->H copy of the full arrays is cheaper than issuing an extra
+    // reduction kernel and avoids a second sync.
+    const int kCumulativeCount = parameters.batch_size + 1;
+    auto cum_q_pinned = this->AllocateBufferOnCPUPinned<int>(kCumulativeCount);
+    auto cum_kv_pinned = this->AllocateBufferOnCPUPinned<int>(kCumulativeCount);
+    CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(cum_q_pinned.get(),
+                                         reinterpret_cast<const int*>(cumulative_seqlens_q->Data<int>()),
+                                         sizeof(int) * kCumulativeCount, cudaMemcpyDeviceToHost, cuda_stream));
+    CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(cum_kv_pinned.get(), cumulative_seqlens_kv_ptr,
+                                         sizeof(int) * kCumulativeCount, cudaMemcpyDeviceToHost, cuda_stream));
     CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(cuda_stream));
-    total_kv_tokens = total_kv_pinned.get()[0];
+    total_kv_tokens = cum_kv_pinned.get()[parameters.batch_size];
+    for (int i = 0; i < parameters.batch_size; ++i) {
+      const int q_len_i = cum_q_pinned.get()[i + 1] - cum_q_pinned.get()[i];
+      if (q_len_i > max_query_len) {
+        max_query_len = q_len_i;
+      }
+    }
     if (total_kv_tokens == 0) {
       // Legal empty-input case: token_count == 0 and all past_seqlens == 0 — nothing to do.
       // The paged key/value caches are alias-outputs already bound to the input caches
@@ -305,6 +325,7 @@ Status PagedAttention<T>::ComputeInternal(OpKernelContext* context) const {
       data.fmha_buffer = reinterpret_cast<CudaT*>(fmha_buffer.get());
     }
     data.total_kv_tokens = total_kv_tokens;
+    data.max_query_len = max_query_len;
   }
 
   cublasHandle_t cublas = GetCublasHandle(context);
diff --git a/onnxruntime/contrib_ops/cuda/bert/paged_attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/paged_attention_impl.cu
@@ -239,6 +239,9 @@ Status LaunchReshapeAndCache(const T* key, const T* value, T* key_cache, T* valu
 }
 
 // Gather paged KV into packed-varlen [total_kv_tokens, num_heads, head_size], expanding GQA heads.
+// total_elems = total_kv_tokens * num_heads * head_size can exceed INT32_MAX for realistic
+// large-context GQA configs (e.g., 2M tokens * 64 * 128 = 16.4B), so the linear index is int64_t
+// and the kernel uses a grid-stride loop instead of a single (tid >= total_elems) early-exit.
 template <typename T>
 __global__ void GatherAndExpandPagedKVCache(const T* __restrict__ key_cache,
                                             const T* __restrict__ value_cache,
@@ -252,52 +255,54 @@ __global__ void GatherAndExpandPagedKVCache(const T* __restrict__ key_cache,
                                             const int head_size,
                                             const int block_size,
                                             const int max_num_blocks_per_seq,
-                                            const int total_kv_tokens) {
-  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  const int total_elems = total_kv_tokens * num_heads * head_size;
-  if (tid >= total_elems) {
-    return;
-  }
-
-  const int h = tid % head_size;
-  const int head_id = (tid / head_size) % num_heads;
-  const int token_id = tid / (num_heads * head_size);
-
-  // cumulative_seqlens_kv is a prefix sum of non-negative per-batch KV lengths
-  // (past_seqlens[i] + new_tokens[i]), so it is monotonically non-decreasing for
-  // any valid op input — the same assumption the previous linear scan made.
-  // Binary-search for the batch this token belongs to: log2(batch_size) is strictly
-  // better than the linear scan, which ran once per (token, head, h) element and
-  // multiplied its cost by num_heads * head_size.
-  int left = 0;
-  int right = batch_size;
-  while (left < right) {
-    const int mid = left + (right - left) / 2;
-    if (token_id < cumulative_seqlens_kv[mid + 1]) {
-      right = mid;
-    } else {
-      left = mid + 1;
+                                            const int64_t total_elems) {
+  const int64_t stride = static_cast<int64_t>(gridDim.x) * blockDim.x;
+  const int64_t num_heads_times_head = static_cast<int64_t>(num_heads) * head_size;
+  const int q_kv_head_ratio = num_heads / kv_num_heads;
+  const int64_t page_stride = static_cast<int64_t>(block_size) * kv_num_heads * head_size;
+
+  for (int64_t tid = threadIdx.x + static_cast<int64_t>(blockIdx.x) * blockDim.x;
+       tid < total_elems;
+       tid += stride) {
+    const int h = static_cast<int>(tid % head_size);
+    const int head_id = static_cast<int>((tid / head_size) % num_heads);
+    const int token_id = static_cast<int>(tid / num_heads_times_head);
+
+    // cumulative_seqlens_kv is a prefix sum of non-negative per-batch KV lengths
+    // (past_seqlens[i] + new_tokens[i]), so it is monotonically non-decreasing for
+    // any valid op input — the same assumption the previous linear scan made.
+    // Binary-search for the batch this token belongs to: log2(batch_size) is strictly
+    // better than the linear scan, which ran once per (token, head, h) element and
+    // multiplied its cost by num_heads * head_size.
+    int left = 0;
+    int right = batch_size;
+    while (left < right) {
+      const int mid = left + (right - left) / 2;
+      if (token_id < cumulative_seqlens_kv[mid + 1]) {
+        right = mid;
+      } else {
+        left = mid + 1;
+      }
     }
-  }
-  const int batch_id = left;
+    const int batch_id = left;
 
-  const int pos = token_id - cumulative_seqlens_kv[batch_id];
-  const int block_idx_in_seq = pos / block_size;
-  const int block_offset = pos % block_size;
-  const int block_id = block_table[batch_id * max_num_blocks_per_seq + block_idx_in_seq];
+    const int pos = token_id - cumulative_seqlens_kv[batch_id];
+    const int block_idx_in_seq = pos / block_size;
+    const int block_offset = pos % block_size;
+    const int block_id = block_table[batch_id * max_num_blocks_per_seq + block_idx_in_seq];
 
-  // GQA expansion: each output head maps to kv_head_id = head_id / (num_heads / kv_num_heads).
-  // For MHA (num_heads == kv_num_heads) this is the identity.
-  const int q_kv_head_ratio = num_heads / kv_num_heads;
-  const int kv_head_id = head_id / q_kv_head_ratio;
+    // GQA expansion: each output head maps to kv_head_id = head_id / (num_heads / kv_num_heads).
+    // For MHA (num_heads == kv_num_heads) this is the identity.
+    const int kv_head_id = head_id / q_kv_head_ratio;
 
-  const int paged_idx = block_id * block_size * kv_num_heads * head_size +
-                        block_offset * kv_num_heads * head_size +
-                        kv_head_id * head_size +
-                        h;
+    const int64_t paged_idx = static_cast<int64_t>(block_id) * page_stride +
+                              static_cast<int64_t>(block_offset) * kv_num_heads * head_size +
+                              kv_head_id * head_size +
+                              h;
 
-  gathered_key[tid] = key_cache[paged_idx];
-  gathered_value[tid] = value_cache[paged_idx];
+    gathered_key[tid] = key_cache[paged_idx];
+    gathered_value[tid] = value_cache[paged_idx];
+  }
 }
 
 template <typename T>
@@ -309,17 +314,22 @@ Status LaunchGatherAndExpandPagedKVCache(const T* key_cache, const T* value_cach
                                          const int block_size, const int max_num_blocks_per_seq,
                                          const int total_kv_tokens, cudaStream_t stream,
                                          const int max_threads_per_block) {
-  const int total_elems = total_kv_tokens * num_heads * head_size;
+  const int64_t total_elems = static_cast<int64_t>(total_kv_tokens) * num_heads * head_size;
   if (total_elems == 0) {
     return Status::OK();
   }
-  const int threads = std::min(total_elems, max_threads_per_block);
-  const int blocks = (total_elems + threads - 1) / threads;
+  // With the op's batch_size <= 256 precondition (paged_attention.cc) and MEA's
+  // head_size <= 1024 cap, blocks_needed = ceil(total_elems / threads) stays comfortably
+  // within int range for any realistic input, so no explicit clamp is needed. The kernel
+  // uses a grid-stride loop so launching fewer blocks than total_elems / threads would
+  // also be correct — we don't need an artificial "keep SMs busy" cap.
+  const int threads = static_cast<int>(std::min<int64_t>(max_threads_per_block, total_elems));
+  const int blocks = static_cast<int>((total_elems + threads - 1) / threads);
   GatherAndExpandPagedKVCache<T><<<blocks, threads, 0, stream>>>(
       key_cache, value_cache, gathered_key, gathered_value,
       block_table, cumulative_seqlens_kv,
       batch_size, num_heads, kv_num_heads, head_size,
-      block_size, max_num_blocks_per_seq, total_kv_tokens);
+      block_size, max_num_blocks_per_seq, total_elems);
   return CUDA_CALL(cudaGetLastError());
 }
 
@@ -445,7 +455,11 @@ Status EfficientAttention(
   const int max_num_blocks_per_seq = parameters.max_num_blocks_per_seq;
   const int local_window_size = parameters.local_window_size;
   const int total_kv_tokens = data.total_kv_tokens;
-  const int max_query_len = token_count - batch_size + 1;
+  // Use the caller-computed actual max of per-batch new-query lengths, not the
+  // `token_count - batch_size + 1` heuristic: the heuristic assumes >=1 new token per batch
+  // and underestimates otherwise, which would silently drop query tokens from the
+  // rotary grid and from MEA's `grid_x = ceil_div(sequence_length, kQueriesPerBlock)`.
+  const int max_query_len = data.max_query_len;
 
   T* query = const_cast<T*>(data.query);
   T* key;