Use SafeInt for size arithmetic in CPU tensor operators to prevent overflow (#28060)

tianleiwu · Copilot · web-flow · commit a208df8a25a5 · 2026-04-17T06:53:20.000Z
### Description

Replace unchecked `int64_t` size/offset arithmetic with
`SafeInt&lt;size_t&gt;` across several CPU operator implementations to prevent
silent integer overflow when computing buffer offsets and allocation
sizes.

All changed expressions compute non-negative element counts or byte
offsets used in pointer arithmetic, `memset`, `std::copy_n`,
`std::fill_n`, or allocator calls. On models with large tensor
dimensions the intermediate products (e.g., `N * C * H * W`) can
overflow `int64_t` before the result is used. Wrapping the leading
factor in `SafeInt&lt;size_t&gt;()` ensures every intermediate multiplication
is overflow-checked and produces a `size_t` result.

### Motivation and Context

Integer overflow in size calculations can lead to undersized
allocations, out-of-bounds memory access, or incorrect pointer offsets —
all of which are security-sensitive. This change hardens the affected
code paths against such overflow.

### Key Changes

| File | Change |
|---|---|
| `core/providers/cpu/tensor/grid_sample.cc` | Wrap grid/input/output
offset computations with `SafeInt&lt;size_t&gt;`, chain all factors through
SafeInt instead of parenthesized sub-expressions |
| `core/providers/cpu/tensor/affine_grid.cc` | Wrap batch offset and
Eigen map size computations with `SafeInt&lt;size_t&gt;` |
| `core/providers/cpu/tensor/upsample_antialias.h` | Replace
`narrow&lt;size_t&gt;(a * b)` and `static_cast&lt;size_t&gt;(a * b)` with
`SafeInt&lt;size_t&gt;(a) * b` for temp buffer sizes, span extents, and copy
counts |
| `core/providers/cpu/nn/tfidfvectorizer.cc` | Wrap `memset` byte-count
computation with `SafeInt` |
| `core/providers/cpu/quantization/qlinearconv.cc` | Wrap `Alloc()` /
`MakeUniquePtr` size computation with `SafeInt` |
| `core/providers/cpu/quantization/quantize_linear.cc` | Wrap sub-byte
quantization total-size computation with `SafeInt` |
| `core/providers/cpu/sequence/sequence_ops.cc` | Wrap `SplitToSequence`
offset and copy-count computations with `SafeInt` |

### Testing

Existing unit tests cover the functional behavior of all affected
operators. The change is purely defensive — it makes previously
unchecked arithmetic throw on overflow instead of silently wrapping,
with no change to behavior for in-range inputs.

---------

Co-authored-by: copilot-swe-agent[bot] &lt;198982749+Copilot@users.noreply.github.com&gt;
diff --git a/onnxruntime/core/providers/cpu/nn/tfidfvectorizer.cc b/onnxruntime/core/providers/cpu/nn/tfidfvectorizer.cc
@@ -382,7 +382,7 @@ Status TfIdfVectorizer::Compute(OpKernelContext* ctx) const {
     // TfidfVectorizer returns a zero tensor of shape
     // {b_dim, output_size} when b_dim is the number of received observations
     // and output_size the is the maximum value in ngram_indexes attribute plus 1.
-    memset(output_data, 0, static_cast<size_t>(output_shape.Size() * sizeof(float)));
+    memset(output_data, 0, static_cast<size_t>(SafeInt<size_t>(output_shape.Size()) * sizeof(float)));
     return Status::OK();
   }
 
diff --git a/onnxruntime/core/providers/cpu/quantization/qlinearconv.cc b/onnxruntime/core/providers/cpu/quantization/qlinearconv.cc
@@ -251,7 +251,8 @@ class QLinearConv : public OpKernel {
         //
         // Note: The size of this buffer is less than or equal to the size of the original
         // weight tensor, so the allocation size is guaranteed to fit inside size_t.
-        auto* group_reordered_W = static_cast<int8_t*>(alloc->Alloc(group_output_channels * group_input_channels * kernel_size));
+        auto* group_reordered_W = static_cast<int8_t*>(alloc->Alloc(
+            static_cast<size_t>(SafeInt<size_t>(group_output_channels) * group_input_channels * kernel_size)));
         BufferUniquePtr group_reordered_W_buffer(group_reordered_W, BufferDeleter(alloc));
 
         const size_t W_offset = group_output_channels * kernel_dim;
@@ -439,7 +440,9 @@ Status QLinearConv<ActType>::PrePack(const Tensor& tensor, int input_idx, Alloca
       //
       // Note: The size of this buffer is less than or equal to the size of the original
       // weight tensor, so the allocation size is guaranteed to fit inside size_t.
-      auto group_reordered_W_buffer = IAllocator::MakeUniquePtr<void>(alloc, group_output_channels * group_input_channels * kernel_size, true);
+      auto group_reordered_W_buffer = IAllocator::MakeUniquePtr<void>(
+          alloc, static_cast<size_t>(SafeInt<size_t>(group_output_channels) * group_input_channels * kernel_size),
+          true);
       auto* group_reordered_W = static_cast<uint8_t*>(group_reordered_W_buffer.get());
 
       const size_t W_offset = group_output_channels * kernel_dim;
diff --git a/onnxruntime/core/providers/cpu/quantization/quantize_linear.cc b/onnxruntime/core/providers/cpu/quantization/quantize_linear.cc
@@ -872,7 +872,7 @@ void ComputeLoop(OpKernelContext* ctx, const InT* input, const InT* scale, const
         output_index += static_cast<size_t>(N);                                                                   \
       }                                                                                                           \
     }                                                                                                             \
-    assert(output_index == static_cast<size_t>(M * K * N));                                                       \
+    assert(output_index == static_cast<size_t>(SafeInt<size_t>(M) * K * N));                                      \
   }
 
 DEFINE_COMPUTE_LOOP_FP32_TO_SUB_BYTE(Int4x2, ParQuantizeLinearStdS4, 2)
@@ -890,7 +890,7 @@ DEFINE_COMPUTE_LOOP_FP32_TO_SUB_BYTE(UInt2x4, ParQuantizeLinearStdU2, 4)
                                              int64_t K, int64_t N, bool saturate) {                                    \
     ORT_UNUSED_PARAMETER(saturate);                                                                                    \
                                                                                                                        \
-    size_t total_size = static_cast<size_t>(M * K * N);                                                                \
+    size_t total_size = static_cast<size_t>(SafeInt<size_t>(M) * K * N);                                               \
     auto tmp_buf = std::make_unique<SUB_BYTE_TYPE::UnpackedType[]>(total_size);                                        \
     size_t tmp_buf_index = 0;                                                                                          \
     constexpr size_t shift_bits = (ELEMENTS_PER_BYTE == 2) ? 1 : 2; /* log2(ELEMENTS_PER_BYTE) */                      \
diff --git a/onnxruntime/core/providers/cpu/sequence/sequence_ops.cc b/onnxruntime/core/providers/cpu/sequence/sequence_ops.cc
@@ -4,6 +4,7 @@
 #include "core/providers/cpu/sequence/sequence_ops.h"
 
 #include "core/common/narrow.h"
+#include "core/common/safeint.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/framework/TensorSeq.h"
 #include "core/framework/op_kernel_type_control_utils.h"
@@ -517,7 +518,8 @@ Status SplitToSequence::ComputeImpl(OpKernelContext& context, const Tensor& inpu
     void* output_data = output_tensor.MutableDataRaw();
 
     const auto M = before_dims;
-    const auto* A = static_cast<const char*>(input_data) + static_cast<size_t>(input_offset * element_size);
+    const auto* A =
+        static_cast<const char*>(input_data) + static_cast<size_t>(SafeInt<size_t>(input_offset) * element_size);
     const auto lda = after_dims_including_split_axis;
     auto* B = output_data;
 
@@ -528,7 +530,7 @@ Status SplitToSequence::ComputeImpl(OpKernelContext& context, const Tensor& inpu
       const auto* src = reinterpret_cast<const std::string*>(A);
       auto* dst = reinterpret_cast<std::string*>(B);
       if (lda == N) {
-        copy_data<std::string>(src, dst, static_cast<size_t>(M * N));
+        copy_data<std::string>(src, dst, static_cast<size_t>(SafeInt<size_t>(M) * N));
       } else {
         size_t lda_offset = 0;
         size_t ldb_offset = 0;
@@ -540,13 +542,13 @@ Status SplitToSequence::ComputeImpl(OpKernelContext& context, const Tensor& inpu
     } else {
       if (lda == N) {
         // if the data is contiguous, we can just copy the data
-        const size_t bytes_to_copy = static_cast<size_t>(N) * static_cast<size_t>(M) * element_size;
+        const size_t bytes_to_copy = static_cast<size_t>(SafeInt<size_t>(N) * M * element_size);
         memcpy(B, A, bytes_to_copy);
       } else {
         // otherwise we need to copy each row
-        const size_t row_bytes = SafeInt<size_t>(N) * element_size;
-        const auto lda_bytes_inc = SafeInt<size_t>(lda) * element_size;
-        const auto ldb_bytes_inc = SafeInt<size_t>(ldb) * element_size;
+        const size_t row_bytes = static_cast<size_t>(SafeInt<size_t>(N) * element_size);
+        const auto lda_bytes_inc = static_cast<size_t>(SafeInt<size_t>(lda) * element_size);
+        const auto ldb_bytes_inc = static_cast<size_t>(SafeInt<size_t>(ldb) * element_size);
         SafeInt<size_t> lda_bytes_offset = 0;
         SafeInt<size_t> ldb_bytes_offset = 0;
         for (size_t idx = 0; idx < static_cast<size_t>(M); ++idx,
diff --git a/onnxruntime/core/providers/cpu/tensor/affine_grid.cc b/onnxruntime/core/providers/cpu/tensor/affine_grid.cc
@@ -4,6 +4,7 @@
 #include "core/providers/cpu/tensor/affine_grid.h"
 
 #include "core/common/common.h"
+#include "core/common/safeint.h"
 #include "core/providers/op_kernel_type_control.h"
 #include "core/util/math_cpuonly.h"
 #include <Eigen/Dense>
@@ -78,9 +79,10 @@ void affine_grid_generator_2d(const Tensor* theta, const Eigen::Matrix<T, 2, Eig
   const Eigen::Matrix<T, 2, 2, option> theta_R{{theta_data[0], theta_data[1]}, {theta_data[3], theta_data[4]}};  // 2x2
   const Eigen::Array<T, 2, 1> theta_T(theta_data[2], theta_data[5]);                                             // 2x1
 
-  auto grid_batch_offset = batch_num * H * W * 2;
+  const auto grid_batch_offset = static_cast<size_t>(SafeInt<size_t>(batch_num) * H * W * 2);
   T* grid_data = grid->MutableData<T>() + grid_batch_offset;
-  Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, 2, option>> grid_matrix(grid_data, narrow<size_t>(H * W), 2);
+  Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, 2, option>> grid_matrix(
+      grid_data, static_cast<size_t>(SafeInt<size_t>(H) * W), 2);
   grid_matrix = ((theta_R * base_grid_transposed).array().colwise() + theta_T).matrix().transpose();  // ((2x2 * 2xN).array().colwise() + 2x1).matrix().transpose() => Nx2
 }
 
@@ -97,9 +99,10 @@ void affine_grid_generator_3d(const Tensor* theta, const Eigen::Matrix<T, 3, Eig
 
   const Eigen::Array<T, 3, 1> theta_T(theta_data[3], theta_data[7], theta_data[11]);  // 3x1
 
-  auto grid_batch_offset = batch_num * D * H * W * 3;
+  const auto grid_batch_offset = static_cast<size_t>(SafeInt<size_t>(batch_num) * D * H * W * 3);
   T* grid_data = grid->MutableData<T>() + grid_batch_offset;
-  Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, 3, option>> grid_matrix(grid_data, narrow<size_t>(D * H * W), 3);
+  Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, 3, option>> grid_matrix(
+      grid_data, static_cast<size_t>(SafeInt<size_t>(D) * H * W), 3);
   grid_matrix = ((theta_R * base_grid_transposed).array().colwise() + theta_T).matrix().transpose();
 }
 
diff --git a/onnxruntime/core/providers/cpu/tensor/grid_sample.cc b/onnxruntime/core/providers/cpu/tensor/grid_sample.cc
@@ -3,6 +3,7 @@
 
 #include <vector>
 
+#include "core/common/safeint.h"
 #include "core/providers/cpu/tensor/grid_sample.h"
 #include "core/framework/element_type_lists.h"
 #include "core/framework/TensorSeq.h"
@@ -379,12 +380,14 @@ Status GridSample<T>::Compute(OpKernelContext* context) const {
       }
     } else {
       for (int64_t n = 0; n < N; n++) {
-        const T* grid_data = grid->Data<T>() + n * (H_out * W_out) * 2;
+        const T* grid_data = grid->Data<T>() + static_cast<size_t>(SafeInt<size_t>(n) * H_out * W_out * 2);
         concurrency::ThreadPool::TrySimpleParallelFor(
             tp, onnxruntime::narrow<std::ptrdiff_t>(C),
             [&](std::ptrdiff_t c) {
-              const T* X_data = input->Data<T>() + (n * C + c) * (H_in * W_in);
-              T* Y_data = Y.MutableData<T>() + (n * C + c) * (H_out * W_out);
+              const SafeInt<size_t> nc = SafeInt<size_t>(n) * SafeInt<size_t>(C) + SafeInt<size_t>(c);
+              const T* X_data =
+                  input->Data<T>() + static_cast<size_t>(nc * H_in * W_in);
+              T* Y_data = Y.MutableData<T>() + static_cast<size_t>(nc * H_out * W_out);
 
               for (int64_t oy = 0; oy < H_out; oy++) {
                 for (int64_t ox = 0; ox < W_out; ox++) {
@@ -469,12 +472,17 @@ Status GridSample<T>::Compute(OpKernelContext* context) const {
 
     concurrency::ThreadPool* tp = D_out * H_out * W_out > 64 ? context->GetOperatorThreadPool() : nullptr;
     for (int64_t n = 0; n < N; n++) {
-      const T* grid_data = grid->Data<T>() + n * (D_out * H_out * W_out) * 3;
+      const T* grid_data = grid->Data<T>() + static_cast<size_t>(SafeInt<size_t>(n) * D_out * H_out * W_out * 3);
       concurrency::ThreadPool::TrySimpleParallelFor(
           tp, onnxruntime::narrow<std::ptrdiff_t>(C),
           [&](std::ptrdiff_t c) {
-            const T* X_data = input->Data<T>() + (n * C + c) * (D_in * H_in * W_in);
-            T* Y_data = Y.MutableData<T>() + (n * C + c) * (D_out * H_out * W_out);
+            const SafeInt<size_t> nc = SafeInt<size_t>(n) * SafeInt<size_t>(C) + SafeInt<size_t>(c);
+            const SafeInt<size_t> input_plane_offset = nc * SafeInt<size_t>(D_in) * SafeInt<size_t>(H_in) * SafeInt<size_t>(W_in);
+            const SafeInt<size_t> output_plane_offset = nc * SafeInt<size_t>(D_out) * SafeInt<size_t>(H_out) * SafeInt<size_t>(W_out);
+            const T* X_data =
+                input->Data<T>() + static_cast<size_t>(input_plane_offset);
+            T* Y_data =
+                Y.MutableData<T>() + static_cast<size_t>(output_plane_offset);
 
             for (int64_t oz = 0; oz < D_out; oz++) {
               for (int64_t oy = 0; oy < H_out; oy++) {
diff --git a/onnxruntime/core/providers/cpu/tensor/upsample_antialias.h b/onnxruntime/core/providers/cpu/tensor/upsample_antialias.h

Original file line number	Diff line number	Diff line change
`@@ -382,7 +382,7 @@ Status TfIdfVectorizer::Compute(OpKernelContext* ctx) const {`
`382`	`382`	`// TfidfVectorizer returns a zero tensor of shape`
`383`	`383`	`// {b_dim, output_size} when b_dim is the number of received observations`
`384`	`384`	`// and output_size the is the maximum value in ngram_indexes attribute plus 1.`
`385`		`- memset(output_data, 0, static_cast<size_t>(output_shape.Size() * sizeof(float)));`
	`385`	`+ memset(output_data, 0, static_cast<size_t>(SafeInt<size_t>(output_shape.Size()) * sizeof(float)));`
`386`	`386`	`return Status::OK();`
`387`	`387`	`}`
`388`	`388`