Skip to content

Commit a208df8

Browse files
tianleiwuCopilot
andauthored
Use SafeInt for size arithmetic in CPU tensor operators to prevent overflow (#28060)
### Description Replace unchecked `int64_t` size/offset arithmetic with `SafeInt<size_t>` across several CPU operator implementations to prevent silent integer overflow when computing buffer offsets and allocation sizes. All changed expressions compute non-negative element counts or byte offsets used in pointer arithmetic, `memset`, `std::copy_n`, `std::fill_n`, or allocator calls. On models with large tensor dimensions the intermediate products (e.g., `N * C * H * W`) can overflow `int64_t` before the result is used. Wrapping the leading factor in `SafeInt<size_t>()` ensures every intermediate multiplication is overflow-checked and produces a `size_t` result. ### Motivation and Context Integer overflow in size calculations can lead to undersized allocations, out-of-bounds memory access, or incorrect pointer offsets — all of which are security-sensitive. This change hardens the affected code paths against such overflow. ### Key Changes | File | Change | |---|---| | `core/providers/cpu/tensor/grid_sample.cc` | Wrap grid/input/output offset computations with `SafeInt<size_t>`, chain all factors through SafeInt instead of parenthesized sub-expressions | | `core/providers/cpu/tensor/affine_grid.cc` | Wrap batch offset and Eigen map size computations with `SafeInt<size_t>` | | `core/providers/cpu/tensor/upsample_antialias.h` | Replace `narrow<size_t>(a * b)` and `static_cast<size_t>(a * b)` with `SafeInt<size_t>(a) * b` for temp buffer sizes, span extents, and copy counts | | `core/providers/cpu/nn/tfidfvectorizer.cc` | Wrap `memset` byte-count computation with `SafeInt` | | `core/providers/cpu/quantization/qlinearconv.cc` | Wrap `Alloc()` / `MakeUniquePtr` size computation with `SafeInt` | | `core/providers/cpu/quantization/quantize_linear.cc` | Wrap sub-byte quantization total-size computation with `SafeInt` | | `core/providers/cpu/sequence/sequence_ops.cc` | Wrap `SplitToSequence` offset and copy-count computations with `SafeInt` | ### Testing Existing unit tests cover the functional behavior of all affected operators. The change is purely defensive — it makes previously unchecked arithmetic throw on overflow instead of silently wrapping, with no change to behavior for in-range inputs. --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
1 parent 3f74b3c commit a208df8

File tree

7 files changed

+101
-63
lines changed

7 files changed

+101
-63
lines changed

onnxruntime/core/providers/cpu/nn/tfidfvectorizer.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -382,7 +382,7 @@ Status TfIdfVectorizer::Compute(OpKernelContext* ctx) const {
382382
// TfidfVectorizer returns a zero tensor of shape
383383
// {b_dim, output_size} when b_dim is the number of received observations
384384
// and output_size the is the maximum value in ngram_indexes attribute plus 1.
385-
memset(output_data, 0, static_cast<size_t>(output_shape.Size() * sizeof(float)));
385+
memset(output_data, 0, static_cast<size_t>(SafeInt<size_t>(output_shape.Size()) * sizeof(float)));
386386
return Status::OK();
387387
}
388388

onnxruntime/core/providers/cpu/quantization/qlinearconv.cc

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -251,7 +251,8 @@ class QLinearConv : public OpKernel {
251251
//
252252
// Note: The size of this buffer is less than or equal to the size of the original
253253
// weight tensor, so the allocation size is guaranteed to fit inside size_t.
254-
auto* group_reordered_W = static_cast<int8_t*>(alloc->Alloc(group_output_channels * group_input_channels * kernel_size));
254+
auto* group_reordered_W = static_cast<int8_t*>(alloc->Alloc(
255+
static_cast<size_t>(SafeInt<size_t>(group_output_channels) * group_input_channels * kernel_size)));
255256
BufferUniquePtr group_reordered_W_buffer(group_reordered_W, BufferDeleter(alloc));
256257

257258
const size_t W_offset = group_output_channels * kernel_dim;
@@ -439,7 +440,9 @@ Status QLinearConv<ActType>::PrePack(const Tensor& tensor, int input_idx, Alloca
439440
//
440441
// Note: The size of this buffer is less than or equal to the size of the original
441442
// weight tensor, so the allocation size is guaranteed to fit inside size_t.
442-
auto group_reordered_W_buffer = IAllocator::MakeUniquePtr<void>(alloc, group_output_channels * group_input_channels * kernel_size, true);
443+
auto group_reordered_W_buffer = IAllocator::MakeUniquePtr<void>(
444+
alloc, static_cast<size_t>(SafeInt<size_t>(group_output_channels) * group_input_channels * kernel_size),
445+
true);
443446
auto* group_reordered_W = static_cast<uint8_t*>(group_reordered_W_buffer.get());
444447

445448
const size_t W_offset = group_output_channels * kernel_dim;

onnxruntime/core/providers/cpu/quantization/quantize_linear.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -872,7 +872,7 @@ void ComputeLoop(OpKernelContext* ctx, const InT* input, const InT* scale, const
872872
output_index += static_cast<size_t>(N); \
873873
} \
874874
} \
875-
assert(output_index == static_cast<size_t>(M * K * N)); \
875+
assert(output_index == static_cast<size_t>(SafeInt<size_t>(M) * K * N)); \
876876
}
877877

878878
DEFINE_COMPUTE_LOOP_FP32_TO_SUB_BYTE(Int4x2, ParQuantizeLinearStdS4, 2)
@@ -890,7 +890,7 @@ DEFINE_COMPUTE_LOOP_FP32_TO_SUB_BYTE(UInt2x4, ParQuantizeLinearStdU2, 4)
890890
int64_t K, int64_t N, bool saturate) { \
891891
ORT_UNUSED_PARAMETER(saturate); \
892892
\
893-
size_t total_size = static_cast<size_t>(M * K * N); \
893+
size_t total_size = static_cast<size_t>(SafeInt<size_t>(M) * K * N); \
894894
auto tmp_buf = std::make_unique<SUB_BYTE_TYPE::UnpackedType[]>(total_size); \
895895
size_t tmp_buf_index = 0; \
896896
constexpr size_t shift_bits = (ELEMENTS_PER_BYTE == 2) ? 1 : 2; /* log2(ELEMENTS_PER_BYTE) */ \

onnxruntime/core/providers/cpu/sequence/sequence_ops.cc

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include "core/providers/cpu/sequence/sequence_ops.h"
55

66
#include "core/common/narrow.h"
7+
#include "core/common/safeint.h"
78
#include "core/framework/tensorprotoutils.h"
89
#include "core/framework/TensorSeq.h"
910
#include "core/framework/op_kernel_type_control_utils.h"
@@ -517,7 +518,8 @@ Status SplitToSequence::ComputeImpl(OpKernelContext& context, const Tensor& inpu
517518
void* output_data = output_tensor.MutableDataRaw();
518519

519520
const auto M = before_dims;
520-
const auto* A = static_cast<const char*>(input_data) + static_cast<size_t>(input_offset * element_size);
521+
const auto* A =
522+
static_cast<const char*>(input_data) + static_cast<size_t>(SafeInt<size_t>(input_offset) * element_size);
521523
const auto lda = after_dims_including_split_axis;
522524
auto* B = output_data;
523525

@@ -528,7 +530,7 @@ Status SplitToSequence::ComputeImpl(OpKernelContext& context, const Tensor& inpu
528530
const auto* src = reinterpret_cast<const std::string*>(A);
529531
auto* dst = reinterpret_cast<std::string*>(B);
530532
if (lda == N) {
531-
copy_data<std::string>(src, dst, static_cast<size_t>(M * N));
533+
copy_data<std::string>(src, dst, static_cast<size_t>(SafeInt<size_t>(M) * N));
532534
} else {
533535
size_t lda_offset = 0;
534536
size_t ldb_offset = 0;
@@ -540,13 +542,13 @@ Status SplitToSequence::ComputeImpl(OpKernelContext& context, const Tensor& inpu
540542
} else {
541543
if (lda == N) {
542544
// if the data is contiguous, we can just copy the data
543-
const size_t bytes_to_copy = static_cast<size_t>(N) * static_cast<size_t>(M) * element_size;
545+
const size_t bytes_to_copy = static_cast<size_t>(SafeInt<size_t>(N) * M * element_size);
544546
memcpy(B, A, bytes_to_copy);
545547
} else {
546548
// otherwise we need to copy each row
547-
const size_t row_bytes = SafeInt<size_t>(N) * element_size;
548-
const auto lda_bytes_inc = SafeInt<size_t>(lda) * element_size;
549-
const auto ldb_bytes_inc = SafeInt<size_t>(ldb) * element_size;
549+
const size_t row_bytes = static_cast<size_t>(SafeInt<size_t>(N) * element_size);
550+
const auto lda_bytes_inc = static_cast<size_t>(SafeInt<size_t>(lda) * element_size);
551+
const auto ldb_bytes_inc = static_cast<size_t>(SafeInt<size_t>(ldb) * element_size);
550552
SafeInt<size_t> lda_bytes_offset = 0;
551553
SafeInt<size_t> ldb_bytes_offset = 0;
552554
for (size_t idx = 0; idx < static_cast<size_t>(M); ++idx,

onnxruntime/core/providers/cpu/tensor/affine_grid.cc

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include "core/providers/cpu/tensor/affine_grid.h"
55

66
#include "core/common/common.h"
7+
#include "core/common/safeint.h"
78
#include "core/providers/op_kernel_type_control.h"
89
#include "core/util/math_cpuonly.h"
910
#include <Eigen/Dense>
@@ -78,9 +79,10 @@ void affine_grid_generator_2d(const Tensor* theta, const Eigen::Matrix<T, 2, Eig
7879
const Eigen::Matrix<T, 2, 2, option> theta_R{{theta_data[0], theta_data[1]}, {theta_data[3], theta_data[4]}}; // 2x2
7980
const Eigen::Array<T, 2, 1> theta_T(theta_data[2], theta_data[5]); // 2x1
8081

81-
auto grid_batch_offset = batch_num * H * W * 2;
82+
const auto grid_batch_offset = static_cast<size_t>(SafeInt<size_t>(batch_num) * H * W * 2);
8283
T* grid_data = grid->MutableData<T>() + grid_batch_offset;
83-
Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, 2, option>> grid_matrix(grid_data, narrow<size_t>(H * W), 2);
84+
Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, 2, option>> grid_matrix(
85+
grid_data, static_cast<size_t>(SafeInt<size_t>(H) * W), 2);
8486
grid_matrix = ((theta_R * base_grid_transposed).array().colwise() + theta_T).matrix().transpose(); // ((2x2 * 2xN).array().colwise() + 2x1).matrix().transpose() => Nx2
8587
}
8688

@@ -97,9 +99,10 @@ void affine_grid_generator_3d(const Tensor* theta, const Eigen::Matrix<T, 3, Eig
9799

98100
const Eigen::Array<T, 3, 1> theta_T(theta_data[3], theta_data[7], theta_data[11]); // 3x1
99101

100-
auto grid_batch_offset = batch_num * D * H * W * 3;
102+
const auto grid_batch_offset = static_cast<size_t>(SafeInt<size_t>(batch_num) * D * H * W * 3);
101103
T* grid_data = grid->MutableData<T>() + grid_batch_offset;
102-
Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, 3, option>> grid_matrix(grid_data, narrow<size_t>(D * H * W), 3);
104+
Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, 3, option>> grid_matrix(
105+
grid_data, static_cast<size_t>(SafeInt<size_t>(D) * H * W), 3);
103106
grid_matrix = ((theta_R * base_grid_transposed).array().colwise() + theta_T).matrix().transpose();
104107
}
105108

onnxruntime/core/providers/cpu/tensor/grid_sample.cc

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
#include <vector>
55

6+
#include "core/common/safeint.h"
67
#include "core/providers/cpu/tensor/grid_sample.h"
78
#include "core/framework/element_type_lists.h"
89
#include "core/framework/TensorSeq.h"
@@ -379,12 +380,14 @@ Status GridSample<T>::Compute(OpKernelContext* context) const {
379380
}
380381
} else {
381382
for (int64_t n = 0; n < N; n++) {
382-
const T* grid_data = grid->Data<T>() + n * (H_out * W_out) * 2;
383+
const T* grid_data = grid->Data<T>() + static_cast<size_t>(SafeInt<size_t>(n) * H_out * W_out * 2);
383384
concurrency::ThreadPool::TrySimpleParallelFor(
384385
tp, onnxruntime::narrow<std::ptrdiff_t>(C),
385386
[&](std::ptrdiff_t c) {
386-
const T* X_data = input->Data<T>() + (n * C + c) * (H_in * W_in);
387-
T* Y_data = Y.MutableData<T>() + (n * C + c) * (H_out * W_out);
387+
const SafeInt<size_t> nc = SafeInt<size_t>(n) * SafeInt<size_t>(C) + SafeInt<size_t>(c);
388+
const T* X_data =
389+
input->Data<T>() + static_cast<size_t>(nc * H_in * W_in);
390+
T* Y_data = Y.MutableData<T>() + static_cast<size_t>(nc * H_out * W_out);
388391

389392
for (int64_t oy = 0; oy < H_out; oy++) {
390393
for (int64_t ox = 0; ox < W_out; ox++) {
@@ -469,12 +472,17 @@ Status GridSample<T>::Compute(OpKernelContext* context) const {
469472

470473
concurrency::ThreadPool* tp = D_out * H_out * W_out > 64 ? context->GetOperatorThreadPool() : nullptr;
471474
for (int64_t n = 0; n < N; n++) {
472-
const T* grid_data = grid->Data<T>() + n * (D_out * H_out * W_out) * 3;
475+
const T* grid_data = grid->Data<T>() + static_cast<size_t>(SafeInt<size_t>(n) * D_out * H_out * W_out * 3);
473476
concurrency::ThreadPool::TrySimpleParallelFor(
474477
tp, onnxruntime::narrow<std::ptrdiff_t>(C),
475478
[&](std::ptrdiff_t c) {
476-
const T* X_data = input->Data<T>() + (n * C + c) * (D_in * H_in * W_in);
477-
T* Y_data = Y.MutableData<T>() + (n * C + c) * (D_out * H_out * W_out);
479+
const SafeInt<size_t> nc = SafeInt<size_t>(n) * SafeInt<size_t>(C) + SafeInt<size_t>(c);
480+
const SafeInt<size_t> input_plane_offset = nc * SafeInt<size_t>(D_in) * SafeInt<size_t>(H_in) * SafeInt<size_t>(W_in);
481+
const SafeInt<size_t> output_plane_offset = nc * SafeInt<size_t>(D_out) * SafeInt<size_t>(H_out) * SafeInt<size_t>(W_out);
482+
const T* X_data =
483+
input->Data<T>() + static_cast<size_t>(input_plane_offset);
484+
T* Y_data =
485+
Y.MutableData<T>() + static_cast<size_t>(output_plane_offset);
478486

479487
for (int64_t oz = 0; oz < D_out; oz++) {
480488
for (int64_t oy = 0; oy < H_out; oy++) {

0 commit comments

Comments
 (0)