Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 95 additions & 16 deletions internal/core/src/index/BitmapIndex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,21 @@ BitmapIndex<T>::SerializeIndexData(uint8_t* data_ptr) {
}
}

template <typename T>
std::pair<std::shared_ptr<uint8_t[]>, size_t>
BitmapIndex<T>::SerializeValidBitsetData() const {
size_t valid_bitset_size = (total_num_rows_ + 7) / 8;
std::shared_ptr<uint8_t[]> valid_bitset_data(
new uint8_t[valid_bitset_size]);
memset(valid_bitset_data.get(), 0, valid_bitset_size);
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should be able to directly copy from valid_bitset_.data(), instead of copy bit by bit?

Copy link
Copy Markdown
Contributor Author

@SpadeA-Tang SpadeA-Tang Apr 14, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can copy directly. But in this way, we assume that the underlying format of TargetBitmap would not change. Otherwise, the format will be corrupt.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm, that is fair. we may migrate to roaring one day

for (size_t i = 0; i < total_num_rows_; ++i) {
if (valid_bitset_[i]) {
valid_bitset_data[i / 8] |= (1 << (i % 8));
}
}
return std::make_pair(valid_bitset_data, valid_bitset_size);
}

template <typename T>
std::pair<std::shared_ptr<uint8_t[]>, size_t>
BitmapIndex<T>::SerializeIndexMeta() {
Expand Down Expand Up @@ -263,6 +278,24 @@ BitmapIndex<std::string>::SerializeIndexData(uint8_t* data_ptr) {
}
}

template <typename T>
void
BitmapIndex<T>::DeserializeValidBitsetData(const uint8_t* data_ptr,
size_t data_size) {
auto expected_size = (total_num_rows_ + 7) / 8;
AssertInfo(data_size == expected_size,
"bitmap valid_bitset size mismatch, expect {}, got {}",
expected_size,
data_size);
valid_bitset_ = TargetBitmap(total_num_rows_, false);
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we memcpy into valid_bitset_?

for (size_t i = 0; i < total_num_rows_; ++i) {
uint8_t byte = data_ptr[i / 8];
if (byte & (1 << (i % 8))) {
valid_bitset_.set(i);
}
}
}

template <typename T>
BinarySet
BitmapIndex<T>::Serialize(const Config& config) {
Expand All @@ -279,6 +312,11 @@ BitmapIndex<T>::Serialize(const Config& config) {
BinarySet ret_set;
ret_set.Append(BITMAP_INDEX_DATA, index_data, index_data_size);
ret_set.Append(BITMAP_INDEX_META, index_meta.first, index_meta.second);
if (schema_.nullable()) {
auto valid_bitset = SerializeValidBitsetData();
ret_set.Append(
BITMAP_INDEX_VALID_BITSET, valid_bitset.first, valid_bitset.second);
}

LOG_INFO("build bitmap index with cardinality = {}, num_rows = {}",
data_.size(),
Expand Down Expand Up @@ -354,7 +392,8 @@ BitmapIndex<T>::ChooseIndexLoadMode(int64_t index_length) {
template <typename T>
void
BitmapIndex<T>::DeserializeIndexData(const uint8_t* data_ptr,
size_t index_length) {
size_t index_length,
bool rebuild_validity_from_postings) {
ChooseIndexLoadMode(index_length);
for (size_t i = 0; i < index_length; ++i) {
T key;
Expand All @@ -370,8 +409,10 @@ BitmapIndex<T>::DeserializeIndexData(const uint8_t* data_ptr,
} else {
data_[key] = value;
}
for (const auto& v : value) {
valid_bitset_.set(v);
if (rebuild_validity_from_postings) {
for (const auto& v : value) {
valid_bitset_.set(v);
}
}
}
}
Expand Down Expand Up @@ -413,8 +454,10 @@ BitmapIndex<T>::BuildOffsetCache() {

template <>
void
BitmapIndex<std::string>::DeserializeIndexData(const uint8_t* data_ptr,
size_t index_length) {
BitmapIndex<std::string>::DeserializeIndexData(
const uint8_t* data_ptr,
size_t index_length,
bool rebuild_validity_from_postings) {
ChooseIndexLoadMode(index_length);
for (size_t i = 0; i < index_length; ++i) {
size_t key_size;
Expand All @@ -433,8 +476,10 @@ BitmapIndex<std::string>::DeserializeIndexData(const uint8_t* data_ptr,
} else {
data_[key] = value;
}
for (const auto& v : value) {
valid_bitset_.set(v);
if (rebuild_validity_from_postings) {
for (const auto& v : value) {
valid_bitset_.set(v);
}
}
}
}
Expand Down Expand Up @@ -468,7 +513,8 @@ BitmapIndex<T>::MMapIndexData(const std::string& file_name,
const uint8_t* data_ptr,
size_t data_size,
size_t index_length,
milvus::proto::common::LoadPriority priority) {
milvus::proto::common::LoadPriority priority,
bool rebuild_validity_from_postings) {
std::filesystem::create_directories(
std::filesystem::path(file_name).parent_path());

Expand All @@ -483,8 +529,10 @@ BitmapIndex<T>::MMapIndexData(const std::string& file_name,
roaring::Roaring value;
value =
roaring::Roaring::read(reinterpret_cast<const char*>(data_ptr));
for (const auto& v : value) {
valid_bitset_.set(v);
if (rebuild_validity_from_postings) {
for (const auto& v : value) {
valid_bitset_.set(v);
}
}

// convert roaring vaule to frozen mode
Expand Down Expand Up @@ -537,7 +585,15 @@ BitmapIndex<T>::LoadWithoutAssemble(const BinarySet& binary_set,
index_meta_buffer->size);
auto index_length = index_meta.first;
total_num_rows_ = index_meta.second;
valid_bitset_ = TargetBitmap(total_num_rows_, false);
valid_bitset_ = TargetBitmap(total_num_rows_, !schema_.nullable());
bool rebuild_validity_from_postings = schema_.nullable();

auto valid_bitset_buffer = binary_set.GetByName(BITMAP_INDEX_VALID_BITSET);
if (valid_bitset_buffer != nullptr) {
DeserializeValidBitsetData(valid_bitset_buffer->data.get(),
valid_bitset_buffer->size);
rebuild_validity_from_postings = false;
}

auto index_data_buffer = binary_set.GetByName(BITMAP_INDEX_DATA);

Expand All @@ -558,9 +614,12 @@ BitmapIndex<T>::LoadWithoutAssemble(const BinarySet& binary_set,
index_data_buffer->data.get(),
index_data_buffer->size,
index_length,
priority);
priority,
rebuild_validity_from_postings);
} else {
DeserializeIndexData(index_data_buffer->data.get(), index_length);
DeserializeIndexData(index_data_buffer->data.get(),
index_length,
rebuild_validity_from_postings);
}

if (enable_offset_cache.has_value() && enable_offset_cache.value()) {
Expand Down Expand Up @@ -1313,6 +1372,12 @@ BitmapIndex<T>::WriteEntries(storage::IndexEntryWriter* writer) {
uint8_t* data_ptr = index_data.get();
SerializeIndexData(data_ptr);
writer->WriteEntry(BITMAP_INDEX_DATA, index_data.get(), index_data_size);
if (schema_.nullable()) {
auto valid_bitset = SerializeValidBitsetData();
writer->WriteEntry(BITMAP_INDEX_VALID_BITSET,
valid_bitset.first.get(),
valid_bitset.second);
}

LOG_INFO("write bitmap index entries with cardinality = {}, num_rows = {}",
data_.size(),
Expand All @@ -1329,7 +1394,18 @@ BitmapIndex<T>::LoadEntries(storage::IndexEntryReader& reader,
// V3 format: meta is in __meta__ entry
auto index_length = reader.GetMeta<size_t>(BITMAP_INDEX_LENGTH);
total_num_rows_ = reader.GetMeta<size_t>(BITMAP_INDEX_NUM_ROWS);
valid_bitset_ = TargetBitmap(total_num_rows_, false);
valid_bitset_ = TargetBitmap(total_num_rows_, !schema_.nullable());
bool rebuild_validity_from_postings = schema_.nullable();

auto entry_names = reader.GetEntryNames();
if (std::find(entry_names.begin(),
entry_names.end(),
BITMAP_INDEX_VALID_BITSET) != entry_names.end()) {
auto valid_bitset_entry = reader.ReadEntry(BITMAP_INDEX_VALID_BITSET);
DeserializeValidBitsetData(valid_bitset_entry.data.data(),
valid_bitset_entry.data.size());
rebuild_validity_from_postings = false;
}

auto data_entry = reader.ReadEntry(BITMAP_INDEX_DATA);

Expand All @@ -1349,9 +1425,12 @@ BitmapIndex<T>::LoadEntries(storage::IndexEntryReader& reader,
data_entry.data.data(),
data_entry.data.size(),
index_length,
priority);
priority,
rebuild_validity_from_postings);
} else {
DeserializeIndexData(data_entry.data.data(), index_length);
DeserializeIndexData(data_entry.data.data(),
index_length,
rebuild_validity_from_postings);
}

if (enable_offset_cache.has_value() && enable_offset_cache.value()) {
Expand Down
35 changes: 33 additions & 2 deletions internal/core/src/index/BitmapIndex.h
Original file line number Diff line number Diff line change
Expand Up @@ -298,17 +298,36 @@ class BitmapIndex : public ScalarIndex<T> {
void
SerializeIndexData(uint8_t* index_data_ptr);

std::pair<std::shared_ptr<uint8_t[]>, size_t>
SerializeValidBitsetData() const;

std::pair<std::shared_ptr<uint8_t[]>, size_t>
SerializeIndexMeta();

std::pair<size_t, size_t>
DeserializeIndexMeta(const uint8_t* data_ptr, size_t data_size);

void
DeserializeValidBitsetData(const uint8_t* data_ptr, size_t data_size);

T
ParseKey(const uint8_t** ptr);

// Deserialize posting data.
//
// New bitmap index formats persist valid_bitset_, which is the
// authoritative source of row validity. Legacy formats do not, so we may
// rebuild validity from postings as a backward-compatibility fallback for
// nullable fields. Non-nullable fields do not persist valid_bitset_ and
// are treated as all-valid on load.
//
// Rebuilding validity from postings is lossy for ARRAY fields: empty
// arrays have no element postings, so they cannot be distinguished from
// null arrays during reconstruction.
void
DeserializeIndexData(const uint8_t* data_ptr, size_t index_length);
DeserializeIndexData(const uint8_t* data_ptr,
size_t index_length,
bool rebuild_validity_from_postings);

void
BuildOffsetCache();
Expand Down Expand Up @@ -352,12 +371,24 @@ class BitmapIndex : public ScalarIndex<T> {
const T& upper_bound_value,
bool ub_inclusive);

// Build mmap-backed posting storage from serialized bitmap index data.
//
// New bitmap index formats persist valid_bitset_, which is the
// authoritative source of row validity. Legacy formats do not, so we may
// rebuild validity from postings as a backward-compatibility fallback for
// nullable fields. Non-nullable fields do not persist valid_bitset_ and
// are treated as all-valid on load.
//
// Rebuilding validity from postings is lossy for ARRAY fields: empty
// arrays have no element postings, so they cannot be distinguished from
// null arrays during reconstruction.
void
MMapIndexData(const std::string& filepath,
const uint8_t* data,
size_t data_size,
size_t index_length,
milvus::proto::common::LoadPriority priority);
milvus::proto::common::LoadPriority priority,
bool rebuild_validity_from_postings);

void
UnmapIndexData();
Expand Down
Loading
Loading