Skip to content

Commit ddd1409

Browse files
authored
fix: persist bitmap index valid bitset for nullable array fields (#49008)
issue: #48901 --------- Signed-off-by: SpadeA <tangchenjie1210@gmail.com>
1 parent 90c5608 commit ddd1409

File tree

4 files changed

+408
-18
lines changed

4 files changed

+408
-18
lines changed

internal/core/src/index/BitmapIndex.cpp

Lines changed: 95 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,21 @@ BitmapIndex<T>::SerializeIndexData(uint8_t* data_ptr) {
231231
}
232232
}
233233

234+
template <typename T>
235+
std::pair<std::shared_ptr<uint8_t[]>, size_t>
236+
BitmapIndex<T>::SerializeValidBitsetData() const {
237+
size_t valid_bitset_size = (total_num_rows_ + 7) / 8;
238+
std::shared_ptr<uint8_t[]> valid_bitset_data(
239+
new uint8_t[valid_bitset_size]);
240+
memset(valid_bitset_data.get(), 0, valid_bitset_size);
241+
for (size_t i = 0; i < total_num_rows_; ++i) {
242+
if (valid_bitset_[i]) {
243+
valid_bitset_data[i / 8] |= (1 << (i % 8));
244+
}
245+
}
246+
return std::make_pair(valid_bitset_data, valid_bitset_size);
247+
}
248+
234249
template <typename T>
235250
std::pair<std::shared_ptr<uint8_t[]>, size_t>
236251
BitmapIndex<T>::SerializeIndexMeta() {
@@ -263,6 +278,24 @@ BitmapIndex<std::string>::SerializeIndexData(uint8_t* data_ptr) {
263278
}
264279
}
265280

281+
template <typename T>
282+
void
283+
BitmapIndex<T>::DeserializeValidBitsetData(const uint8_t* data_ptr,
284+
size_t data_size) {
285+
auto expected_size = (total_num_rows_ + 7) / 8;
286+
AssertInfo(data_size == expected_size,
287+
"bitmap valid_bitset size mismatch, expect {}, got {}",
288+
expected_size,
289+
data_size);
290+
valid_bitset_ = TargetBitmap(total_num_rows_, false);
291+
for (size_t i = 0; i < total_num_rows_; ++i) {
292+
uint8_t byte = data_ptr[i / 8];
293+
if (byte & (1 << (i % 8))) {
294+
valid_bitset_.set(i);
295+
}
296+
}
297+
}
298+
266299
template <typename T>
267300
BinarySet
268301
BitmapIndex<T>::Serialize(const Config& config) {
@@ -279,6 +312,11 @@ BitmapIndex<T>::Serialize(const Config& config) {
279312
BinarySet ret_set;
280313
ret_set.Append(BITMAP_INDEX_DATA, index_data, index_data_size);
281314
ret_set.Append(BITMAP_INDEX_META, index_meta.first, index_meta.second);
315+
if (schema_.nullable()) {
316+
auto valid_bitset = SerializeValidBitsetData();
317+
ret_set.Append(
318+
BITMAP_INDEX_VALID_BITSET, valid_bitset.first, valid_bitset.second);
319+
}
282320

283321
LOG_INFO("build bitmap index with cardinality = {}, num_rows = {}",
284322
data_.size(),
@@ -354,7 +392,8 @@ BitmapIndex<T>::ChooseIndexLoadMode(int64_t index_length) {
354392
template <typename T>
355393
void
356394
BitmapIndex<T>::DeserializeIndexData(const uint8_t* data_ptr,
357-
size_t index_length) {
395+
size_t index_length,
396+
bool rebuild_validity_from_postings) {
358397
ChooseIndexLoadMode(index_length);
359398
for (size_t i = 0; i < index_length; ++i) {
360399
T key;
@@ -370,8 +409,10 @@ BitmapIndex<T>::DeserializeIndexData(const uint8_t* data_ptr,
370409
} else {
371410
data_[key] = value;
372411
}
373-
for (const auto& v : value) {
374-
valid_bitset_.set(v);
412+
if (rebuild_validity_from_postings) {
413+
for (const auto& v : value) {
414+
valid_bitset_.set(v);
415+
}
375416
}
376417
}
377418
}
@@ -413,8 +454,10 @@ BitmapIndex<T>::BuildOffsetCache() {
413454

414455
template <>
415456
void
416-
BitmapIndex<std::string>::DeserializeIndexData(const uint8_t* data_ptr,
417-
size_t index_length) {
457+
BitmapIndex<std::string>::DeserializeIndexData(
458+
const uint8_t* data_ptr,
459+
size_t index_length,
460+
bool rebuild_validity_from_postings) {
418461
ChooseIndexLoadMode(index_length);
419462
for (size_t i = 0; i < index_length; ++i) {
420463
size_t key_size;
@@ -433,8 +476,10 @@ BitmapIndex<std::string>::DeserializeIndexData(const uint8_t* data_ptr,
433476
} else {
434477
data_[key] = value;
435478
}
436-
for (const auto& v : value) {
437-
valid_bitset_.set(v);
479+
if (rebuild_validity_from_postings) {
480+
for (const auto& v : value) {
481+
valid_bitset_.set(v);
482+
}
438483
}
439484
}
440485
}
@@ -468,7 +513,8 @@ BitmapIndex<T>::MMapIndexData(const std::string& file_name,
468513
const uint8_t* data_ptr,
469514
size_t data_size,
470515
size_t index_length,
471-
milvus::proto::common::LoadPriority priority) {
516+
milvus::proto::common::LoadPriority priority,
517+
bool rebuild_validity_from_postings) {
472518
std::filesystem::create_directories(
473519
std::filesystem::path(file_name).parent_path());
474520

@@ -483,8 +529,10 @@ BitmapIndex<T>::MMapIndexData(const std::string& file_name,
483529
roaring::Roaring value;
484530
value =
485531
roaring::Roaring::read(reinterpret_cast<const char*>(data_ptr));
486-
for (const auto& v : value) {
487-
valid_bitset_.set(v);
532+
if (rebuild_validity_from_postings) {
533+
for (const auto& v : value) {
534+
valid_bitset_.set(v);
535+
}
488536
}
489537

490538
// convert roaring vaule to frozen mode
@@ -537,7 +585,15 @@ BitmapIndex<T>::LoadWithoutAssemble(const BinarySet& binary_set,
537585
index_meta_buffer->size);
538586
auto index_length = index_meta.first;
539587
total_num_rows_ = index_meta.second;
540-
valid_bitset_ = TargetBitmap(total_num_rows_, false);
588+
valid_bitset_ = TargetBitmap(total_num_rows_, !schema_.nullable());
589+
bool rebuild_validity_from_postings = schema_.nullable();
590+
591+
auto valid_bitset_buffer = binary_set.GetByName(BITMAP_INDEX_VALID_BITSET);
592+
if (valid_bitset_buffer != nullptr) {
593+
DeserializeValidBitsetData(valid_bitset_buffer->data.get(),
594+
valid_bitset_buffer->size);
595+
rebuild_validity_from_postings = false;
596+
}
541597

542598
auto index_data_buffer = binary_set.GetByName(BITMAP_INDEX_DATA);
543599

@@ -558,9 +614,12 @@ BitmapIndex<T>::LoadWithoutAssemble(const BinarySet& binary_set,
558614
index_data_buffer->data.get(),
559615
index_data_buffer->size,
560616
index_length,
561-
priority);
617+
priority,
618+
rebuild_validity_from_postings);
562619
} else {
563-
DeserializeIndexData(index_data_buffer->data.get(), index_length);
620+
DeserializeIndexData(index_data_buffer->data.get(),
621+
index_length,
622+
rebuild_validity_from_postings);
564623
}
565624

566625
if (enable_offset_cache.has_value() && enable_offset_cache.value()) {
@@ -1313,6 +1372,12 @@ BitmapIndex<T>::WriteEntries(storage::IndexEntryWriter* writer) {
13131372
uint8_t* data_ptr = index_data.get();
13141373
SerializeIndexData(data_ptr);
13151374
writer->WriteEntry(BITMAP_INDEX_DATA, index_data.get(), index_data_size);
1375+
if (schema_.nullable()) {
1376+
auto valid_bitset = SerializeValidBitsetData();
1377+
writer->WriteEntry(BITMAP_INDEX_VALID_BITSET,
1378+
valid_bitset.first.get(),
1379+
valid_bitset.second);
1380+
}
13161381

13171382
LOG_INFO("write bitmap index entries with cardinality = {}, num_rows = {}",
13181383
data_.size(),
@@ -1329,7 +1394,18 @@ BitmapIndex<T>::LoadEntries(storage::IndexEntryReader& reader,
13291394
// V3 format: meta is in __meta__ entry
13301395
auto index_length = reader.GetMeta<size_t>(BITMAP_INDEX_LENGTH);
13311396
total_num_rows_ = reader.GetMeta<size_t>(BITMAP_INDEX_NUM_ROWS);
1332-
valid_bitset_ = TargetBitmap(total_num_rows_, false);
1397+
valid_bitset_ = TargetBitmap(total_num_rows_, !schema_.nullable());
1398+
bool rebuild_validity_from_postings = schema_.nullable();
1399+
1400+
auto entry_names = reader.GetEntryNames();
1401+
if (std::find(entry_names.begin(),
1402+
entry_names.end(),
1403+
BITMAP_INDEX_VALID_BITSET) != entry_names.end()) {
1404+
auto valid_bitset_entry = reader.ReadEntry(BITMAP_INDEX_VALID_BITSET);
1405+
DeserializeValidBitsetData(valid_bitset_entry.data.data(),
1406+
valid_bitset_entry.data.size());
1407+
rebuild_validity_from_postings = false;
1408+
}
13331409

13341410
auto data_entry = reader.ReadEntry(BITMAP_INDEX_DATA);
13351411

@@ -1349,9 +1425,12 @@ BitmapIndex<T>::LoadEntries(storage::IndexEntryReader& reader,
13491425
data_entry.data.data(),
13501426
data_entry.data.size(),
13511427
index_length,
1352-
priority);
1428+
priority,
1429+
rebuild_validity_from_postings);
13531430
} else {
1354-
DeserializeIndexData(data_entry.data.data(), index_length);
1431+
DeserializeIndexData(data_entry.data.data(),
1432+
index_length,
1433+
rebuild_validity_from_postings);
13551434
}
13561435

13571436
if (enable_offset_cache.has_value() && enable_offset_cache.value()) {

internal/core/src/index/BitmapIndex.h

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -298,17 +298,36 @@ class BitmapIndex : public ScalarIndex<T> {
298298
void
299299
SerializeIndexData(uint8_t* index_data_ptr);
300300

301+
std::pair<std::shared_ptr<uint8_t[]>, size_t>
302+
SerializeValidBitsetData() const;
303+
301304
std::pair<std::shared_ptr<uint8_t[]>, size_t>
302305
SerializeIndexMeta();
303306

304307
std::pair<size_t, size_t>
305308
DeserializeIndexMeta(const uint8_t* data_ptr, size_t data_size);
306309

310+
void
311+
DeserializeValidBitsetData(const uint8_t* data_ptr, size_t data_size);
312+
307313
T
308314
ParseKey(const uint8_t** ptr);
309315

316+
// Deserialize posting data.
317+
//
318+
// New bitmap index formats persist valid_bitset_, which is the
319+
// authoritative source of row validity. Legacy formats do not, so we may
320+
// rebuild validity from postings as a backward-compatibility fallback for
321+
// nullable fields. Non-nullable fields do not persist valid_bitset_ and
322+
// are treated as all-valid on load.
323+
//
324+
// Rebuilding validity from postings is lossy for ARRAY fields: empty
325+
// arrays have no element postings, so they cannot be distinguished from
326+
// null arrays during reconstruction.
310327
void
311-
DeserializeIndexData(const uint8_t* data_ptr, size_t index_length);
328+
DeserializeIndexData(const uint8_t* data_ptr,
329+
size_t index_length,
330+
bool rebuild_validity_from_postings);
312331

313332
void
314333
BuildOffsetCache();
@@ -352,12 +371,24 @@ class BitmapIndex : public ScalarIndex<T> {
352371
const T& upper_bound_value,
353372
bool ub_inclusive);
354373

374+
// Build mmap-backed posting storage from serialized bitmap index data.
375+
//
376+
// New bitmap index formats persist valid_bitset_, which is the
377+
// authoritative source of row validity. Legacy formats do not, so we may
378+
// rebuild validity from postings as a backward-compatibility fallback for
379+
// nullable fields. Non-nullable fields do not persist valid_bitset_ and
380+
// are treated as all-valid on load.
381+
//
382+
// Rebuilding validity from postings is lossy for ARRAY fields: empty
383+
// arrays have no element postings, so they cannot be distinguished from
384+
// null arrays during reconstruction.
355385
void
356386
MMapIndexData(const std::string& filepath,
357387
const uint8_t* data,
358388
size_t data_size,
359389
size_t index_length,
360-
milvus::proto::common::LoadPriority priority);
390+
milvus::proto::common::LoadPriority priority,
391+
bool rebuild_validity_from_postings);
361392

362393
void
363394
UnmapIndexData();

0 commit comments

Comments
 (0)