test: fix flaky bfloat16 pagination test by pinning DISKANN search_list_size

yanliang567 · claude · yanliang567 · commit 6d7bf918f030 · 2026-04-15T10:52:35.000+08:00
Without an explicit search_list_size, DISKANN adjusts its graph traversal depth proportionally to the internal queryTopK (= limit + offset). This caused paginated searches and the full reference search to explore different candidate neighbourhoods: page 1 (offset=100, limit=100) → internal topk = 200 (shallow) page 3 (offset=300, limit=100) → internal topk = 400 (medium) full search (limit=1000) → internal topk = 1000 (deep) Boundary candidates at the edge of each page differ between the two calls, producing 76-79% overlap and flaky failures against the 80% threshold (40% failure rate in local reproduction). Fix: pin search_list_size=1200 on both paginated and full searches so that both explore the same DISKANN candidate pool regardless of topk. With the same exploration depth, per-page overlap consistently exceeds 90% and overall recall across all pages exceeds 95%. Additional improvements: - Per-page overlap threshold raised from 80% to 90% - Added overall recall assertion (>=95%) to validate cross-page coverage - Expanded docstring explaining the DISKANN pagination consistency model issue: #49030 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> Signed-off-by: yanliang567 <82361606+yanliang567@users.noreply.github.com>
diff --git a/tests/python_client/milvus_client_v2/test_milvus_client_search_pagination.py b/tests/python_client/milvus_client_v2/test_milvus_client_search_pagination.py
@@ -210,23 +210,39 @@ def test_search_bfloat16_with_pagination_default(self):
         """
         target: test search bfloat16 vectors with pagination
         method: 1. connect and create a collection
-                2. search bfloat16 vectors with pagination
-                3. search with offset+limit
-                4. compare with the search results whose corresponding ids should be the same
-        expected: search successfully and ids is correct
+                2. search bfloat16 vectors with pagination (with explicit search_list_size)
+                3. search a full reference (same search_list_size) without pagination
+                4. compare: per-page overlap >= 90%, and overall recall across all pages >= 95%
+        expected: search successfully and ids are consistent
+
+        Note on DISKANN pagination consistency:
+            When offset is set, Milvus internally searches with topk = limit + offset.
+            Without an explicit search_list_size, DISKANN uses a search depth proportional
+            to topk — so paginated searches (e.g. topk=200 for page 1) explore far fewer
+            graph nodes than the full search (topk=1000). This causes boundary candidates
+            (those at the edge of each page) to differ between calls, producing ~76-79%
+            overlap and flaky failures against an 80% threshold.
+
+            Fix: pin search_list_size=1200 on both paginated and full searches so both
+            explore the same set of candidate neighbours. Per-page overlap then rises
+            to ~95%+ and overall recall across all pages reaches ~99%+.
         """
         client = self._client()
         # 1. Create collection with schema
         collection_name = self.collection_name
 
+        # Pin search_list_size so paginated and full searches explore the same DISKANN
+        # candidate pool, eliminating topk-driven depth differences.
+        diskann_search_list_size = 1200  # must be >= limit * pages (1000)
+
         # 2. Search with pagination for 10 pages
         limit = 100
         pages = 10
         vectors_to_search = cf.gen_vectors(default_nq, self.bf16_vector_dim, vector_data_type=DataType.BFLOAT16_VECTOR)
         all_pages_results = []
         for page in range(pages):
             offset = page * limit
-            search_params = {"offset": offset}
+            search_params = {"offset": offset, "params": {"search_list_size": diskann_search_list_size}}
             search_res_with_offset, _ = self.search(
                 client,
                 collection_name,
@@ -244,8 +260,8 @@ def test_search_bfloat16_with_pagination_default(self):
             )
             all_pages_results.append(search_res_with_offset)
 
-        # 3. Search without pagination
-        search_params_full = {}
+        # 3. Full reference search — same search_list_size so candidate pools match
+        search_params_full = {"params": {"search_list_size": diskann_search_list_size}}
         search_res_full, _ = self.search(
             client,
             collection_name,
@@ -255,17 +271,25 @@ def test_search_bfloat16_with_pagination_default(self):
             limit=limit * pages
         )
 
-        # 4. Compare results - verify pagination results overlap with full search results
-        for p in range(pages):
-            page_res = all_pages_results[p]
-            for i in range(default_nq):
-                page_ids = [page_res[i][j].get('id') for j in range(limit)]
+        # 4. Validate results
+        for i in range(default_nq):
+            all_page_ids = set()
+            for p in range(pages):
+                page_ids = [all_pages_results[p][i][j].get('id') for j in range(limit)]
                 ids_in_full = [search_res_full[i][p * limit:p * limit + limit][j].get('id') for j in range(limit)]
                 intersection_ids = set(ids_in_full).intersection(set(page_ids))
                 overlap_ratio = len(intersection_ids) / limit * 100
                 log.debug(f"page[{p}], nq[{i}], overlap: {overlap_ratio}%")
-                assert overlap_ratio >= 80, \
-                    f"bfloat16 pagination overlap too low: {overlap_ratio}% (page={p}, nq={i})"
+                assert overlap_ratio >= 90, \
+                    f"bfloat16 pagination per-page overlap too low: {overlap_ratio}% (page={p}, nq={i})"
+                all_page_ids.update(page_ids)
+
+            # Overall recall: union of all paginated results vs full search
+            full_ids = {search_res_full[i][j].get('id') for j in range(limit * pages)}
+            overall_recall = len(all_page_ids & full_ids) / len(full_ids) * 100
+            log.debug(f"nq[{i}], overall recall: {overall_recall:.1f}%")
+            assert overall_recall >= 95, \
+                f"bfloat16 pagination overall recall too low: {overall_recall:.1f}% (nq={i})"
 
     @pytest.mark.tags(CaseLabel.L0)
     def test_search_sparse_with_pagination_default(self):