|
7 | 7 | import moto |
8 | 8 | import pytest |
9 | 9 |
|
10 | | -from tests.utils import generate_sample_embeddings, generate_sample_records |
| 10 | +from tests.utils import ( |
| 11 | + generate_sample_embeddings, |
| 12 | + generate_sample_embeddings_for_run, |
| 13 | + generate_sample_records, |
| 14 | +) |
11 | 15 | from timdex_dataset_api import TIMDEXDataset, TIMDEXDatasetMetadata |
12 | 16 | from timdex_dataset_api.dataset import TIMDEXDatasetConfig |
13 | 17 | from timdex_dataset_api.embeddings import ( |
@@ -294,12 +298,112 @@ def timdex_metadata_merged_deltas( |
294 | 298 | # Dataset Embeddings Fixtures |
295 | 299 | # ================================================================================ |
296 | 300 | @pytest.fixture |
297 | | -def timdex_embeddings_with_runs(timdex_dataset_empty): |
298 | | - """TIMDEXEmbeddings with multiple runs for single strategy.""" |
299 | | - embeddings = TIMDEXEmbeddings(timdex_dataset_empty) |
300 | | - embeddings.write(generate_sample_embeddings(100, run_id="abc123")) # run 1 |
301 | | - embeddings.write(generate_sample_embeddings(50, run_id="def456")) # run 2 |
302 | | - return TIMDEXEmbeddings(timdex_dataset_empty) |
| 301 | +def timdex_embeddings_with_runs(timdex_dataset_empty) -> TIMDEXEmbeddings: |
| 302 | + """TIMDEXEmbeddings with multiple runs for single strategy. |
| 303 | +
|
| 304 | + Also writes matching records and rebuilds metadata so embeddings queries |
| 305 | + can join to metadata.records. |
| 306 | + """ |
| 307 | + timdex_dataset = timdex_dataset_empty |
| 308 | + |
| 309 | + # write matching records for embeddings |
| 310 | + timdex_dataset.write( |
| 311 | + generate_sample_records(100, source="alma", run_id="abc123"), |
| 312 | + write_append_deltas=False, |
| 313 | + ) |
| 314 | + timdex_dataset.write( |
| 315 | + generate_sample_records(50, source="alma", run_id="def456"), |
| 316 | + write_append_deltas=False, |
| 317 | + ) |
| 318 | + |
| 319 | + # reload TIMDEXDataset instance and build metadata |
| 320 | + timdex_dataset.metadata.rebuild_dataset_metadata() |
| 321 | + timdex_dataset = TIMDEXDataset(timdex_dataset.location) |
| 322 | + |
| 323 | + # write embeddings |
| 324 | + timdex_dataset.embeddings.write( |
| 325 | + generate_sample_embeddings_for_run(timdex_dataset, run_id="abc123") |
| 326 | + ) |
| 327 | + timdex_dataset.embeddings.write( |
| 328 | + generate_sample_embeddings_for_run(timdex_dataset, run_id="def456") |
| 329 | + ) |
| 330 | + |
| 331 | + # reload TIMDEXDataset instance once more |
| 332 | + return TIMDEXDataset(timdex_dataset_empty.location).embeddings |
| 333 | + |
| 334 | + |
| 335 | +@pytest.fixture |
| 336 | +def timdex_dataset_for_embeddings_views(timdex_dataset_empty) -> TIMDEXDataset: |
| 337 | + """TIMDEXDataset with records for testing embeddings views. |
| 338 | +
|
| 339 | + Creates three scenarios to test DuckDB views: |
| 340 | + - apple: single full run with 10 records |
| 341 | + - orange: full run with 10 records + daily run with 5 records |
| 342 | + - lemon: full run with 10 records + daily run with 5 records |
| 343 | + """ |
| 344 | + timdex_dataset_dataset = timdex_dataset_empty |
| 345 | + |
| 346 | + # scenario 1: apple - single full run |
| 347 | + timdex_dataset_dataset.write( |
| 348 | + generate_sample_records( |
| 349 | + num_records=10, |
| 350 | + source="apple", |
| 351 | + run_date="2025-06-01", |
| 352 | + run_type="full", |
| 353 | + run_id="apple-1", |
| 354 | + ), |
| 355 | + write_append_deltas=False, |
| 356 | + ) |
| 357 | + |
| 358 | + # scenario 2: orange - full run + daily run |
| 359 | + timdex_dataset_dataset.write( |
| 360 | + generate_sample_records( |
| 361 | + num_records=10, |
| 362 | + source="orange", |
| 363 | + run_date="2025-07-01", |
| 364 | + run_type="full", |
| 365 | + run_id="orange-1", |
| 366 | + ), |
| 367 | + write_append_deltas=False, |
| 368 | + ) |
| 369 | + timdex_dataset_dataset.write( |
| 370 | + generate_sample_records( |
| 371 | + num_records=5, |
| 372 | + source="orange", |
| 373 | + run_date="2025-07-02", |
| 374 | + run_type="daily", |
| 375 | + run_id="orange-2", |
| 376 | + ), |
| 377 | + write_append_deltas=False, |
| 378 | + ) |
| 379 | + |
| 380 | + # scenario 3: lemon - full run + daily run (daily will be embedded twice) |
| 381 | + timdex_dataset_dataset.write( |
| 382 | + generate_sample_records( |
| 383 | + num_records=10, |
| 384 | + source="lemon", |
| 385 | + run_date="2025-08-01", |
| 386 | + run_type="full", |
| 387 | + run_id="lemon-1", |
| 388 | + ), |
| 389 | + write_append_deltas=False, |
| 390 | + ) |
| 391 | + timdex_dataset_dataset.write( |
| 392 | + generate_sample_records( |
| 393 | + num_records=5, |
| 394 | + source="lemon", |
| 395 | + run_date="2025-08-02", |
| 396 | + run_type="daily", |
| 397 | + run_id="lemon-2", |
| 398 | + ), |
| 399 | + write_append_deltas=False, |
| 400 | + ) |
| 401 | + |
| 402 | + # rebuild metadata so records can be queried |
| 403 | + timdex_dataset_dataset.metadata.rebuild_dataset_metadata() |
| 404 | + |
| 405 | + # reload dataset to work around bug |
| 406 | + return TIMDEXDataset(timdex_dataset_dataset.location) |
303 | 407 |
|
304 | 408 |
|
305 | 409 | # ================================================================================ |
|
0 commit comments