Skip to content

Commit a1d8ad7

Browse files
authored
Merge pull request #179 from MITLibraries/USE-143-record-metadata-join
Join embeddings queries on record metadata
2 parents 184d87f + b125772 commit a1d8ad7

7 files changed

Lines changed: 487 additions & 50 deletions

File tree

.DS_Store

8 KB
Binary file not shown.

Pipfile.lock

Lines changed: 8 additions & 6 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/conftest.py

Lines changed: 111 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,11 @@
77
import moto
88
import pytest
99

10-
from tests.utils import generate_sample_embeddings, generate_sample_records
10+
from tests.utils import (
11+
generate_sample_embeddings,
12+
generate_sample_embeddings_for_run,
13+
generate_sample_records,
14+
)
1115
from timdex_dataset_api import TIMDEXDataset, TIMDEXDatasetMetadata
1216
from timdex_dataset_api.dataset import TIMDEXDatasetConfig
1317
from timdex_dataset_api.embeddings import (
@@ -294,12 +298,112 @@ def timdex_metadata_merged_deltas(
294298
# Dataset Embeddings Fixtures
295299
# ================================================================================
296300
@pytest.fixture
297-
def timdex_embeddings_with_runs(timdex_dataset_empty):
298-
"""TIMDEXEmbeddings with multiple runs for single strategy."""
299-
embeddings = TIMDEXEmbeddings(timdex_dataset_empty)
300-
embeddings.write(generate_sample_embeddings(100, run_id="abc123")) # run 1
301-
embeddings.write(generate_sample_embeddings(50, run_id="def456")) # run 2
302-
return TIMDEXEmbeddings(timdex_dataset_empty)
301+
def timdex_embeddings_with_runs(timdex_dataset_empty) -> TIMDEXEmbeddings:
302+
"""TIMDEXEmbeddings with multiple runs for single strategy.
303+
304+
Also writes matching records and rebuilds metadata so embeddings queries
305+
can join to metadata.records.
306+
"""
307+
timdex_dataset = timdex_dataset_empty
308+
309+
# write matching records for embeddings
310+
timdex_dataset.write(
311+
generate_sample_records(100, source="alma", run_id="abc123"),
312+
write_append_deltas=False,
313+
)
314+
timdex_dataset.write(
315+
generate_sample_records(50, source="alma", run_id="def456"),
316+
write_append_deltas=False,
317+
)
318+
319+
# reload TIMDEXDataset instance and build metadata
320+
timdex_dataset.metadata.rebuild_dataset_metadata()
321+
timdex_dataset = TIMDEXDataset(timdex_dataset.location)
322+
323+
# write embeddings
324+
timdex_dataset.embeddings.write(
325+
generate_sample_embeddings_for_run(timdex_dataset, run_id="abc123")
326+
)
327+
timdex_dataset.embeddings.write(
328+
generate_sample_embeddings_for_run(timdex_dataset, run_id="def456")
329+
)
330+
331+
# reload TIMDEXDataset instance once more
332+
return TIMDEXDataset(timdex_dataset_empty.location).embeddings
333+
334+
335+
@pytest.fixture
336+
def timdex_dataset_for_embeddings_views(timdex_dataset_empty) -> TIMDEXDataset:
337+
"""TIMDEXDataset with records for testing embeddings views.
338+
339+
Creates three scenarios to test DuckDB views:
340+
- apple: single full run with 10 records
341+
- orange: full run with 10 records + daily run with 5 records
342+
- lemon: full run with 10 records + daily run with 5 records
343+
"""
344+
timdex_dataset_dataset = timdex_dataset_empty
345+
346+
# scenario 1: apple - single full run
347+
timdex_dataset_dataset.write(
348+
generate_sample_records(
349+
num_records=10,
350+
source="apple",
351+
run_date="2025-06-01",
352+
run_type="full",
353+
run_id="apple-1",
354+
),
355+
write_append_deltas=False,
356+
)
357+
358+
# scenario 2: orange - full run + daily run
359+
timdex_dataset_dataset.write(
360+
generate_sample_records(
361+
num_records=10,
362+
source="orange",
363+
run_date="2025-07-01",
364+
run_type="full",
365+
run_id="orange-1",
366+
),
367+
write_append_deltas=False,
368+
)
369+
timdex_dataset_dataset.write(
370+
generate_sample_records(
371+
num_records=5,
372+
source="orange",
373+
run_date="2025-07-02",
374+
run_type="daily",
375+
run_id="orange-2",
376+
),
377+
write_append_deltas=False,
378+
)
379+
380+
# scenario 3: lemon - full run + daily run (daily will be embedded twice)
381+
timdex_dataset_dataset.write(
382+
generate_sample_records(
383+
num_records=10,
384+
source="lemon",
385+
run_date="2025-08-01",
386+
run_type="full",
387+
run_id="lemon-1",
388+
),
389+
write_append_deltas=False,
390+
)
391+
timdex_dataset_dataset.write(
392+
generate_sample_records(
393+
num_records=5,
394+
source="lemon",
395+
run_date="2025-08-02",
396+
run_type="daily",
397+
run_id="lemon-2",
398+
),
399+
write_append_deltas=False,
400+
)
401+
402+
# rebuild metadata so records can be queried
403+
timdex_dataset_dataset.metadata.rebuild_dataset_metadata()
404+
405+
# reload dataset to work around bug
406+
return TIMDEXDataset(timdex_dataset_dataset.location)
303407

304408

305409
# ================================================================================

0 commit comments

Comments
 (0)