Skip to content

Commit 66bd9ef

Browse files
committed
Update dependencies and pin to pandas 2.x
Why these changes are being introduced: There are some breaking changes in pandas 3.x that will require some modifications to how we register dataframes in DuckDB contexts. How this addresses that need: In the short term, pin to pandas 2.x for this library. Side effects of this change: * Applications that use this library will be restricted to pandas 2.x until the pin is removed. Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/USE-342 * discovered while installing for timdex-embeddings
1 parent 03521ef commit 66bd9ef

10 files changed

Lines changed: 715 additions & 762 deletions

File tree

Pipfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ attrs = "*"
88
boto3 = "*"
99
duckdb = "*"
1010
duckdb-engine = "*"
11-
pandas = "*"
11+
pandas = "<3.0.0"
1212
pyarrow = "*"
1313
sqlalchemy = "==2.0.44"
1414

Pipfile.lock

Lines changed: 654 additions & 613 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ dependencies = [
2626
"boto3",
2727
"duckdb",
2828
"duckdb_engine",
29-
"pandas",
29+
"pandas<3.0.0",
3030
"pyarrow",
3131
"sqlalchemy==2.0.44"
3232
]

tests/test_dataset.py

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -248,17 +248,12 @@ def test_dataset_duckdb_context_created_on_init(timdex_dataset):
248248

249249

250250
def test_dataset_duckdb_context_creates_data_schema(timdex_dataset):
251-
assert (
252-
timdex_dataset.conn.query(
253-
"""
251+
assert timdex_dataset.conn.query("""
254252
select count(*)
255253
from information_schema.schemata
256254
where catalog_name = 'memory'
257255
and schema_name = 'data';
258-
"""
259-
).fetchone()[0]
260-
== 1
261-
)
256+
""").fetchone()[0] == 1
262257

263258

264259
def test_dataset_preload_current_records_default_false(timdex_dataset):

tests/test_metadata.py

Lines changed: 22 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -55,17 +55,12 @@ def test_tdm_init_metadata_file_found_success(timdex_metadata):
5555

5656

5757
def test_tdm_duckdb_context_creates_metadata_schema(timdex_metadata):
58-
assert (
59-
timdex_metadata.conn.query(
60-
"""
58+
assert timdex_metadata.conn.query("""
6159
select count(*)
6260
from information_schema.schemata
6361
where catalog_name = 'memory'
6462
and schema_name = 'metadata';
65-
"""
66-
).fetchone()[0]
67-
== 1
68-
)
63+
""").fetchone()[0] == 1
6964

7065

7166
def test_tdm_connection_has_static_database_attached(timdex_metadata):
@@ -241,38 +236,32 @@ def test_tdm_current_records_with_deltas_logic(timdex_metadata_with_deltas):
241236

242237
def test_tdm_current_records_most_recent_version(timdex_metadata_with_deltas):
243238
# check that for records with multiple versions, only the most recent is returned
244-
multi_version_records = timdex_metadata_with_deltas.conn.query(
245-
"""
239+
multi_version_records = timdex_metadata_with_deltas.conn.query("""
246240
select timdex_record_id, count(*) as version_count
247241
from metadata.records
248242
group by timdex_record_id
249243
having count(*) > 1
250244
limit 1;
251-
"""
252-
).to_df()
245+
""").to_df()
253246

254247
if len(multi_version_records) > 0:
255248
record_id = multi_version_records.iloc[0]["timdex_record_id"]
256249

257250
# get most recent timestamp for this record
258-
most_recent = timdex_metadata_with_deltas.conn.query(
259-
f"""
251+
most_recent = timdex_metadata_with_deltas.conn.query(f"""
260252
select run_timestamp, run_id
261253
from metadata.records
262254
where timdex_record_id = '{record_id}'
263255
order by run_timestamp desc
264256
limit 1;
265-
"""
266-
).to_df()
257+
""").to_df()
267258

268259
# verify current_records contains this version
269-
current_version = timdex_metadata_with_deltas.conn.query(
270-
f"""
260+
current_version = timdex_metadata_with_deltas.conn.query(f"""
271261
select run_timestamp, run_id
272262
from metadata.current_records
273263
where timdex_record_id = '{record_id}';
274-
"""
275-
).to_df()
264+
""").to_df()
276265

277266
assert len(current_version) == 1
278267
assert (
@@ -294,21 +283,17 @@ def test_tdm_merge_append_deltas_static_counts_match_records_count_before_merge(
294283
def test_tdm_merge_append_deltas_adds_records_to_static_db(
295284
timdex_metadata_with_deltas, timdex_metadata_merged_deltas
296285
):
297-
append_deltas = timdex_metadata_with_deltas.conn.query(
298-
f"""
286+
append_deltas = timdex_metadata_with_deltas.conn.query(f"""
299287
select
300288
{','.join(ORDERED_METADATA_COLUMN_NAMES)}
301289
from metadata.append_deltas
302-
"""
303-
).to_df()
290+
""").to_df()
304291

305-
merged_static_db = timdex_metadata_merged_deltas.conn.query(
306-
f"""
292+
merged_static_db = timdex_metadata_merged_deltas.conn.query(f"""
307293
select
308294
{','.join(ORDERED_METADATA_COLUMN_NAMES)}
309295
from static_db.records
310-
"""
311-
).to_df()
296+
""").to_df()
312297

313298
assert set(map(tuple, append_deltas.to_numpy())).issubset(
314299
set(map(tuple, merged_static_db.to_numpy()))
@@ -332,18 +317,12 @@ def test_td_prepare_duckdb_secret_and_extensions_home_env_var_set_and_valid(
332317
monkeypatch.setenv("HOME", str(preset_home))
333318

334319
td = TIMDEXDataset(timdex_dataset_with_runs.location)
335-
df = (
336-
td.conn.query(
337-
"""
320+
df = td.conn.query("""
338321
select
339322
current_setting('secret_directory') as secret_directory,
340323
current_setting('extension_directory') as extension_directory
341324
;
342-
"""
343-
)
344-
.to_df()
345-
.iloc[0]
346-
)
325+
""").to_df().iloc[0]
347326
assert "my-account" in df.secret_directory
348327
assert df.extension_directory == "" # expected and okay when HOME set
349328

@@ -355,18 +334,12 @@ def test_td_prepare_duckdb_secret_and_extensions_home_env_var_unset(
355334

356335
td = TIMDEXDataset(timdex_dataset_with_runs.location)
357336

358-
df = (
359-
td.conn.query(
360-
"""
337+
df = td.conn.query("""
361338
select
362339
current_setting('secret_directory') as secret_directory,
363340
current_setting('extension_directory') as extension_directory
364341
;
365-
"""
366-
)
367-
.to_df()
368-
.iloc[0]
369-
)
342+
""").to_df().iloc[0]
370343
assert df.secret_directory == "/tmp/.duckdb/secrets"
371344
assert df.extension_directory == "/tmp/.duckdb/extensions"
372345

@@ -378,18 +351,12 @@ def test_td_prepare_duckdb_secret_and_extensions_home_env_var_set_but_empty(
378351

379352
td = TIMDEXDataset(timdex_dataset_with_runs.location)
380353

381-
df = (
382-
td.conn.query(
383-
"""
354+
df = td.conn.query("""
384355
select
385356
current_setting('secret_directory') as secret_directory,
386357
current_setting('extension_directory') as extension_directory
387358
;
388-
"""
389-
)
390-
.to_df()
391-
.iloc[0]
392-
)
359+
""").to_df().iloc[0]
393360
assert df.secret_directory == "/tmp/.duckdb/secrets"
394361
assert df.extension_directory == "/tmp/.duckdb/extensions"
395362

@@ -411,16 +378,14 @@ def test_tdm_preload_false_no_temp_table(timdex_dataset_with_runs):
411378
td = TIMDEXDataset(timdex_dataset_with_runs.location)
412379

413380
# assert that materialized, temporary table "temp.current_records" does not exist
414-
temp_table_count = td.metadata.conn.query(
415-
"""
381+
temp_table_count = td.metadata.conn.query("""
416382
select count(*)
417383
from information_schema.tables
418384
where table_catalog = 'temp'
419385
and table_name = 'current_records'
420386
and table_type = 'LOCAL TEMPORARY'
421387
;
422-
"""
423-
).fetchone()[0]
388+
""").fetchone()[0]
424389

425390
assert temp_table_count == 0
426391

@@ -430,15 +395,13 @@ def test_tdm_preload_true_has_temp_table(timdex_dataset_with_runs):
430395
td = TIMDEXDataset(timdex_dataset_with_runs.location, preload_current_records=True)
431396

432397
# assert that materialized, temporary table "temp.current_records" does exist
433-
temp_table_count = td.metadata.conn.query(
434-
"""
398+
temp_table_count = td.metadata.conn.query("""
435399
select count(*)
436400
from information_schema.tables
437401
where table_catalog = 'temp'
438402
and table_name = 'current_records'
439403
and table_type = 'LOCAL TEMPORARY'
440404
;
441-
"""
442-
).fetchone()[0]
405+
""").fetchone()[0]
443406

444407
assert temp_table_count == 1

tests/utils.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -105,14 +105,12 @@ def generate_sample_embeddings_for_run(
105105
embedding_dimensions: int = 3,
106106
) -> Iterator[DatasetEmbedding]:
107107
"""Generate sample DatasetEmbeddings for a given ETL run."""
108-
records_metadata = timdex_dataset.conn.query(
109-
f"""
108+
records_metadata = timdex_dataset.conn.query(f"""
110109
select
111110
*
112111
from metadata.records
113112
where run_id = '{run_id}';
114-
"""
115-
).to_df()
113+
""").to_df()
116114

117115
if not embedding_timestamp:
118116
embedding_timestamp = records_metadata.iloc[0].run_timestamp.isoformat()

timdex_dataset_api/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from timdex_dataset_api.metadata import TIMDEXDatasetMetadata
66
from timdex_dataset_api.record import DatasetRecord
77

8-
__version__ = "3.10.0"
8+
__version__ = "3.11.0"
99

1010
__all__ = [
1111
"DatasetEmbedding",

timdex_dataset_api/embeddings.py

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -181,8 +181,7 @@ def _create_embeddings_view(self, conn: DuckDBPyConnection) -> None:
181181
"""Create a view that projects over embeddings parquet files."""
182182
logger.debug("creating view data.embeddings")
183183

184-
conn.execute(
185-
f"""
184+
conn.execute(f"""
186185
create or replace view data.embeddings as
187186
(
188187
select *
@@ -192,8 +191,7 @@ def _create_embeddings_view(self, conn: DuckDBPyConnection) -> None:
192191
filename=true
193192
)
194193
);
195-
"""
196-
)
194+
""")
197195

198196
def _create_current_embeddings_view(self, conn: DuckDBPyConnection) -> None:
199197
"""Create a view of current embedding records.
@@ -205,8 +203,7 @@ def _create_current_embeddings_view(self, conn: DuckDBPyConnection) -> None:
205203
logger.debug("creating view data.current_embeddings")
206204

207205
# SQL for the current records logic (CTEs)
208-
conn.execute(
209-
"""
206+
conn.execute("""
210207
create or replace view data.current_embeddings as
211208
(
212209
with
@@ -229,8 +226,7 @@ def _create_current_embeddings_view(self, conn: DuckDBPyConnection) -> None:
229226
from ce_ranked_embeddings
230227
where rn = 1
231228
);
232-
"""
233-
)
229+
""")
234230

235231
def _create_current_run_embeddings_view(self, conn: DuckDBPyConnection) -> None:
236232
"""Create a view of current embedding records per run.
@@ -242,8 +238,7 @@ def _create_current_run_embeddings_view(self, conn: DuckDBPyConnection) -> None:
242238
logger.debug("creating view data.current_run_embeddings")
243239

244240
# SQL for the current records logic (CTEs)
245-
conn.execute(
246-
"""
241+
conn.execute("""
247242
create or replace view data.current_run_embeddings as
248243
(
249244
with
@@ -267,8 +262,7 @@ def _create_current_run_embeddings_view(self, conn: DuckDBPyConnection) -> None:
267262
from ce_ranked_embeddings
268263
where rn = 1
269264
);
270-
"""
271-
)
265+
""")
272266

273267
def write(
274268
self,

0 commit comments

Comments
 (0)