Skip to content

Commit cb0502c

Browse files
committed
Add MinIO S3 support
Why these changes are being introduced: As we prepare for expanding the role of dataset metadata, which may be stored as a static file in S3 next to the dataset, it will be helpful to test and develop against a realistic S3 backdrop but do so locally. How this addresses that need: Configures pyarrow and DuckDB to create appropriate connections when a 'MINIO_S3_ENDPOINT_URL' env var is present. Side effects of this change: * None Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TIMX-515
1 parent ed1cf59 commit cb0502c

5 files changed

Lines changed: 86 additions & 17 deletions

File tree

Makefile

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
-include .env
2+
13
SHELL=/bin/bash
24
DATETIME:=$(shell date -u +%Y%m%dT%H%M%SZ)
35

@@ -53,4 +55,17 @@ black-apply: # Apply changes with 'black'
5355
pipenv run black .
5456

5557
ruff-apply: # Resolve 'fixable errors' with 'ruff'
56-
pipenv run ruff check --fix .
58+
pipenv run ruff check --fix .
59+
60+
61+
######################
62+
# Minio S3 Instance
63+
######################
64+
minio-start:
65+
docker run \
66+
-p 9000:9000 \
67+
-p 9001:9001 \
68+
-v $(MINIO_DATA):/data \
69+
-e "MINIO_ROOT_USER=$(MINIO_USERNAME)" \
70+
-e "MINIO_ROOT_PASSWORD=$(MINIO_PASSWORD)" \
71+
quay.io/minio/minio server /data --console-address ":9001"

README.md

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,31 @@ None at this time.
4545
```shell
4646
TDA_LOG_LEVEL=# log level for timdex-dataset-api, accepts [DEBUG, INFO, WARNING, ERROR], default INFO
4747
WARNING_ONLY_LOGGERS=# Comma-seperated list of logger names to set as WARNING only, e.g. 'botocore,charset_normalizer,smart_open'
48+
49+
MINIO_S3_ENDPOINT_URL=# If set, informs the library to use this Minio S3 instance. Requires the http(s):// protocol.
50+
MINIO_USERNAME=# Username / AWS Key for Minio; required when MINIO_S3_ENDPOINT_URL is set
51+
MINIO_PASSWORD=# Pasword / AWS Secret for Minio; required when MINIO_S3_ENDPOINT_URL is set
52+
MINIO_DATA=# Path to persist MinIO data if started via Makefile command
53+
```
54+
55+
## Local S3 via MinIO
56+
57+
Set env vars:
58+
```shell
59+
MINIO_S3_ENDPOINT_URL=http://localhost:9000
60+
MINIO_USERNAME="admin"
61+
MINIO_PASSWORD="password"
62+
MINIO_DATA=<path to persist MinIO data, e.g. /tmp/minio>
4863
```
4964

65+
Use a `Makefile` command that will start a MinIO instance:
66+
67+
```shell
68+
make minio-start
69+
```
70+
71+
With the env var `MINIO_S3_ENDPOINT_URL` set, this library will configure `pyarrow` and DuckDB connections to point at this local MinIO S3 instance.
72+
5073
## Usage
5174

5275
Currently, the most common use cases are:

tests/conftest.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ def _test_env(monkeypatch):
2121
monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "fake_secret_key")
2222
monkeypatch.setenv("AWS_SESSION_TOKEN", "fake_session_token")
2323
monkeypatch.setenv("AWS_DEFAULT_REGION", "us-east-1")
24+
monkeypatch.delenv("MINIO_S3_ENDPOINT_URL", raising=False)
2425

2526

2627
@pytest.fixture

timdex_dataset_api/dataset.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -285,11 +285,23 @@ def _parse_date_filters(self, run_date: str | date | None) -> DatasetFilters:
285285

286286
@staticmethod
287287
def get_s3_filesystem() -> fs.FileSystem:
288-
"""Instantiate a pyarrow S3 Filesystem for dataset loading."""
288+
"""Instantiate a pyarrow S3 Filesystem for dataset loading.
289+
290+
If the env var 'MINIO_S3_ENDPOINT_URL' is present, assume a local MinIO S3
291+
instance and configure accordingly, otherwise assume normal AWS S3.
292+
"""
289293
session = boto3.session.Session()
290294
credentials = session.get_credentials()
291295
if not credentials:
292296
raise RuntimeError("Could not locate AWS credentials")
297+
298+
if os.getenv("MINIO_S3_ENDPOINT_URL"):
299+
return fs.S3FileSystem(
300+
access_key=os.environ["MINIO_USERNAME"],
301+
secret_key=os.environ["MINIO_PASSWORD"],
302+
endpoint_override=os.environ["MINIO_S3_ENDPOINT_URL"],
303+
)
304+
293305
return fs.S3FileSystem(
294306
secret_key=credentials.secret_key,
295307
access_key=credentials.access_key,

timdex_dataset_api/metadata.py

Lines changed: 33 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
"""timdex_dataset_api/metadata.py"""
22

3+
import os
34
import time
45
from typing import TYPE_CHECKING, Unpack
6+
from urllib.parse import urlparse
57

68
import duckdb
79

@@ -87,8 +89,8 @@ def _setup_database(self) -> None:
8789
# bump threads for high parallelization of lightweight data calls for metadata
8890
self.set_database_thread_usage(64)
8991

90-
# setup AWS credentials chain
91-
self._create_aws_credential_chain()
92+
# configure s3 connection
93+
self._configure_s3_connection()
9294

9395
# create a table of metadata about all rows in dataset
9496
self._create_full_dataset_table()
@@ -101,21 +103,37 @@ def _setup_database(self) -> None:
101103
f"path: '{self.db_path}'"
102104
)
103105

104-
def _create_aws_credential_chain(self) -> None:
105-
"""Setup AWS credentials chain in database connection.
106+
def _configure_s3_connection(self) -> None:
107+
"""Configure S3 connection for DuckDB access.
106108
107-
https://duckdb.org/docs/stable/core_extensions/aws.html
109+
If the env var 'MINIO_S3_ENDPOINT_URL' is present, assume a local MinIO S3
110+
instance and configure accordingly, otherwise assume normal AWS S3 and setup a
111+
credentials chain in DuckDB.
108112
"""
109-
logger.info("setting up AWS credentials chain")
110-
query = """
111-
create or replace secret secret (
112-
type s3,
113-
provider credential_chain,
114-
chain 'sso;env;config',
115-
refresh true
116-
);
117-
"""
118-
self.conn.execute(query)
113+
logger.info("configuring S3 connection")
114+
115+
if os.getenv("MINIO_S3_ENDPOINT_URL"):
116+
self.conn.execute(
117+
f"""
118+
PRAGMA s3_endpoint='{urlparse(os.environ["MINIO_S3_ENDPOINT_URL"]).netloc}';
119+
PRAGMA s3_access_key_id='{os.environ["MINIO_USERNAME"]}';
120+
PRAGMA s3_secret_access_key='{os.environ["MINIO_PASSWORD"]}';
121+
PRAGMA s3_use_ssl=false;
122+
PRAGMA s3_url_style='path';
123+
"""
124+
)
125+
126+
else:
127+
self.conn.execute(
128+
"""
129+
create or replace secret secret (
130+
type s3,
131+
provider credential_chain,
132+
chain 'sso;env;config',
133+
refresh true
134+
);
135+
"""
136+
)
119137

120138
def _create_full_dataset_table(self) -> None:
121139
"""Create a table of metadata about all records in the parquet dataset.

0 commit comments

Comments
 (0)