Skip to content

Commit c212b50

Browse files
committed
Add S3Client for metadata management
Why these changes are being introduced: With the addition of dataset/metadata assets in S3, we will need to perform actions like downloading the static DB file, uploading a new one, and deleting append deltas. How this addresses that need: Creates new utility class S3Client that performs these actions. Side effects of this change: * None Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TIMX-530
1 parent 9ff7594 commit c212b50

5 files changed

Lines changed: 252 additions & 5 deletions

File tree

Pipfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ pyarrow = "*"
1212

1313
[dev-packages]
1414
black = "*"
15-
boto3-stubs = {version = "*", extras = ["s3"]}
1615
coveralls = "*"
1716
ipython = "*"
1817
moto = "*"
@@ -25,6 +24,7 @@ pytest = "*"
2524
ruff = "*"
2625
setuptools = "*"
2726
pip-audit = "*"
27+
boto3-stubs = {extras = ["essential"], version = "*"}
2828

2929
[requires]
3030
python_version = "3.12"

Pipfile.lock

Lines changed: 50 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/conftest.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
"""tests/conftest.py"""
22

3-
# ruff: noqa: D205, D209
4-
53
import os
64

5+
import boto3
6+
import moto
77
import pytest
88

99
from tests.utils import (
@@ -198,3 +198,16 @@ def dataset_with_same_day_runs(tmp_path) -> TIMDEXDataset:
198198
@pytest.fixture
199199
def timdex_dataset_metadata(dataset_with_same_day_runs):
200200
return TIMDEXDatasetMetadata(timdex_dataset=dataset_with_same_day_runs)
201+
202+
203+
@pytest.fixture
204+
def timdex_bucket():
205+
return "timdex"
206+
207+
208+
@pytest.fixture
209+
def mock_s3_resource(timdex_bucket):
210+
with moto.mock_aws():
211+
conn = boto3.resource("s3", region_name="us-east-1")
212+
conn.create_bucket(Bucket=timdex_bucket)
213+
yield conn

tests/test_s3client.py

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
"""tests/test_s3client.py"""
2+
3+
# ruff: noqa: PLR2004, SLF001
4+
5+
import pytest
6+
7+
from timdex_dataset_api.utils import S3Client
8+
9+
10+
def test_s3client_init():
11+
"""Test S3Client initialization."""
12+
client = S3Client()
13+
assert client.resource is not None
14+
15+
16+
def test_s3client_init_with_minio_env(caplog, monkeypatch):
17+
"""Test S3Client initialization with MinIO environment variables."""
18+
caplog.set_level("DEBUG")
19+
20+
monkeypatch.setenv("MINIO_S3_ENDPOINT_URL", "http://localhost:9000")
21+
monkeypatch.setenv("MINIO_USERNAME", "minioadmin")
22+
monkeypatch.setenv("MINIO_PASSWORD", "minioadmin")
23+
monkeypatch.setenv("MINIO_REGION", "us-east-1")
24+
25+
client = S3Client()
26+
assert client.resource is not None
27+
assert "MinIO env vars detected, using for S3Client" in caplog.text
28+
29+
30+
def test_split_s3_uri():
31+
"""Test _split_s3_uri method."""
32+
client = S3Client()
33+
bucket, key = client._split_s3_uri("s3://timdex/path/to/file.txt")
34+
assert bucket == "timdex"
35+
assert key == "path/to/file.txt"
36+
37+
38+
def test_split_s3_uri_invalid():
39+
"""Test _split_s3_uri method with invalid URI."""
40+
client = S3Client()
41+
with pytest.raises(ValueError, match="Invalid S3 URI"):
42+
client._split_s3_uri("timdex/path/to/file.txt")
43+
44+
45+
def test_upload_download_file(mock_s3_resource, tmp_path):
46+
"""Test upload_file and download_file methods."""
47+
client = S3Client()
48+
49+
# Create a test file
50+
test_file = tmp_path / "test.txt"
51+
test_file.write_text("test content")
52+
53+
# Upload the file
54+
s3_uri = "s3://timdex/test.txt"
55+
client.upload_file(test_file, s3_uri)
56+
57+
# Download the file to a different location
58+
download_path = tmp_path / "downloaded.txt"
59+
client.download_file(s3_uri, download_path)
60+
61+
# Verify the content
62+
assert download_path.read_text() == "test content"
63+
64+
65+
def test_delete_file(mock_s3_resource, tmp_path):
66+
"""Test delete_file method."""
67+
client = S3Client()
68+
69+
# Create and upload a test file
70+
test_file = tmp_path / "test.txt"
71+
test_file.write_text("test content")
72+
s3_uri = "s3://timdex/test.txt"
73+
client.upload_file(test_file, s3_uri)
74+
75+
# Delete the file
76+
client.delete_file(s3_uri)
77+
78+
# Verify the file is deleted
79+
bucket = mock_s3_resource.Bucket("timdex")
80+
objects = list(bucket.objects.all())
81+
assert len(objects) == 0
82+
83+
84+
def test_delete_folder(mock_s3_resource, tmp_path):
85+
"""Test delete_folder method."""
86+
client = S3Client()
87+
88+
# Create and upload test files
89+
for i in range(3):
90+
test_file = tmp_path / f"test{i}.txt"
91+
test_file.write_text(f"test content {i}")
92+
s3_uri = f"s3://timdex/folder/test{i}.txt"
93+
client.upload_file(test_file, s3_uri)
94+
95+
# Upload a file outside the folder
96+
other_file = tmp_path / "other.txt"
97+
other_file.write_text("other content")
98+
client.upload_file(other_file, "s3://timdex/other.txt")
99+
100+
# Delete the folder
101+
deleted_keys = client.delete_folder("s3://timdex/folder/")
102+
103+
# Verify only folder contents are deleted
104+
assert len(deleted_keys) == 3
105+
assert all(key.startswith("folder/") for key in deleted_keys)
106+
107+
bucket = mock_s3_resource.Bucket("timdex")
108+
objects = list(bucket.objects.all())
109+
assert len(objects) == 1
110+
assert objects[0].key == "other.txt"

timdex_dataset_api/utils.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
"""timdex_dataset_api/utils.py"""
2+
3+
import logging
4+
import os
5+
import pathlib
6+
from urllib.parse import urlparse
7+
8+
import boto3
9+
from mypy_boto3_s3.service_resource import S3ServiceResource
10+
11+
logger = logging.getLogger(__name__)
12+
13+
14+
class S3Client:
15+
def __init__(
16+
self,
17+
) -> None:
18+
self.resource = self._create_resource()
19+
20+
def _create_resource(self) -> S3ServiceResource:
21+
"""Instantiate a boto3 S3 resource.
22+
23+
If env var MINIO_S3_ENDPOINT_URL is set, assume using local set of MinIO env vars.
24+
"""
25+
endpoint_url = os.getenv("MINIO_S3_ENDPOINT_URL")
26+
if endpoint_url:
27+
logger.debug("MinIO env vars detected, using for S3Client.")
28+
return boto3.resource(
29+
"s3",
30+
endpoint_url=endpoint_url,
31+
aws_access_key_id=os.getenv("MINIO_USERNAME"),
32+
aws_secret_access_key=os.getenv("MINIO_PASSWORD"),
33+
region_name=os.getenv("MINIO_REGION", "us-east-1"),
34+
)
35+
return boto3.resource("s3")
36+
37+
def download_file(self, s3_uri: str, local_path: str | pathlib.Path) -> None:
38+
bucket, key = self._split_s3_uri(s3_uri)
39+
local_path = pathlib.Path(local_path)
40+
local_path.parent.mkdir(parents=True, exist_ok=True)
41+
self.resource.Bucket(bucket).download_file(key, str(local_path))
42+
logger.info(f"Downloaded {s3_uri} to {local_path}")
43+
44+
def upload_file(self, local_path: str | pathlib.Path, s3_uri: str) -> None:
45+
bucket, key = self._split_s3_uri(s3_uri)
46+
local_path = pathlib.Path(local_path)
47+
self.resource.Bucket(bucket).upload_file(str(local_path), key)
48+
logger.info(f"Uploaded {local_path} to {s3_uri}")
49+
50+
def delete_file(self, s3_uri: str) -> None:
51+
bucket, key = self._split_s3_uri(s3_uri)
52+
self.resource.Object(bucket, key).delete()
53+
logger.info(f"Deleted {s3_uri}")
54+
55+
def delete_folder(self, s3_uri: str) -> list[str]:
56+
"""Delete all objects whose keys start with the given prefix."""
57+
bucket, prefix = self._split_s3_uri(s3_uri)
58+
bucket_obj = self.resource.Bucket(bucket)
59+
receipt = bucket_obj.objects.filter(Prefix=prefix).delete()
60+
61+
deleted_keys = []
62+
for request in receipt:
63+
deleted_keys.extend([item["Key"] for item in request["Deleted"]])
64+
logger.info(f"Deleted {deleted_keys}")
65+
return deleted_keys
66+
67+
@staticmethod
68+
def _split_s3_uri(s3_uri: str) -> tuple[str, str]:
69+
"""Validate and split an S3 URI into (bucket, key)."""
70+
parsed = urlparse(s3_uri)
71+
if parsed.scheme != "s3" or not parsed.netloc or not parsed.path:
72+
raise ValueError(f"Invalid S3 URI: {s3_uri!r}")
73+
74+
bucket = parsed.netloc
75+
key = parsed.path.lstrip("/") # strip leading slash from /key
76+
return bucket, key

0 commit comments

Comments
 (0)