Skip to content

Commit d924fc6

Browse files
committed
WIP: transition to MSSDK for package loading/saving, v2/eForms only
- Refactor MongoDB package repo to use MSSDK's - Extend current pipeline model from MSSDK v2 (eForms) model - Refactor completely the loading from GH and saving to MongoDB TODO: Update failing tests due to the extended model where any MSSDK package will leave any legacy fields unpopulated. WARNING: Loading test data and therefore notice processing is currently BROKEN until complete migration. P.S: The MSSDK model retains any package outputs in `test_results` -- this we forecefully discard before persistence as for production packages it pushes storage beyond 16MB MongoDB limit and throws the `pymongo.errors.DocumentTooLarge: BSON document too large` error:
1 parent 1a3e707 commit d924fc6

3 files changed

Lines changed: 269 additions & 165 deletions

File tree

src/ted_sws/core/model/transform.py

Lines changed: 68 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -11,37 +11,34 @@
1111
from enum import Enum
1212
from typing import List, Optional, Union
1313

14-
from pydantic import field_validator, ConfigDict
14+
from pydantic import field_validator, ConfigDict, Field
1515

1616
from src.ted_sws.core.model import PropertyBaseModel
1717

18+
from mapping_suite_sdk.mapping_package_v2.models import MappingPackageV2
19+
from mapping_suite_sdk.core.models.collection_asset import TestDataCollectionAsset, SPARQLTestCollectionAsset, SHACLTestCollectionAsset, \
20+
TestResultCollectionAsset
1821

1922
class MappingPackageComponent(PropertyBaseModel, abc.ABC):
2023
model_config = ConfigDict(validate_assignment=True)
2124

2225

2326
class FileResource(MappingPackageComponent):
24-
"""
25-
26-
"""
27+
"""Represents a file resource in a mapping package."""
2728
file_name: str
2829
file_content: str
2930
original_name: Optional[str] = None
30-
parents: Optional[List[str]] = []
31+
parents: List[str] = Field(default_factory=list)
3132

3233

3334
class NoticeFileResource(FileResource):
34-
"""
35-
36-
"""
35+
"""Represents a file resource associated with a notice."""
3736
notice_id: str
3837

3938

40-
4139
class MetadataConstraintsStandardForm(MappingPackageComponent):
42-
"""
43-
Metadata constraints structure for Standard forms
44-
"""
40+
"""Metadata constraints structure for Standard forms."""
41+
# TODO: MSSDK must fix SF (v1) to have str in model even if data is int
4542
eforms_subtype: List[str]
4643
start_date: Optional[List[str]] = None
4744
end_date: Optional[List[str]] = None
@@ -54,84 +51,106 @@ def coerce_eforms_subtype(cls, value):
5451
return [str(item) for item in value]
5552
return value
5653

54+
5755
class MetadataConstraintsEform(MappingPackageComponent):
58-
"""
59-
Metadata constraints structure for eForms
60-
"""
56+
"""Metadata constraints structure for eForms."""
6157
eforms_subtype: List[str]
6258
start_date: Optional[List[str]] = None
6359
end_date: Optional[List[str]] = None
6460
eforms_sdk_versions: List[str]
6561

6662

6763
class MetadataConstraints(MappingPackageComponent):
68-
"""
69-
Metadata constraints
70-
"""
64+
"""Metadata constraints container."""
7165
constraints: Union[MetadataConstraintsStandardForm, MetadataConstraintsEform]
7266

7367

7468
class TransformationRuleSet(MappingPackageComponent):
75-
"""
76-
77-
"""
69+
"""Transformation rule set with vocabulary resources and RML mappings."""
7870
resources: List[FileResource]
7971
rml_mapping_rules: List[FileResource]
8072

8173

8274
class SHACLTestSuite(MappingPackageComponent):
83-
"""
84-
85-
"""
75+
"""SHACL test suite."""
8676
identifier: str
8777
shacl_tests: List[FileResource]
8878

8979

9080
class SPARQLTestSuite(MappingPackageComponent):
91-
"""
92-
93-
"""
81+
"""SPARQL test suite."""
9482
identifier: str
9583
sparql_tests: List[FileResource]
9684

9785

9886
class TransformationTestData(MappingPackageComponent):
99-
"""
100-
101-
"""
87+
"""Transformation test data."""
10288
test_data: List[FileResource]
10389

10490

10591
class MappingXPATH(MappingPackageComponent):
92+
"""Mapping XPath expression."""
10693
xpath: str
10794
form_field: Optional[str] = None
10895

10996

11097
class MappingPackageType(str, Enum):
98+
"""Type of mapping package."""
11199
STANDARD_FORMS = "standard_forms"
112100
ELECTRONIC_FORMS = "eforms"
113101

114102
def __str__(self):
115103
return self.value
116104

117105

118-
class MappingPackage(MappingPackageComponent):
119-
"""
120-
121-
"""
122-
created_at: str = datetime.now().replace(microsecond=0).isoformat()
123-
identifier: str = "no_id"
124-
title: str = "no_title"
125-
version: str = "0.1.1"
126-
ontology_version: str = "0.0.1"
127-
git_latest_commit_hash: str = "no_hash"
128-
mapping_suite_hash_digest: str = "no_hash"
129-
mapping_type: Optional[MappingPackageType] = MappingPackageType.STANDARD_FORMS
130-
metadata_constraints: MetadataConstraints
131-
transformation_rule_set: TransformationRuleSet
132-
shacl_test_suites: List[SHACLTestSuite]
133-
sparql_test_suites: List[SPARQLTestSuite]
134-
transformation_test_data: TransformationTestData
135-
106+
# this will become a union- or composition-based class when more versions are added
107+
class MappingPackage(MappingPackageComponent, MappingPackageV2):
108+
"""
109+
Extended mapping package model that inherits from an MSSDK model.
110+
111+
Combines compatibility with MSSDK version 2 while adding legacy pipeline-specific fields.
112+
113+
IMPORTANT: Many legacy fields are optional with defaults to avoid conflicts with MSSDK models.
114+
"""
115+
116+
# Legacy pipeline-specific fields - MOSTLY OPTIONAL
117+
created_at: str = Field(
118+
default_factory=lambda: datetime.now().replace(microsecond=0).isoformat()
119+
)
120+
identifier: str = Field(default="no_id")
121+
title: str = Field(default="no_title")
122+
version: str = Field(default="0.1.1")
123+
ontology_version: str = Field(default="0.0.1")
124+
git_latest_commit_hash: str = Field(default="")
125+
mapping_suite_hash_digest: str = Field(default="")
126+
mapping_type: Optional[MappingPackageType] = Field( default=MappingPackageType.STANDARD_FORMS)
127+
metadata_constraints: Optional[MetadataConstraints] = Field(default=None)
128+
transformation_rule_set: Optional[TransformationRuleSet] = Field(default=None)
129+
shacl_test_suites: List[SHACLTestSuite] = Field(default_factory=list)
130+
sparql_test_suites: List[SPARQLTestSuite] = Field(default_factory=list)
131+
transformation_test_data: Optional[TransformationTestData] = Field(default=None)
132+
previous_version: Optional[str] = Field(default=None)
133+
134+
# TODO fix to be forwarded to MSSDK, remove when implemented there
135+
# Override large/optional collection assets in MSSDK model
136+
test_results: Optional[TestResultCollectionAsset] = Field(
137+
default=None,
138+
description="Collections of test transformation results (optional due to large storage requirements -- will cause MongoDB BSON error for 16MB limit)"
139+
)
140+
test_data_suites: List[TestDataCollectionAsset] = Field(
141+
default_factory=list,
142+
description="Collections of test data for transformation"
143+
)
144+
test_suites_sparql: List[SPARQLTestCollectionAsset] = Field(
145+
default_factory=list,
146+
description="Collections of SPARQL-based test suites"
147+
)
148+
test_suites_shacl: Optional[SHACLTestCollectionAsset] = Field(
149+
default=None,
150+
description="Collections of SHACL-based validation test suites"
151+
)
152+
153+
# TODO check this out and remove if not needed (see if any production package ID does not come with version)
136154
def get_mongodb_id(self) -> str:
137-
return f"{self.identifier}_v{self.version}"
155+
"""Get MongoDB _id for this package."""
156+
return f"{self.id}_v{self.version}"

0 commit comments

Comments
 (0)