Skip to content

Commit 130a4ff

Browse files
Kolea PLESCOschivmeister
authored andcommitted
feat: migrate metadata resource files loading to read from MS Config
1 parent 70dcca7 commit 130a4ff

15 files changed

Lines changed: 48081 additions & 141 deletions

File tree

src/ted_sws/core/model/notice.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,7 @@ class Notice(LazyWorkExpression):
192192
_mets_manifestation: Optional[METSManifestation] = None
193193
_xml_metadata: Optional[XMLMetadata] = None
194194
validation_summary: Optional[ValidationSummaryReport] = None
195+
mapping_package_identifier: Optional[str] = None
195196

196197
@computed_field
197198
@property

src/ted_sws/core/model/transform.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,7 @@ class MappingPackage(MappingPackageComponent, MappingPackageV3):
160160
version: str = Field(default="0.1.1")
161161
ontology_version: str = Field(default="0.0.1")
162162
git_latest_commit_hash: str = Field(default="")
163+
mapping_suite_identifier: str = Field(default="")
163164
mapping_suite_hash_digest: str = Field(default="")
164165
mapping_type: Optional[MappingPackageType] = Field( default=MappingPackageType.STANDARD_FORMS)
165166
metadata_constraints: Optional[MetadataConstraints] = Field(default=None)

src/ted_sws/data_manager/adapters/notice_repository.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
XMLMetadataRepository
1919
from src.ted_sws.data_manager.adapters.repository_abc import NoticeRepositoryABC
2020
from src.ted_sws.notice_metadata_processor.services.metadata_normalizer import create_normalised_metadata_view
21+
from src.ted_sws.notice_validator.adapters.validation_summary_runner import MAPPING_PACKAGE_IDENTIFIER
2122

2223
logger = logging.getLogger(__name__)
2324

@@ -244,7 +245,8 @@ def _create_dict_from_notice(notice: Notice) -> dict:
244245
"""
245246

246247
notice_dict = notice.model_dump(include={NOTICE_TED_ID: True, NOTICE_STATUS: True,
247-
NOTICE_CREATED_AT: True, VALIDATION_SUMMARY: True})
248+
NOTICE_CREATED_AT: True, VALIDATION_SUMMARY: True,
249+
MAPPING_PACKAGE_IDENTIFIER: True})
248250
notice_dict[MONGODB_COLLECTION_ID] = notice_dict[NOTICE_TED_ID]
249251
notice_dict[NOTICE_STATUS] = str(notice_dict[NOTICE_STATUS])
250252
notice_dict[NOTICE_CREATED_AT] = datetime.fromisoformat(notice_dict[NOTICE_CREATED_AT])

src/ted_sws/mapping_suite_processor/adapters/github_ms_project_downloader.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -72,19 +72,16 @@ def get_git_head_hash(git_repository_path: pathlib.Path) -> str:
7272
shell=True,
7373
stdout=subprocess.PIPE)
7474
git_head_hash = result.stdout.decode(encoding="utf-8")
75-
return git_head_hash
75+
return (git_head_hash or "").strip()
7676

7777
with tempfile.TemporaryDirectory() as tmp_dir:
7878
temp_dir_path = pathlib.Path(tmp_dir)
7979
bash_script = f"cd {temp_dir_path} && git clone --depth 1 --branch {self.branch_or_tag_name} {self.github_repository_url}"
8080
result = subprocess.run(bash_script, shell=True,
8181
capture_output=True, text=True)
82-
log_technical_info(
83-
message=f"Downloaded stdout '{result.stdout}'")
84-
log_technical_info(
85-
message=f"Downloaded stderr '{result.stderr}'")
86-
git_last_commit_hash = get_git_head_hash(
87-
git_repository_path=temp_dir_path / self.repository_name)
82+
log_technical_info(message=f"Downloaded stdout '{result.stdout}'")
83+
log_technical_info(message=f"Downloaded stderr '{result.stderr}'")
84+
git_last_commit_hash = get_git_head_hash(git_repository_path=temp_dir_path / self.repository_name)
8885
downloaded_tmp_project_path = temp_dir_path / self.repository_name
8986
shutil.copytree(downloaded_tmp_project_path, output_project_path, dirs_exist_ok=True)
9087

src/ted_sws/mapping_suite_processor/services/mapping_package_processor.py

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import tempfile
33
from typing import List
44

5+
from mapping_suite_sdk.mapping_suite.models import MappingSuite
56
from pymongo import MongoClient
67

78
from src.ted_sws import config
@@ -36,10 +37,11 @@
3637

3738

3839
def mapping_package_processor_load_package_in_mongo_db(
39-
package: MappingPackage,
40-
mongodb_client: MongoClient,
41-
load_test_data: bool = False,
42-
git_last_commit_hash: str = None
40+
package: MappingPackage,
41+
mongodb_client: MongoClient,
42+
load_test_data: bool = False,
43+
git_last_commit_hash: str = None,
44+
mapping_suite: MappingSuite = None
4345
) -> List[str]:
4446
"""Load a mapping package to MongoDB.
4547
@@ -50,6 +52,7 @@ def mapping_package_processor_load_package_in_mongo_db(
5052
mongodb_client: MongoDB client instance
5153
load_test_data: Whether to load test data as notices
5254
git_last_commit_hash: Optional git commit hash to store
55+
mapping_suite: Optional mapping suite that the package belongs to
5356
5457
Returns:
5558
List of notice IDs that were loaded (if load_test_data=True)
@@ -60,6 +63,9 @@ def mapping_package_processor_load_package_in_mongo_db(
6063
# Update git hash if provided and field exists
6164
if git_last_commit_hash is not None:
6265
package.git_latest_commit_hash = git_last_commit_hash
66+
67+
if mapping_suite:
68+
package.mapping_suite_identifier = get_mapping_suite_identifier(mapping_suite)
6369
result_notice_ids = []
6470

6571
# Load test data if requested
@@ -68,7 +74,7 @@ def mapping_package_processor_load_package_in_mongo_db(
6874
notice_repository = NoticeRepository(mongodb_client=mongodb_client)
6975
for test_data in tests_data:
7076
notice_id = test_data.file_name.split(".")[0]
71-
notice = Notice(ted_id=notice_id)
77+
notice = Notice(ted_id=notice_id, mapping_package_identifier=package.identifier)
7278
notice.set_xml_manifestation(XMLManifestation(object_data=test_data.file_content))
7379
notice_repository.add(notice=notice)
7480
result_notice_ids.append(notice_id)
@@ -113,6 +119,7 @@ def load_mapping_suite_and_packages_from_github_to_mongo_db(mongodb_client: Mong
113119
git_last_commit_hash = mapping_package_downloader.download(output_project_path=tmp_dir_path)
114120

115121
# load project config if available
122+
mapping_suite = None
116123
if ms_config_file_path.is_file():
117124
log_technical_info(message=f"Mapping suite config found at '{ms_config_file_path}'")
118125
mapping_suite = load_mapping_suite_from_folder(mapping_suite_folder_path=tmp_dir_path)
@@ -131,7 +138,7 @@ def load_mapping_suite_and_packages_from_github_to_mongo_db(mongodb_client: Mong
131138

132139
# continue loading mapping packages
133140
mapping_package_paths = [
134-
mappings_dir_path / mapping_package_name ] if mapping_package_name else list(mappings_dir_path.iterdir())
141+
mappings_dir_path / mapping_package_name] if mapping_package_name else list(mappings_dir_path.iterdir())
135142
result_notice_ids = []
136143
for mapping_package_path in mapping_package_paths:
137144
detected_version = detect_mapping_package_version(mapping_package_path)
@@ -159,7 +166,8 @@ def load_mapping_suite_and_packages_from_github_to_mongo_db(mongodb_client: Mong
159166
package=mapping_package,
160167
mongodb_client=mongodb_client,
161168
load_test_data=load_test_data,
162-
git_last_commit_hash=git_last_commit_hash
169+
git_last_commit_hash=git_last_commit_hash,
170+
mapping_suite=mapping_suite
163171
))
164172
log_mapping_package_info(
165173
message=f"Mapping package with id={mapping_package.id} loaded with success in MongoDB!",
@@ -178,3 +186,10 @@ def _convert_to_mapping_package(mssdk_package) -> MappingPackage:
178186
"""Convert MSSDK package to extended MappingPackage."""
179187
data = mssdk_package.model_dump(exclude={'test_results'})
180188
return MappingPackage(**data)
189+
190+
191+
def get_mapping_suite_identifier(mapping_suite):
192+
ms_config = getattr(mapping_suite, "mapping_suite_config", None)
193+
metadata = getattr(ms_config, "mapping_suite_metadata", None) if ms_config else None
194+
identifier = getattr(metadata, "mapping_suite_identifier", None) if metadata else None
195+
return identifier

src/ted_sws/notice_metadata_processor/adapters/notice_metadata_normaliser.py

Lines changed: 28 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,15 @@
11
import abc
2+
import html
3+
import re
24
from datetime import datetime
35
from typing import Dict, Tuple, List
4-
import re
6+
57
import pandas as pd
6-
import html
8+
from pymongo import MongoClient
79

810
from src.ted_sws.core.model.metadata import NormalisedMetadata, LanguageTaggedString, NoticeSource
11+
from src.ted_sws.core.model.notice import Notice
12+
from src.ted_sws.event_manager.services.log import log_notice_info
913
from src.ted_sws.notice_metadata_processor.model.metadata import ExtractedMetadata
1014
from src.ted_sws.notice_metadata_processor.services.metadata_constraints import filter_df_by_variables
1115
from src.ted_sws.resources.mapping_files_registry import MappingFilesRegistry
@@ -40,7 +44,6 @@
4044
EFORM_SDK_VERSION_KEY = "eform_sdk_version"
4145
NOTICE_SOURCE_KEY = "notice_source"
4246
ENGLISH_LANGUAGE_TAG = "EN"
43-
mapping_registry = MappingFilesRegistry()
4447

4548

4649
def get_html_compatible_string(input_string: LanguageTaggedString) -> LanguageTaggedString:
@@ -86,6 +89,8 @@ def normalise_metadata(self, extracted_metadata: ExtractedMetadata) -> Normalise
8689

8790

8891
class DefaultNoticeMetadataNormaliser(NoticeMetadataNormaliserABC):
92+
def __init__(self, notice: Notice, mongodb_client: MongoClient = None):
93+
self.mapping_registry = MappingFilesRegistry(notice=notice, mongodb_client=mongodb_client)
8994

9095
@classmethod
9196
def normalise_legal_basis_value(cls, value: str) -> str:
@@ -207,15 +212,15 @@ def normalise_metadata(self, extracted_metadata: ExtractedMetadata) -> Normalise
207212
Generate the normalised metadata
208213
:return:
209214
"""
210-
countries_map = mapping_registry.countries
211-
form_type_map = mapping_registry.form_type
212-
languages_map = mapping_registry.languages
213-
legal_basis_map = mapping_registry.legal_basis
214-
notice_type_map = mapping_registry.notice_type
215-
nuts_map = mapping_registry.nuts
216-
standard_forms_map = mapping_registry.sf_notice_df
217-
eforms_map = mapping_registry.ef_notice_df
218-
filter_map = mapping_registry.filter_map_df
215+
countries_map = self.mapping_registry.countries
216+
form_type_map = self.mapping_registry.form_type
217+
languages_map = self.mapping_registry.languages
218+
legal_basis_map = self.mapping_registry.legal_basis
219+
notice_type_map = self.mapping_registry.notice_type
220+
nuts_map = self.mapping_registry.nuts
221+
standard_forms_map = self.mapping_registry.sf_notice_df
222+
eforms_map = self.mapping_registry.ef_notice_df
223+
filter_map = self.mapping_registry.filter_map_df
219224
form_type, notice_type, legal_basis, eforms_subtype = self.get_form_type_and_notice_type(
220225
sf_map=standard_forms_map, ef_map=eforms_map, filter_map=filter_map,
221226
extracted_notice_type=extracted_metadata.extracted_notice_type,
@@ -273,6 +278,8 @@ class EformsNoticeMetadataNormaliser(NoticeMetadataNormaliserABC):
273278
"""
274279
Metadata normaliser for eForms
275280
"""
281+
def __init__(self, notice: Notice, mongodb_client: MongoClient = None):
282+
self.mapping_registry = MappingFilesRegistry(notice=notice, mongodb_client=mongodb_client)
276283

277284
@classmethod
278285
def iso_date_format(cls, _date: str, with_none=False):
@@ -283,17 +290,17 @@ def iso_date_format(cls, _date: str, with_none=False):
283290
return datetime.fromisoformat(_date).isoformat()
284291
return None
285292

286-
@classmethod
287-
def get_form_type_notice_type_and_legal_basis(cls, extracted_notice_subtype: str) -> Tuple:
293+
def get_form_type_notice_type_and_legal_basis(self, extracted_notice_subtype: str) -> Tuple:
288294
"""
289295
Get the values for form type, notice type and legal basis from the eForm mapping files
290296
"""
291-
ef_map: pd.DataFrame = mapping_registry.ef_notice_df
297+
ef_map: pd.DataFrame = self.mapping_registry.ef_notice_df
292298
try:
293299
filtered_df = ef_map.query(f"{E_FORMS_SUBTYPE_KEY}=='{extracted_notice_subtype}'").to_dict(orient='records')[0]
294300
except:
295301
raise Exception(
296302
f'No mapping available for {extracted_notice_subtype} notice subtype. Please check that the field exists in the XML content if the notice subtype is not specified in this message')
303+
297304
try:
298305
form_type = filtered_df[FORM_TYPE_KEY]
299306
notice_type = filtered_df[E_FORM_NOTICE_TYPE_COLUMN]
@@ -312,11 +319,11 @@ def normalise_metadata(self, extracted_metadata: ExtractedMetadata) -> Normalise
312319
:return:
313320
"""
314321
extracted_metadata = extracted_metadata
315-
form_type_map = mapping_registry.form_type
316-
languages_map = mapping_registry.languages
317-
legal_basis_map = mapping_registry.legal_basis
318-
notice_type_map = mapping_registry.notice_type
319-
nuts_map = mapping_registry.nuts
322+
form_type_map = self.mapping_registry.form_type
323+
languages_map = self.mapping_registry.languages
324+
legal_basis_map = self.mapping_registry.legal_basis
325+
notice_type_map = self.mapping_registry.notice_type
326+
nuts_map = self.mapping_registry.nuts
320327
form_type, notice_type, legal_basis = self.get_form_type_notice_type_and_legal_basis(
321328
extracted_notice_subtype=extracted_metadata.extracted_notice_subtype)
322329
metadata = {
@@ -354,3 +361,4 @@ def normalise_metadata(self, extracted_metadata: ExtractedMetadata) -> Normalise
354361
}
355362

356363
return NormalisedMetadata(**metadata)
364+

src/ted_sws/notice_metadata_processor/services/metadata_normalizer.py

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
from typing import Optional
22
import xml.etree.ElementTree as ET
33

4+
from pymongo import MongoClient
5+
46
from src.ted_sws.core.model.manifestation import XMLManifestation
57
from src.ted_sws.core.model.metadata import NormalisedMetadata, NormalisedMetadataView
68
from src.ted_sws.core.model.notice import Notice
@@ -32,14 +34,17 @@ def find_metadata_extractor_based_on_xml_manifestation(
3234

3335

3436
def find_metadata_normaliser_based_on_xml_manifestation(
35-
xml_manifestation: XMLManifestation) -> NoticeMetadataNormaliserABC:
37+
notice: Notice,
38+
xml_manifestation: XMLManifestation,
39+
mongodb_client: MongoClient = None
40+
) -> NoticeMetadataNormaliserABC:
3641
"""
3742
Find the correct extractor based on the XML Manifestation
3843
"""
3944
if check_if_xml_manifestation_is_eform(xml_manifestation):
40-
return EformsNoticeMetadataNormaliser()
45+
return EformsNoticeMetadataNormaliser(notice=notice, mongodb_client=mongodb_client)
4146
else:
42-
return DefaultNoticeMetadataNormaliser()
47+
return DefaultNoticeMetadataNormaliser(notice=notice, mongodb_client=mongodb_client)
4348

4449

4550
def extract_notice_metadata(metadata_extractor: NoticeMetadataExtractorABC) -> ExtractedMetadata:
@@ -57,32 +62,36 @@ def normalise_notice_metadata(extracted_metadata: ExtractedMetadata,
5762
return metadata_normaliser.normalise_metadata(extracted_metadata)
5863

5964

60-
def extract_and_normalise_notice_metadata(xml_manifestation: XMLManifestation) -> NormalisedMetadata:
65+
def extract_and_normalise_notice_metadata(notice: Notice, xml_manifestation: XMLManifestation, mongodb_client: MongoClient = None) -> NormalisedMetadata:
6166
"""
6267
Extract and normalise metadata using the correct extractor and normaliser type
6368
"""
6469
metadata_extractor = find_metadata_extractor_based_on_xml_manifestation(xml_manifestation)
6570
extracted_metadata = extract_notice_metadata(metadata_extractor)
66-
metadata_normaliser = find_metadata_normaliser_based_on_xml_manifestation(xml_manifestation)
71+
metadata_normaliser = find_metadata_normaliser_based_on_xml_manifestation(
72+
notice=notice,
73+
xml_manifestation=xml_manifestation,
74+
mongodb_client=mongodb_client
75+
)
6776
normalised_metadata = normalise_notice_metadata(extracted_metadata, metadata_normaliser)
6877
return normalised_metadata
6978

7079

71-
def extract_and_normalise_notice_metadata_from_notice(notice: Notice) -> NormalisedMetadata:
80+
def extract_and_normalise_notice_metadata_from_notice(notice: Notice, mongodb_client: MongoClient = None) -> NormalisedMetadata:
7281
"""
7382
Extract and normalise metadata using the correct extractor and normaliser type
7483
"""
7584
xml_manifestation = notice.xml_manifestation
76-
return extract_and_normalise_notice_metadata(xml_manifestation)
85+
return extract_and_normalise_notice_metadata(notice=notice, xml_manifestation=xml_manifestation, mongodb_client=mongodb_client)
7786

7887

79-
def normalise_notice(notice: Notice) -> Notice:
88+
def normalise_notice(notice: Notice, mongodb_client: MongoClient = None) -> Notice:
8089
"""
8190
Given a notice object, normalise metadata and return the updated object
8291
:param notice:
8392
:return:
8493
"""
85-
normalised_metadata = extract_and_normalise_notice_metadata_from_notice(notice=notice)
94+
normalised_metadata = extract_and_normalise_notice_metadata_from_notice(notice=notice, mongodb_client=mongodb_client)
8695
notice.set_normalised_metadata(normalised_metadata)
8796
return notice
8897

src/ted_sws/resources/__init__.py

Lines changed: 0 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,5 @@
1-
import json
21
import pathlib
32

4-
import pandas as pd
5-
6-
try:
7-
import importlib.resources as pkg_resources
8-
except ImportError:
9-
# Try backported to PY<37 `importlib_resources`.
10-
import importlib_resources as pkg_resources
11-
12-
import src.ted_sws.resources.mapping_files
13-
14-
15-
def get_mapping_json_file(mapping_file_name: str) -> dict:
16-
"""
17-
get a predefined index mapping by reference to file name
18-
"""
19-
with pkg_resources.path(mapping_files, mapping_file_name) as path:
20-
return json.loads(path.read_bytes())
21-
22-
23-
def get_mapping_csv_file(mapping_file_name: str) -> pd.DataFrame:
24-
"""
25-
get content of a csv file in pandas dataframe format
26-
"""
27-
with pkg_resources.path(mapping_files, mapping_file_name) as path:
28-
return pd.read_csv(path).fillna("")
29-
30-
313
RESOURCES_PATH = pathlib.Path(__file__).parent.resolve()
324

335
PREFIXES_PATH = RESOURCES_PATH / 'prefixes'

0 commit comments

Comments
 (0)