Skip to content

Commit faf19c3

Browse files
Merge branch 'main' into feature/TED-1046
2 parents 1068a4a + 2897792 commit faf19c3

10 files changed

Lines changed: 34504 additions & 28 deletions

File tree

docs/antora/modules/ROOT/attachments/FATs/2022-11-15-TED-SWS-FAT-complete.html

Lines changed: 17288 additions & 0 deletions
Large diffs are not rendered by default.

docs/antora/modules/ROOT/attachments/FATs/2022-11-22-TED-SWS-FAT-complete.html

Lines changed: 17158 additions & 0 deletions
Large diffs are not rendered by default.
Binary file not shown.

docs/antora/modules/ROOT/pages/index.adoc

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,17 +10,15 @@ xref:demo_installation.adoc[Installation instructions for development and testin
1010

1111
xref:attachment$/aws-infra-docs/TED-SWS-AWS-Infrastructure-architecture-overview-v0.9.pdf[TED-SWS AWS Infrastructure architecture overview v0.9]
1212

13-
xref:attachment$/aws-infra-docs/TED-SWS-AWS-Installation-manual-v0.9.pdf[TED-SWS AWS Installation manual v0.9]
14-
15-
Developers who need to know how the logger works can find here a xref:event_manager.adoc[little explanation].
13+
xref:attachment$/aws-infra-docs/TED-SWS Installation manual v2.0.2.pdf[TED-SWS AWS Installation manual v2.0.2]
1614

1715
== Project roadmap
1816

1917
|===
2018
|Reference | Description | Estimated delivery | Factory Acceptance Test | Release date | Release
2119

2220
| Phase 1 | The first phase places high priority on the deployment into the OP AWS Cloud environment.| August 2022 | xref:attachment$/FATs/2022-08-29-report/index.html[2022-08-29 report] | 29 August 2022 | link:https://github.com/OP-TED/ted-rdf-conversion-pipeline/releases/tag/0.0.9-beta[0.0.9-beta]
23-
| Phase 2 | Provided that the deployment in the acceptance environment is successful, the delivery of Phase 2 aims to provide the first production version of the TED SWS system. | Nov 2022 | --- | --- | ---
21+
| Phase 2 | Provided that the deployment in the acceptance environment is successful, the delivery of Phase 2 aims to provide the first production version of the TED SWS system. | Nov 2022 | xref:attachment$/FATs/2022-11-22-TED-SWS-FAT-complete.html[2022-11-22 report] | 20 Nov 2022 | https://github.com/OP-TED/ted-rdf-conversion-pipeline/releases/tag/1.0.0-beta[1.0.0-beta]
2422
| Phase 3 | This phase delivers the documentation and components and improvements that could not be covered in the previous phases. | Feb 2023 | --- | --- | ---
2523

2624
|===

ted_sws/core/model/transform.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,6 @@ class MappingSuite(MappingSuiteComponent):
150150
title: str = "no_title"
151151
version: str = "0.1.1"
152152
ontology_version: str = "0.0.1"
153-
xsd_version: str = "no_xsd_version"
154153
git_latest_commit_hash: str = "no_hash"
155154
mapping_suite_hash_digest: str = "no_hash"
156155
metadata_constraints: MetadataConstraints

ted_sws/data_manager/services/create_batch_collection_materialised_view.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,4 +30,4 @@ def create_batch_collection_materialised_view(mongo_client: MongoClient):
3030
{
3131
"$out": NOTICE_PROCESS_BATCH_COLLECTION_NAME
3232
}
33-
])
33+
], allowDiskUse=True)

ted_sws/data_manager/services/create_notice_collection_materialised_view.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def create_notice_collection_materialised_view(mongo_client: MongoClient):
4040
{
4141
"$out": NOTICES_MATERIALISED_VIEW_NAME
4242
}
43-
])
43+
], allowDiskUse=True)
4444
materialised_view = database[NOTICES_MATERIALISED_VIEW_NAME]
4545
materialised_view.create_index([("created_at", DESCENDING)])
4646
materialised_view.create_index([("publication_date", DESCENDING)])
@@ -80,4 +80,4 @@ def create_notice_kpi_collection(mongo_client: MongoClient):
8080
{
8181
"$out": NOTICE_KPI_COLLECTION_NAME
8282
}
83-
])
83+
], allowDiskUse=True)

ted_sws/master_data_registry/services/entity_deduplication.py

Lines changed: 49 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import pathlib
22
import tempfile
33
from io import StringIO
4-
from typing import List, Set, Tuple, Dict
4+
from typing import List, Tuple, Dict
55
import rdflib
66
from pymongo import MongoClient
77
from rdflib import RDF, URIRef, OWL
@@ -12,16 +12,17 @@
1212
from ted_sws.core.model.notice import Notice
1313
from ted_sws.data_manager.adapters.notice_repository import NoticeRepository
1414
from ted_sws.data_manager.adapters.sparql_endpoint import SPARQLStringEndpoint
15-
from ted_sws.data_manager.adapters.triple_store import FusekiAdapter, TripleStoreABC, FusekiException, \
15+
from ted_sws.data_manager.adapters.triple_store import FusekiAdapter, TripleStoreABC, \
1616
FUSEKI_REPOSITORY_ALREADY_EXIST_ERROR_MSG
17-
from ted_sws.event_manager.services.log import log_error
17+
from ted_sws.event_manager.services.log import log_error, log_notice_error
1818
from ted_sws.master_data_registry.services.rdf_fragment_processor import get_rdf_fragments_by_cet_uri_from_notices, \
1919
merge_rdf_fragments_into_graph, write_rdf_fragments_in_triple_store, RDF_FRAGMENT_FROM_NOTICE_PROPERTY, \
20-
get_subjects_by_cet_uri
20+
get_subjects_by_cet_uri, get_rdf_fragment_by_cet_uri_from_notice
2121

2222
MDR_TEMPORARY_FUSEKI_DATASET_NAME = "tmp_mdr_dataset"
2323
MDR_FUSEKI_DATASET_NAME = "mdr_dataset"
2424
MDR_CANONICAL_CET_PROPERTY = rdflib.term.URIRef("http://www.meaningfy.ws/mdr#isCanonicalEntity")
25+
DEDUPLICATE_PROCEDURE_ENTITIES_DOMAIN_ACTION = "deduplicate_procedure_entities"
2526

2627

2728
def generate_mdr_alignment_links(merged_rdf_fragments: rdflib.Graph, cet_uri: str,
@@ -225,12 +226,14 @@ def deduplicate_entities_by_cet_uri(notices: List[Notice], cet_uri: str,
225226
alignment_graph=cet_alignment_links, inject_reflexive_links=True)
226227

227228

228-
def deduplicate_procedure_entities(notices: List[Notice], procedure_cet_uri: str, mongodb_client: MongoClient):
229+
def deduplicate_procedure_entities(notices: List[Notice], procedure_cet_uri: str, mongodb_client: MongoClient,
230+
mdr_dataset_name: str = MDR_FUSEKI_DATASET_NAME):
229231
"""
230232
This function deduplicate procedure entities for each notice from batch of notices.
231233
:param notices:
232234
:param procedure_cet_uri:
233235
:param mongodb_client:
236+
:param mdr_dataset_name:
234237
:return:
235238
"""
236239
notice_families = defaultdict(list)
@@ -242,26 +245,57 @@ def deduplicate_procedure_entities(notices: List[Notice], procedure_cet_uri: str
242245

243246
parent_uries = {}
244247
notice_repository = NoticeRepository(mongodb_client=mongodb_client)
248+
triple_store = FusekiAdapter()
249+
if mdr_dataset_name not in triple_store.list_repositories():
250+
try:
251+
triple_store.create_repository(repository_name=mdr_dataset_name)
252+
except Exception as exception:
253+
if str(exception) != FUSEKI_REPOSITORY_ALREADY_EXIST_ERROR_MSG:
254+
log_error(message=str(exception))
255+
245256
for parent_notice_id in notice_families.keys():
246257
parent_notice = notice_repository.get(reference=parent_notice_id)
247258
if parent_notice and parent_notice.rdf_manifestation and parent_notice.rdf_manifestation.object_data:
248259
rdf_content = parent_notice.rdf_manifestation.object_data
249260
sparql_endpoint = SPARQLStringEndpoint(rdf_content=rdf_content)
250261
result_uris = get_subjects_by_cet_uri(sparql_endpoint=sparql_endpoint, cet_uri=procedure_cet_uri)
251-
assert len(result_uris) == 1
252-
parent_procedure_uri = rdflib.URIRef(result_uris[0])
253-
parent_uries[parent_notice_id] = parent_procedure_uri
262+
result_uris_len = len(result_uris)
263+
if result_uris_len != 1:
264+
notice_normalised_metadata = parent_notice.normalised_metadata
265+
log_notice_error(
266+
message=f"Parent notice with notice_id=[{parent_notice.ted_id}] have {result_uris_len} Procedure CETs!",
267+
notice_id=parent_notice.ted_id, domain_action=DEDUPLICATE_PROCEDURE_ENTITIES_DOMAIN_ACTION,
268+
notice_form_number=notice_normalised_metadata.form_number if notice_normalised_metadata else None,
269+
notice_status=parent_notice.status,
270+
notice_eforms_subtype=notice_normalised_metadata.eforms_subtype if notice_normalised_metadata else None)
271+
else:
272+
parent_procedure_uri = rdflib.URIRef(result_uris[0])
273+
parent_uries[parent_notice_id] = parent_procedure_uri
274+
parent_procedure_rdf_fragments = get_rdf_fragment_by_cet_uri_from_notice(notice=parent_notice,
275+
cet_uri=procedure_cet_uri)
276+
parent_new_cet = {parent_procedure_uri: parent_procedure_rdf_fragments[0]}
277+
register_new_cets_in_mdr(new_canonical_entities=parent_new_cet, triple_store=triple_store,
278+
mdr_dataset_name=mdr_dataset_name)
254279

255280
for parent_uri_key in parent_uries.keys():
256281
parent_uri = parent_uries[parent_uri_key]
257282
for child_notice in notice_families[parent_uri_key]:
258283
rdf_content = child_notice.rdf_manifestation.object_data
259284
sparql_endpoint = SPARQLStringEndpoint(rdf_content=rdf_content)
260285
result_uris = get_subjects_by_cet_uri(sparql_endpoint=sparql_endpoint, cet_uri=procedure_cet_uri)
261-
assert len(result_uris) == 1
262-
child_procedure_uri = rdflib.URIRef(result_uris[0])
263-
inject_links = rdflib.Graph()
264-
inject_links.add((child_procedure_uri, OWL.sameAs, parent_uri))
265-
child_notice.distilled_rdf_manifestation.object_data = '\n'.join(
266-
[child_notice.distilled_rdf_manifestation.object_data,
267-
str(inject_links.serialize(format="nt"))])
286+
result_uris_len = len(result_uris)
287+
if result_uris_len != 1:
288+
notice_normalised_metadata = child_notice.normalised_metadata
289+
log_notice_error(
290+
message=f"Child notice with notice_id=[{child_notice.ted_id}] have {result_uris_len} Procedure CETs!",
291+
notice_id=child_notice.ted_id, domain_action=DEDUPLICATE_PROCEDURE_ENTITIES_DOMAIN_ACTION,
292+
notice_form_number=notice_normalised_metadata.form_number if notice_normalised_metadata else None,
293+
notice_status=child_notice.status,
294+
notice_eforms_subtype=notice_normalised_metadata.eforms_subtype if notice_normalised_metadata else None)
295+
else:
296+
child_procedure_uri = rdflib.URIRef(result_uris[0])
297+
inject_links = rdflib.Graph()
298+
inject_links.add((child_procedure_uri, OWL.sameAs, parent_uri))
299+
child_notice.distilled_rdf_manifestation.object_data = '\n'.join(
300+
[child_notice.distilled_rdf_manifestation.object_data,
301+
str(inject_links.serialize(format="nt"))])

ted_sws/resources/prefixes/prefixes.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
"locn": "http://www.w3.org/ns/locn#",
2525
"legal": "https://www.w3.org/ns/legal#",
2626

27-
"core-voc": "http://data.europa.eu/m8g/",
27+
"cv": "http://data.europa.eu/m8g/",
2828
"cccev": "http://data.europa.eu/m8g/",
2929
"cpov": "http://data.europa.eu/m8g/",
3030
"geosparql": "http://www.opengis.net/ont/geosparql#",
@@ -51,10 +51,10 @@
5151

5252
"prefix_selections" : {
5353
"sparql_generator" : ["xml", "xsd", "rdf", "rdfs", "owl", "skos", "dc", "dct", "foaf", "epo", "epd", "locn", "legal"],
54-
"sparql_generator_ext" : ["time", "vann", "cc", "org", "core-voc", "cccev", "cpov", "geosparql", "dul"],
54+
"sparql_generator_ext" : ["time", "vann", "cc", "org", "cv", "cccev", "cpov", "geosparql", "dul"],
5555
"yarrrml_rules" : ["epo", "epd", "locn", "xml", "xsd", "rdf", "rdfs", "dct"],
5656
"rml_rules" : ["rr", "rml", "ql", "epo", "epd", "tedm", "locn", "xml", "xsd", "rdf", "rdfs", "owl", "dct", "cc", "cccev", "cpov", "skos", "time", "vann", "legal"],
57-
"epo_ontology" : ["epo", "epd", "locn", "xml", "xsd", "rdf", "rdfs", "owl", "skos", "dc", "dct", "foaf", "core-voc", "cccev", "cpov", "geosparql", "dul", "cc", "time", "vann", "legal"],
57+
"epo_ontology" : ["epo", "epd", "locn", "xml", "xsd", "rdf", "rdfs", "owl", "skos", "dc", "dct", "foaf", "cv", "cccev", "cpov", "geosparql", "dul", "cc", "time", "vann", "legal"],
5858
"currently_unused" : ["epor", "epos", "grel", "xf", "map", "sd", "ht", "v"]
5959
}
6060
}

tests/features/notice_publisher/test_notice_publisher_s3.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,6 @@ def the_notice_rdf_manifestation_publication_is_executed(publish_eligible_notice
5858
publish_result = publish_notice_rdf_into_s3(notice=publish_eligible_notice, s3_publisher=s3_publisher,
5959
bucket_name=s3_bucket_name)
6060
assert publish_result
61-
publish_eligible_notice.update_status_to(new_status=NoticeStatus.PUBLISHED)
6261
return publish_eligible_notice
6362

6463

@@ -80,5 +79,5 @@ def the_rdf_manifestation_is_available_in_a_s3_bucket(s3_publisher, s3_bucket_na
8079
@then('the notice status is PUBLISHED')
8180
def the_notice_status_is_published(published_notice: Notice, s3_publisher, s3_bucket_name):
8281
"""the notice status is PUBLISHED."""
83-
assert published_notice.status == NoticeStatus.PUBLISHED
82+
assert published_notice.status == NoticeStatus.ELIGIBLE_FOR_PUBLISHING
8483

0 commit comments

Comments
 (0)