|
1 | 1 | import pathlib |
2 | 2 | import tempfile |
3 | 3 | from io import StringIO |
4 | | -from typing import List, Set, Tuple, Dict |
| 4 | +from typing import List, Tuple, Dict |
5 | 5 | import rdflib |
6 | 6 | from pymongo import MongoClient |
7 | 7 | from rdflib import RDF, URIRef, OWL |
|
12 | 12 | from ted_sws.core.model.notice import Notice |
13 | 13 | from ted_sws.data_manager.adapters.notice_repository import NoticeRepository |
14 | 14 | from ted_sws.data_manager.adapters.sparql_endpoint import SPARQLStringEndpoint |
15 | | -from ted_sws.data_manager.adapters.triple_store import FusekiAdapter, TripleStoreABC, FusekiException, \ |
| 15 | +from ted_sws.data_manager.adapters.triple_store import FusekiAdapter, TripleStoreABC, \ |
16 | 16 | FUSEKI_REPOSITORY_ALREADY_EXIST_ERROR_MSG |
17 | | -from ted_sws.event_manager.services.log import log_error |
| 17 | +from ted_sws.event_manager.services.log import log_error, log_notice_error |
18 | 18 | from ted_sws.master_data_registry.services.rdf_fragment_processor import get_rdf_fragments_by_cet_uri_from_notices, \ |
19 | 19 | merge_rdf_fragments_into_graph, write_rdf_fragments_in_triple_store, RDF_FRAGMENT_FROM_NOTICE_PROPERTY, \ |
20 | 20 | get_subjects_by_cet_uri |
21 | 21 |
|
22 | 22 | MDR_TEMPORARY_FUSEKI_DATASET_NAME = "tmp_mdr_dataset" |
23 | 23 | MDR_FUSEKI_DATASET_NAME = "mdr_dataset" |
24 | 24 | MDR_CANONICAL_CET_PROPERTY = rdflib.term.URIRef("http://www.meaningfy.ws/mdr#isCanonicalEntity") |
| 25 | +DEDUPLICATE_PROCEDURE_ENTITIES_DOMAIN_ACTION = "deduplicate_procedure_entities" |
25 | 26 |
|
26 | 27 |
|
27 | 28 | def generate_mdr_alignment_links(merged_rdf_fragments: rdflib.Graph, cet_uri: str, |
@@ -248,20 +249,38 @@ def deduplicate_procedure_entities(notices: List[Notice], procedure_cet_uri: str |
248 | 249 | rdf_content = parent_notice.rdf_manifestation.object_data |
249 | 250 | sparql_endpoint = SPARQLStringEndpoint(rdf_content=rdf_content) |
250 | 251 | result_uris = get_subjects_by_cet_uri(sparql_endpoint=sparql_endpoint, cet_uri=procedure_cet_uri) |
251 | | - assert len(result_uris) == 1 |
252 | | - parent_procedure_uri = rdflib.URIRef(result_uris[0]) |
253 | | - parent_uries[parent_notice_id] = parent_procedure_uri |
| 252 | + result_uris_len = len(result_uris) |
| 253 | + if result_uris_len != 1: |
| 254 | + notice_normalised_metadata = parent_notice.normalised_metadata |
| 255 | + log_notice_error( |
| 256 | + message=f"Parent notice with notice_id=[{parent_notice.ted_id}] have {result_uris_len} Procedure CETs!", |
| 257 | + notice_id=parent_notice.ted_id, domain_action=DEDUPLICATE_PROCEDURE_ENTITIES_DOMAIN_ACTION, |
| 258 | + notice_form_number=notice_normalised_metadata.form_number if notice_normalised_metadata else None, |
| 259 | + notice_status=parent_notice.status, |
| 260 | + notice_eforms_subtype=notice_normalised_metadata.eforms_subtype if notice_normalised_metadata else None) |
| 261 | + else: |
| 262 | + parent_procedure_uri = rdflib.URIRef(result_uris[0]) |
| 263 | + parent_uries[parent_notice_id] = parent_procedure_uri |
254 | 264 |
|
255 | 265 | for parent_uri_key in parent_uries.keys(): |
256 | 266 | parent_uri = parent_uries[parent_uri_key] |
257 | 267 | for child_notice in notice_families[parent_uri_key]: |
258 | 268 | rdf_content = child_notice.rdf_manifestation.object_data |
259 | 269 | sparql_endpoint = SPARQLStringEndpoint(rdf_content=rdf_content) |
260 | 270 | result_uris = get_subjects_by_cet_uri(sparql_endpoint=sparql_endpoint, cet_uri=procedure_cet_uri) |
261 | | - assert len(result_uris) == 1 |
262 | | - child_procedure_uri = rdflib.URIRef(result_uris[0]) |
263 | | - inject_links = rdflib.Graph() |
264 | | - inject_links.add((child_procedure_uri, OWL.sameAs, parent_uri)) |
265 | | - child_notice.distilled_rdf_manifestation.object_data = '\n'.join( |
266 | | - [child_notice.distilled_rdf_manifestation.object_data, |
267 | | - str(inject_links.serialize(format="nt"))]) |
| 271 | + result_uris_len = len(result_uris) |
| 272 | + if result_uris_len != 1: |
| 273 | + notice_normalised_metadata = child_notice.normalised_metadata |
| 274 | + log_notice_error( |
| 275 | + message=f"Child notice with notice_id=[{child_notice.ted_id}] have {result_uris_len} Procedure CETs!", |
| 276 | + notice_id=child_notice.ted_id, domain_action=DEDUPLICATE_PROCEDURE_ENTITIES_DOMAIN_ACTION, |
| 277 | + notice_form_number=notice_normalised_metadata.form_number if notice_normalised_metadata else None, |
| 278 | + notice_status=child_notice.status, |
| 279 | + notice_eforms_subtype=notice_normalised_metadata.eforms_subtype if notice_normalised_metadata else None) |
| 280 | + else: |
| 281 | + child_procedure_uri = rdflib.URIRef(result_uris[0]) |
| 282 | + inject_links = rdflib.Graph() |
| 283 | + inject_links.add((child_procedure_uri, OWL.sameAs, parent_uri)) |
| 284 | + child_notice.distilled_rdf_manifestation.object_data = '\n'.join( |
| 285 | + [child_notice.distilled_rdf_manifestation.object_data, |
| 286 | + str(inject_links.serialize(format="nt"))]) |
0 commit comments