Skip to content

Commit a2bb216

Browse files
Merge pull request #389 from OP-TED/feature/TED-1027
Update entity_deduplication.py
2 parents 31214c8 + 8eb39d8 commit a2bb216

1 file changed

Lines changed: 32 additions & 13 deletions

File tree

ted_sws/master_data_registry/services/entity_deduplication.py

Lines changed: 32 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import pathlib
22
import tempfile
33
from io import StringIO
4-
from typing import List, Set, Tuple, Dict
4+
from typing import List, Tuple, Dict
55
import rdflib
66
from pymongo import MongoClient
77
from rdflib import RDF, URIRef, OWL
@@ -12,16 +12,17 @@
1212
from ted_sws.core.model.notice import Notice
1313
from ted_sws.data_manager.adapters.notice_repository import NoticeRepository
1414
from ted_sws.data_manager.adapters.sparql_endpoint import SPARQLStringEndpoint
15-
from ted_sws.data_manager.adapters.triple_store import FusekiAdapter, TripleStoreABC, FusekiException, \
15+
from ted_sws.data_manager.adapters.triple_store import FusekiAdapter, TripleStoreABC, \
1616
FUSEKI_REPOSITORY_ALREADY_EXIST_ERROR_MSG
17-
from ted_sws.event_manager.services.log import log_error
17+
from ted_sws.event_manager.services.log import log_error, log_notice_error
1818
from ted_sws.master_data_registry.services.rdf_fragment_processor import get_rdf_fragments_by_cet_uri_from_notices, \
1919
merge_rdf_fragments_into_graph, write_rdf_fragments_in_triple_store, RDF_FRAGMENT_FROM_NOTICE_PROPERTY, \
2020
get_subjects_by_cet_uri
2121

2222
MDR_TEMPORARY_FUSEKI_DATASET_NAME = "tmp_mdr_dataset"
2323
MDR_FUSEKI_DATASET_NAME = "mdr_dataset"
2424
MDR_CANONICAL_CET_PROPERTY = rdflib.term.URIRef("http://www.meaningfy.ws/mdr#isCanonicalEntity")
25+
DEDUPLICATE_PROCEDURE_ENTITIES_DOMAIN_ACTION = "deduplicate_procedure_entities"
2526

2627

2728
def generate_mdr_alignment_links(merged_rdf_fragments: rdflib.Graph, cet_uri: str,
@@ -248,20 +249,38 @@ def deduplicate_procedure_entities(notices: List[Notice], procedure_cet_uri: str
248249
rdf_content = parent_notice.rdf_manifestation.object_data
249250
sparql_endpoint = SPARQLStringEndpoint(rdf_content=rdf_content)
250251
result_uris = get_subjects_by_cet_uri(sparql_endpoint=sparql_endpoint, cet_uri=procedure_cet_uri)
251-
assert len(result_uris) == 1
252-
parent_procedure_uri = rdflib.URIRef(result_uris[0])
253-
parent_uries[parent_notice_id] = parent_procedure_uri
252+
result_uris_len = len(result_uris)
253+
if result_uris_len != 1:
254+
notice_normalised_metadata = parent_notice.normalised_metadata
255+
log_notice_error(
256+
message=f"Parent notice with notice_id=[{parent_notice.ted_id}] have {result_uris_len} Procedure CETs!",
257+
notice_id=parent_notice.ted_id, domain_action=DEDUPLICATE_PROCEDURE_ENTITIES_DOMAIN_ACTION,
258+
notice_form_number=notice_normalised_metadata.form_number if notice_normalised_metadata else None,
259+
notice_status=parent_notice.status,
260+
notice_eforms_subtype=notice_normalised_metadata.eforms_subtype if notice_normalised_metadata else None)
261+
else:
262+
parent_procedure_uri = rdflib.URIRef(result_uris[0])
263+
parent_uries[parent_notice_id] = parent_procedure_uri
254264

255265
for parent_uri_key in parent_uries.keys():
256266
parent_uri = parent_uries[parent_uri_key]
257267
for child_notice in notice_families[parent_uri_key]:
258268
rdf_content = child_notice.rdf_manifestation.object_data
259269
sparql_endpoint = SPARQLStringEndpoint(rdf_content=rdf_content)
260270
result_uris = get_subjects_by_cet_uri(sparql_endpoint=sparql_endpoint, cet_uri=procedure_cet_uri)
261-
assert len(result_uris) == 1
262-
child_procedure_uri = rdflib.URIRef(result_uris[0])
263-
inject_links = rdflib.Graph()
264-
inject_links.add((child_procedure_uri, OWL.sameAs, parent_uri))
265-
child_notice.distilled_rdf_manifestation.object_data = '\n'.join(
266-
[child_notice.distilled_rdf_manifestation.object_data,
267-
str(inject_links.serialize(format="nt"))])
271+
result_uris_len = len(result_uris)
272+
if result_uris_len != 1:
273+
notice_normalised_metadata = child_notice.normalised_metadata
274+
log_notice_error(
275+
message=f"Child notice with notice_id=[{child_notice.ted_id}] have {result_uris_len} Procedure CETs!",
276+
notice_id=child_notice.ted_id, domain_action=DEDUPLICATE_PROCEDURE_ENTITIES_DOMAIN_ACTION,
277+
notice_form_number=notice_normalised_metadata.form_number if notice_normalised_metadata else None,
278+
notice_status=child_notice.status,
279+
notice_eforms_subtype=notice_normalised_metadata.eforms_subtype if notice_normalised_metadata else None)
280+
else:
281+
child_procedure_uri = rdflib.URIRef(result_uris[0])
282+
inject_links = rdflib.Graph()
283+
inject_links.add((child_procedure_uri, OWL.sameAs, parent_uri))
284+
child_notice.distilled_rdf_manifestation.object_data = '\n'.join(
285+
[child_notice.distilled_rdf_manifestation.object_data,
286+
str(inject_links.serialize(format="nt"))])

0 commit comments

Comments
 (0)