Skip to content

Commit 94b1dcc

Browse files
add deduplication report generation
1 parent 9d2cca8 commit 94b1dcc

4 files changed

Lines changed: 49 additions & 0 deletions

File tree

dags/pipelines/notice_processor_pipelines.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
from dags.pipelines.pipeline_protocols import NoticePipelineOutput
33
from ted_sws.core.model.notice import Notice, NoticeStatus
44
from ted_sws.event_manager.services.log import log_notice_error
5+
from ted_sws.notice_validator.services.entity_deduplication_validation import \
6+
generate_rdf_manifestation_entity_deduplication_report
57

68

79
def notice_normalisation_pipeline(notice: Notice, mongodb_client: MongoClient) -> NoticePipelineOutput:
@@ -75,6 +77,9 @@ def notice_validation_pipeline(notice: Notice, mongodb_client: MongoClient) -> N
7577
log_notice_info(message="Validation :: Summary :: START", notice_id=notice.ted_id)
7678
validation_summary_report_notice(notice=notice)
7779
log_notice_info(message="Validation :: Summary :: END", notice_id=notice.ted_id)
80+
log_notice_info(message="Validation :: Entity deduplication :: START", notice_id=notice.ted_id)
81+
generate_rdf_manifestation_entity_deduplication_report(rdf_manifestation=notice.distilled_rdf_manifestation)
82+
log_notice_info(message="Validation :: Entity deduplication :: END", notice_id=notice.ted_id)
7883
return NoticePipelineOutput(notice=notice)
7984

8085

ted_sws/core/model/manifestation.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,13 +181,21 @@ class SHACLTestSuiteValidationReport(RDFValidationManifestation):
181181
validation_results: Union[QueriedSHACLShapeValidationResult, str]
182182

183183

184+
class EntityDeduplicationReport(Manifestation):
185+
object_data: Optional[str]
186+
number_of_duplicates: int
187+
number_of_cets: int
188+
uries: List[str]
189+
190+
184191
class RDFManifestation(Manifestation):
185192
"""
186193
Transformed manifestation in RDF format
187194
"""
188195
mapping_suite_id = "unknown_mapping_suite_id"
189196
shacl_validations: List[SHACLTestSuiteValidationReport] = []
190197
sparql_validations: List[SPARQLTestSuiteValidationReport] = []
198+
deduplication_report: Optional[EntityDeduplicationReport]
191199

192200
def validation_exists(self, validation, validations):
193201
"""
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
import rdflib
2+
from rdflib import OWL
3+
4+
from ted_sws.core.model.manifestation import RDFManifestation, EntityDeduplicationReport
5+
from ted_sws.data_manager.adapters.sparql_endpoint import DEFAULT_RDF_FILE_FORMAT
6+
7+
8+
def generate_rdf_manifestation_entity_deduplication_report(rdf_manifestation: RDFManifestation):
9+
"""
10+
This function generate entity deduplication report for an RDF manifestation.
11+
:param rdf_manifestation:
12+
:return:
13+
"""
14+
rdf_content = rdflib.Graph()
15+
rdf_content.parse(rdf_manifestation.object_data.encode(encoding="utf-8"), format=DEFAULT_RDF_FILE_FORMAT)
16+
duplicate_entities = set()
17+
new_entities = set()
18+
for triple_sub, triple_pred, triple_obj in rdf_content.triples(triple=(None, OWL.sameAs, None)):
19+
if triple_sub == triple_obj:
20+
new_entities.add(str(triple_sub))
21+
else:
22+
duplicate_entities.add(str(triple_sub))
23+
24+
rdf_manifestation.deduplication_report = EntityDeduplicationReport(number_of_duplicates=len(duplicate_entities),
25+
number_of_cets=len(new_entities),
26+
uries=list(
27+
duplicate_entities.union(new_entities))
28+
)

tests/e2e/master_data_registry/test_entity_deduplication.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55

66
from ted_sws.master_data_registry.services.entity_deduplication import deduplicate_entities_by_cet_uri, \
77
deduplicate_procedure_entities
8+
from ted_sws.notice_validator.services.entity_deduplication_validation import \
9+
generate_rdf_manifestation_entity_deduplication_report
810

911
TEST_MDR_REPOSITORY = "tmp_mdr_test_repository"
1012
TEST_QUERY_UNIQUE_NAMES = """SELECT distinct ?name
@@ -53,6 +55,12 @@ def test_deduplicate_entities_by_cet_uri(notice_with_rdf_manifestation, organisa
5355
for triple in canonical_cets_same_as_triples:
5456
assert str(triple[2]) in canonical_cets_set
5557

58+
assert notice_with_rdf_manifestation.distilled_rdf_manifestation.deduplication_report is None
59+
generate_rdf_manifestation_entity_deduplication_report(notice_with_rdf_manifestation.distilled_rdf_manifestation)
60+
assert notice_with_rdf_manifestation.distilled_rdf_manifestation.deduplication_report is not None
61+
deduplication_report = notice_with_rdf_manifestation.distilled_rdf_manifestation.deduplication_report
62+
assert deduplication_report.number_of_cets == len(canonical_cets_set)
63+
5664
fuseki_triple_store.delete_repository(repository_name=TEST_MDR_REPOSITORY)
5765

5866

0 commit comments

Comments
 (0)