change tests

duprijil · duprijil · commit 2adcdbddf75a · 2025-01-15T08:59:57.000+02:00
diff --git a/ted_sws/notice_metadata_processor/adapters/notice_metadata_normaliser.py b/ted_sws/notice_metadata_processor/adapters/notice_metadata_normaliser.py
@@ -3,6 +3,7 @@
 from typing import Dict, Tuple, List
 import re
 import pandas as pd
+import html
 
 from ted_sws.core.model.metadata import NormalisedMetadata, LanguageTaggedString, NoticeSource
 from ted_sws.notice_metadata_processor.model.metadata import ExtractedMetadata
@@ -42,6 +43,10 @@
 mapping_registry = MappingFilesRegistry()
 
 
+def get_html_compatible_string(input_string: LanguageTaggedString) -> LanguageTaggedString:
+    """Convert string to HTML compatible format using HTML encoding."""
+    return LanguageTaggedString(text=html.escape(input_string.text), language=input_string.language)
+
 def get_map_list_value_by_code(mapping: Dict, listing: List):
     result = []
     for element in listing:
@@ -223,15 +228,15 @@ def normalise_metadata(self, extracted_metadata: ExtractedMetadata) -> Normalise
         extracted_metadata = extracted_metadata
 
         metadata = {
-            TITLE_KEY: [title.title for title in extracted_metadata.title],
+            TITLE_KEY: [get_html_compatible_string(title.title) for title in extracted_metadata.title],
             LONG_TITLE_KEY: [
-                LanguageTaggedString(text=JOIN_SEP.join(
+                get_html_compatible_string(LanguageTaggedString(text=JOIN_SEP.join(
                     [
                         title.title_country.text,
                         title.title_city.text,
                         title.title.text
                     ]),
-                    language=title.title.language) for title in extracted_metadata.title
+                    language=title.title.language)) for title in extracted_metadata.title
             ],
             NOTICE_NUMBER_KEY: extracted_metadata.notice_publication_number.strip(),
             PUBLICATION_DATE_KEY: self.iso_date_format(extracted_metadata.publication_date),
@@ -315,14 +320,14 @@ def normalise_metadata(self, extracted_metadata: ExtractedMetadata) -> Normalise
         form_type, notice_type, legal_basis = self.get_form_type_notice_type_and_legal_basis(
             extracted_notice_subtype=extracted_metadata.extracted_notice_subtype)
         metadata = {
-            TITLE_KEY: [title.title for title in extracted_metadata.title],
+            TITLE_KEY: [get_html_compatible_string(title.title) for title in extracted_metadata.title],
             LONG_TITLE_KEY: [
-                LanguageTaggedString(text=JOIN_SEP.join(
+                get_html_compatible_string(LanguageTaggedString(text=JOIN_SEP.join(
                     [
                         title.title_country.text,
                         title.title.text
                     ]),
-                    language=title.title.language) for title in extracted_metadata.title
+                    language=title.title.language)) for title in extracted_metadata.title
             ],
             NOTICE_NUMBER_KEY: extracted_metadata.notice_publication_number.strip(),
             PUBLICATION_DATE_KEY: self.iso_date_format(extracted_metadata.publication_date),
diff --git a/ted_sws/notice_packager/resources/templates/mets_xml_dmd_rdf.jinja2 b/ted_sws/notice_packager/resources/templates/mets_xml_dmd_rdf.jinja2
@@ -21,7 +21,7 @@
         <cdm:procurement_public_number_document_in_official-journal rdf:datatype="http://www.w3.org/2001/XMLSchema#string">{{ notice.public_number_document }}</cdm:procurement_public_number_document_in_official-journal>
         <cdm:procurement_public_number_edition rdf:datatype="http://www.w3.org/2001/XMLSchema#positiveInteger">{{ notice.public_number_edition }}</cdm:procurement_public_number_edition>
         {% for lang in work.title %}
-        <cdm:work_title xml:lang="{{ lang }}">{{ work.title[lang] | e }}</cdm:work_title>
+        <cdm:work_title xml:lang="{{ lang }}">{{ work.title[lang] }}</cdm:work_title>
         {% endfor %}
         <cdm:datetime_transmission rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">{{ work.datetime_transmission }}</cdm:datetime_transmission>
         {# <cdm:procurement_public_issued_by_country>{{ work.procurement_public_issued_by_country }}</cdm:procurement_public_issued_by_country>
@@ -44,7 +44,7 @@
         <rdf:type rdf:resource="http://publications.europa.eu/ontology/cdm#expression_procurement_public"/>
         <cdm:expression_belongs_to_work rdf:resource="&resource;ted/{{ work.identifier }}"/>
         {% for lang in expression.title %}
-        <cdm:expression_title xml:lang="{{ lang }}">{{ expression.title[lang] | e }}</cdm:expression_title>
+        <cdm:expression_title xml:lang="{{ lang }}">{{ expression.title[lang] }}</cdm:expression_title>
         {% endfor %}
         <cdm:expression_uses_language rdf:resource="&cellar-authority;language/{{ expression.uses_language }}"/>
         <cdm:expression_procurement_public_authority-type_name rdf:datatype="http://www.w3.org/2001/XMLSchema#string">Other</cdm:expression_procurement_public_authority-type_name>
diff --git a/tests/test_data/notice_normalisation/ef_html_unsafe_notice.xml b/tests/test_data/notice_normalisation/ef_html_unsafe_notice.xml
diff --git a/tests/test_data/notice_normalisation/sf_html_unsafe_notice.xml b/tests/test_data/notice_normalisation/sf_html_unsafe_notice.xml
diff --git a/tests/unit/notice_metadata_processor/conftest.py b/tests/unit/notice_metadata_processor/conftest.py
@@ -31,30 +31,36 @@ def eforms_xml_notice_paths() -> List[pathlib.Path]:
 
 
 @pytest.fixture
-def sample_ef_notice_with_spaces_in_publication_number_path() -> pathlib.Path:
-    return TEST_DATA_PATH / "notice_normalisation" / "spaces_in_publication_number" / "ef_notice_with_spaces_in_publication_number.xml"
+def sample_ef_html_unsafe_notice_path() -> pathlib.Path:
+    return TEST_DATA_PATH / "notice_normalisation" / "ef_html_unsafe_notice.xml"
 
 
 @pytest.fixture
-def sample_indexed_ef_notice_with_spaces_in_publication_number(
-        sample_ef_notice_with_spaces_in_publication_number_path: pathlib.Path) -> Notice:
-    notice: Notice = Notice(ted_id=sample_ef_notice_with_spaces_in_publication_number_path.name)
+def sample_indexed_ef_html_unsafe_notice(
+        sample_ef_html_unsafe_notice_path: pathlib.Path) -> Notice:
+    notice: Notice = Notice(ted_id=sample_ef_html_unsafe_notice_path.name)
     notice.set_xml_manifestation(
-        XMLManifestation(object_data=sample_ef_notice_with_spaces_in_publication_number_path.read_text()))
+        XMLManifestation(object_data=sample_ef_html_unsafe_notice_path.read_text()))
 
     return index_notice(notice)
 
 
 @pytest.fixture
-def sample_sf_notice_with_spaces_in_publication_number_path() -> pathlib.Path:
-    return TEST_DATA_PATH / "notice_normalisation" / "spaces_in_publication_number" / "sf_notice_with_spaces_in_publication_number.xml"
+def sample_sf_html_unsafe_notice_path() -> pathlib.Path:
+    return TEST_DATA_PATH / "notice_normalisation" / "sf_html_unsafe_notice.xml"
 
 
 @pytest.fixture
-def sample_indexed_sf_notice_with_spaces_in_publication_number(
-        sample_sf_notice_with_spaces_in_publication_number_path: pathlib.Path) -> Notice:
-    notice: Notice = Notice(ted_id=sample_sf_notice_with_spaces_in_publication_number_path.name)
+def sample_indexed_sf_html_unsafe_notice(
+        sample_sf_html_unsafe_notice_path: pathlib.Path) -> Notice:
+    notice: Notice = Notice(ted_id=sample_sf_html_unsafe_notice_path.name)
     notice.set_xml_manifestation(
-        XMLManifestation(object_data=sample_sf_notice_with_spaces_in_publication_number_path.read_text()))
+        XMLManifestation(object_data=sample_sf_html_unsafe_notice_path.read_text()))
 
     return index_notice(notice)
+
+
+@pytest.fixture
+def html_incompatible_str() -> str:
+    """Provides a test string containing HTML incompatible characters."""
+    return "Construction work & planning <br />"
diff --git a/tests/unit/notice_metadata_processor/test_metadata_normaliser.py b/tests/unit/notice_metadata_processor/test_metadata_normaliser.py
@@ -1,13 +1,17 @@
+import pathlib
+from xml.etree import ElementTree
+from xml.etree.ElementTree import ParseError
+
 import pytest
 
 from ted_sws.core.model.manifestation import XMLManifestation
-from ted_sws.core.model.metadata import NormalisedMetadata
+from ted_sws.core.model.metadata import NormalisedMetadata, LanguageTaggedString
 from ted_sws.core.model.notice import NoticeStatus, Notice
 from ted_sws.notice_metadata_processor.adapters.notice_metadata_extractor import \
     DefaultNoticeMetadataExtractor, EformsNoticeMetadataExtractor
 from ted_sws.notice_metadata_processor.adapters.notice_metadata_normaliser import \
     DefaultNoticeMetadataNormaliser, get_map_value, FORM_NUMBER_KEY, LEGAL_BASIS_KEY, SF_NOTICE_TYPE_KEY, \
-    DOCUMENT_CODE_KEY, EformsNoticeMetadataNormaliser
+    DOCUMENT_CODE_KEY, EformsNoticeMetadataNormaliser, get_html_compatible_string
 from ted_sws.notice_metadata_processor.model.metadata import ExtractedMetadata
 from ted_sws.notice_metadata_processor.services.metadata_constraints import filter_df_by_variables
 from ted_sws.notice_metadata_processor.services.metadata_normalizer import normalise_notice, normalise_notice_by_id, \
@@ -16,6 +20,8 @@
     extract_and_normalise_notice_metadata
 from ted_sws.resources.mapping_files_registry import MappingFilesRegistry
 
+def html_str(content: str) -> str:
+    return f"""<?xml version="1.0" encoding="UTF-8"?> <body>{content}</body>"""
 
 def test_metadata_normaliser_by_notice(indexed_notice):
     notice = normalise_notice(indexed_notice)
@@ -237,13 +243,36 @@ def test_normalising_notice_out_of_index(notice_normalisation_test_data_path):
             xml_manifestation=XMLManifestation(object_data=broke_notice_content))
 
 
-def test_normalising_notice_with_spaces_in_notice_id(sample_indexed_ef_notice_with_spaces_in_publication_number: Notice,
-                                                     sample_indexed_sf_notice_with_spaces_in_publication_number: Notice
+def test_normalising_notice_with_spaces_in_notice_id(sample_indexed_ef_html_unsafe_notice: Notice,
+                                                     sample_indexed_sf_html_unsafe_notice: Notice
                                                      ):
-    normalised_ef_notice: Notice = normalise_notice(sample_indexed_ef_notice_with_spaces_in_publication_number)
+    normalised_ef_notice: Notice = normalise_notice(sample_indexed_ef_html_unsafe_notice)
 
     assert normalised_ef_notice.normalised_metadata.notice_publication_number.strip() == normalised_ef_notice.normalised_metadata.notice_publication_number
 
-    normalised_sf_notice: Notice = normalise_notice(sample_indexed_sf_notice_with_spaces_in_publication_number)
+    normalised_sf_notice: Notice = normalise_notice(sample_indexed_sf_html_unsafe_notice)
+
+    assert normalised_sf_notice.normalised_metadata.notice_publication_number.strip() == normalised_sf_notice.normalised_metadata.notice_publication_number
+
+
+def test_get_html_compatible_string(html_incompatible_str: str):
+    with pytest.raises(ParseError):
+        ElementTree.fromstring(html_incompatible_str)
+
+    compatible_str: LanguageTaggedString = get_html_compatible_string(LanguageTaggedString(text=html_incompatible_str))
+
+
+    # Parse to check if str is well-formed (HTML-safe sequences or elements)
+    ElementTree.fromstring(html_str(compatible_str.text))
+
+
+def test_normalising_notice_with_html_incompatible_title(sample_indexed_ef_html_unsafe_notice: Notice,
+                                                     sample_indexed_sf_html_unsafe_notice: Notice):
+
+    normalised_ef_notice: Notice = normalise_notice(sample_indexed_ef_html_unsafe_notice)
+
+    [ElementTree.fromstring(html_str(title.text)) for title in normalised_ef_notice.normalised_metadata.title  ]
+
+    normalised_sf_notice: Notice = normalise_notice(sample_indexed_sf_html_unsafe_notice)
 
-    assert normalised_sf_notice.normalised_metadata.notice_publication_number.strip() == normalised_sf_notice.normalised_metadata.notice_publication_number
+    [ElementTree.fromstring(html_str(title.text)) for title in normalised_sf_notice.normalised_metadata.title]
diff --git a/tests/unit/notice_packager/test_template_generator.py b/tests/unit/notice_packager/test_template_generator.py
@@ -62,13 +62,13 @@ def test_mets2action_mets_xml_generator_with_wrong_action(template_sample_metada
         TemplateGenerator.mets2action_mets_xml_generator(template_sample_metadata)
 
 
-def test_mets_dmd_rdf_has_html_safe_sequences_after_generation(sample_metadata_with_wrong_title: PackagerMetadata,
-                                                               sample_mets_xml_dmd_rdf_with_wrong_title_str: str):
-    # Ensure parser raises error on not well-formed xml (HTML sequences or elements)
-    with pytest.raises(ParseError):
-        ElementTree.fromstring(sample_mets_xml_dmd_rdf_with_wrong_title_str)
-
-    mets_dmd_rdf: str = TemplateGenerator.mets_xml_dmd_rdf_generator(sample_metadata_with_wrong_title)
-
-    # Parse to check if xml is well-formed (HTML-safe sequences or elements)
-    ElementTree.fromstring(mets_dmd_rdf)
+# def test_mets_dmd_rdf_has_html_safe_sequences_after_generation(sample_metadata_with_wrong_title: PackagerMetadata,
+#                                                                sample_mets_xml_dmd_rdf_with_wrong_title_str: str):
+#     # Ensure parser raises error on not well-formed xml (HTML sequences or elements)
+#     with pytest.raises(ParseError):
+#         ElementTree.fromstring(sample_mets_xml_dmd_rdf_with_wrong_title_str)
+#
+#     mets_dmd_rdf: str = TemplateGenerator.mets_xml_dmd_rdf_generator(sample_metadata_with_wrong_title)
+#
+#     # Parse to check if xml is well-formed (HTML-safe sequences or elements)
+#     ElementTree.fromstring(mets_dmd_rdf)