1+ import pathlib
2+ from xml .etree import ElementTree
3+ from xml .etree .ElementTree import ParseError
4+
15import pytest
26
37from ted_sws .core .model .manifestation import XMLManifestation
4- from ted_sws .core .model .metadata import NormalisedMetadata
8+ from ted_sws .core .model .metadata import NormalisedMetadata , LanguageTaggedString
59from ted_sws .core .model .notice import NoticeStatus , Notice
610from ted_sws .notice_metadata_processor .adapters .notice_metadata_extractor import \
711 DefaultNoticeMetadataExtractor , EformsNoticeMetadataExtractor
812from ted_sws .notice_metadata_processor .adapters .notice_metadata_normaliser import \
913 DefaultNoticeMetadataNormaliser , get_map_value , FORM_NUMBER_KEY , LEGAL_BASIS_KEY , SF_NOTICE_TYPE_KEY , \
10- DOCUMENT_CODE_KEY , EformsNoticeMetadataNormaliser
14+ DOCUMENT_CODE_KEY , EformsNoticeMetadataNormaliser , get_html_compatible_string
1115from ted_sws .notice_metadata_processor .model .metadata import ExtractedMetadata
1216from ted_sws .notice_metadata_processor .services .metadata_constraints import filter_df_by_variables
1317from ted_sws .notice_metadata_processor .services .metadata_normalizer import normalise_notice , normalise_notice_by_id , \
1620 extract_and_normalise_notice_metadata
1721from ted_sws .resources .mapping_files_registry import MappingFilesRegistry
1822
23+ def html_str (content : str ) -> str :
24+ return f"""<?xml version="1.0" encoding="UTF-8"?> <body>{ content } </body>"""
1925
2026def test_metadata_normaliser_by_notice (indexed_notice ):
2127 notice = normalise_notice (indexed_notice )
@@ -237,13 +243,36 @@ def test_normalising_notice_out_of_index(notice_normalisation_test_data_path):
237243 xml_manifestation = XMLManifestation (object_data = broke_notice_content ))
238244
239245
240- def test_normalising_notice_with_spaces_in_notice_id (sample_indexed_ef_notice_with_spaces_in_publication_number : Notice ,
241- sample_indexed_sf_notice_with_spaces_in_publication_number : Notice
246+ def test_normalising_notice_with_spaces_in_notice_id (sample_indexed_ef_html_unsafe_notice : Notice ,
247+ sample_indexed_sf_html_unsafe_notice : Notice
242248 ):
243- normalised_ef_notice : Notice = normalise_notice (sample_indexed_ef_notice_with_spaces_in_publication_number )
249+ normalised_ef_notice : Notice = normalise_notice (sample_indexed_ef_html_unsafe_notice )
244250
245251 assert normalised_ef_notice .normalised_metadata .notice_publication_number .strip () == normalised_ef_notice .normalised_metadata .notice_publication_number
246252
247- normalised_sf_notice : Notice = normalise_notice (sample_indexed_sf_notice_with_spaces_in_publication_number )
253+ normalised_sf_notice : Notice = normalise_notice (sample_indexed_sf_html_unsafe_notice )
254+
255+ assert normalised_sf_notice .normalised_metadata .notice_publication_number .strip () == normalised_sf_notice .normalised_metadata .notice_publication_number
256+
257+
258+ def test_get_html_compatible_string (html_incompatible_str : str ):
259+ with pytest .raises (ParseError ):
260+ ElementTree .fromstring (html_incompatible_str )
261+
262+ compatible_str : LanguageTaggedString = get_html_compatible_string (LanguageTaggedString (text = html_incompatible_str ))
263+
264+
265+ # Parse to check if str is well-formed (HTML-safe sequences or elements)
266+ ElementTree .fromstring (html_str (compatible_str .text ))
267+
268+
269+ def test_normalising_notice_with_html_incompatible_title (sample_indexed_ef_html_unsafe_notice : Notice ,
270+ sample_indexed_sf_html_unsafe_notice : Notice ):
271+
272+ normalised_ef_notice : Notice = normalise_notice (sample_indexed_ef_html_unsafe_notice )
273+
274+ [ElementTree .fromstring (html_str (title .text )) for title in normalised_ef_notice .normalised_metadata .title ]
275+
276+ normalised_sf_notice : Notice = normalise_notice (sample_indexed_sf_html_unsafe_notice )
248277
249- assert normalised_sf_notice . normalised_metadata . notice_publication_number . strip () == normalised_sf_notice .normalised_metadata .notice_publication_number
278+ [ ElementTree . fromstring ( html_str ( title . text )) for title in normalised_sf_notice .normalised_metadata .title ]
0 commit comments