Skip to content

Commit 2adcdbd

Browse files
committed
change tests
1 parent 639a98c commit 2adcdbd

7 files changed

Lines changed: 77 additions & 37 deletions

File tree

ted_sws/notice_metadata_processor/adapters/notice_metadata_normaliser.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from typing import Dict, Tuple, List
44
import re
55
import pandas as pd
6+
import html
67

78
from ted_sws.core.model.metadata import NormalisedMetadata, LanguageTaggedString, NoticeSource
89
from ted_sws.notice_metadata_processor.model.metadata import ExtractedMetadata
@@ -42,6 +43,10 @@
4243
mapping_registry = MappingFilesRegistry()
4344

4445

46+
def get_html_compatible_string(input_string: LanguageTaggedString) -> LanguageTaggedString:
47+
"""Convert string to HTML compatible format using HTML encoding."""
48+
return LanguageTaggedString(text=html.escape(input_string.text), language=input_string.language)
49+
4550
def get_map_list_value_by_code(mapping: Dict, listing: List):
4651
result = []
4752
for element in listing:
@@ -223,15 +228,15 @@ def normalise_metadata(self, extracted_metadata: ExtractedMetadata) -> Normalise
223228
extracted_metadata = extracted_metadata
224229

225230
metadata = {
226-
TITLE_KEY: [title.title for title in extracted_metadata.title],
231+
TITLE_KEY: [get_html_compatible_string(title.title) for title in extracted_metadata.title],
227232
LONG_TITLE_KEY: [
228-
LanguageTaggedString(text=JOIN_SEP.join(
233+
get_html_compatible_string(LanguageTaggedString(text=JOIN_SEP.join(
229234
[
230235
title.title_country.text,
231236
title.title_city.text,
232237
title.title.text
233238
]),
234-
language=title.title.language) for title in extracted_metadata.title
239+
language=title.title.language)) for title in extracted_metadata.title
235240
],
236241
NOTICE_NUMBER_KEY: extracted_metadata.notice_publication_number.strip(),
237242
PUBLICATION_DATE_KEY: self.iso_date_format(extracted_metadata.publication_date),
@@ -315,14 +320,14 @@ def normalise_metadata(self, extracted_metadata: ExtractedMetadata) -> Normalise
315320
form_type, notice_type, legal_basis = self.get_form_type_notice_type_and_legal_basis(
316321
extracted_notice_subtype=extracted_metadata.extracted_notice_subtype)
317322
metadata = {
318-
TITLE_KEY: [title.title for title in extracted_metadata.title],
323+
TITLE_KEY: [get_html_compatible_string(title.title) for title in extracted_metadata.title],
319324
LONG_TITLE_KEY: [
320-
LanguageTaggedString(text=JOIN_SEP.join(
325+
get_html_compatible_string(LanguageTaggedString(text=JOIN_SEP.join(
321326
[
322327
title.title_country.text,
323328
title.title.text
324329
]),
325-
language=title.title.language) for title in extracted_metadata.title
330+
language=title.title.language)) for title in extracted_metadata.title
326331
],
327332
NOTICE_NUMBER_KEY: extracted_metadata.notice_publication_number.strip(),
328333
PUBLICATION_DATE_KEY: self.iso_date_format(extracted_metadata.publication_date),

ted_sws/notice_packager/resources/templates/mets_xml_dmd_rdf.jinja2

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
<cdm:procurement_public_number_document_in_official-journal rdf:datatype="http://www.w3.org/2001/XMLSchema#string">{{ notice.public_number_document }}</cdm:procurement_public_number_document_in_official-journal>
2222
<cdm:procurement_public_number_edition rdf:datatype="http://www.w3.org/2001/XMLSchema#positiveInteger">{{ notice.public_number_edition }}</cdm:procurement_public_number_edition>
2323
{% for lang in work.title %}
24-
<cdm:work_title xml:lang="{{ lang }}">{{ work.title[lang] | e }}</cdm:work_title>
24+
<cdm:work_title xml:lang="{{ lang }}">{{ work.title[lang] }}</cdm:work_title>
2525
{% endfor %}
2626
<cdm:datetime_transmission rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">{{ work.datetime_transmission }}</cdm:datetime_transmission>
2727
{# <cdm:procurement_public_issued_by_country>{{ work.procurement_public_issued_by_country }}</cdm:procurement_public_issued_by_country>
@@ -44,7 +44,7 @@
4444
<rdf:type rdf:resource="http://publications.europa.eu/ontology/cdm#expression_procurement_public"/>
4545
<cdm:expression_belongs_to_work rdf:resource="&resource;ted/{{ work.identifier }}"/>
4646
{% for lang in expression.title %}
47-
<cdm:expression_title xml:lang="{{ lang }}">{{ expression.title[lang] | e }}</cdm:expression_title>
47+
<cdm:expression_title xml:lang="{{ lang }}">{{ expression.title[lang] }}</cdm:expression_title>
4848
{% endfor %}
4949
<cdm:expression_uses_language rdf:resource="&cellar-authority;language/{{ expression.uses_language }}"/>
5050
<cdm:expression_procurement_public_authority-type_name rdf:datatype="http://www.w3.org/2001/XMLSchema#string">Other</cdm:expression_procurement_public_authority-type_name>

tests/test_data/notice_normalisation/spaces_in_publication_number/ef_notice_with_spaces_in_publication_number.xml renamed to tests/test_data/notice_normalisation/ef_html_unsafe_notice.xml

File renamed without changes.

tests/test_data/notice_normalisation/spaces_in_publication_number/sf_notice_with_spaces_in_publication_number.xml renamed to tests/test_data/notice_normalisation/sf_html_unsafe_notice.xml

File renamed without changes.

tests/unit/notice_metadata_processor/conftest.py

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -31,30 +31,36 @@ def eforms_xml_notice_paths() -> List[pathlib.Path]:
3131

3232

3333
@pytest.fixture
34-
def sample_ef_notice_with_spaces_in_publication_number_path() -> pathlib.Path:
35-
return TEST_DATA_PATH / "notice_normalisation" / "spaces_in_publication_number" / "ef_notice_with_spaces_in_publication_number.xml"
34+
def sample_ef_html_unsafe_notice_path() -> pathlib.Path:
35+
return TEST_DATA_PATH / "notice_normalisation" / "ef_html_unsafe_notice.xml"
3636

3737

3838
@pytest.fixture
39-
def sample_indexed_ef_notice_with_spaces_in_publication_number(
40-
sample_ef_notice_with_spaces_in_publication_number_path: pathlib.Path) -> Notice:
41-
notice: Notice = Notice(ted_id=sample_ef_notice_with_spaces_in_publication_number_path.name)
39+
def sample_indexed_ef_html_unsafe_notice(
40+
sample_ef_html_unsafe_notice_path: pathlib.Path) -> Notice:
41+
notice: Notice = Notice(ted_id=sample_ef_html_unsafe_notice_path.name)
4242
notice.set_xml_manifestation(
43-
XMLManifestation(object_data=sample_ef_notice_with_spaces_in_publication_number_path.read_text()))
43+
XMLManifestation(object_data=sample_ef_html_unsafe_notice_path.read_text()))
4444

4545
return index_notice(notice)
4646

4747

4848
@pytest.fixture
49-
def sample_sf_notice_with_spaces_in_publication_number_path() -> pathlib.Path:
50-
return TEST_DATA_PATH / "notice_normalisation" / "spaces_in_publication_number" / "sf_notice_with_spaces_in_publication_number.xml"
49+
def sample_sf_html_unsafe_notice_path() -> pathlib.Path:
50+
return TEST_DATA_PATH / "notice_normalisation" / "sf_html_unsafe_notice.xml"
5151

5252

5353
@pytest.fixture
54-
def sample_indexed_sf_notice_with_spaces_in_publication_number(
55-
sample_sf_notice_with_spaces_in_publication_number_path: pathlib.Path) -> Notice:
56-
notice: Notice = Notice(ted_id=sample_sf_notice_with_spaces_in_publication_number_path.name)
54+
def sample_indexed_sf_html_unsafe_notice(
55+
sample_sf_html_unsafe_notice_path: pathlib.Path) -> Notice:
56+
notice: Notice = Notice(ted_id=sample_sf_html_unsafe_notice_path.name)
5757
notice.set_xml_manifestation(
58-
XMLManifestation(object_data=sample_sf_notice_with_spaces_in_publication_number_path.read_text()))
58+
XMLManifestation(object_data=sample_sf_html_unsafe_notice_path.read_text()))
5959

6060
return index_notice(notice)
61+
62+
63+
@pytest.fixture
64+
def html_incompatible_str() -> str:
65+
"""Provides a test string containing HTML incompatible characters."""
66+
return "Construction work & planning <br />"

tests/unit/notice_metadata_processor/test_metadata_normaliser.py

Lines changed: 36 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,17 @@
1+
import pathlib
2+
from xml.etree import ElementTree
3+
from xml.etree.ElementTree import ParseError
4+
15
import pytest
26

37
from ted_sws.core.model.manifestation import XMLManifestation
4-
from ted_sws.core.model.metadata import NormalisedMetadata
8+
from ted_sws.core.model.metadata import NormalisedMetadata, LanguageTaggedString
59
from ted_sws.core.model.notice import NoticeStatus, Notice
610
from ted_sws.notice_metadata_processor.adapters.notice_metadata_extractor import \
711
DefaultNoticeMetadataExtractor, EformsNoticeMetadataExtractor
812
from ted_sws.notice_metadata_processor.adapters.notice_metadata_normaliser import \
913
DefaultNoticeMetadataNormaliser, get_map_value, FORM_NUMBER_KEY, LEGAL_BASIS_KEY, SF_NOTICE_TYPE_KEY, \
10-
DOCUMENT_CODE_KEY, EformsNoticeMetadataNormaliser
14+
DOCUMENT_CODE_KEY, EformsNoticeMetadataNormaliser, get_html_compatible_string
1115
from ted_sws.notice_metadata_processor.model.metadata import ExtractedMetadata
1216
from ted_sws.notice_metadata_processor.services.metadata_constraints import filter_df_by_variables
1317
from ted_sws.notice_metadata_processor.services.metadata_normalizer import normalise_notice, normalise_notice_by_id, \
@@ -16,6 +20,8 @@
1620
extract_and_normalise_notice_metadata
1721
from ted_sws.resources.mapping_files_registry import MappingFilesRegistry
1822

23+
def html_str(content: str) -> str:
24+
return f"""<?xml version="1.0" encoding="UTF-8"?> <body>{content}</body>"""
1925

2026
def test_metadata_normaliser_by_notice(indexed_notice):
2127
notice = normalise_notice(indexed_notice)
@@ -237,13 +243,36 @@ def test_normalising_notice_out_of_index(notice_normalisation_test_data_path):
237243
xml_manifestation=XMLManifestation(object_data=broke_notice_content))
238244

239245

240-
def test_normalising_notice_with_spaces_in_notice_id(sample_indexed_ef_notice_with_spaces_in_publication_number: Notice,
241-
sample_indexed_sf_notice_with_spaces_in_publication_number: Notice
246+
def test_normalising_notice_with_spaces_in_notice_id(sample_indexed_ef_html_unsafe_notice: Notice,
247+
sample_indexed_sf_html_unsafe_notice: Notice
242248
):
243-
normalised_ef_notice: Notice = normalise_notice(sample_indexed_ef_notice_with_spaces_in_publication_number)
249+
normalised_ef_notice: Notice = normalise_notice(sample_indexed_ef_html_unsafe_notice)
244250

245251
assert normalised_ef_notice.normalised_metadata.notice_publication_number.strip() == normalised_ef_notice.normalised_metadata.notice_publication_number
246252

247-
normalised_sf_notice: Notice = normalise_notice(sample_indexed_sf_notice_with_spaces_in_publication_number)
253+
normalised_sf_notice: Notice = normalise_notice(sample_indexed_sf_html_unsafe_notice)
254+
255+
assert normalised_sf_notice.normalised_metadata.notice_publication_number.strip() == normalised_sf_notice.normalised_metadata.notice_publication_number
256+
257+
258+
def test_get_html_compatible_string(html_incompatible_str: str):
259+
with pytest.raises(ParseError):
260+
ElementTree.fromstring(html_incompatible_str)
261+
262+
compatible_str: LanguageTaggedString = get_html_compatible_string(LanguageTaggedString(text=html_incompatible_str))
263+
264+
265+
# Parse to check if str is well-formed (HTML-safe sequences or elements)
266+
ElementTree.fromstring(html_str(compatible_str.text))
267+
268+
269+
def test_normalising_notice_with_html_incompatible_title(sample_indexed_ef_html_unsafe_notice: Notice,
270+
sample_indexed_sf_html_unsafe_notice: Notice):
271+
272+
normalised_ef_notice: Notice = normalise_notice(sample_indexed_ef_html_unsafe_notice)
273+
274+
[ElementTree.fromstring(html_str(title.text)) for title in normalised_ef_notice.normalised_metadata.title ]
275+
276+
normalised_sf_notice: Notice = normalise_notice(sample_indexed_sf_html_unsafe_notice)
248277

249-
assert normalised_sf_notice.normalised_metadata.notice_publication_number.strip() == normalised_sf_notice.normalised_metadata.notice_publication_number
278+
[ElementTree.fromstring(html_str(title.text)) for title in normalised_sf_notice.normalised_metadata.title]

tests/unit/notice_packager/test_template_generator.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -62,13 +62,13 @@ def test_mets2action_mets_xml_generator_with_wrong_action(template_sample_metada
6262
TemplateGenerator.mets2action_mets_xml_generator(template_sample_metadata)
6363

6464

65-
def test_mets_dmd_rdf_has_html_safe_sequences_after_generation(sample_metadata_with_wrong_title: PackagerMetadata,
66-
sample_mets_xml_dmd_rdf_with_wrong_title_str: str):
67-
# Ensure parser raises error on not well-formed xml (HTML sequences or elements)
68-
with pytest.raises(ParseError):
69-
ElementTree.fromstring(sample_mets_xml_dmd_rdf_with_wrong_title_str)
70-
71-
mets_dmd_rdf: str = TemplateGenerator.mets_xml_dmd_rdf_generator(sample_metadata_with_wrong_title)
72-
73-
# Parse to check if xml is well-formed (HTML-safe sequences or elements)
74-
ElementTree.fromstring(mets_dmd_rdf)
65+
# def test_mets_dmd_rdf_has_html_safe_sequences_after_generation(sample_metadata_with_wrong_title: PackagerMetadata,
66+
# sample_mets_xml_dmd_rdf_with_wrong_title_str: str):
67+
# # Ensure parser raises error on not well-formed xml (HTML sequences or elements)
68+
# with pytest.raises(ParseError):
69+
# ElementTree.fromstring(sample_mets_xml_dmd_rdf_with_wrong_title_str)
70+
#
71+
# mets_dmd_rdf: str = TemplateGenerator.mets_xml_dmd_rdf_generator(sample_metadata_with_wrong_title)
72+
#
73+
# # Parse to check if xml is well-formed (HTML-safe sequences or elements)
74+
# ElementTree.fromstring(mets_dmd_rdf)

0 commit comments

Comments
 (0)