Skip to content

Commit adc2f3f

Browse files
authored
Merge pull request #560 from OP-TED/feature/SWS1-12
Fix issue #549
2 parents 6c03617 + fdfc7f6 commit adc2f3f

9 files changed

Lines changed: 4767 additions & 11 deletions

File tree

ted_sws/data_manager/services/create_notice_collection_materialised_view.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@ def create_notice_collection_materialised_view(mongo_client: MongoClient):
4242
"publication_date_str_ym": "$normalised_metadata.publication_date_str_ym",
4343
"publication_date_str_ymd": "$normalised_metadata.publication_date_str_ymd",
4444
"deduplication_report": "$rdf_manifestation.deduplication_report",
45+
"notice_source": "$normalised_metadata.notice_source",
46+
"eform_sdk_version": "$normalised_metadata.eform_sdk_version",
4547
}
4648
},
4749
{

ted_sws/notice_metadata_processor/adapters/notice_metadata_normaliser.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from typing import Dict, Tuple, List
44
import re
55
import pandas as pd
6+
import html
67

78
from ted_sws.core.model.metadata import NormalisedMetadata, LanguageTaggedString, NoticeSource
89
from ted_sws.notice_metadata_processor.model.metadata import ExtractedMetadata
@@ -42,6 +43,10 @@
4243
mapping_registry = MappingFilesRegistry()
4344

4445

46+
def get_html_compatible_string(input_string: LanguageTaggedString) -> LanguageTaggedString:
47+
"""Convert string to HTML compatible format using HTML encoding."""
48+
return LanguageTaggedString(text=html.escape(input_string.text), language=input_string.language)
49+
4550
def get_map_list_value_by_code(mapping: Dict, listing: List):
4651
result = []
4752
for element in listing:
@@ -223,17 +228,17 @@ def normalise_metadata(self, extracted_metadata: ExtractedMetadata) -> Normalise
223228
extracted_metadata = extracted_metadata
224229

225230
metadata = {
226-
TITLE_KEY: [title.title for title in extracted_metadata.title],
231+
TITLE_KEY: [get_html_compatible_string(title.title) for title in extracted_metadata.title],
227232
LONG_TITLE_KEY: [
228-
LanguageTaggedString(text=JOIN_SEP.join(
233+
get_html_compatible_string(LanguageTaggedString(text=JOIN_SEP.join(
229234
[
230235
title.title_country.text,
231236
title.title_city.text,
232237
title.title.text
233238
]),
234-
language=title.title.language) for title in extracted_metadata.title
239+
language=title.title.language)) for title in extracted_metadata.title
235240
],
236-
NOTICE_NUMBER_KEY: extracted_metadata.notice_publication_number,
241+
NOTICE_NUMBER_KEY: extracted_metadata.notice_publication_number.strip(),
237242
PUBLICATION_DATE_KEY: self.iso_date_format(extracted_metadata.publication_date),
238243
OJS_NUMBER_KEY: extracted_metadata.ojs_issue_number,
239244
OJS_TYPE_KEY: extracted_metadata.ojs_type if extracted_metadata.ojs_type else "S",
@@ -315,16 +320,16 @@ def normalise_metadata(self, extracted_metadata: ExtractedMetadata) -> Normalise
315320
form_type, notice_type, legal_basis = self.get_form_type_notice_type_and_legal_basis(
316321
extracted_notice_subtype=extracted_metadata.extracted_notice_subtype)
317322
metadata = {
318-
TITLE_KEY: [title.title for title in extracted_metadata.title],
323+
TITLE_KEY: [get_html_compatible_string(title.title) for title in extracted_metadata.title],
319324
LONG_TITLE_KEY: [
320-
LanguageTaggedString(text=JOIN_SEP.join(
325+
get_html_compatible_string(LanguageTaggedString(text=JOIN_SEP.join(
321326
[
322327
title.title_country.text,
323328
title.title.text
324329
]),
325-
language=title.title.language) for title in extracted_metadata.title
330+
language=title.title.language)) for title in extracted_metadata.title
326331
],
327-
NOTICE_NUMBER_KEY: extracted_metadata.notice_publication_number,
332+
NOTICE_NUMBER_KEY: extracted_metadata.notice_publication_number.strip(),
328333
PUBLICATION_DATE_KEY: self.iso_date_format(extracted_metadata.publication_date),
329334
OJS_NUMBER_KEY: extracted_metadata.ojs_issue_number,
330335
OJS_TYPE_KEY: extracted_metadata.ojs_type if extracted_metadata.ojs_type else "S",

tests/test_data/notice_normalisation/ef_html_unsafe_notice.xml

Lines changed: 395 additions & 0 deletions
Large diffs are not rendered by default.

tests/test_data/notice_normalisation/sf_html_unsafe_notice.xml

Lines changed: 4135 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
{
2+
"notice": {
3+
"id": "003545_2021",
4+
"public_number_document": "003545",
5+
"public_number_edition": "2021004"
6+
},
7+
"mets": {
8+
"languages": [
9+
"en"
10+
],
11+
"revision": "0",
12+
"type": "create",
13+
"profile": "http://publications.europa.eu/resource/mets/op-sip-profile_002",
14+
"createdate": "2023-03-09T18:28:54.804225",
15+
"document_id": "",
16+
"dmd_id": "dmd_2021_S_004_003545_0_001",
17+
"dmd_mdtype": "OTHER",
18+
"dmd_othermdtype": "INSTANCE",
19+
"dmd_href": "2021_S_004_003545_0.mets.xml.dmd.rdf",
20+
"tmd_id": "tmd_2021_S_004_003545_0_001",
21+
"tmd_href": "2021_S_004_003545_0.tmd.rdf",
22+
"tmd_mdtype": "OTHER",
23+
"tmd_othermdtype": "INSTANCE",
24+
"file_id": "file_2021_S_004_003545_0_001",
25+
"notice_file_href": "2021_S_004_003545_0.notice.rdf",
26+
"notice_file_mimetype": "application/rdf+xml",
27+
"notice_file_checksum": "00e2c0570f2d9f00c71c3d8009b8bec5a530167a01ebb473e67be5e97383cdc5",
28+
"notice_file_checksum_type": "SHA-256"
29+
},
30+
"work": {
31+
"identifier": "2021_S_004_003545",
32+
"oj_identifier": "JOS_2021_004_R_003545",
33+
"cdm_rdf_type": "procurement_public",
34+
"resource_type": "PROCUREMENT_NOTICE",
35+
"uri": "http://data.europa.eu/a4g/resource/2021/003545_2021",
36+
"do_not_index": "true",
37+
"date_document": "2021-01-07",
38+
"created_by_agent": "EURUN",
39+
"dataset_published_by_agent": "EURUN",
40+
"datetime_transmission": "2023-03-09T18:28:54.806241",
41+
"title": {
42+
"en": "Construction work & planning",
43+
"ro": "Lucrari de constructie <br /> si planificare"
44+
},
45+
"date_creation": "2023-03-09",
46+
"concept_type_dataset": "TEST_DATA",
47+
"dataset_version": "20230309-0",
48+
"dataset_keyword": [
49+
"eProcurement",
50+
"notice"
51+
],
52+
"dataset_has_frequency_publication_frequency": "OTHER",
53+
"procurement_public_issued_by_country": "CZ",
54+
"procurement_public_url_etendering": []
55+
},
56+
"expression": {
57+
"identifier": "2021_S_004_003545.MUL",
58+
"title": {
59+
"en": " eProcurement notice 2021_S_004_003545 "
60+
},
61+
"uses_language": "MUL"
62+
},
63+
"manifestation": {
64+
"identifier": "2021_S_004_003545.MUL.rdf",
65+
"type": "rdf_epo",
66+
"date_publication": "2021-01-07",
67+
"distribution_has_status_distribution_status": "COMPLETED",
68+
"distribution_has_media_type_concept_media_type": "RDF"
69+
}
70+
}
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<!DOCTYPE rdf:RDF [
3+
<!ENTITY % cellarEntities PUBLIC
4+
"-//PO-RESOURCE//ENTITIES CELLAR cdm model 1.0//EN"
5+
"/home/metaconv/metaconv_components/components/common/data/cellar_uris.ent">
6+
%cellarEntities;
7+
]>
8+
<rdf:RDF xmlns:cdm="http://publications.europa.eu/ontology/cdm#"
9+
xmlns:dct="http://purl.org/dc/terms/"
10+
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
11+
12+
<cdm:work rdf:about="&resource;ted/2021_S_004_003545">
13+
14+
<rdf:type rdf:resource="http://publications.europa.eu/ontology/cdm#procurement_public"/>
15+
<cdm:work_id_document rdf:datatype="http://www.w3.org/2001/XMLSchema#string">ted:2021_S_004_003545</cdm:work_id_document>
16+
<cdm:work_id_document rdf:datatype="http://www.w3.org/2001/XMLSchema#string">oj:JOS_2021_004_R_003545</cdm:work_id_document>
17+
<cdm:work_has_resource-type rdf:resource="http://publications.europa.eu/resource/authority/resource-type/PROCUREMENT_NOTICE"/>
18+
<cdm:do_not_index rdf:datatype="http://www.w3.org/2001/XMLSchema#boolean">true</cdm:do_not_index>
19+
<cdm:work_date_document rdf:datatype="http://www.w3.org/2001/XMLSchema#date">2021-01-07</cdm:work_date_document>
20+
<cdm:work_created_by_agent rdf:resource="&cellar-authority;corporate-body/EURUN"/>
21+
<cdm:procurement_public_number_document_in_official-journal rdf:datatype="http://www.w3.org/2001/XMLSchema#string">003545</cdm:procurement_public_number_document_in_official-journal>
22+
<cdm:procurement_public_number_edition rdf:datatype="http://www.w3.org/2001/XMLSchema#positiveInteger">2021004</cdm:procurement_public_number_edition>
23+
24+
<cdm:work_title xml:lang="en">Construction work & planning</cdm:work_title>
25+
26+
<cdm:work_title xml:lang="ro">Lucrari de constructie <br /> si planificare</cdm:work_title>
27+
28+
<cdm:datetime_transmission rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2023-03-09T18:28:54.806241</cdm:datetime_transmission>
29+
30+
31+
</cdm:work>
32+
33+
<cdm:expression rdf:about="&resource;ted/2021_S_004_003545.MUL">
34+
35+
<rdf:type rdf:resource="http://publications.europa.eu/ontology/cdm#expression_procurement_public"/>
36+
<cdm:expression_belongs_to_work rdf:resource="&resource;ted/2021_S_004_003545"/>
37+
38+
<cdm:expression_title xml:lang="en"> eProcurement notice 2021_S_004_003545 </cdm:expression_title>
39+
40+
<cdm:expression_uses_language rdf:resource="&cellar-authority;language/MUL"/>
41+
<cdm:expression_procurement_public_authority-type_name rdf:datatype="http://www.w3.org/2001/XMLSchema#string">Other</cdm:expression_procurement_public_authority-type_name>
42+
</cdm:expression>
43+
44+
<cdm:manifestation_distribution rdf:about="&resource;ted/2021_S_004_003545.MUL.rdf">
45+
46+
<cdm:manifestation_manifests_expression rdf:resource="&resource;ted/2021_S_004_003545.MUL"/>
47+
<cdm:manifestation_type rdf:datatype="http://www.w3.org/2001/XMLSchema#string">rdf_epo</cdm:manifestation_type>
48+
<cdm:manifestation_date_publication rdf:datatype="http://www.w3.org/2001/XMLSchema#date">2021-01-07</cdm:manifestation_date_publication>
49+
<cdm:manifestation_distribution_has_status_distribution_status rdf:resource="http://publications.europa.eu/resource/authority/distribution-status/COMPLETED"/>
50+
<cdm:manifestation_distribution_has_media_type_concept_media_type rdf:resource="http://publications.europa.eu/resource/authority/file-type/RDF"/>
51+
</cdm:manifestation_distribution>
52+
</rdf:RDF>

tests/unit/notice_metadata_processor/conftest.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33

44
import pytest
55

6+
from ted_sws.core.model.manifestation import XMLManifestation
7+
from ted_sws.core.model.notice import Notice
8+
from ted_sws.data_sampler.services.notice_xml_indexer import index_notice
69
from tests import TEST_DATA_PATH
710

811

@@ -25,3 +28,39 @@ def notice_normalisation_test_data_path():
2528
def eforms_xml_notice_paths() -> List[pathlib.Path]:
2629
eforms_xml_notices_path = TEST_DATA_PATH / "eforms_samples"
2730
return list(eforms_xml_notices_path.glob("**/*.xml"))
31+
32+
33+
@pytest.fixture
34+
def sample_ef_html_unsafe_notice_path() -> pathlib.Path:
35+
return TEST_DATA_PATH / "notice_normalisation" / "ef_html_unsafe_notice.xml"
36+
37+
38+
@pytest.fixture
39+
def sample_indexed_ef_html_unsafe_notice(
40+
sample_ef_html_unsafe_notice_path: pathlib.Path) -> Notice:
41+
notice: Notice = Notice(ted_id=sample_ef_html_unsafe_notice_path.name)
42+
notice.set_xml_manifestation(
43+
XMLManifestation(object_data=sample_ef_html_unsafe_notice_path.read_text()))
44+
45+
return index_notice(notice)
46+
47+
48+
@pytest.fixture
49+
def sample_sf_html_unsafe_notice_path() -> pathlib.Path:
50+
return TEST_DATA_PATH / "notice_normalisation" / "sf_html_unsafe_notice.xml"
51+
52+
53+
@pytest.fixture
54+
def sample_indexed_sf_html_unsafe_notice(
55+
sample_sf_html_unsafe_notice_path: pathlib.Path) -> Notice:
56+
notice: Notice = Notice(ted_id=sample_sf_html_unsafe_notice_path.name)
57+
notice.set_xml_manifestation(
58+
XMLManifestation(object_data=sample_sf_html_unsafe_notice_path.read_text()))
59+
60+
return index_notice(notice)
61+
62+
63+
@pytest.fixture
64+
def html_incompatible_str() -> str:
65+
"""Provides a test string containing HTML incompatible characters."""
66+
return "Construction work & planning <br />"

tests/unit/notice_metadata_processor/test_metadata_normaliser.py

Lines changed: 44 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,17 @@
1+
import pathlib
2+
from xml.etree import ElementTree
3+
from xml.etree.ElementTree import ParseError
4+
15
import pytest
26

37
from ted_sws.core.model.manifestation import XMLManifestation
4-
from ted_sws.core.model.metadata import NormalisedMetadata
5-
from ted_sws.core.model.notice import NoticeStatus
8+
from ted_sws.core.model.metadata import NormalisedMetadata, LanguageTaggedString
9+
from ted_sws.core.model.notice import NoticeStatus, Notice
610
from ted_sws.notice_metadata_processor.adapters.notice_metadata_extractor import \
711
DefaultNoticeMetadataExtractor, EformsNoticeMetadataExtractor
812
from ted_sws.notice_metadata_processor.adapters.notice_metadata_normaliser import \
913
DefaultNoticeMetadataNormaliser, get_map_value, FORM_NUMBER_KEY, LEGAL_BASIS_KEY, SF_NOTICE_TYPE_KEY, \
10-
DOCUMENT_CODE_KEY, EformsNoticeMetadataNormaliser
14+
DOCUMENT_CODE_KEY, EformsNoticeMetadataNormaliser, get_html_compatible_string
1115
from ted_sws.notice_metadata_processor.model.metadata import ExtractedMetadata
1216
from ted_sws.notice_metadata_processor.services.metadata_constraints import filter_df_by_variables
1317
from ted_sws.notice_metadata_processor.services.metadata_normalizer import normalise_notice, normalise_notice_by_id, \
@@ -16,6 +20,8 @@
1620
extract_and_normalise_notice_metadata
1721
from ted_sws.resources.mapping_files_registry import MappingFilesRegistry
1822

23+
def html_str(content: str) -> str:
24+
return f"""<?xml version="1.0" encoding="UTF-8"?> <body>{content}</body>"""
1925

2026
def test_metadata_normaliser_by_notice(indexed_notice):
2127
notice = normalise_notice(indexed_notice)
@@ -235,3 +241,38 @@ def test_normalising_notice_out_of_index(notice_normalisation_test_data_path):
235241
with pytest.raises(Exception):
236242
extract_and_normalise_notice_metadata(
237243
xml_manifestation=XMLManifestation(object_data=broke_notice_content))
244+
245+
246+
def test_normalising_notice_with_spaces_in_notice_id(sample_indexed_ef_html_unsafe_notice: Notice,
247+
sample_indexed_sf_html_unsafe_notice: Notice
248+
):
249+
normalised_ef_notice: Notice = normalise_notice(sample_indexed_ef_html_unsafe_notice)
250+
251+
assert normalised_ef_notice.normalised_metadata.notice_publication_number.strip() == normalised_ef_notice.normalised_metadata.notice_publication_number
252+
253+
normalised_sf_notice: Notice = normalise_notice(sample_indexed_sf_html_unsafe_notice)
254+
255+
assert normalised_sf_notice.normalised_metadata.notice_publication_number.strip() == normalised_sf_notice.normalised_metadata.notice_publication_number
256+
257+
258+
def test_get_html_compatible_string(html_incompatible_str: str):
259+
with pytest.raises(ParseError):
260+
ElementTree.fromstring(html_incompatible_str)
261+
262+
compatible_str: LanguageTaggedString = get_html_compatible_string(LanguageTaggedString(text=html_incompatible_str))
263+
264+
265+
# Parse to check if str is well-formed (HTML-safe sequences or elements)
266+
ElementTree.fromstring(html_str(compatible_str.text))
267+
268+
269+
def test_normalising_notice_with_html_incompatible_title(sample_indexed_ef_html_unsafe_notice: Notice,
270+
sample_indexed_sf_html_unsafe_notice: Notice):
271+
272+
normalised_ef_notice: Notice = normalise_notice(sample_indexed_ef_html_unsafe_notice)
273+
274+
[ElementTree.fromstring(html_str(title.text)) for title in normalised_ef_notice.normalised_metadata.title ]
275+
276+
normalised_sf_notice: Notice = normalise_notice(sample_indexed_sf_html_unsafe_notice)
277+
278+
[ElementTree.fromstring(html_str(title.text)) for title in normalised_sf_notice.normalised_metadata.title]

tests/unit/notice_packager/conftest.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,21 @@ def template_sample_metadata_json() -> Dict:
3030
return json.load((TEST_DATA_PATH / "notice_packager" / "template_metadata.json").open())
3131

3232

33+
@pytest.fixture
34+
def sample_metadata_with_wrong_title_json() -> Dict:
35+
return json.load((TEST_DATA_PATH / "notice_packager" / "wrong_title" / "metadata_with_wrong_title.json").open())
36+
37+
38+
@pytest.fixture
39+
def sample_mets_xml_dmd_rdf_with_wrong_title_str() -> str:
40+
return (TEST_DATA_PATH / "notice_packager" / "wrong_title" / "mets_with_wrong_title.mets.xml.dmd.rdf").read_text()
41+
42+
43+
@pytest.fixture
44+
def sample_metadata_with_wrong_title(sample_metadata_with_wrong_title_json) -> PackagerMetadata:
45+
return PackagerMetadata(**sample_metadata_with_wrong_title_json)
46+
47+
3348
@pytest.fixture
3449
def template_sample_metadata(template_sample_metadata_json) -> PackagerMetadata:
3550
return PackagerMetadata(**template_sample_metadata_json)
@@ -54,6 +69,7 @@ def template_sample_expression(template_sample_metadata) -> ExpressionMetadata:
5469
def template_sample_manifestation(template_sample_metadata) -> ManifestationMetadata:
5570
return template_sample_metadata.manifestation
5671

72+
5773
# template_metadata END
5874

5975

@@ -67,6 +83,7 @@ def notice_sample_metadata(notice_2018) -> NormalisedMetadata:
6783

6884
return normalised_metadata
6985

86+
7087
# notice_metadata END
7188

7289

0 commit comments

Comments
 (0)