Skip to content

Commit 3390149

Browse files
authored
Merge pull request #366 from OP-TED/feature/TED-677
2 parents 70c59f9 + 3ef25bf commit 3390149

19 files changed

Lines changed: 300 additions & 184 deletions

ted_sws/notice_metadata_processor/model/metadata.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ class ExtractedMetadata(Metadata):
1515
city_of_buyer: List[LanguageTaggedString] = None
1616
name_of_buyer: List[LanguageTaggedString] = None
1717
original_language: str = None
18+
uri_list: str = None
1819
country_of_buyer: str = None
1920
type_of_buyer: EncodedValue = None
2021
eu_institution: str = None

ted_sws/notice_metadata_processor/services/xml_manifestation_metadata_extractor.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,17 @@ def name_of_buyer(self):
8686
def eu_institution(self):
8787
return self.type_of_buyer.value if self.type_of_buyer.code == "5" else "-"
8888

89+
@property
90+
def uri_list(self):
91+
uri_elements = self.manifestation_root.findall(
92+
self.xpath_registry.xpath_uri_elements,
93+
namespaces=self.namespaces)
94+
95+
return [LanguageTaggedString(text=extract_text_from_element(element=uri.find(".")),
96+
language=extract_attribute_from_element(element=uri.find("."),
97+
attrib_key="LG")) for
98+
uri in uri_elements]
99+
89100
@property
90101
def country_of_buyer(self):
91102
return extract_attribute_from_element(element=self.manifestation_root.find(
@@ -219,14 +230,16 @@ def to_metadata(self) -> ExtractedMetadata:
219230
Creating extracted metadata
220231
:return:
221232
"""
222-
metadata = ExtractedMetadata()
233+
metadata: ExtractedMetadata = ExtractedMetadata()
223234
metadata.title = self.title
224235
metadata.notice_publication_number = self.notice_publication_number
225236
metadata.publication_date = self.publication_date
237+
metadata.ojs_type = self.ojs_type
226238
metadata.ojs_issue_number = self.ojs_issue_number
227239
metadata.city_of_buyer = self.city_of_buyer
228240
metadata.name_of_buyer = self.name_of_buyer
229241
metadata.original_language = self.original_language
242+
metadata.uri_list = self.uri_list
230243
metadata.country_of_buyer = self.country_of_buyer
231244
metadata.type_of_buyer = self.type_of_buyer
232245
metadata.eu_institution = self.eu_institution

ted_sws/notice_metadata_processor/services/xpath_registry.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,10 @@ def xpath_name_of_buyer_elements(self):
4343
def xpath_country_of_buyer(self):
4444
return "manifestation_ns:CODED_DATA_SECTION/manifestation_ns:NOTICE_DATA/manifestation_ns:ISO_COUNTRY"
4545

46+
@property
47+
def xpath_uri_elements(self):
48+
return "manifestation_ns:CODED_DATA_SECTION/manifestation_ns:NOTICE_DATA/manifestation_ns:URI_LIST/"
49+
4650
@property
4751
def xpath_original_language(self):
4852
return "manifestation_ns:CODED_DATA_SECTION/manifestation_ns:NOTICE_DATA/manifestation_ns:LG_ORIG"

ted_sws/notice_packager/entrypoints/cli/cmd_bulk_packager.py

Lines changed: 53 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -11,17 +11,22 @@
1111
import base64
1212
import os
1313
from pathlib import Path
14+
from typing import List
1415

1516
import click
17+
from pymongo import MongoClient
1618

19+
from ted_sws import config
1720
from ted_sws.core.adapters.cmd_runner import CmdRunner as BaseCmdRunner
1821
from ted_sws.core.model.manifestation import XMLManifestation
1922
from ted_sws.core.model.notice import Notice
23+
from ted_sws.data_manager.adapters.notice_repository import NoticeRepository
24+
from ted_sws.event_manager.adapters.log import LOG_WARN_TEXT
2025
from ted_sws.notice_metadata_processor.services.xml_manifestation_metadata_extractor import \
2126
XMLManifestationMetadataExtractor
22-
from ted_sws.notice_packager.services.metadata_transformer import MetadataTransformer
23-
from ted_sws.notice_packager.services.notice_packager import create_notice_package
2427
from ted_sws.notice_packager import DEFAULT_NOTICE_PACKAGE_EXTENSION
28+
from ted_sws.notice_packager.services.metadata_transformer import MetadataTransformer
29+
from ted_sws.notice_packager.services.notice_packager import create_notice_package, package_notice_and_save_to
2530

2631
CMD_NAME = "CMD_BULK_PACKAGER"
2732
DEFAULT_FILES_COUNT: int = 3000
@@ -35,47 +40,60 @@ class PackageNotice(Notice):
3540

3641

3742
class CmdRunner(BaseCmdRunner):
38-
def __init__(self, rdf_files_folder, output_folder, pkgs_count: int):
43+
def __init__(self, rdf_files_folder, output_folder, pkgs_count: int, notice_ids: List = None,
44+
mongodb_client=MongoClient(config.MONGO_DB_AUTH_URL)):
3945
super().__init__(name=CMD_NAME)
40-
self.rdf_files_path = Path(os.path.realpath(rdf_files_folder))
4146
self.output_path = Path(os.path.realpath(output_folder))
42-
self.pkgs_count = pkgs_count
43-
if not self.rdf_files_path.is_dir():
44-
error_msg = f"No such folder :: [{rdf_files_folder}]"
45-
self.log_failed_msg(error_msg)
46-
raise FileNotFoundError(error_msg)
47+
self.notices = None
48+
if notice_ids:
49+
self.log(LOG_WARN_TEXT.format("Notices: ") + str(notice_ids))
50+
self.notice_repository = NoticeRepository(mongodb_client=mongodb_client)
51+
self.notices = []
52+
for notice_id in notice_ids:
53+
self.notices.append(self.notice_repository.get(reference=notice_id))
54+
else:
55+
self.rdf_files_path = Path(os.path.realpath(rdf_files_folder))
56+
self.pkgs_count = pkgs_count
57+
if not self.rdf_files_path.is_dir():
58+
error_msg = f"No such folder :: [{rdf_files_folder}]"
59+
self.log_failed_msg(error_msg)
60+
raise FileNotFoundError(error_msg)
61+
62+
self.output_path.mkdir(parents=True, exist_ok=True)
4763

4864
def run_cmd(self):
4965
error = None
5066
try:
51-
self.output_path.mkdir(parents=True, exist_ok=True)
52-
rdf_files = [Path(str(f_path)) for f in os.listdir(self.rdf_files_path) if
53-
os.path.isfile(f_path := os.path.join(self.rdf_files_path, f))]
54-
rdf_files_count = len(rdf_files)
55-
base_idx = 100000
56-
year = 2021
57-
58-
for i in range(self.pkgs_count):
59-
rdf_idx = i % rdf_files_count
60-
rdf_file_path = rdf_files[rdf_idx]
61-
notice_id = str(base_idx + i) + "_" + str(year)
62-
pkg_name = notice_id
63-
self.generate_package(notice_id, self.output_path, rdf_file_path, pkg_name)
67+
if self.notices:
68+
self.log("Saving packages to " + str(self.output_path))
69+
for notice in self.notices:
70+
package_notice_and_save_to(notice=notice,
71+
save_to=self.output_path)
72+
else:
73+
rdf_files = [Path(str(f_path)) for f in os.listdir(self.rdf_files_path) if
74+
os.path.isfile(f_path := os.path.join(self.rdf_files_path, f))]
75+
rdf_files_count = len(rdf_files)
76+
base_idx = 100000
77+
year = 2021
78+
79+
for i in range(self.pkgs_count):
80+
rdf_idx = i % rdf_files_count
81+
rdf_file_path = rdf_files[rdf_idx]
82+
notice_id = str(base_idx + i) + "_" + str(year)
83+
self.generate_package(notice_id, self.output_path, rdf_file_path)
6484
except Exception as e:
6585
error = e
6686

6787
return self.run_cmd_result(error)
6888

6989
@classmethod
70-
def generate_package(cls, notice_id, output_path, rdf_file_path, pkg_name):
90+
def generate_package(cls, notice_id, output_path, rdf_file_path):
7191

7292
with open(rdf_file_path, "r") as f:
7393
rdf_content = f.read()
7494

7595
encoded_rdf_content = base64.b64encode(bytes(rdf_content, 'utf-8'))
7696

77-
output_file = output_path / (pkg_name + DEFAULT_NOTICE_PACKAGE_EXTENSION)
78-
7997
notice = PackageNotice(ted_id=notice_id)
8098
notice_metadata = XMLManifestationMetadataExtractor(
8199
xml_manifestation=notice.xml_manifestation).to_metadata()
@@ -84,24 +102,26 @@ def generate_package(cls, notice_id, output_path, rdf_file_path, pkg_name):
84102
create_notice_package(
85103
notice_metadata,
86104
rdf_content=encoded_rdf_content,
87-
save_to=output_file
105+
save_to=output_path
88106
)
89107

90108

91-
def run(rdf_files_count, output_folder, pkgs_count):
92-
cmd = CmdRunner(rdf_files_count, output_folder, pkgs_count)
109+
def run(rdf_files_count=None, output_folder=None, pkgs_count=None, notice_id=None,
110+
mongodb_client=MongoClient(config.MONGO_DB_AUTH_URL)):
111+
cmd = CmdRunner(rdf_files_count, output_folder, pkgs_count, list(notice_id or []), mongodb_client)
93112
cmd.run()
94113

95114

96115
@click.command()
97-
@click.argument('rdf-files-folder', nargs=1)
98-
@click.argument('output-folder', nargs=1)
116+
@click.argument('rdf-files-folder', nargs=1, required=False)
99117
@click.argument('pkgs-count', nargs=1, type=click.INT, required=False, default=DEFAULT_FILES_COUNT)
100-
def main(rdf_files_folder, output_folder, pkgs_count):
118+
@click.option('--output-folder', required=False, default=".")
119+
@click.option('--notice-id', required=False, multiple=True, default=None)
120+
def main(rdf_files_folder, pkgs_count, output_folder, notice_id):
101121
"""
102-
Generates <PKGS_COUNT> test METS packages
122+
Generates test METS packages
103123
"""
104-
run(rdf_files_folder, output_folder, pkgs_count)
124+
run(rdf_files_folder, output_folder, pkgs_count, notice_id)
105125

106126

107127
if __name__ == '__main__':

ted_sws/notice_packager/model/metadata.py

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
"""
1111

1212
import datetime
13-
from typing import List, Dict
13+
from typing import List, Dict, Optional
1414

1515
from pydantic import validator
1616

@@ -28,12 +28,12 @@
2828
BASE_TITLE = "eProcurement notice"
2929

3030
WORK_DO_NOT_INDEX = "true"
31-
MANIFESTATION_TYPE = "E_PROCUREMENT_ONTOLOGY"
31+
MANIFESTATION_TYPE = "rdf_epo"
3232
DISTRIBUTION_STATUS = "COMPLETED"
3333
MEDIA_TYPE = "RDF"
3434
LANGUAGES = ["en"]
3535
LANGUAGE = LANGUAGES[0]
36-
USES_LANGUAGE = "ENG"
36+
USES_LANGUAGE = "MUL"
3737

3838
ACTION_CREATE = "create"
3939
ACTION_UPDATE = "update"
@@ -64,7 +64,7 @@ class NoticeMetadata(Metadata):
6464
"""
6565
General notice metadata
6666
"""
67-
id: str = None
67+
id: Optional[str] = None
6868
languages: List[str] = LANGUAGES
6969
action: NoticeActionMetadata = NoticeActionMetadata()
7070

@@ -75,22 +75,27 @@ class WorkMetadata(Metadata):
7575
and the rest is a bunch of constants OR generated values (e.g. date, URI, ...)
7676
"""
7777

78-
uri: str = None
78+
identifier: Optional[str]
79+
cdm_rdf_type: Optional[str]
80+
resource_type: Optional[str]
81+
uri: Optional[str] = None
7982
do_not_index: str = WORK_DO_NOT_INDEX
8083
date_document: str = datetime.datetime.now().strftime('%Y-%m-%d')
8184
created_by_agent: str = WORK_AGENT
8285
dataset_published_by_agent: str = WORK_AGENT
8386
datetime_transmission: str = datetime.datetime.now().isoformat()
84-
title: Dict[str, str] = None
85-
date_creation: str = None
87+
title: Optional[Dict[str, str]] = None
88+
date_creation: Optional[str] = datetime.datetime.now().strftime('%Y-%m-%d')
8689
concept_type_dataset: str = CONCEPT_TYPE_DATASET
87-
dataset_version: str = None
90+
dataset_version: Optional[str] = None
8891
dataset_keyword: List[str] = DATASET_KEYWORD
8992
dataset_has_frequency_publication_frequency: str = PUBLICATION_FREQUENCY
93+
procurement_public_issued_by_country: Optional[str]
94+
procurement_public_url_etendering: Optional[List[str]]
9095

9196

9297
class ExpressionMetadata(Metadata):
93-
title: Dict[str, str] = None
98+
title: Optional[Dict[str, str]] = None
9499
uses_language: str = USES_LANGUAGE
95100

96101

ted_sws/notice_packager/resources/templates/mets2action_mets_xml.jinja2

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
TYPE="{{ notice.action.type }}"
77
PROFILE="http://publications.europa.eu/resource/mets/op-sip-profile_002">
88
<metsHdr {{ notice.action.type|upper }}DATE="{{ notice.action.date }}">
9-
<metsDocumentID>{{ notice.id }}_mets2{{ notice.action.type }}</metsDocumentID>
9+
<metsDocumentID>{{ work.identifier }}_{{ notice.action.type }}</metsDocumentID>
1010
</metsHdr>
1111
<dmdSec ID="dmdSec01">
1212
<mdRef MDTYPE="OTHER" LOCTYPE="URL" MIMETYPE="application/rdf+xml" OTHERMDTYPE="INSTANCE" xlink:href="{{ notice.id }}-0.mets.xml.dmd.rdf"/>

ted_sws/notice_packager/resources/templates/mets_xml_dmd_rdf.jinja2

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,39 +9,46 @@
99
xmlns:dct="http://purl.org/dc/terms/"
1010
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
1111

12-
<cdm:work rdf:about="&resource;dataset/{{ notice.id }}">
12+
<cdm:work rdf:about="&resource;ted/{{ work.identifier }}">
13+
<rdf:type rdf:resource="http://publications.europa.eu/ontology/cdm#work"/>
14+
<rdf:type rdf:resource="http://publications.europa.eu/ontology/cdm#procurement_public"/>
15+
<cdm:work_has_resource-type rdf:resource="http://publications.europa.eu/resource/authority/resource-type/PROCUREMENT_NOTICE"/>
1316
<cdm:do_not_index rdf:datatype="http://www.w3.org/2001/XMLSchema#boolean">{{ work.do_not_index }}</cdm:do_not_index>
1417
<cdm:work_date_document rdf:datatype="http://www.w3.org/2001/XMLSchema#date">{{ work.date_document }}</cdm:work_date_document>
1518
<cdm:work_created_by_agent rdf:resource="&cellar-authority;corporate-body/{{ work.created_by_agent }}"/>
16-
<cdm:work_dataset_published_by_agent rdf:resource="&cellar-authority;corporate-body/{{ work.dataset_published_by_agent }}"/>
17-
<cdm:datetime_transmission rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">{{ work.datetime_transmission }}</cdm:datetime_transmission>
18-
<cdm:datetime_negotiation rdf:resource="http://publications.europa.eu/ontology/cdm#datetime_transmission"/>
1919
{% for lang in notice.languages %}
2020
<cdm:work_title xml:lang="{{ lang }}">{{ work.title[lang] }}</cdm:work_title>
2121
{% endfor %}
22+
<cdm:datetime_transmission rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">{{ work.datetime_transmission }}</cdm:datetime_transmission>
23+
{# <cdm:procurement_public_issued_by_country>{{ work.procurement_public_issued_by_country }}</cdm:procurement_public_issued_by_country>
24+
{% for uri in work.procurement_public_url_etendering %}
25+
<cdm:procurement_public_url_etendering xml:lang="{{ uri.language }}">{{ uri.text }}</cdm:procurement_public_url_etendering>
26+
{% endfor %} #}
27+
{# <cdm:datetime_negotiation rdf:resource="http://publications.europa.eu/ontology/cdm#datetime_transmission"/>
28+
<cdm:work_dataset_published_by_agent rdf:resource="&cellar-authority;corporate-body/{{ work.dataset_published_by_agent }}"/>
2229
<cdm:work_date_creation rdf:datatype="http://www.w3.org/2001/XMLSchema#date">{{ work.date_creation }}</cdm:work_date_creation>
2330
<cdm:work_id>{{ work.uri }}</cdm:work_id>
2431
<cdm:work_dataset_version>{{ work.dataset_version }}</cdm:work_dataset_version>
2532
<cdm:work_dataset_has_type_concept_type_dataset rdf:resource="http://publications.europa.eu/resource/authority/dataset-type/{{ work.concept_type_dataset }}"/>
2633
{% for dataset_keyword in work.dataset_keyword %}
2734
<cdm:work_dataset_keyword>{{ dataset_keyword }}</cdm:work_dataset_keyword>
2835
{% endfor %}
29-
<cdm:work_dataset_has_frequency_publication_frequency rdf:resource="http://publications.europa.eu/resource/authority/frequency/{{ work.dataset_has_frequency_publication_frequency }}"/>
36+
<cdm:work_dataset_has_frequency_publication_frequency rdf:resource="http://publications.europa.eu/resource/authority/frequency/{{ work.dataset_has_frequency_publication_frequency }}"/> #}
3037
</cdm:work>
3138

32-
<cdm:expression rdf:about="&resource;expression/{{ notice.id }}">
33-
<cdm:expression_belongs_to_work rdf:resource="&resource;dataset/{{ notice.id }}"/>
39+
<cdm:expression rdf:about="&resource;ted/{{ work.identifier }}.MUL">
40+
<cdm:expression_belongs_to_work rdf:resource="&resource;ted/{{ work.identifier }}"/>
3441
{% for lang in notice.languages %}
3542
<cdm:expression_title xml:lang="{{ lang }}">{{ expression.title[lang] }}</cdm:expression_title>
3643
{% endfor %}
3744
<cdm:expression_uses_language rdf:resource="&cellar-authority;language/{{ expression.uses_language }}"/>
3845
</cdm:expression>
3946

40-
<cdm:manifestation_distribution rdf:about="&resource;distribution/{{ notice.id }}/{{ notice.id }}_rdf">
41-
<cdm:manifestation_manifests_expression rdf:resource="&resource;expression/{{ notice.id }}"/>
47+
<cdm:manifestation_distribution rdf:about="&resource;ted/{{ work.identifier }}.MUL.rdf">
48+
<cdm:manifestation_manifests_expression rdf:resource="&resource;ted/{{ work.identifier }}.MUL"/>
4249
<cdm:manifestation_type rdf:datatype="http://www.w3.org/2001/XMLSchema#string">{{ manifestation.type }}</cdm:manifestation_type>
4350
<cdm:manifestation_date_publication rdf:datatype="http://www.w3.org/2001/XMLSchema#date">{{ manifestation.date_publication }}</cdm:manifestation_date_publication>
44-
<cdm:manifestation_distribution_has_status_distribution_status rdf:resource="http://publications.europa.eu/resource/authority/dataset-status/{{ manifestation.distribution_has_status_distribution_status }}"/>
45-
<cdm:manifestation_distribution_has_media_type_concept_media_type rdf:resource="http://publications.europa.eu/resource/authority/file-type/{{ manifestation.distribution_has_media_type_concept_media_type }}"/>
51+
{# <cdm:manifestation_distribution_has_status_distribution_status rdf:resource="http://publications.europa.eu/resource/authority/dataset-status/{{ manifestation.distribution_has_status_distribution_status }}"/>
52+
<cdm:manifestation_distribution_has_media_type_concept_media_type rdf:resource="http://publications.europa.eu/resource/authority/file-type/{{ manifestation.distribution_has_media_type_concept_media_type }}"/> #}
4653
</cdm:manifestation_distribution>
4754
</rdf:RDF>

ted_sws/notice_packager/services/metadata_transformer.py

Lines changed: 27 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@
2323
# This is used in TED API
2424
DENORMALIZED_SEPARATOR = '-'
2525

26+
PROCUREMENT_PUBLIC = "procurement_public"
27+
PROCUREMENT_NOTICE = "PROCUREMENT_NOTICE"
28+
2629

2730
class MetadataTransformer:
2831
def __init__(self, notice_metadata: ExtractedMetadata):
@@ -64,23 +67,40 @@ def from_notice_metadata(cls, notice_metadata: ExtractedMetadata) -> PackagerMet
6467
metadata.notice.id = cls.normalize_value(notice_metadata.notice_publication_number)
6568

6669
# WORK
67-
metadata.work.uri = publication_notice_uri(metadata.notice.id)
70+
publication_date = datetime.datetime.strptime(notice_metadata.publication_date, '%Y%m%d').strftime('%Y-%m-%d')
71+
metadata.work.identifier = publication_work_identifier(metadata.notice.id, notice_metadata)
72+
metadata.work.cdm_rdf_type = PROCUREMENT_PUBLIC
73+
metadata.work.resource_type = PROCUREMENT_NOTICE
74+
metadata.work.date_document = publication_date
75+
metadata.work.uri = publication_notice_uri(metadata.notice.id, notice_metadata)
6876
title_search = [t.title.text for t in notice_metadata.title if t.title.language == LANGUAGE.upper()]
6977
if len(title_search) > 0:
7078
metadata.work.title = {LANGUAGE: title_search[0]}
71-
metadata.work.date_creation = datetime.datetime \
72-
.strptime(notice_metadata.publication_date, '%Y%m%d').strftime('%Y-%m-%d')
7379
metadata.work.dataset_version = _date.strftime('%Y%m%d') + '-' + _revision
80+
metadata.work.procurement_public_issued_by_country = notice_metadata.country_of_buyer
81+
metadata.work.procurement_public_url_etendering = notice_metadata.uri_list
7482

7583
# EXPRESSION
7684
metadata.expression.title = {LANGUAGE: BASE_TITLE + " " + metadata.notice.id}
7785

86+
# MANIFESTATION
87+
metadata.manifestation.date_publication = publication_date
7888
return metadata
7989

8090

81-
def publication_notice_year(notice_id):
82-
return notice_id.split(NORMALIZED_SEPARATOR)[1]
91+
def publication_notice_year(notice_metadata):
92+
return datetime.datetime.strptime(notice_metadata.publication_date, '%Y%m%d').strftime('%Y')
93+
94+
95+
def publication_notice_number(notice_id):
96+
return notice_id.split(NORMALIZED_SEPARATOR)[0]
97+
98+
99+
def publication_notice_uri(notice_id, notice_metadata):
100+
return f"{BASE_WORK}{publication_notice_year(notice_metadata)}/{notice_id}"
83101

84102

85-
def publication_notice_uri(notice_id):
86-
return f"{BASE_WORK}{publication_notice_year(notice_id)}/{notice_id}"
103+
def publication_work_identifier(notice_id, notice_metadata):
104+
year = publication_notice_year(notice_metadata)
105+
number = publication_notice_number(notice_id)
106+
return f"{year}_{notice_metadata.ojs_type}_{notice_metadata.ojs_issue_number}_{number}"

0 commit comments

Comments
 (0)