Skip to content

Commit d28a5d0

Browse files
committed
Merge branch 'main' into feature/TED-1030
2 parents 81c0dbd + e52c3d6 commit d28a5d0

28 files changed

Lines changed: 340 additions & 197 deletions

File tree

dags/notice_fetch_by_date_workflow.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from datetime import timedelta
2+
13
from airflow.decorators import dag, task
24
from airflow.operators.dummy import DummyOperator
35
from airflow.operators.python import BranchPythonOperator, PythonOperator
@@ -58,7 +60,9 @@ def validate_fetched_notices():
5860
from datetime import datetime
5961
from pymongo import MongoClient
6062

61-
publication_date = datetime.strptime(get_dag_param(key=WILD_CARD_DAG_KEY), "%Y%m%d*")
63+
publication_date = datetime.strptime(get_dag_param(key=WILD_CARD_DAG_KEY,
64+
default_value=(datetime.now() - timedelta(days=1)).strftime(
65+
"%Y%m%d*")), "%Y%m%d*")
6266
mongodb_client = MongoClient(config.MONGO_DB_AUTH_URL)
6367
validate_and_update_daily_supra_notice(notice_publication_day=publication_date,
6468
mongodb_client=mongodb_client)

ted_sws/data_manager/adapters/triple_store.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,9 @@
1313
from ted_sws import config
1414
from ted_sws.data_manager.adapters.sparql_endpoint import TripleStoreEndpointABC, SPARQLTripleStoreEndpoint
1515

16-
1716
FUSEKI_REPOSITORY_ALREADY_EXIST_ERROR_MSG = 'A repository with this name already exists.'
1817

18+
1919
class TripleStoreABC:
2020
@abc.abstractmethod
2121
def create_repository(self, repository_name: str):

ted_sws/mapping_suite_processor/adapters/mapping_suite_hasher.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
""" """
99
import hashlib
1010
import pathlib
11+
import re
1112
from typing import Tuple, List, Union
1213

1314
from ted_sws.data_manager.adapters.mapping_suite_repository import MS_TRANSFORM_FOLDER_NAME, \
@@ -35,7 +36,10 @@ def _hash_a_file(file_path: pathlib.Path) -> Tuple[str, str]:
3536
"""
3637
Return a tuple of the relative file path and the file hash.
3738
"""
38-
hashed_line = hashlib.sha256(file_path.read_bytes()).hexdigest()
39+
# remove new-lines to align content generated on different operating systems
40+
new_line_pattern = re.compile(b'\r\n|\r|\n')
41+
file_content = re.sub(new_line_pattern, b'', file_path.read_bytes())
42+
hashed_line = hashlib.sha256(file_content).hexdigest()
3943
relative_path = str(file_path).replace(str(self.mapping_suite_path), "")
4044
return relative_path, hashed_line
4145

@@ -62,7 +66,7 @@ def hash_mapping_suite(self, with_version: str = "") -> str:
6266
"""
6367
Returns a hash of the mapping suite.
6468
Only the critical resources are hashed in the mapping suite.
65-
The decission which rescources are "critical" is implemented
69+
The decision which resources are "critical" is implemented
6670
in self.hash_critical_mapping_files() function.
6771
6872
If "with_version" parameter is used, then it computed the mapping

ted_sws/mapping_suite_processor/services/load_mapping_suite_output_into_triple_store.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import pathlib
22

33
from ted_sws import config
4-
from ted_sws.data_manager.adapters.triple_store import AllegroGraphTripleStore, FusekiAdapter, TripleStoreABC
4+
from ted_sws.data_manager.adapters.triple_store import FusekiAdapter, TripleStoreABC
55

66

77
def repository_exists(triple_store: TripleStoreABC, repository_name) -> bool:

ted_sws/master_data_registry/services/entity_deduplication.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from ted_sws.event_manager.services.log import log_error, log_notice_error
1818
from ted_sws.master_data_registry.services.rdf_fragment_processor import get_rdf_fragments_by_cet_uri_from_notices, \
1919
merge_rdf_fragments_into_graph, write_rdf_fragments_in_triple_store, RDF_FRAGMENT_FROM_NOTICE_PROPERTY, \
20-
get_subjects_by_cet_uri
20+
get_subjects_by_cet_uri, get_rdf_fragment_by_cet_uri_from_notice
2121

2222
MDR_TEMPORARY_FUSEKI_DATASET_NAME = "tmp_mdr_dataset"
2323
MDR_FUSEKI_DATASET_NAME = "mdr_dataset"
@@ -226,12 +226,14 @@ def deduplicate_entities_by_cet_uri(notices: List[Notice], cet_uri: str,
226226
alignment_graph=cet_alignment_links, inject_reflexive_links=True)
227227

228228

229-
def deduplicate_procedure_entities(notices: List[Notice], procedure_cet_uri: str, mongodb_client: MongoClient):
229+
def deduplicate_procedure_entities(notices: List[Notice], procedure_cet_uri: str, mongodb_client: MongoClient,
230+
mdr_dataset_name: str = MDR_FUSEKI_DATASET_NAME):
230231
"""
231232
This function deduplicate procedure entities for each notice from batch of notices.
232233
:param notices:
233234
:param procedure_cet_uri:
234235
:param mongodb_client:
236+
:param mdr_dataset_name:
235237
:return:
236238
"""
237239
notice_families = defaultdict(list)
@@ -243,6 +245,14 @@ def deduplicate_procedure_entities(notices: List[Notice], procedure_cet_uri: str
243245

244246
parent_uries = {}
245247
notice_repository = NoticeRepository(mongodb_client=mongodb_client)
248+
triple_store = FusekiAdapter()
249+
if mdr_dataset_name not in triple_store.list_repositories():
250+
try:
251+
triple_store.create_repository(repository_name=mdr_dataset_name)
252+
except Exception as exception:
253+
if str(exception) != FUSEKI_REPOSITORY_ALREADY_EXIST_ERROR_MSG:
254+
log_error(message=str(exception))
255+
246256
for parent_notice_id in notice_families.keys():
247257
parent_notice = notice_repository.get(reference=parent_notice_id)
248258
if parent_notice and parent_notice.rdf_manifestation and parent_notice.rdf_manifestation.object_data:
@@ -261,6 +271,11 @@ def deduplicate_procedure_entities(notices: List[Notice], procedure_cet_uri: str
261271
else:
262272
parent_procedure_uri = rdflib.URIRef(result_uris[0])
263273
parent_uries[parent_notice_id] = parent_procedure_uri
274+
parent_procedure_rdf_fragments = get_rdf_fragment_by_cet_uri_from_notice(notice=parent_notice,
275+
cet_uri=procedure_cet_uri)
276+
parent_new_cet = {parent_procedure_uri: parent_procedure_rdf_fragments[0]}
277+
register_new_cets_in_mdr(new_canonical_entities=parent_new_cet, triple_store=triple_store,
278+
mdr_dataset_name=mdr_dataset_name)
264279

265280
for parent_uri_key in parent_uries.keys():
266281
parent_uri = parent_uries[parent_uri_key]

ted_sws/notice_fetcher/adapters/ted_api.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,10 @@
1616
DEFAULT_TED_API_QUERY_RESULT_FIELDS = {"fields": ["AA", "AC", "CY", "DD", "DI", "DS", "TVL", "TY",
1717
"DT", "MA", "NC", "ND", "OC", "OJ", "OL", "OY",
1818
"PC", "PD", "PR", "RC", "RN", "RP", "TD", "TVH",
19-
"CONTENT", "notice-type", "award-criterion-type", "corporate-body",
20-
"funding", "notice-identifier", "notice-version"
19+
"CONTENT",
20+
# INFO: This query result fields is not supported correctly by TED-API.
21+
#"notice-type", "award-criterion-type", "corporate-body",
22+
#"funding", "notice-identifier", "notice-version"
2123
]}
2224

2325
TOTAL_DOCUMENTS_NUMBER = "total"

ted_sws/notice_metadata_processor/model/metadata.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ class ExtractedMetadata(Metadata):
1515
city_of_buyer: List[LanguageTaggedString] = None
1616
name_of_buyer: List[LanguageTaggedString] = None
1717
original_language: str = None
18+
uri_list: str = None
1819
country_of_buyer: str = None
1920
type_of_buyer: EncodedValue = None
2021
eu_institution: str = None

ted_sws/notice_metadata_processor/services/xml_manifestation_metadata_extractor.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,17 @@ def name_of_buyer(self):
8686
def eu_institution(self):
8787
return self.type_of_buyer.value if self.type_of_buyer.code == "5" else "-"
8888

89+
@property
90+
def uri_list(self):
91+
uri_elements = self.manifestation_root.findall(
92+
self.xpath_registry.xpath_uri_elements,
93+
namespaces=self.namespaces)
94+
95+
return [LanguageTaggedString(text=extract_text_from_element(element=uri.find(".")),
96+
language=extract_attribute_from_element(element=uri.find("."),
97+
attrib_key="LG")) for
98+
uri in uri_elements]
99+
89100
@property
90101
def country_of_buyer(self):
91102
return extract_attribute_from_element(element=self.manifestation_root.find(
@@ -219,14 +230,16 @@ def to_metadata(self) -> ExtractedMetadata:
219230
Creating extracted metadata
220231
:return:
221232
"""
222-
metadata = ExtractedMetadata()
233+
metadata: ExtractedMetadata = ExtractedMetadata()
223234
metadata.title = self.title
224235
metadata.notice_publication_number = self.notice_publication_number
225236
metadata.publication_date = self.publication_date
237+
metadata.ojs_type = self.ojs_type
226238
metadata.ojs_issue_number = self.ojs_issue_number
227239
metadata.city_of_buyer = self.city_of_buyer
228240
metadata.name_of_buyer = self.name_of_buyer
229241
metadata.original_language = self.original_language
242+
metadata.uri_list = self.uri_list
230243
metadata.country_of_buyer = self.country_of_buyer
231244
metadata.type_of_buyer = self.type_of_buyer
232245
metadata.eu_institution = self.eu_institution

ted_sws/notice_metadata_processor/services/xpath_registry.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,10 @@ def xpath_name_of_buyer_elements(self):
4343
def xpath_country_of_buyer(self):
4444
return "manifestation_ns:CODED_DATA_SECTION/manifestation_ns:NOTICE_DATA/manifestation_ns:ISO_COUNTRY"
4545

46+
@property
47+
def xpath_uri_elements(self):
48+
return "manifestation_ns:CODED_DATA_SECTION/manifestation_ns:NOTICE_DATA/manifestation_ns:URI_LIST/"
49+
4650
@property
4751
def xpath_original_language(self):
4852
return "manifestation_ns:CODED_DATA_SECTION/manifestation_ns:NOTICE_DATA/manifestation_ns:LG_ORIG"

ted_sws/notice_packager/entrypoints/cli/cmd_bulk_packager.py

Lines changed: 53 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -11,17 +11,22 @@
1111
import base64
1212
import os
1313
from pathlib import Path
14+
from typing import List
1415

1516
import click
17+
from pymongo import MongoClient
1618

19+
from ted_sws import config
1720
from ted_sws.core.adapters.cmd_runner import CmdRunner as BaseCmdRunner
1821
from ted_sws.core.model.manifestation import XMLManifestation
1922
from ted_sws.core.model.notice import Notice
23+
from ted_sws.data_manager.adapters.notice_repository import NoticeRepository
24+
from ted_sws.event_manager.adapters.log import LOG_WARN_TEXT
2025
from ted_sws.notice_metadata_processor.services.xml_manifestation_metadata_extractor import \
2126
XMLManifestationMetadataExtractor
22-
from ted_sws.notice_packager.services.metadata_transformer import MetadataTransformer
23-
from ted_sws.notice_packager.services.notice_packager import create_notice_package
2427
from ted_sws.notice_packager import DEFAULT_NOTICE_PACKAGE_EXTENSION
28+
from ted_sws.notice_packager.services.metadata_transformer import MetadataTransformer
29+
from ted_sws.notice_packager.services.notice_packager import create_notice_package, package_notice_and_save_to
2530

2631
CMD_NAME = "CMD_BULK_PACKAGER"
2732
DEFAULT_FILES_COUNT: int = 3000
@@ -35,47 +40,60 @@ class PackageNotice(Notice):
3540

3641

3742
class CmdRunner(BaseCmdRunner):
38-
def __init__(self, rdf_files_folder, output_folder, pkgs_count: int):
43+
def __init__(self, rdf_files_folder, output_folder, pkgs_count: int, notice_ids: List = None,
44+
mongodb_client=MongoClient(config.MONGO_DB_AUTH_URL)):
3945
super().__init__(name=CMD_NAME)
40-
self.rdf_files_path = Path(os.path.realpath(rdf_files_folder))
4146
self.output_path = Path(os.path.realpath(output_folder))
42-
self.pkgs_count = pkgs_count
43-
if not self.rdf_files_path.is_dir():
44-
error_msg = f"No such folder :: [{rdf_files_folder}]"
45-
self.log_failed_msg(error_msg)
46-
raise FileNotFoundError(error_msg)
47+
self.notices = None
48+
if notice_ids:
49+
self.log(LOG_WARN_TEXT.format("Notices: ") + str(notice_ids))
50+
self.notice_repository = NoticeRepository(mongodb_client=mongodb_client)
51+
self.notices = []
52+
for notice_id in notice_ids:
53+
self.notices.append(self.notice_repository.get(reference=notice_id))
54+
else:
55+
self.rdf_files_path = Path(os.path.realpath(rdf_files_folder))
56+
self.pkgs_count = pkgs_count
57+
if not self.rdf_files_path.is_dir():
58+
error_msg = f"No such folder :: [{rdf_files_folder}]"
59+
self.log_failed_msg(error_msg)
60+
raise FileNotFoundError(error_msg)
61+
62+
self.output_path.mkdir(parents=True, exist_ok=True)
4763

4864
def run_cmd(self):
4965
error = None
5066
try:
51-
self.output_path.mkdir(parents=True, exist_ok=True)
52-
rdf_files = [Path(str(f_path)) for f in os.listdir(self.rdf_files_path) if
53-
os.path.isfile(f_path := os.path.join(self.rdf_files_path, f))]
54-
rdf_files_count = len(rdf_files)
55-
base_idx = 100000
56-
year = 2021
57-
58-
for i in range(self.pkgs_count):
59-
rdf_idx = i % rdf_files_count
60-
rdf_file_path = rdf_files[rdf_idx]
61-
notice_id = str(base_idx + i) + "_" + str(year)
62-
pkg_name = notice_id
63-
self.generate_package(notice_id, self.output_path, rdf_file_path, pkg_name)
67+
if self.notices:
68+
self.log("Saving packages to " + str(self.output_path))
69+
for notice in self.notices:
70+
package_notice_and_save_to(notice=notice,
71+
save_to=self.output_path)
72+
else:
73+
rdf_files = [Path(str(f_path)) for f in os.listdir(self.rdf_files_path) if
74+
os.path.isfile(f_path := os.path.join(self.rdf_files_path, f))]
75+
rdf_files_count = len(rdf_files)
76+
base_idx = 100000
77+
year = 2021
78+
79+
for i in range(self.pkgs_count):
80+
rdf_idx = i % rdf_files_count
81+
rdf_file_path = rdf_files[rdf_idx]
82+
notice_id = str(base_idx + i) + "_" + str(year)
83+
self.generate_package(notice_id, self.output_path, rdf_file_path)
6484
except Exception as e:
6585
error = e
6686

6787
return self.run_cmd_result(error)
6888

6989
@classmethod
70-
def generate_package(cls, notice_id, output_path, rdf_file_path, pkg_name):
90+
def generate_package(cls, notice_id, output_path, rdf_file_path):
7191

7292
with open(rdf_file_path, "r") as f:
7393
rdf_content = f.read()
7494

7595
encoded_rdf_content = base64.b64encode(bytes(rdf_content, 'utf-8'))
7696

77-
output_file = output_path / (pkg_name + DEFAULT_NOTICE_PACKAGE_EXTENSION)
78-
7997
notice = PackageNotice(ted_id=notice_id)
8098
notice_metadata = XMLManifestationMetadataExtractor(
8199
xml_manifestation=notice.xml_manifestation).to_metadata()
@@ -84,24 +102,26 @@ def generate_package(cls, notice_id, output_path, rdf_file_path, pkg_name):
84102
create_notice_package(
85103
notice_metadata,
86104
rdf_content=encoded_rdf_content,
87-
save_to=output_file
105+
save_to=output_path
88106
)
89107

90108

91-
def run(rdf_files_count, output_folder, pkgs_count):
92-
cmd = CmdRunner(rdf_files_count, output_folder, pkgs_count)
109+
def run(rdf_files_count=None, output_folder=None, pkgs_count=None, notice_id=None,
110+
mongodb_client=MongoClient(config.MONGO_DB_AUTH_URL)):
111+
cmd = CmdRunner(rdf_files_count, output_folder, pkgs_count, list(notice_id or []), mongodb_client)
93112
cmd.run()
94113

95114

96115
@click.command()
97-
@click.argument('rdf-files-folder', nargs=1)
98-
@click.argument('output-folder', nargs=1)
116+
@click.argument('rdf-files-folder', nargs=1, required=False)
99117
@click.argument('pkgs-count', nargs=1, type=click.INT, required=False, default=DEFAULT_FILES_COUNT)
100-
def main(rdf_files_folder, output_folder, pkgs_count):
118+
@click.option('--output-folder', required=False, default=".")
119+
@click.option('--notice-id', required=False, multiple=True, default=None)
120+
def main(rdf_files_folder, pkgs_count, output_folder, notice_id):
101121
"""
102-
Generates <PKGS_COUNT> test METS packages
122+
Generates test METS packages
103123
"""
104-
run(rdf_files_folder, output_folder, pkgs_count)
124+
run(rdf_files_folder, output_folder, pkgs_count, notice_id)
105125

106126

107127
if __name__ == '__main__':

0 commit comments

Comments
 (0)