Skip to content

Commit d31b1f6

Browse files
Merge pull request #422 from OP-TED/feature/TED-1173
check availability in Cellar with a batch of URIes
2 parents 43990b8 + 0eee7d7 commit d31b1f6

4 files changed

Lines changed: 48 additions & 13 deletions

File tree

dags/dags_utils.py

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,10 @@
1-
from itertools import islice, chain
2-
from typing import Any, Iterable
1+
from typing import Any
32

43
from airflow.operators.python import get_current_context
54

65
TASK_INSTANCE = "ti"
76

87

9-
def chunks(iterable: Iterable, chunk_size: int):
10-
iterator = iter(iterable)
11-
for first in iterator:
12-
yield chain([first], islice(iterator, chunk_size - 1))
13-
14-
158
def select_first_non_none(data):
169
"""
1710

dags/operators/DagBatchPipelineOperator.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,9 @@
44
from airflow.operators.trigger_dagrun import TriggerDagRunOperator
55
from pymongo import MongoClient
66

7-
from dags.dags_utils import pull_dag_upstream, push_dag_downstream, chunks, get_dag_param, smart_xcom_pull, \
7+
from dags.dags_utils import pull_dag_upstream, push_dag_downstream, get_dag_param, smart_xcom_pull, \
88
smart_xcom_push
9+
from ted_sws.core.service.batch_processing import chunks
910
from dags.pipelines.pipeline_protocols import NoticePipelineCallable
1011
from ted_sws import config
1112
from ted_sws.data_manager.adapters.notice_repository import NoticeRepository
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
from itertools import chain, islice
2+
from typing import Iterable
3+
4+
5+
def chunks(iterable: Iterable, chunk_size: int):
6+
"""
7+
This function split in chunks a iterable structure based on chunk_size parameter.
8+
:param iterable:
9+
:param chunk_size:
10+
:return:
11+
"""
12+
iterator = iter(iterable)
13+
for first in iterator:
14+
yield chain([first], islice(iterator, chunk_size - 1))

ted_sws/notice_validator/services/check_availability_of_notice_in_cellar.py

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,17 @@
1-
from typing import List
1+
from typing import List, Set
22

33
from pymongo import MongoClient
44
from ted_sws.core.model.notice import Notice, NoticeStatus
5+
from ted_sws.core.service.batch_processing import chunks
56
from ted_sws.data_manager.adapters.notice_repository import NoticeRepository
67
from ted_sws.data_manager.adapters.sparql_endpoint import SPARQLTripleStoreEndpoint
78

89
WEBAPI_SPARQL_URL = "https://publications.europa.eu/webapi/rdf/sparql"
910
CELLAR_NOTICE_AVAILABILITY_QUERY = "ASK {{ VALUES ?instance {{<{notice_uri}>}} ?instance ?predicate [] . }}"
11+
CELLAR_NOTICES_AVAILABILITY_QUERY = "select distinct ?s {{VALUES ?s {{$notice_uries}} ?s ?p ?o . }}"
1012
WEBAPI_SPARQL_RUN_FORMAT = "application/sparql-results+json"
1113
INVALID_NOTICE_URI = 'https://www.w3.org/1999/02/22-rdf-syntax-ns#type-invalid'
14+
DEFAULT_NOTICES_BATCH_SIZE = 1000
1215

1316

1417
def check_availability_of_notice_in_cellar(notice_uri: str, endpoint_url: str = WEBAPI_SPARQL_URL) -> bool:
@@ -23,6 +26,19 @@ def check_availability_of_notice_in_cellar(notice_uri: str, endpoint_url: str =
2326
return result['boolean']
2427

2528

29+
def check_availability_of_notices_in_cellar(notice_uries: List[str], endpoint_url: str = WEBAPI_SPARQL_URL) -> Set[str]:
30+
"""
31+
This service check the notices availability in Cellar, and return available set of notice uries.
32+
:param notice_uries:
33+
:param endpoint_url:
34+
:return:
35+
"""
36+
notice_uries = " ".join([f"<{notice_uri}>" for notice_uri in notice_uries])
37+
query = CELLAR_NOTICE_AVAILABILITY_QUERY.format(notice_uri=notice_uries)
38+
result = SPARQLTripleStoreEndpoint(endpoint_url=endpoint_url).with_query(sparql_query=query).fetch_tabular()
39+
return set(result['s'].to_list())
40+
41+
2642
def generate_notice_uri_from_notice_id(notice_id: str) -> str:
2743
"""
2844
This service generates Cellar URI for a notice, determined by notice_id
@@ -60,6 +76,17 @@ def validate_notices_availability_in_cellar(notice_statuses: List[NoticeStatus],
6076
notice_repository = NoticeRepository(mongodb_client=mongodb_client)
6177
for notice_status in notice_statuses:
6278
selected_notices = notice_repository.get_notices_by_status(notice_status=notice_status)
63-
for selected_notice in selected_notices:
64-
validate_notice_availability_in_cellar(notice=selected_notice)
65-
notice_repository.update(notice=selected_notice)
79+
for selected_notices_chunk in chunks(selected_notices, chunk_size=DEFAULT_NOTICES_BATCH_SIZE):
80+
selected_notices_map = {
81+
generate_notice_uri_from_notice_id(notice_id=notice.ted_id): notice
82+
for notice in selected_notices_chunk
83+
}
84+
selected_notices_uries = list(selected_notices_map.keys())
85+
available_notice_uries_in_cellar = check_availability_of_notices_in_cellar(
86+
notice_uries=selected_notices_uries)
87+
for notice_uri, notice in selected_notices_map.items():
88+
if notice_uri in available_notice_uries_in_cellar:
89+
notice.update_status_to(new_status=NoticeStatus.PUBLICLY_AVAILABLE)
90+
else:
91+
notice.update_status_to(new_status=NoticeStatus.PUBLICLY_UNAVAILABLE)
92+
notice_repository.update(notice=notice)

0 commit comments

Comments
 (0)