Skip to content

Commit f012140

Browse files
authored
Merge pull request #580 from OP-TED/future-dev
Future dev
2 parents 43e3b0f + 707c50a commit f012140

61 files changed

Lines changed: 522 additions & 321 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/unit-tests-srv.yml

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,14 @@ jobs:
2323
steps:
2424
- uses: actions/checkout@v2
2525

26+
# # <-- ADDING THIS STEP TO SOLVE CACHE ISSUE -->
27+
# - uses: actions/cache@v4
28+
# with:
29+
# path: |
30+
# ~/.cache/pip
31+
# .pytest_cache
32+
# key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
33+
2634
- name: Make envfile
2735
run: make staging-dotenv-file
2836

@@ -55,7 +63,7 @@ jobs:
5563
run: mkdir -p $GITHUB_WORKSPACE/.scannerwork && chmod -R 777 $GITHUB_WORKSPACE/.scannerwork && pwd && ls -a
5664

5765
- name: SonarCloud Scan
58-
uses: SonarSource/sonarqube-scan-action@v4.1.0
66+
uses: SonarSource/sonarqube-scan-action@v4.2.0
5967
env:
6068
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Needed to get PR information, if any
6169
SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }}

.github/workflows/unit-tests.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,9 @@ jobs:
2020
with:
2121
fetch-depth: 0 # Shallow clones should be disabled for a better relevancy of analysis
2222
- name: Set up Python ${{ matrix.python-version }}
23-
uses: actions/setup-python@v2
23+
uses: actions/setup-python@v4
2424
with:
25-
python-version: 3.8
25+
python-version: '3.10'
2626
- name: Install dependencies
2727
run: |
2828
sudo apt-get update

Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,12 @@ CAROOT = $(shell pwd)/infra/traefik/certs
2424
install:
2525
@ echo -e "$(BUILD_PRINT)Installing the requirements$(END_BUILD_PRINT)"
2626
@ pip install --upgrade pip
27-
@ pip install --no-cache-dir -r requirements.txt --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.5.1/constraints-no-providers-3.8.txt"
27+
@ pip install --no-cache-dir -r requirements.txt --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.10.5/constraints-no-providers-3.10.txt"
2828

2929
install-dev:
3030
@ echo -e "$(BUILD_PRINT)Installing the dev requirements$(END_BUILD_PRINT)"
3131
@ pip install --upgrade pip
32-
@ pip install --no-cache-dir -r requirements.dev.txt --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.5.1/constraints-no-providers-3.8.txt"
32+
@ pip install --no-cache-dir -r requirements.dev.txt --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.10.5/constraints-no-providers-3.10.txt"
3333

3434
test: test-unit
3535

dags/fetch_notices_by_date.py

Lines changed: 31 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1-
from datetime import timedelta
1+
from datetime import timedelta, date, datetime
22

33
from airflow.decorators import dag, task
4-
from airflow.operators.dummy import DummyOperator
4+
from airflow.models import Param
5+
from airflow.operators.empty import EmptyOperator
56
from airflow.operators.python import BranchPythonOperator, PythonOperator
67
from airflow.timetables.trigger import CronTriggerTimetable
78
from airflow.utils.trigger_rule import TriggerRule
@@ -33,7 +34,25 @@
3334
timetable=CronTriggerTimetable(
3435
cron=config.SCHEDULE_DAG_FETCH,
3536
timezone=DAG_DEFAULT_TIMEZONE),
36-
tags=['selector', 'daily-fetch'])
37+
tags=['selector', 'daily-fetch'],
38+
params={
39+
WILD_CARD_DAG_KEY: Param(
40+
default=f"{date.today() - timedelta(days=1)}",
41+
type="string",
42+
format="date",
43+
title="Date",
44+
description="""This field is required.
45+
Date to fetch notices from TED."""
46+
),
47+
TRIGGER_COMPLETE_WORKFLOW_DAG_KEY: Param(
48+
default=True,
49+
type="boolean",
50+
title="Trigger Complete Workflow",
51+
description="""This field is required.
52+
If true, the complete workflow will be triggered, otherwise only the partial workflow will be triggered."""
53+
)
54+
}
55+
)
3756
def fetch_notices_by_date():
3857
@task
3958
@event_log(TechnicalEventMessage(
@@ -43,7 +62,11 @@ def fetch_notices_by_date():
4362
))
4463
)
4564
def fetch_by_date_notice_from_ted():
46-
notice_ids = notice_fetcher_by_date_pipeline(date_wild_card=get_dag_param(key=WILD_CARD_DAG_KEY))
65+
default_date = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d")
66+
selected_date = datetime.strptime(get_dag_param(key=WILD_CARD_DAG_KEY,
67+
default_value=default_date), "%Y-%m-%d")
68+
date_wild_card = datetime.strftime(selected_date, "%Y%m%d*")
69+
notice_ids = notice_fetcher_by_date_pipeline(date_wild_card=date_wild_card)
4770
if not notice_ids:
4871
log_error("No notices has been fetched!")
4972
else:
@@ -65,10 +88,10 @@ def validate_fetched_notices():
6588
from ted_sws.supra_notice_manager.services.supra_notice_validator import validate_and_update_daily_supra_notice
6689
from datetime import datetime
6790
from pymongo import MongoClient
68-
91+
default_date = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d")
6992
publication_date = datetime.strptime(get_dag_param(key=WILD_CARD_DAG_KEY,
70-
default_value=(datetime.now() - timedelta(days=1)).strftime(
71-
"%Y%m%d*")), "%Y%m%d*")
93+
default_value=default_date
94+
), "%Y-%m-%d")
7295
mongodb_client = MongoClient(config.MONGO_DB_AUTH_URL)
7396
validate_and_update_daily_supra_notice(ted_publication_date=publication_date, mongodb_client=mongodb_client)
7497

@@ -95,7 +118,7 @@ def _branch_selector():
95118
python_callable=validate_fetched_notices
96119
)
97120

98-
finish_step = DummyOperator(task_id=FINISH_FETCH_BY_DATE_TASK_ID,
121+
finish_step = EmptyOperator(task_id=FINISH_FETCH_BY_DATE_TASK_ID,
99122
trigger_rule=TriggerRule.NONE_FAILED_MIN_ONE_SUCCESS)
100123

101124
fetch_by_date_notice_from_ted() >> branch_task >> [trigger_normalisation_workflow,

dags/fetch_notices_by_date_range.py

Lines changed: 37 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
1-
from datetime import datetime
1+
from datetime import datetime, date
22
from typing import Any
33
from dateutil import rrule
44

55
from airflow.decorators import dag, task
66
from airflow.operators.python import get_current_context
77
from airflow.operators.trigger_dagrun import TriggerDagRunOperator
8+
from airflow.models import Param
89

910
from dags import DEFAULT_DAG_ARGUMENTS
1011
from dags.dags_utils import get_dag_param
@@ -20,20 +21,46 @@
2021
END_DATE_KEY = "end_date"
2122

2223

23-
def generate_wildcards_foreach_day_in_range(start_date: str, end_date: str) -> list:
24+
def generate_list_of_dates_from_date_range(start_date: str, end_date: str) -> list:
2425
"""
2526
Given a date range returns all daily dates in that range
2627
:param start_date:
2728
:param end_date:
2829
:return:
2930
"""
30-
return [dt.strftime('%Y%m%d*')
31+
return [dt
3132
for dt in rrule.rrule(rrule.DAILY,
32-
dtstart=datetime.strptime(start_date, '%Y%m%d'),
33-
until=datetime.strptime(end_date, '%Y%m%d'))]
33+
dtstart=datetime.strptime(start_date, '%Y-%m-%d'),
34+
until=datetime.strptime(end_date, '%Y-%m-%d'))]
3435

3536

36-
@dag(default_args=DEFAULT_DAG_ARGUMENTS, schedule_interval=None, tags=['master'])
37+
@dag(default_args=DEFAULT_DAG_ARGUMENTS, schedule_interval=None, tags=['master'],
38+
params={
39+
START_DATE_KEY: Param(
40+
default=f"{date.today()}",
41+
type="string",
42+
format="date",
43+
title="Start Date",
44+
description="""This field is required.
45+
Start date of the date range to fetch notices from TED."""
46+
),
47+
END_DATE_KEY: Param(
48+
default=f"{date.today()}",
49+
type="string",
50+
format="date",
51+
title="End Date",
52+
description="""This field is required.
53+
End date of the date range to fetch notices from TED."""
54+
),
55+
TRIGGER_COMPLETE_WORKFLOW_DAG_KEY: Param(
56+
default=False,
57+
type="boolean",
58+
title="Trigger Complete Workflow",
59+
description="""This field is required.
60+
If true, the complete workflow will be triggered, otherwise only the partial workflow will be triggered."""
61+
)
62+
}
63+
)
3764
def fetch_notices_by_date_range():
3865
@task
3966
@event_log(TechnicalEventMessage(
@@ -47,12 +74,12 @@ def trigger_notice_by_date_for_each_date_in_range():
4774
start_date = get_dag_param(key=START_DATE_KEY, raise_error=True)
4875
end_date = get_dag_param(key=END_DATE_KEY, raise_error=True)
4976
trigger_complete_workflow = get_dag_param(key=TRIGGER_COMPLETE_WORKFLOW_DAG_KEY, default_value=False)
50-
date_wildcards = generate_wildcards_foreach_day_in_range(start_date, end_date)
51-
for date_wildcard in date_wildcards:
77+
fetch_dates = generate_list_of_dates_from_date_range(start_date, end_date)
78+
for fetch_date in fetch_dates:
5279
TriggerDagRunOperator(
53-
task_id=f'trigger_notice_fetch_by_date_workflow_dag_{date_wildcard[:-1]}',
80+
task_id=f'trigger_notice_fetch_by_date_workflow_dag_{fetch_date.strftime("%Y_%m_%d")}',
5481
trigger_dag_id=FETCH_NOTICES_BY_DATE_DAG_NAME,
55-
conf={WILD_CARD_DAG_KEY: date_wildcard,
82+
conf={WILD_CARD_DAG_KEY: fetch_date.strftime('%Y-%m-%d'),
5683
TRIGGER_COMPLETE_WORKFLOW_DAG_KEY: trigger_complete_workflow,
5784
}
5885
).execute(context=context)

dags/fetch_notices_by_query.py

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from airflow.decorators import dag, task
2-
from airflow.operators.dummy import DummyOperator
2+
from airflow.operators.empty import EmptyOperator
3+
from airflow.models import Param
34
from airflow.operators.python import BranchPythonOperator
45
from airflow.utils.trigger_rule import TriggerRule
56
from dags import DEFAULT_DAG_ARGUMENTS
@@ -22,7 +23,24 @@
2223

2324
@dag(default_args=DEFAULT_DAG_ARGUMENTS,
2425
schedule_interval=None,
25-
tags=['fetch'])
26+
tags=['fetch'],
27+
params={
28+
QUERY_DAG_KEY: Param(
29+
default=None,
30+
type="string",
31+
title="Query",
32+
description="""This field is required.
33+
Query to fetch notices from TED."""
34+
),
35+
TRIGGER_COMPLETE_WORKFLOW_DAG_KEY: Param(
36+
default=True,
37+
type="boolean",
38+
title="Trigger Complete Workflow",
39+
description="""This field is required.
40+
If true, the complete workflow will be triggered, otherwise only the partial workflow will be triggered."""
41+
)
42+
}
43+
)
2644
def fetch_notices_by_query():
2745
@task
2846
@event_log(TechnicalEventMessage(
@@ -58,7 +76,7 @@ def _branch_selector():
5876
python_callable=_branch_selector,
5977
)
6078

61-
finish_step = DummyOperator(task_id=FINISH_FETCH_BY_DATE_TASK_ID,
79+
finish_step = EmptyOperator(task_id=FINISH_FETCH_BY_DATE_TASK_ID,
6280
trigger_rule=TriggerRule.NONE_FAILED_MIN_ONE_SUCCESS)
6381

6482
fetch_by_query_notice_from_ted() >> branch_task >> [trigger_normalisation_workflow,

dags/load_mapping_suite_in_database.py

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from airflow.decorators import dag, task
2-
from airflow.operators.dummy import DummyOperator
2+
from airflow.models import Param
3+
from airflow.operators.empty import EmptyOperator
34
from airflow.operators.python import get_current_context, BranchPythonOperator
45
from airflow.utils.trigger_rule import TriggerRule
56
from pymongo import MongoClient
@@ -30,7 +31,37 @@
3031

3132
@dag(default_args=DEFAULT_DAG_ARGUMENTS,
3233
schedule_interval=None,
33-
tags=['fetch', 'mapping-suite', 'github'])
34+
tags=['fetch', 'mapping-suite', 'github'],
35+
params={
36+
GITHUB_REPOSITORY_URL_DAG_PARAM_KEY: Param(
37+
default=None,
38+
type=["null", "string"],
39+
title="Github repository url",
40+
description="""This is optional field.
41+
Github repository url to fetch mapping suite package from."""
42+
),
43+
BRANCH_OR_TAG_NAME_DAG_PARAM_KEY: Param(
44+
default=None,
45+
type=["null", "string"],
46+
title="Branch or tag name",
47+
description="""This is optional field.
48+
Branch or tag name to fetch mapping suite package from."""
49+
),
50+
MAPPING_SUITE_PACKAGE_NAME_DAG_PARAM_KEY: Param(
51+
default=None,
52+
type=["null", "string"],
53+
title="Mapping suite package name",
54+
description="""This is optional field.
55+
Mapping suite package name to fetch from github repository."""
56+
),
57+
LOAD_TEST_DATA_DAG_PARAM_KEY: Param(
58+
default=False,
59+
type="boolean",
60+
title="Load test data",
61+
description="""This field is used to load test data."""
62+
)
63+
}
64+
)
3465
def load_mapping_suite_in_database():
3566
@task
3667
@event_log(is_loggable=False)
@@ -77,7 +108,7 @@ def _branch_selector():
77108
task_id=CHECK_IF_LOAD_TEST_DATA_TASK_ID,
78109
python_callable=_branch_selector,
79110
)
80-
finish_step = DummyOperator(task_id=FINISH_LOADING_MAPPING_SUITE_TASK_ID,
111+
finish_step = EmptyOperator(task_id=FINISH_LOADING_MAPPING_SUITE_TASK_ID,
81112
trigger_rule=TriggerRule.NONE_FAILED_MIN_ONE_SUCCESS)
82113

83114
trigger_document_proc_pipeline = TriggerNoticeBatchPipelineOperator(task_id=TRIGGER_DOCUMENT_PROC_PIPELINE_TASK_ID)

dags/pipelines/notice_batch_processor_pipelines.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ def notices_batch_distillation_pipeline(notice_ids: List[str], mongodb_client: M
2222
for notice_id in notice_ids:
2323
notice = notice_repository.get(reference=notice_id)
2424
notice.set_distilled_rdf_manifestation(
25-
distilled_rdf_manifestation=notice.rdf_manifestation.copy())
25+
distilled_rdf_manifestation=notice.rdf_manifestation.model_copy())
2626
notices.append(notice)
2727
for cet_uri in CET_URIS:
2828
deduplicate_entities_by_cet_uri(notices=notices, cet_uri=cet_uri)

dags/pipelines/notice_fetcher_pipelines.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def notice_fetcher_by_query_pipeline(query: str = None) -> List[str]:
3737
from ted_sws.event_manager.services.log import log_error
3838
notice_ids = None
3939
try:
40-
ted_api_query = {"q": query}
40+
ted_api_query = {"query": query}
4141
mongodb_client = MongoClient(config.MONGO_DB_AUTH_URL)
4242
notice_ids = NoticeFetcher(notice_repository=NoticeRepository(mongodb_client=mongodb_client),
4343
ted_api_adapter=TedAPIAdapter(

infra/airflow-cluster/Dockerfile

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,30 @@
1-
FROM docker.io/apache/airflow:2.5.1-python3.8
1+
FROM docker.io/apache/airflow:2.10.5-python3.10
22

33
# quick sudo
44
USER root
55

66
RUN apt-get update && apt-get install -y \
77
build-essential=12.9 \
8-
default-jre=2:1.11-72 \
9-
git=1:2.30.2-1+deb11u2 \
8+
default-jre=2:1.17-74 \
9+
git=1:2.39.5-0+deb12u2 \
1010
make=4.3-4.1 \
11-
wget=1.21-1+deb11u1 \
12-
unzip=6.0-26+deb11u1 \
11+
wget=1.21.3-1+deb12u1 \
12+
unzip=6.0-28 \
13+
gosu=1.14-1+b10 \
1314
&& rm -rf /var/lib/apt/lists/*
1415

1516
# back to normal user
1617
USER airflow
1718

1819
COPY libraries /home/airflow
19-
2020
# requirements.txt shall be made availble from the **ted-sws** GitHub repository
2121
COPY requirements.txt /opt/airflow
2222

2323
# working in the /opt/airflow
2424
WORKDIR /opt/airflow
25+
2526
RUN mkdir -p ./dags ./ted_sws
2627

2728

2829
RUN pip install --upgrade pip
29-
RUN pip install --no-cache-dir -r requirements.txt --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.5.1/constraints-no-providers-3.8.txt"
30+
RUN pip install --no-cache-dir -r requirements.txt --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.10.5/constraints-no-providers-3.10.txt"

0 commit comments

Comments
 (0)