Skip to content

Commit 4ced645

Browse files
Merge pull request #518 from OP-TED/feature/TED4-101
integrate TED-API v3 and add eForms sample data
2 parents 44b8a0b + 7d8774d commit 4ced645

94 files changed

Lines changed: 24249 additions & 124 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

ted_sws/core/model/metadata.py

Lines changed: 9 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@
77

88
""" """
99
from enum import Enum
10-
from typing import List, Optional
10+
from typing import List, Optional, Union
1111

12-
from pydantic import Field
12+
from pydantic import Field, validator
1313
from pydantic.annotated_types import NamedTuple
1414

1515
from ted_sws.core.model import PropertyBaseModel
@@ -124,39 +124,14 @@ class NormalisedMetadataView(Metadata):
124124
eform_sdk_version: Optional[str]
125125

126126

127-
128127
class TEDMetadata(Metadata):
129128
"""
130129
Stores notice original metadata
131130
"""
132-
AA: List[str] = None
133-
AC: str = None
134-
CY: List[str] = None
135-
DD: str = None
136-
DI: str = None
137-
DS: str = None
138-
DT: List[str] = None
139-
MA: List[str] = None
140-
NC: List[str] = None
141-
ND: str = None
142-
NL: str = None
143-
OC: List[str] = None
144-
OJ: str = None
145-
OL: str = None
146-
OY: List[str] = None
147-
PC: List[str] = None
148-
PD: str = None
149-
PR: str = None
150-
RC: List[str] = None
151-
RN: List[str] = None
152-
RP: str = None
153-
TD: str = None
154-
TVH: str = None
155-
TVL: str = None
156-
TY: str = None
157-
award_criterion_type: str = Field(default=None, alias='award-criterion-type')
158-
corporate_body: List[str] = Field(default=None, alias='corporate-body')
159-
funding: List[str] = None
160-
notice_identifier: str = Field(default=None, alias='notice-identifier')
161-
notice_type: str = Field(default=None, alias='notice-type')
162-
notice_version: str = Field(default=None, alias='notice-version')
131+
ND: Optional[str] = None
132+
PD: Optional[str] = None
133+
# ------------------------------------------------------------------
134+
# Note: In TED-API v3 this field is str, in past was list
135+
# ------------------------------------------------------------------
136+
RN: Optional[Union[List[str], str]] = None
137+
# ------------------------------------------------------------------

ted_sws/master_data_registry/services/entity_deduplication.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -239,7 +239,14 @@ def deduplicate_procedure_entities(notices: List[Notice], procedure_cet_uri: str
239239
notice_families = defaultdict(list)
240240
for notice in notices:
241241
if notice.original_metadata and notice.original_metadata.RN:
242-
parent_notice_id = notice.original_metadata.RN[0]
242+
parent_notice_id_field = notice.original_metadata.RN
243+
# ------------------------------------------------------------------
244+
# Note: This logic is added to be back compatible with old TED-API data format.
245+
# ------------------------------------------------------------------
246+
if isinstance(parent_notice_id_field, list):
247+
parent_notice_id_field = parent_notice_id_field[0]
248+
# ------------------------------------------------------------------
249+
parent_notice_id = parent_notice_id_field
243250
parent_notice_id = f"{parent_notice_id[4:]}-{parent_notice_id[:4]}"
244251
notice_families[parent_notice_id].append(notice)
245252

Lines changed: 93 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,34 @@
1-
import base64
21
import json
2+
import time
33
from datetime import date
4-
from typing import List
4+
from http import HTTPStatus
5+
from typing import List, Generator
56

67
import requests
78

89
from ted_sws import config
10+
from ted_sws.event_manager.services.log import log_warning
911
from ted_sws.notice_fetcher.adapters.ted_api_abc import TedAPIAdapterABC, RequestAPI
1012

11-
DEFAULT_TED_API_QUERY_RESULT_SIZE = {"pageSize": 100,
12-
"pageNum": 1,
13-
"scope": 3
13+
DOCUMENTS_PER_PAGE = 100
14+
15+
DEFAULT_TED_API_QUERY_RESULT_SIZE = {"limit": DOCUMENTS_PER_PAGE,
16+
"page": 1,
17+
"scope": "ALL",
1418
}
1519

16-
DEFAULT_TED_API_QUERY_RESULT_FIELDS = {"fields": ["AA", "AC", "CY", "DD", "DI", "DS", "TVL", "TY",
17-
"DT", "MA", "NC", "ND", "OC", "OJ", "OL", "OY",
18-
"PC", "PD", "PR", "RC", "RN", "RP", "TD", "TVH",
19-
"CONTENT",
20-
# INFO: This query result fields is not supported correctly by TED-API.
21-
#"notice-type", "award-criterion-type", "corporate-body",
22-
#"funding", "notice-identifier", "notice-version"
23-
]}
24-
25-
TOTAL_DOCUMENTS_NUMBER = "total"
26-
RESPONSE_RESULTS = "results"
20+
DEFAULT_TED_API_QUERY_RESULT_FIELDS = {"fields": ["ND", "PD", "RN"]}
21+
22+
TOTAL_DOCUMENTS_NUMBER = "totalNoticeCount"
23+
RESPONSE_RESULTS = "notices"
2724
DOCUMENT_CONTENT = "content"
28-
RESULT_PAGE_NUMBER = "pageNum"
25+
RESULT_PAGE_NUMBER = "page"
2926
TED_API_FIELDS = "fields"
30-
DOCUMENT_CONTENT_FIELD = "CONTENT"
27+
LINKS_TO_CONTENT_KEY = "links"
28+
XML_CONTENT_KEY = "xml"
29+
MULTIPLE_LANGUAGE_CONTENT_KEY = "MUL"
30+
ENGLISH_LANGUAGE_CONTENT_KEY = "ENG"
31+
DOCUMENT_NOTICE_ID_KEY = "ND"
3132

3233

3334
class TedRequestAPI(RequestAPI):
@@ -40,15 +41,21 @@ def __call__(self, api_url: str, api_query: dict) -> dict:
4041
:return: dict
4142
"""
4243

43-
response = requests.get(api_url, params=api_query)
44+
response = requests.post(api_url, json=api_query)
45+
try_again_request_count = 0
46+
while response.status_code == HTTPStatus.TOO_MANY_REQUESTS:
47+
try_again_request_count += 1
48+
time.sleep(try_again_request_count * 0.1)
49+
response = requests.post(api_url, json=api_query)
50+
if try_again_request_count > 5:
51+
break
4452
if response.ok:
4553
response_content = json.loads(response.text)
4654
return response_content
4755
else:
4856
raise Exception(f"The TED-API call failed with: {response}")
4957

5058

51-
5259
class TedAPIAdapter(TedAPIAdapterABC):
5360
"""
5461
This class will fetch documents content
@@ -71,7 +78,7 @@ def get_by_wildcard_date(self, wildcard_date: str) -> List[dict]:
7178
:return: List[str]
7279
"""
7380

74-
query = {"q": f"PD=[{wildcard_date}]"}
81+
query = {"query": f"PD={wildcard_date}"}
7582

7683
return self.get_by_query(query=query)
7784

@@ -83,48 +90,94 @@ def get_by_range_date(self, start_date: date, end_date: date) -> List[dict]:
8390
:return:List[str]
8491
"""
8592

86-
date_filter = f">={start_date.strftime('%Y%m%d')} AND <={end_date.strftime('%Y%m%d')}"
93+
date_filter = f"PD>={start_date.strftime('%Y%m%d')} AND PD<={end_date.strftime('%Y%m%d')}"
8794

88-
query = {"q": f"PD=[{date_filter}]"}
95+
query = {"query": date_filter}
8996

9097
return self.get_by_query(query=query)
9198

92-
def get_by_query(self, query: dict, result_fields: dict = None) -> List[dict]:
99+
def _retrieve_document_content(self, document_content: dict) -> str:
100+
"""
101+
Method to retrieve a document content from the TedApi API
102+
:param document_content:
103+
:return:str '
104+
"""
105+
xml_links = document_content[LINKS_TO_CONTENT_KEY][XML_CONTENT_KEY]
106+
language_key = MULTIPLE_LANGUAGE_CONTENT_KEY
107+
if language_key not in xml_links.keys():
108+
if ENGLISH_LANGUAGE_CONTENT_KEY in xml_links.keys():
109+
language_key = ENGLISH_LANGUAGE_CONTENT_KEY
110+
else:
111+
language_key = xml_links.keys()[0]
112+
113+
log_warning(
114+
f"Language key {MULTIPLE_LANGUAGE_CONTENT_KEY} not found in {document_content[DOCUMENT_NOTICE_ID_KEY]},"
115+
f" and will be used language key {language_key}!")
116+
117+
xml_document_content_link = xml_links[language_key]
118+
response = requests.get(xml_document_content_link)
119+
try_again_request_count = 0
120+
while response.status_code == HTTPStatus.TOO_MANY_REQUESTS:
121+
try_again_request_count += 1
122+
time.sleep(try_again_request_count * 0.1)
123+
response = requests.get(xml_document_content_link)
124+
if try_again_request_count > 5:
125+
break
126+
if response.ok:
127+
return response.text
128+
else:
129+
raise Exception(f"The notice content can't be loaded!: {response}, {response.content}")
130+
131+
def get_generator_by_query(self, query: dict, result_fields: dict = None, load_content: bool = True) -> Generator[
132+
dict, None, None]:
93133
"""
94134
Method to get a documents content by passing a query to the API (json)
95135
:param query:
96136
:param result_fields:
97-
:return:List[str]
137+
:param load_content:
138+
:return:Generator[dict]
98139
"""
99140
query.update(DEFAULT_TED_API_QUERY_RESULT_SIZE)
100141
query.update(result_fields or DEFAULT_TED_API_QUERY_RESULT_FIELDS)
101142
response_body = self.request_api(api_url=self.ted_api_url, api_query=query)
102-
103143
documents_number = response_body[TOTAL_DOCUMENTS_NUMBER]
104-
result_pages = 1 + int(documents_number) // 100
144+
result_pages = 1 + int(documents_number) // DOCUMENTS_PER_PAGE
105145
documents_content = response_body[RESPONSE_RESULTS]
146+
if result_pages > 1:
147+
for page_number in range(2, result_pages + 1):
148+
query[RESULT_PAGE_NUMBER] = page_number
149+
response_body = self.request_api(api_url=self.ted_api_url, api_query=query)
150+
documents_content += response_body[RESPONSE_RESULTS]
106151

107-
for page_number in range(2, result_pages + 1):
108-
query[RESULT_PAGE_NUMBER] = page_number
109-
response_body = self.request_api(api_url=self.ted_api_url, api_query=query)
110-
documents_content += response_body[RESPONSE_RESULTS]
111-
if DOCUMENT_CONTENT_FIELD in query[TED_API_FIELDS]:
112-
decoded_documents_content = []
113152
for document_content in documents_content:
114-
document_content[DOCUMENT_CONTENT] = base64.b64decode(document_content[DOCUMENT_CONTENT]).decode(
115-
encoding="utf-8")
116-
decoded_documents_content.append(document_content)
117-
return decoded_documents_content
153+
if load_content:
154+
document_content[DOCUMENT_CONTENT] = self._retrieve_document_content(document_content)
155+
del document_content[LINKS_TO_CONTENT_KEY]
156+
yield document_content
118157
else:
119-
return documents_content
158+
for document_content in documents_content:
159+
if load_content:
160+
document_content[DOCUMENT_CONTENT] = self._retrieve_document_content(document_content)
161+
del document_content[LINKS_TO_CONTENT_KEY]
162+
yield document_content
163+
164+
def get_by_query(self, query: dict, result_fields: dict = None, load_content: bool = True) -> List[dict]:
165+
"""
166+
Method to get a documents content by passing a query to the API (json)
167+
:param query:
168+
:param result_fields:
169+
:param load_content:
170+
:return:List[dict]
171+
"""
172+
return list(self.get_generator_by_query(query=query, result_fields=result_fields, load_content=load_content))
120173

121174
def get_by_id(self, document_id: str) -> dict:
122175
"""
123176
Method to get a document content by passing an ID
124177
:param document_id:
125-
:return: str
178+
:return: dict
126179
"""
127180

128-
query = {"q": f"ND=[{document_id}]"}
181+
query = {"query": f"ND={document_id}"}
129182

130183
return self.get_by_query(query=query)[0]

ted_sws/notice_fetcher/services/notice_fetcher.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,10 @@ def _create_notice(self, notice_data: dict) -> Notice:
6868
:param notice_data:
6969
:return:
7070
"""
71-
xml_manifestation = XMLManifestation(object_data=notice_data["content"])
71+
try:
72+
xml_manifestation = XMLManifestation(object_data=notice_data["content"])
73+
except Exception as e:
74+
raise Exception(str(e), notice_data)
7275
del notice_data["content"]
7376
ted_id = notice_data["ND"]
7477
original_metadata = TEDMetadata(**notice_data)

ted_sws/supra_notice_manager/services/supra_notice_validator.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,8 @@ def validate_and_update_daily_supra_notice(ted_publication_date: day_type, mongo
3535
fetched_notice_ids = set(fetched_notice_ids_list)
3636

3737
ted_api_adapter: TedAPIAdapter = TedAPIAdapter(request_api=request_api)
38-
query = {"q": f"PD=[{ted_publication_date.strftime('%Y%m%d*')}]"}
39-
documents = ted_api_adapter.get_by_query(query=query, result_fields={"fields": ["ND"]})
38+
query = {"query": f"PD={ted_publication_date.strftime('%Y%m%d*')}"}
39+
documents = ted_api_adapter.get_by_query(query=query, result_fields={"fields": ["ND"]}, load_content=False)
4040
api_notice_ids_list = [document["ND"] for document in documents] if documents and len(documents) else []
4141
api_notice_ids = set(api_notice_ids_list)
4242

tests/conftest.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929

3030
@pytest.fixture
3131
def notice_id():
32-
return "067623-2022"
32+
return "67623-2022"
3333

3434

3535
@pytest.fixture

tests/e2e/data_manager/test_mongodb_client.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ def test_mongo_db_query_2():
142142

143143
def test_create_matview_for_notices(fake_mongodb_client):
144144
notice_id = "696661-2022"
145-
ted_api_query = {"q": f"ND=[{notice_id}]"}
145+
ted_api_query = {"query": f"ND={notice_id}"}
146146
mongodb_client = fake_mongodb_client
147147
notice_repository = NoticeRepository(mongodb_client=mongodb_client)
148148
NoticeFetcher(notice_repository=notice_repository,

tests/e2e/data_manager/test_notice_repository.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,22 +14,22 @@ def test_notice_repository_create(mongodb_client):
1414
notice_repository = NoticeRepository(mongodb_client=mongodb_client, database_name=TEST_DATABASE_NAME)
1515
notice = Notice(ted_id=NOTICE_TED_ID)
1616
notice.set_xml_manifestation(XMLManifestation(object_data="HELLO"))
17-
notice.set_original_metadata(TEDMetadata(**{"AA": ["Metadata"]}))
17+
notice.set_original_metadata(TEDMetadata(**{"RN": ["Metadata"]}))
1818
notice_repository.add(notice)
1919
result_notice = notice_repository.get(reference=NOTICE_TED_ID)
2020
assert result_notice
2121
assert result_notice.ted_id == NOTICE_TED_ID
22-
assert result_notice.original_metadata.AA == ["Metadata"]
22+
assert result_notice.original_metadata.RN == ["Metadata"]
2323
result_notices = list(notice_repository.list())
2424
assert result_notices
2525
assert len(result_notices) == 1
2626
notice_repository.add(notice)
27-
notice.set_original_metadata(ted_metadata=TEDMetadata(**{"AA": ["Updated metadata"]}))
27+
notice.set_original_metadata(ted_metadata=TEDMetadata(**{"RN": ["Updated metadata"]}))
2828
notice_repository.update(notice)
2929
result_notice = notice_repository.get(reference=NOTICE_TED_ID)
3030
assert result_notice
3131
assert result_notice.ted_id == NOTICE_TED_ID
32-
assert result_notice.original_metadata.AA == ["Updated metadata"]
32+
assert result_notice.original_metadata.RN == ["Updated metadata"]
3333
mongodb_client.drop_database(TEST_DATABASE_NAME)
3434

3535

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
import pathlib
2+
3+
TED_API_EFORMS_QUERY = """
4+
TD NOT IN (C E G I D P M Q O R 0 1 2 3 4 5 6 7 8 9 B S Y V F A H J K) AND
5+
notice-subtype IN ({eforms_subtype}) AND
6+
FT~"eforms-sdk-{eforms_sdk_version}"
7+
"""
8+
9+
EFORMS_SUBTYPES = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
10+
EFORMS_SDK_VERSIONS = [f"1.{version}" for version in range(3, 11)]
11+
12+
13+
def _test_generate_eforms_sample_dataset(ted_document_search):
14+
results_path = pathlib.Path(__file__).parent / "eforms_samples"
15+
16+
for eforms_sdk_version in EFORMS_SDK_VERSIONS:
17+
for eforms_subtype in EFORMS_SUBTYPES:
18+
results_dir_path = results_path / f"eforms_sdk_v{eforms_sdk_version}" / f"eform_subtype_{eforms_subtype}"
19+
20+
print(f"Load for {results_dir_path}")
21+
query = {"query": TED_API_EFORMS_QUERY.format(eforms_sdk_version=eforms_sdk_version,
22+
eforms_subtype=eforms_subtype)}
23+
print(query)
24+
notices = ted_document_search.get_generator_by_query(query=query)
25+
for sample_id in range(1, 2):
26+
notice = next(notices, None)
27+
if notice is None:
28+
break
29+
results_dir_path.mkdir(parents=True, exist_ok=True)
30+
result_notice_xml_path = results_dir_path / f"{notice['ND']}.xml"
31+
result_notice_xml_path.write_text(notice["content"], encoding="utf-8")
32+
33+
34+
def test_fetch_notice_by_id(ted_document_search):
35+
notice_id = "067623-2022"
36+
import json
37+
notice_content = ted_document_search.get_by_id(document_id=notice_id)
38+
result_notice_path = pathlib.Path(__file__).parent / "epo_notice.xml"
39+
result_notice_path.write_text(json.dumps(notice_content), encoding="utf-8")
40+

0 commit comments

Comments
 (0)