22import time
33from datetime import date
44from http import HTTPStatus
5- from typing import List , Generator
5+ from typing import List , Generator , Callable , Optional
66
77import requests
8+ from requests import Response
89
910from ted_sws import config
10- from ted_sws .event_manager .services .log import log_error
11+ from ted_sws .event_manager .services .log import log_error , log_warning
1112from ted_sws .notice_fetcher .adapters .ted_api_abc import TedAPIAdapterABC , RequestAPI
1213
1314DOCUMENTS_PER_PAGE = 100
3031ENGLISH_LANGUAGE_CONTENT_KEY = "ENG"
3132DOCUMENT_NOTICE_ID_KEY = "ND"
3233
34+ CUSTOM_HEADER = {'User-Agent' : 'TED-SWS-Pipeline-Fetcher' }
35+ MAX_RETRIES = 5
36+ DEFAULT_BACKOFF_FACTOR = 1
37+
38+
39+ def execute_request_with_retries (request_lambda : Callable ,
40+ max_retries : int = MAX_RETRIES ,
41+ backoff_factor : float = DEFAULT_BACKOFF_FACTOR ) -> Response :
42+ response = request_lambda ()
43+ requests_counter = 0
44+ while response .status_code != HTTPStatus .OK :
45+ if requests_counter >= max_retries :
46+ log_warning (f"Max retries exceeded, retried { max_retries } times!" )
47+ return response
48+ requests_counter += 1
49+ time_to_sleep = backoff_factor * requests_counter
50+ log_warning (f"Request returned status code { response .status_code } , retrying in { time_to_sleep } seconds!" )
51+ time .sleep (time_to_sleep )
52+ response = request_lambda ()
53+ return response
54+
55+
56+ def get_configured_custom_headers (custom_header : Optional [dict ] = None ) -> dict :
57+ headers = requests .utils .default_headers ()
58+ if custom_header :
59+ headers .update (custom_header )
60+ return headers
61+
3362
3463class TedRequestAPI (RequestAPI ):
3564
@@ -40,15 +69,9 @@ def __call__(self, api_url: str, api_query: dict) -> dict:
4069 :param api_query:
4170 :return: dict
4271 """
43-
44- response = requests .post (api_url , json = api_query )
45- try_again_request_count = 0
46- while response .status_code == HTTPStatus .TOO_MANY_REQUESTS :
47- try_again_request_count += 1
48- time .sleep (try_again_request_count * 0.1 )
49- response = requests .post (api_url , json = api_query )
50- if try_again_request_count > 5 :
51- break
72+ headers = get_configured_custom_headers (CUSTOM_HEADER )
73+ response = execute_request_with_retries (
74+ request_lambda = lambda : requests .post (api_url , json = api_query , headers = headers ))
5275 if response .ok :
5376 response_content = json .loads (response .text )
5477 return response_content
@@ -108,14 +131,9 @@ def _retrieve_document_content(self, document_content: dict) -> str:
108131 log_error (exception_message )
109132 raise Exception (exception_message )
110133 xml_document_content_link = xml_links [MULTIPLE_LANGUAGE_CONTENT_KEY ]
111- response = requests .get (xml_document_content_link )
112- try_again_request_count = 0
113- while response .status_code == HTTPStatus .TOO_MANY_REQUESTS :
114- try_again_request_count += 1
115- time .sleep (try_again_request_count * 0.1 )
116- response = requests .get (xml_document_content_link )
117- if try_again_request_count > 5 :
118- break
134+ headers = get_configured_custom_headers (CUSTOM_HEADER )
135+ response = execute_request_with_retries (
136+ request_lambda = lambda : requests .get (xml_document_content_link , headers = headers ))
119137 if response .ok :
120138 return response .text
121139 else :
@@ -142,17 +160,11 @@ def get_generator_by_query(self, query: dict, result_fields: dict = None, load_c
142160 response_body = self .request_api (api_url = self .ted_api_url , api_query = query )
143161 documents_content += response_body [RESPONSE_RESULTS ]
144162
145- for document_content in documents_content :
146- if load_content :
147- document_content [DOCUMENT_CONTENT ] = self ._retrieve_document_content (document_content )
148- del document_content [LINKS_TO_CONTENT_KEY ]
149- yield document_content
150- else :
151- for document_content in documents_content :
152- if load_content :
153- document_content [DOCUMENT_CONTENT ] = self ._retrieve_document_content (document_content )
154- del document_content [LINKS_TO_CONTENT_KEY ]
155- yield document_content
163+ for document_content in documents_content :
164+ if load_content :
165+ document_content [DOCUMENT_CONTENT ] = self ._retrieve_document_content (document_content )
166+ del document_content [LINKS_TO_CONTENT_KEY ]
167+ yield document_content
156168
157169 def get_by_query (self , query : dict , result_fields : dict = None , load_content : bool = True ) -> List [dict ]:
158170 """
0 commit comments