1- import base64
21import json
2+ import time
33from datetime import date
4- from typing import List
4+ from http import HTTPStatus
5+ from typing import List , Generator
56
67import requests
78
89from ted_sws import config
10+ from ted_sws .event_manager .services .log import log_warning
911from ted_sws .notice_fetcher .adapters .ted_api_abc import TedAPIAdapterABC , RequestAPI
1012
11- DEFAULT_TED_API_QUERY_RESULT_SIZE = {"pageSize" : 100 ,
12- "pageNum" : 1 ,
13- "scope" : 3
13+ DOCUMENTS_PER_PAGE = 100
14+
15+ DEFAULT_TED_API_QUERY_RESULT_SIZE = {"limit" : DOCUMENTS_PER_PAGE ,
16+ "page" : 1 ,
17+ "scope" : "ALL" ,
1418 }
1519
16- DEFAULT_TED_API_QUERY_RESULT_FIELDS = {"fields" : ["AA" , "AC" , "CY" , "DD" , "DI" , "DS" , "TVL" , "TY" ,
17- "DT" , "MA" , "NC" , "ND" , "OC" , "OJ" , "OL" , "OY" ,
18- "PC" , "PD" , "PR" , "RC" , "RN" , "RP" , "TD" , "TVH" ,
19- "CONTENT" ,
20- # INFO: This query result fields is not supported correctly by TED-API.
21- #"notice-type", "award-criterion-type", "corporate-body",
22- #"funding", "notice-identifier", "notice-version"
23- ]}
24-
25- TOTAL_DOCUMENTS_NUMBER = "total"
26- RESPONSE_RESULTS = "results"
20+ DEFAULT_TED_API_QUERY_RESULT_FIELDS = {"fields" : ["ND" , "PD" , "RN" ]}
21+
22+ TOTAL_DOCUMENTS_NUMBER = "totalNoticeCount"
23+ RESPONSE_RESULTS = "notices"
2724DOCUMENT_CONTENT = "content"
28- RESULT_PAGE_NUMBER = "pageNum "
25+ RESULT_PAGE_NUMBER = "page "
2926TED_API_FIELDS = "fields"
30- DOCUMENT_CONTENT_FIELD = "CONTENT"
27+ LINKS_TO_CONTENT_KEY = "links"
28+ XML_CONTENT_KEY = "xml"
29+ MULTIPLE_LANGUAGE_CONTENT_KEY = "MUL"
30+ ENGLISH_LANGUAGE_CONTENT_KEY = "ENG"
31+ DOCUMENT_NOTICE_ID_KEY = "ND"
3132
3233
3334class TedRequestAPI (RequestAPI ):
@@ -40,15 +41,21 @@ def __call__(self, api_url: str, api_query: dict) -> dict:
4041 :return: dict
4142 """
4243
43- response = requests .get (api_url , params = api_query )
44+ response = requests .post (api_url , json = api_query )
45+ try_again_request_count = 0
46+ while response .status_code == HTTPStatus .TOO_MANY_REQUESTS :
47+ try_again_request_count += 1
48+ time .sleep (try_again_request_count * 0.1 )
49+ response = requests .post (api_url , json = api_query )
50+ if try_again_request_count > 5 :
51+ break
4452 if response .ok :
4553 response_content = json .loads (response .text )
4654 return response_content
4755 else :
4856 raise Exception (f"The TED-API call failed with: { response } " )
4957
5058
51-
5259class TedAPIAdapter (TedAPIAdapterABC ):
5360 """
5461 This class will fetch documents content
@@ -71,7 +78,7 @@ def get_by_wildcard_date(self, wildcard_date: str) -> List[dict]:
7178 :return: List[str]
7279 """
7380
74- query = {"q " : f"PD=[ { wildcard_date } ] " }
81+ query = {"query " : f"PD={ wildcard_date } " }
7582
7683 return self .get_by_query (query = query )
7784
@@ -83,48 +90,94 @@ def get_by_range_date(self, start_date: date, end_date: date) -> List[dict]:
8390 :return:List[str]
8491 """
8592
86- date_filter = f">={ start_date .strftime ('%Y%m%d' )} AND <={ end_date .strftime ('%Y%m%d' )} "
93+ date_filter = f"PD >={ start_date .strftime ('%Y%m%d' )} AND PD <={ end_date .strftime ('%Y%m%d' )} "
8794
88- query = {"q " : f"PD=[ { date_filter } ]" }
95+ query = {"query " : date_filter }
8996
9097 return self .get_by_query (query = query )
9198
92- def get_by_query (self , query : dict , result_fields : dict = None ) -> List [dict ]:
99+ def _retrieve_document_content (self , document_content : dict ) -> str :
100+ """
101+ Method to retrieve a document content from the TedApi API
102+ :param document_content:
103+ :return:str '
104+ """
105+ xml_links = document_content [LINKS_TO_CONTENT_KEY ][XML_CONTENT_KEY ]
106+ language_key = MULTIPLE_LANGUAGE_CONTENT_KEY
107+ if language_key not in xml_links .keys ():
108+ if ENGLISH_LANGUAGE_CONTENT_KEY in xml_links .keys ():
109+ language_key = ENGLISH_LANGUAGE_CONTENT_KEY
110+ else :
111+ language_key = xml_links .keys ()[0 ]
112+
113+ log_warning (
114+ f"Language key { MULTIPLE_LANGUAGE_CONTENT_KEY } not found in { document_content [DOCUMENT_NOTICE_ID_KEY ]} ,"
115+ f" and will be used language key { language_key } !" )
116+
117+ xml_document_content_link = xml_links [language_key ]
118+ response = requests .get (xml_document_content_link )
119+ try_again_request_count = 0
120+ while response .status_code == HTTPStatus .TOO_MANY_REQUESTS :
121+ try_again_request_count += 1
122+ time .sleep (try_again_request_count * 0.1 )
123+ response = requests .get (xml_document_content_link )
124+ if try_again_request_count > 5 :
125+ break
126+ if response .ok :
127+ return response .text
128+ else :
129+ raise Exception (f"The notice content can't be loaded!: { response } , { response .content } " )
130+
131+ def get_generator_by_query (self , query : dict , result_fields : dict = None , load_content : bool = True ) -> Generator [
132+ dict , None , None ]:
93133 """
94134 Method to get a documents content by passing a query to the API (json)
95135 :param query:
96136 :param result_fields:
97- :return:List[str]
137+ :param load_content:
138+ :return:Generator[dict]
98139 """
99140 query .update (DEFAULT_TED_API_QUERY_RESULT_SIZE )
100141 query .update (result_fields or DEFAULT_TED_API_QUERY_RESULT_FIELDS )
101142 response_body = self .request_api (api_url = self .ted_api_url , api_query = query )
102-
103143 documents_number = response_body [TOTAL_DOCUMENTS_NUMBER ]
104- result_pages = 1 + int (documents_number ) // 100
144+ result_pages = 1 + int (documents_number ) // DOCUMENTS_PER_PAGE
105145 documents_content = response_body [RESPONSE_RESULTS ]
146+ if result_pages > 1 :
147+ for page_number in range (2 , result_pages + 1 ):
148+ query [RESULT_PAGE_NUMBER ] = page_number
149+ response_body = self .request_api (api_url = self .ted_api_url , api_query = query )
150+ documents_content += response_body [RESPONSE_RESULTS ]
106151
107- for page_number in range (2 , result_pages + 1 ):
108- query [RESULT_PAGE_NUMBER ] = page_number
109- response_body = self .request_api (api_url = self .ted_api_url , api_query = query )
110- documents_content += response_body [RESPONSE_RESULTS ]
111- if DOCUMENT_CONTENT_FIELD in query [TED_API_FIELDS ]:
112- decoded_documents_content = []
113152 for document_content in documents_content :
114- document_content [ DOCUMENT_CONTENT ] = base64 . b64decode ( document_content [ DOCUMENT_CONTENT ]). decode (
115- encoding = "utf-8" )
116- decoded_documents_content . append ( document_content )
117- return decoded_documents_content
153+ if load_content :
154+ document_content [ DOCUMENT_CONTENT ] = self . _retrieve_document_content ( document_content )
155+ del document_content [ LINKS_TO_CONTENT_KEY ]
156+ yield document_content
118157 else :
119- return documents_content
158+ for document_content in documents_content :
159+ if load_content :
160+ document_content [DOCUMENT_CONTENT ] = self ._retrieve_document_content (document_content )
161+ del document_content [LINKS_TO_CONTENT_KEY ]
162+ yield document_content
163+
164+ def get_by_query (self , query : dict , result_fields : dict = None , load_content : bool = True ) -> List [dict ]:
165+ """
166+ Method to get a documents content by passing a query to the API (json)
167+ :param query:
168+ :param result_fields:
169+ :param load_content:
170+ :return:List[dict]
171+ """
172+ return list (self .get_generator_by_query (query = query , result_fields = result_fields , load_content = load_content ))
120173
121174 def get_by_id (self , document_id : str ) -> dict :
122175 """
123176 Method to get a document content by passing an ID
124177 :param document_id:
125- :return: str
178+ :return: dict
126179 """
127180
128- query = {"q " : f"ND=[ { document_id } ] " }
181+ query = {"query " : f"ND={ document_id } " }
129182
130183 return self .get_by_query (query = query )[0 ]
0 commit comments