|
| 1 | +import abc |
1 | 2 | import xml.etree.ElementTree as ET |
2 | 3 | from io import StringIO |
| 4 | +from typing import Dict |
3 | 5 |
|
4 | 6 | from ted_sws.core.model.manifestation import XMLManifestation |
5 | | -from ted_sws.notice_metadata_processor.model.metadata import ExtractedMetadata, LanguageTaggedString, CompositeTitle, \ |
6 | | - EncodedValue |
7 | | -from ted_sws.notice_metadata_processor.services.xpath_registry import XpathRegistry |
| 7 | +from ted_sws.core.model.metadata import LanguageTaggedString, CompositeTitle, EncodedValue |
| 8 | +from ted_sws.notice_metadata_processor.adapters.xpath_registry import EformsXPathRegistry, DefaultXPathRegistry |
| 9 | +from ted_sws.notice_metadata_processor.model.metadata import ExtractedMetadata |
8 | 10 |
|
| 11 | +MANIFESTATION_NAMESPACE_KEY = "manifestation_ns" |
| 12 | +NUTS_NAMESPACE_KEY = "nuts" |
9 | 13 |
|
10 | | -class XMLManifestationMetadataExtractor: |
11 | | - """ |
12 | | - Extracts metadata from an XML manifestation. |
13 | | - """ |
| 14 | + |
| 15 | +class NoticeMetadataExtractorABC(abc.ABC): |
| 16 | + |
| 17 | + @abc.abstractmethod |
| 18 | + def extract_metadata(self) -> ExtractedMetadata: |
| 19 | + pass |
| 20 | + |
| 21 | + |
| 22 | +class DefaultNoticeMetadataExtractor(NoticeMetadataExtractorABC): |
14 | 23 |
|
15 | 24 | def __init__(self, xml_manifestation: XMLManifestation): |
16 | 25 | self.xml_manifestation = xml_manifestation |
17 | | - self.manifestation_root = self._parse_manifestation() |
18 | | - self.namespaces = self._get_normalised_namespaces() |
19 | | - self.xpath_registry = XpathRegistry() |
| 26 | + self.xpath_registry = DefaultXPathRegistry() |
| 27 | + self.manifestation_root = parse_xml_manifestation(xml_manifestation=xml_manifestation) |
| 28 | + self.namespaces = normalised_namespaces_from_xml_manifestation(xml_manifestation=xml_manifestation) |
20 | 29 |
|
21 | 30 | @property |
22 | 31 | def title(self): |
@@ -225,11 +234,7 @@ def extracted_notice_type(self): |
225 | 234 | self.xpath_registry.xpath_notice_type, |
226 | 235 | namespaces=self.namespaces), attrib_key="TYPE") |
227 | 236 |
|
228 | | - def to_metadata(self) -> ExtractedMetadata: |
229 | | - """ |
230 | | - Creating extracted metadata |
231 | | - :return: |
232 | | - """ |
| 237 | + def extract_metadata(self) -> ExtractedMetadata: |
233 | 238 | metadata: ExtractedMetadata = ExtractedMetadata() |
234 | 239 | metadata.title = self.title |
235 | 240 | metadata.notice_publication_number = self.notice_publication_number |
@@ -260,34 +265,127 @@ def to_metadata(self) -> ExtractedMetadata: |
260 | 265 | metadata.extracted_notice_type = self.extracted_notice_type |
261 | 266 | return metadata |
262 | 267 |
|
263 | | - def _parse_manifestation(self): |
264 | | - """ |
265 | | - Parsing XML manifestation and getting the root |
266 | | - :return: |
267 | | - """ |
268 | | - xml_manifestation_content = self.xml_manifestation.object_data |
269 | | - return ET.fromstring(xml_manifestation_content) |
270 | 268 |
|
271 | | - def _get_normalised_namespaces(self): |
272 | | - """ |
273 | | - Get normalised namespaces from XML manifestation |
274 | | - :return: |
275 | | - """ |
276 | | - namespaces = dict([node for _, node in ET.iterparse(source=StringIO(self.xml_manifestation.object_data), |
277 | | - events=['start-ns'])]) |
| 269 | +class EformsNoticeMetadataExtractor(NoticeMetadataExtractorABC): |
| 270 | + |
| 271 | + def __init__(self, xml_manifestation: XMLManifestation): |
| 272 | + self.xpath_registry = EformsXPathRegistry() |
| 273 | + self.xml_manifestation = xml_manifestation |
| 274 | + self.manifestation_root = parse_xml_manifestation(xml_manifestation=xml_manifestation) |
| 275 | + self.namespaces = normalised_namespaces_from_xml_manifestation(xml_manifestation=xml_manifestation) |
| 276 | + |
| 277 | + @property |
| 278 | + def title(self): |
| 279 | + title_country = LanguageTaggedString(text=extract_text_from_element( |
| 280 | + element=self.manifestation_root.find(self.xpath_registry.xpath_title_country, namespaces=self.namespaces)),language='') |
| 281 | + title_text = LanguageTaggedString( |
| 282 | + text=extract_text_from_element(element=self.manifestation_root.find( |
| 283 | + self.xpath_registry.xpath_title, |
| 284 | + namespaces=self.namespaces)), |
| 285 | + language=extract_attribute_from_element(element=self.manifestation_root.find( |
| 286 | + self.xpath_registry.xpath_title, |
| 287 | + namespaces=self.namespaces), attrib_key="languageID")) |
| 288 | + return [CompositeTitle(title=title_text, title_country=title_country)] |
| 289 | + |
| 290 | + @property |
| 291 | + def publication_date(self): |
| 292 | + return extract_text_from_element( |
| 293 | + element=self.manifestation_root.find(self.xpath_registry.xpath_publication_date, namespaces=self.namespaces)) |
| 294 | + |
| 295 | + @property |
| 296 | + def notice_publication_number(self): |
| 297 | + return extract_text_from_element( |
| 298 | + element=self.manifestation_root.find(self.xpath_registry.xpath_publication_number, namespaces=self.namespaces)) |
| 299 | + |
| 300 | + @property |
| 301 | + def ojs_issue_number(self): |
| 302 | + return extract_text_from_element( |
| 303 | + element=self.manifestation_root.find(self.xpath_registry.xpath_ojs_issue_number, namespaces=self.namespaces)) |
| 304 | + |
| 305 | + @property |
| 306 | + def original_language(self): |
| 307 | + return extract_text_from_element( |
| 308 | + element=self.manifestation_root.find(self.xpath_registry.xpath_original_language, namespaces=self.namespaces)) |
| 309 | + |
| 310 | + @property |
| 311 | + def document_sent_date(self): |
| 312 | + return extract_text_from_element( |
| 313 | + element=self.manifestation_root.find(self.xpath_registry.xpath_document_sent_date, namespaces=self.namespaces)) |
| 314 | + |
| 315 | + @property |
| 316 | + def type_of_contract(self): |
| 317 | + return EncodedValue(value=extract_text_from_element( |
| 318 | + element=self.manifestation_root.find(self.xpath_registry.xpath_type_of_contract, namespaces=self.namespaces))) |
| 319 | + |
| 320 | + @property |
| 321 | + def type_of_procedure(self): |
| 322 | + return EncodedValue(value=extract_text_from_element( |
| 323 | + element=self.manifestation_root.find(self.xpath_registry.xpath_type_of_procedure, namespaces=self.namespaces))) |
| 324 | + |
| 325 | + @property |
| 326 | + def place_of_performance(self): |
| 327 | + extracted_nuts_code = extract_text_from_element( |
| 328 | + element=self.manifestation_root.find(self.xpath_registry.xpath_place_of_performance, namespaces=self.namespaces)) |
| 329 | + return [EncodedValue(value=extracted_nuts_code,code=extracted_nuts_code)] |
| 330 | + |
| 331 | + @property |
| 332 | + def common_procurement(self): |
| 333 | + common_procurement_elements = self.manifestation_root.findall( |
| 334 | + self.xpath_registry.xpath_common_procurement_elements, |
| 335 | + namespaces=self.namespaces) |
| 336 | + return [extract_code_from_element(element=element) for element in common_procurement_elements] |
| 337 | + |
| 338 | + @property |
| 339 | + def internet_address(self): |
| 340 | + return extract_text_from_element( |
| 341 | + element=self.manifestation_root.find(self.xpath_registry.xpath_internet_address, namespaces=self.namespaces)) |
| 342 | + |
| 343 | + @property |
| 344 | + def legal_basis_directive(self): |
| 345 | + return extract_text_from_element( |
| 346 | + element=self.manifestation_root.find(self.xpath_registry.xpath_legal_basis_directive, namespaces=self.namespaces)) |
| 347 | + |
| 348 | + @property |
| 349 | + def extracted_notice_subtype(self): |
| 350 | + return extract_text_from_element( |
| 351 | + element=self.manifestation_root.find(self.xpath_registry.xpath_notice_subtype, namespaces=self.namespaces)) |
278 | 352 |
|
279 | | - namespaces["manifestation_ns"] = namespaces.pop("") if "" in namespaces.keys() else "" |
| 353 | + @property |
| 354 | + def extracted_eform_type(self): |
| 355 | + return extract_attribute_from_element( |
| 356 | + element=self.manifestation_root.find( |
| 357 | + self.xpath_registry.xpath_form_type, |
| 358 | + namespaces=self.namespaces), attrib_key="listName") |
280 | 359 |
|
281 | | - tmp_dict = namespaces.copy() |
282 | | - items = tmp_dict.items() |
283 | | - for key, value in items: |
284 | | - if value.endswith("nuts"): |
285 | | - namespaces["nuts"] = namespaces.pop(key) |
| 360 | + @property |
| 361 | + def extracted_notice_type(self): |
| 362 | + return extract_text_from_element( |
| 363 | + element=self.manifestation_root.find(self.xpath_registry.xpath_notice_type, namespaces=self.namespaces)) |
286 | 364 |
|
287 | | - if "nuts" not in namespaces.keys(): |
288 | | - namespaces.update({"nuts": "no_nuts"}) |
| 365 | + @property |
| 366 | + def xml_schema_version(self): |
| 367 | + return extract_text_from_element( |
| 368 | + element=self.manifestation_root.find(self.xpath_registry.xpath_eform_sdk_version, namespaces=self.namespaces)) |
289 | 369 |
|
290 | | - return namespaces |
| 370 | + def extract_metadata(self) -> ExtractedMetadata: |
| 371 | + metadata: ExtractedMetadata = ExtractedMetadata() |
| 372 | + metadata.title = self.title |
| 373 | + metadata.notice_publication_number = self.notice_publication_number |
| 374 | + metadata.publication_date = self.publication_date |
| 375 | + metadata.ojs_issue_number = self.ojs_issue_number |
| 376 | + metadata.original_language = self.original_language |
| 377 | + metadata.document_sent_date = self.document_sent_date |
| 378 | + metadata.type_of_contract = self.type_of_contract |
| 379 | + metadata.type_of_procedure = self.type_of_procedure |
| 380 | + metadata.common_procurement = self.common_procurement |
| 381 | + metadata.place_of_performance = self.place_of_performance |
| 382 | + metadata.internet_address = self.internet_address |
| 383 | + metadata.legal_basis_directive = self.legal_basis_directive |
| 384 | + metadata.xml_schema_version = self.xml_schema_version |
| 385 | + metadata.extracted_notice_type = self.extracted_notice_type |
| 386 | + metadata.extracted_notice_subtype = self.extracted_notice_subtype |
| 387 | + metadata.extracted_eform_type = self.extracted_eform_type |
| 388 | + return metadata |
291 | 389 |
|
292 | 390 |
|
293 | 391 | def extract_text_from_element(element: ET.Element) -> str: |
@@ -320,3 +418,43 @@ def extract_code_and_value_from_element(element: ET.Element) -> EncodedValue: |
320 | 418 | if element is not None: |
321 | 419 | return EncodedValue(code=extract_attribute_from_element(element=element, attrib_key="CODE"), |
322 | 420 | value=extract_text_from_element(element=element)) |
| 421 | + |
| 422 | +def extract_code_from_element(element: ET.Element) -> EncodedValue: |
| 423 | + """ |
| 424 | + Extract code from text value from an element in the XML structure |
| 425 | + :param element: |
| 426 | + :return: |
| 427 | + """ |
| 428 | + if element is not None: |
| 429 | + return EncodedValue(code=extract_text_from_element(element=element), |
| 430 | + value=extract_text_from_element(element=element)) |
| 431 | + |
| 432 | +def parse_xml_manifestation(xml_manifestation: XMLManifestation) -> ET.Element: |
| 433 | + """ |
| 434 | + Parsing XML manifestation and getting the root |
| 435 | + :return: |
| 436 | + """ |
| 437 | + xml_manifestation_content = xml_manifestation.object_data |
| 438 | + return ET.fromstring(xml_manifestation_content) |
| 439 | + |
| 440 | + |
| 441 | +def normalised_namespaces_from_xml_manifestation(xml_manifestation: XMLManifestation) -> Dict: |
| 442 | + """ |
| 443 | + Get normalised namespaces from XML manifestation |
| 444 | + :return: |
| 445 | + """ |
| 446 | + namespaces = dict([node for _, node in ET.iterparse(source=StringIO(xml_manifestation.object_data), |
| 447 | + events=['start-ns'])]) |
| 448 | + |
| 449 | + namespaces[MANIFESTATION_NAMESPACE_KEY] = namespaces.pop("") if "" in namespaces.keys() else "" |
| 450 | + |
| 451 | + tmp_dict = namespaces.copy() |
| 452 | + items = tmp_dict.items() |
| 453 | + for key, value in items: |
| 454 | + if value.endswith(NUTS_NAMESPACE_KEY): |
| 455 | + namespaces[NUTS_NAMESPACE_KEY] = namespaces.pop(key) |
| 456 | + |
| 457 | + if "nuts" not in namespaces.keys(): |
| 458 | + namespaces.update({NUTS_NAMESPACE_KEY: "no_nuts"}) |
| 459 | + |
| 460 | + return namespaces |
0 commit comments