Skip to content

Commit fa1a4eb

Browse files
authored
Merge pull request #515 from OP-TED/feature/TED4-82
Feature/ted4 82
2 parents 92c34d0 + 9261199 commit fa1a4eb

22 files changed

Lines changed: 963 additions & 448 deletions

ted_sws/core/model/metadata.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ class NormalisedMetadata(Metadata):
8181
eforms_subtype: str
8282
xsd_version: str
8383
published_in_cellar_counter: int = Field(default=0)
84+
is_eform: Optional[bool] = False
8485

8586

8687
class NormalisedMetadataView(Metadata):

ted_sws/notice_metadata_processor/adapters/__init__.py

Whitespace-only changes.

ted_sws/notice_metadata_processor/services/xml_manifestation_metadata_extractor.py renamed to ted_sws/notice_metadata_processor/adapters/notice_metadata_extractor.py

Lines changed: 176 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,31 @@
1+
import abc
12
import xml.etree.ElementTree as ET
23
from io import StringIO
4+
from typing import Dict
35

46
from ted_sws.core.model.manifestation import XMLManifestation
5-
from ted_sws.notice_metadata_processor.model.metadata import ExtractedMetadata, LanguageTaggedString, CompositeTitle, \
6-
EncodedValue
7-
from ted_sws.notice_metadata_processor.services.xpath_registry import XpathRegistry
7+
from ted_sws.core.model.metadata import LanguageTaggedString, CompositeTitle, EncodedValue
8+
from ted_sws.notice_metadata_processor.adapters.xpath_registry import EformsXPathRegistry, DefaultXPathRegistry
9+
from ted_sws.notice_metadata_processor.model.metadata import ExtractedMetadata
810

11+
MANIFESTATION_NAMESPACE_KEY = "manifestation_ns"
12+
NUTS_NAMESPACE_KEY = "nuts"
913

10-
class XMLManifestationMetadataExtractor:
11-
"""
12-
Extracts metadata from an XML manifestation.
13-
"""
14+
15+
class NoticeMetadataExtractorABC(abc.ABC):
16+
17+
@abc.abstractmethod
18+
def extract_metadata(self) -> ExtractedMetadata:
19+
pass
20+
21+
22+
class DefaultNoticeMetadataExtractor(NoticeMetadataExtractorABC):
1423

1524
def __init__(self, xml_manifestation: XMLManifestation):
1625
self.xml_manifestation = xml_manifestation
17-
self.manifestation_root = self._parse_manifestation()
18-
self.namespaces = self._get_normalised_namespaces()
19-
self.xpath_registry = XpathRegistry()
26+
self.xpath_registry = DefaultXPathRegistry()
27+
self.manifestation_root = parse_xml_manifestation(xml_manifestation=xml_manifestation)
28+
self.namespaces = normalised_namespaces_from_xml_manifestation(xml_manifestation=xml_manifestation)
2029

2130
@property
2231
def title(self):
@@ -225,11 +234,7 @@ def extracted_notice_type(self):
225234
self.xpath_registry.xpath_notice_type,
226235
namespaces=self.namespaces), attrib_key="TYPE")
227236

228-
def to_metadata(self) -> ExtractedMetadata:
229-
"""
230-
Creating extracted metadata
231-
:return:
232-
"""
237+
def extract_metadata(self) -> ExtractedMetadata:
233238
metadata: ExtractedMetadata = ExtractedMetadata()
234239
metadata.title = self.title
235240
metadata.notice_publication_number = self.notice_publication_number
@@ -260,34 +265,127 @@ def to_metadata(self) -> ExtractedMetadata:
260265
metadata.extracted_notice_type = self.extracted_notice_type
261266
return metadata
262267

263-
def _parse_manifestation(self):
264-
"""
265-
Parsing XML manifestation and getting the root
266-
:return:
267-
"""
268-
xml_manifestation_content = self.xml_manifestation.object_data
269-
return ET.fromstring(xml_manifestation_content)
270268

271-
def _get_normalised_namespaces(self):
272-
"""
273-
Get normalised namespaces from XML manifestation
274-
:return:
275-
"""
276-
namespaces = dict([node for _, node in ET.iterparse(source=StringIO(self.xml_manifestation.object_data),
277-
events=['start-ns'])])
269+
class EformsNoticeMetadataExtractor(NoticeMetadataExtractorABC):
270+
271+
def __init__(self, xml_manifestation: XMLManifestation):
272+
self.xpath_registry = EformsXPathRegistry()
273+
self.xml_manifestation = xml_manifestation
274+
self.manifestation_root = parse_xml_manifestation(xml_manifestation=xml_manifestation)
275+
self.namespaces = normalised_namespaces_from_xml_manifestation(xml_manifestation=xml_manifestation)
276+
277+
@property
278+
def title(self):
279+
title_country = LanguageTaggedString(text=extract_text_from_element(
280+
element=self.manifestation_root.find(self.xpath_registry.xpath_title_country, namespaces=self.namespaces)),language='')
281+
title_text = LanguageTaggedString(
282+
text=extract_text_from_element(element=self.manifestation_root.find(
283+
self.xpath_registry.xpath_title,
284+
namespaces=self.namespaces)),
285+
language=extract_attribute_from_element(element=self.manifestation_root.find(
286+
self.xpath_registry.xpath_title,
287+
namespaces=self.namespaces), attrib_key="languageID"))
288+
return [CompositeTitle(title=title_text, title_country=title_country)]
289+
290+
@property
291+
def publication_date(self):
292+
return extract_text_from_element(
293+
element=self.manifestation_root.find(self.xpath_registry.xpath_publication_date, namespaces=self.namespaces))
294+
295+
@property
296+
def notice_publication_number(self):
297+
return extract_text_from_element(
298+
element=self.manifestation_root.find(self.xpath_registry.xpath_publication_number, namespaces=self.namespaces))
299+
300+
@property
301+
def ojs_issue_number(self):
302+
return extract_text_from_element(
303+
element=self.manifestation_root.find(self.xpath_registry.xpath_ojs_issue_number, namespaces=self.namespaces))
304+
305+
@property
306+
def original_language(self):
307+
return extract_text_from_element(
308+
element=self.manifestation_root.find(self.xpath_registry.xpath_original_language, namespaces=self.namespaces))
309+
310+
@property
311+
def document_sent_date(self):
312+
return extract_text_from_element(
313+
element=self.manifestation_root.find(self.xpath_registry.xpath_document_sent_date, namespaces=self.namespaces))
314+
315+
@property
316+
def type_of_contract(self):
317+
return EncodedValue(value=extract_text_from_element(
318+
element=self.manifestation_root.find(self.xpath_registry.xpath_type_of_contract, namespaces=self.namespaces)))
319+
320+
@property
321+
def type_of_procedure(self):
322+
return EncodedValue(value=extract_text_from_element(
323+
element=self.manifestation_root.find(self.xpath_registry.xpath_type_of_procedure, namespaces=self.namespaces)))
324+
325+
@property
326+
def place_of_performance(self):
327+
extracted_nuts_code = extract_text_from_element(
328+
element=self.manifestation_root.find(self.xpath_registry.xpath_place_of_performance, namespaces=self.namespaces))
329+
return [EncodedValue(value=extracted_nuts_code,code=extracted_nuts_code)]
330+
331+
@property
332+
def common_procurement(self):
333+
common_procurement_elements = self.manifestation_root.findall(
334+
self.xpath_registry.xpath_common_procurement_elements,
335+
namespaces=self.namespaces)
336+
return [extract_code_from_element(element=element) for element in common_procurement_elements]
337+
338+
@property
339+
def internet_address(self):
340+
return extract_text_from_element(
341+
element=self.manifestation_root.find(self.xpath_registry.xpath_internet_address, namespaces=self.namespaces))
342+
343+
@property
344+
def legal_basis_directive(self):
345+
return extract_text_from_element(
346+
element=self.manifestation_root.find(self.xpath_registry.xpath_legal_basis_directive, namespaces=self.namespaces))
347+
348+
@property
349+
def extracted_notice_subtype(self):
350+
return extract_text_from_element(
351+
element=self.manifestation_root.find(self.xpath_registry.xpath_notice_subtype, namespaces=self.namespaces))
278352

279-
namespaces["manifestation_ns"] = namespaces.pop("") if "" in namespaces.keys() else ""
353+
@property
354+
def extracted_eform_type(self):
355+
return extract_attribute_from_element(
356+
element=self.manifestation_root.find(
357+
self.xpath_registry.xpath_form_type,
358+
namespaces=self.namespaces), attrib_key="listName")
280359

281-
tmp_dict = namespaces.copy()
282-
items = tmp_dict.items()
283-
for key, value in items:
284-
if value.endswith("nuts"):
285-
namespaces["nuts"] = namespaces.pop(key)
360+
@property
361+
def extracted_notice_type(self):
362+
return extract_text_from_element(
363+
element=self.manifestation_root.find(self.xpath_registry.xpath_notice_type, namespaces=self.namespaces))
286364

287-
if "nuts" not in namespaces.keys():
288-
namespaces.update({"nuts": "no_nuts"})
365+
@property
366+
def xml_schema_version(self):
367+
return extract_text_from_element(
368+
element=self.manifestation_root.find(self.xpath_registry.xpath_eform_sdk_version, namespaces=self.namespaces))
289369

290-
return namespaces
370+
def extract_metadata(self) -> ExtractedMetadata:
371+
metadata: ExtractedMetadata = ExtractedMetadata()
372+
metadata.title = self.title
373+
metadata.notice_publication_number = self.notice_publication_number
374+
metadata.publication_date = self.publication_date
375+
metadata.ojs_issue_number = self.ojs_issue_number
376+
metadata.original_language = self.original_language
377+
metadata.document_sent_date = self.document_sent_date
378+
metadata.type_of_contract = self.type_of_contract
379+
metadata.type_of_procedure = self.type_of_procedure
380+
metadata.common_procurement = self.common_procurement
381+
metadata.place_of_performance = self.place_of_performance
382+
metadata.internet_address = self.internet_address
383+
metadata.legal_basis_directive = self.legal_basis_directive
384+
metadata.xml_schema_version = self.xml_schema_version
385+
metadata.extracted_notice_type = self.extracted_notice_type
386+
metadata.extracted_notice_subtype = self.extracted_notice_subtype
387+
metadata.extracted_eform_type = self.extracted_eform_type
388+
return metadata
291389

292390

293391
def extract_text_from_element(element: ET.Element) -> str:
@@ -320,3 +418,43 @@ def extract_code_and_value_from_element(element: ET.Element) -> EncodedValue:
320418
if element is not None:
321419
return EncodedValue(code=extract_attribute_from_element(element=element, attrib_key="CODE"),
322420
value=extract_text_from_element(element=element))
421+
422+
def extract_code_from_element(element: ET.Element) -> EncodedValue:
423+
"""
424+
Extract code from text value from an element in the XML structure
425+
:param element:
426+
:return:
427+
"""
428+
if element is not None:
429+
return EncodedValue(code=extract_text_from_element(element=element),
430+
value=extract_text_from_element(element=element))
431+
432+
def parse_xml_manifestation(xml_manifestation: XMLManifestation) -> ET.Element:
433+
"""
434+
Parsing XML manifestation and getting the root
435+
:return:
436+
"""
437+
xml_manifestation_content = xml_manifestation.object_data
438+
return ET.fromstring(xml_manifestation_content)
439+
440+
441+
def normalised_namespaces_from_xml_manifestation(xml_manifestation: XMLManifestation) -> Dict:
442+
"""
443+
Get normalised namespaces from XML manifestation
444+
:return:
445+
"""
446+
namespaces = dict([node for _, node in ET.iterparse(source=StringIO(xml_manifestation.object_data),
447+
events=['start-ns'])])
448+
449+
namespaces[MANIFESTATION_NAMESPACE_KEY] = namespaces.pop("") if "" in namespaces.keys() else ""
450+
451+
tmp_dict = namespaces.copy()
452+
items = tmp_dict.items()
453+
for key, value in items:
454+
if value.endswith(NUTS_NAMESPACE_KEY):
455+
namespaces[NUTS_NAMESPACE_KEY] = namespaces.pop(key)
456+
457+
if "nuts" not in namespaces.keys():
458+
namespaces.update({NUTS_NAMESPACE_KEY: "no_nuts"})
459+
460+
return namespaces

0 commit comments

Comments
 (0)