Skip to content

Commit dbf94d6

Browse files
committed
WIP- for extractor and normaliser
1 parent 92c34d0 commit dbf94d6

16 files changed

Lines changed: 1162 additions & 438 deletions

File tree

ted_sws/notice_metadata_processor/adapters/__init__.py

Whitespace-only changes.

ted_sws/notice_metadata_processor/services/xml_manifestation_metadata_extractor.py renamed to ted_sws/notice_metadata_processor/adapters/notice_metadata_extractor.py

Lines changed: 173 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,28 @@
1+
import abc
12
import xml.etree.ElementTree as ET
23
from io import StringIO
4+
from typing import Dict
35

46
from ted_sws.core.model.manifestation import XMLManifestation
5-
from ted_sws.notice_metadata_processor.model.metadata import ExtractedMetadata, LanguageTaggedString, CompositeTitle, \
6-
EncodedValue
7-
from ted_sws.notice_metadata_processor.services.xpath_registry import XpathRegistry
7+
from ted_sws.core.model.metadata import LanguageTaggedString, CompositeTitle, EncodedValue
8+
from ted_sws.notice_metadata_processor.adapters.xpath_registry import EformsXPathRegistry, DefaultXPathRegistry
9+
from ted_sws.notice_metadata_processor.model.metadata import ExtractedMetadata
810

911

10-
class XMLManifestationMetadataExtractor:
11-
"""
12-
Extracts metadata from an XML manifestation.
13-
"""
12+
class NoticeMetadataExtractorABC(abc.ABC):
13+
14+
@abc.abstractmethod
15+
def extract_metadata(self) -> ExtractedMetadata:
16+
pass
17+
18+
19+
class DefaultNoticeMetadataExtractor(NoticeMetadataExtractorABC):
1420

1521
def __init__(self, xml_manifestation: XMLManifestation):
1622
self.xml_manifestation = xml_manifestation
17-
self.manifestation_root = self._parse_manifestation()
18-
self.namespaces = self._get_normalised_namespaces()
19-
self.xpath_registry = XpathRegistry()
23+
self.xpath_registry = DefaultXPathRegistry()
24+
self.manifestation_root = parse_xml_manifestation(xml_manifestation=xml_manifestation)
25+
self.namespaces = normalised_namespaces_from_xml_manifestation(xml_manifestation=xml_manifestation)
2026

2127
@property
2228
def title(self):
@@ -225,11 +231,7 @@ def extracted_notice_type(self):
225231
self.xpath_registry.xpath_notice_type,
226232
namespaces=self.namespaces), attrib_key="TYPE")
227233

228-
def to_metadata(self) -> ExtractedMetadata:
229-
"""
230-
Creating extracted metadata
231-
:return:
232-
"""
234+
def extract_metadata(self) -> ExtractedMetadata:
233235
metadata: ExtractedMetadata = ExtractedMetadata()
234236
metadata.title = self.title
235237
metadata.notice_publication_number = self.notice_publication_number
@@ -260,34 +262,127 @@ def to_metadata(self) -> ExtractedMetadata:
260262
metadata.extracted_notice_type = self.extracted_notice_type
261263
return metadata
262264

263-
def _parse_manifestation(self):
264-
"""
265-
Parsing XML manifestation and getting the root
266-
:return:
267-
"""
268-
xml_manifestation_content = self.xml_manifestation.object_data
269-
return ET.fromstring(xml_manifestation_content)
270265

271-
def _get_normalised_namespaces(self):
272-
"""
273-
Get normalised namespaces from XML manifestation
274-
:return:
275-
"""
276-
namespaces = dict([node for _, node in ET.iterparse(source=StringIO(self.xml_manifestation.object_data),
277-
events=['start-ns'])])
266+
class EformsNoticeMetadataExtractor(NoticeMetadataExtractorABC):
267+
268+
def __init__(self, xml_manifestation: XMLManifestation):
269+
self.xpath_registry = EformsXPathRegistry()
270+
self.xml_manifestation = xml_manifestation
271+
self.manifestation_root = parse_xml_manifestation(xml_manifestation=xml_manifestation)
272+
self.namespaces = normalised_namespaces_from_xml_manifestation(xml_manifestation=xml_manifestation)
273+
274+
@property
275+
def title(self):
276+
title_country = LanguageTaggedString(text=extract_text_from_element(
277+
element=self.manifestation_root.find(self.xpath_registry.xpath_title_country, namespaces=self.namespaces)),language='')
278+
title_text = LanguageTaggedString(
279+
text=extract_text_from_element(element=self.manifestation_root.find(
280+
self.xpath_registry.xpath_title,
281+
namespaces=self.namespaces)),
282+
language=extract_attribute_from_element(element=self.manifestation_root.find(
283+
self.xpath_registry.xpath_title,
284+
namespaces=self.namespaces), attrib_key="languageID"))
285+
return [CompositeTitle(title=title_text, title_country=title_country)]
286+
287+
@property
288+
def publication_date(self):
289+
return extract_text_from_element(
290+
element=self.manifestation_root.find(self.xpath_registry.xpath_publication_date, namespaces=self.namespaces))
291+
292+
@property
293+
def notice_publication_number(self):
294+
return extract_text_from_element(
295+
element=self.manifestation_root.find(self.xpath_registry.xpath_publication_number, namespaces=self.namespaces))
296+
297+
@property
298+
def ojs_issue_number(self):
299+
return extract_text_from_element(
300+
element=self.manifestation_root.find(self.xpath_registry.xpath_ojs_issue_number, namespaces=self.namespaces))
301+
302+
@property
303+
def original_language(self):
304+
return extract_text_from_element(
305+
element=self.manifestation_root.find(self.xpath_registry.xpath_original_language, namespaces=self.namespaces))
306+
307+
@property
308+
def document_sent_date(self):
309+
return extract_text_from_element(
310+
element=self.manifestation_root.find(self.xpath_registry.xpath_document_sent_date, namespaces=self.namespaces))
311+
312+
@property
313+
def type_of_contract(self):
314+
return EncodedValue(value=extract_text_from_element(
315+
element=self.manifestation_root.find(self.xpath_registry.xpath_type_of_contract, namespaces=self.namespaces)))
278316

279-
namespaces["manifestation_ns"] = namespaces.pop("") if "" in namespaces.keys() else ""
317+
@property
318+
def type_of_procedure(self):
319+
return EncodedValue(value=extract_text_from_element(
320+
element=self.manifestation_root.find(self.xpath_registry.xpath_type_of_procedure, namespaces=self.namespaces)))
280321

281-
tmp_dict = namespaces.copy()
282-
items = tmp_dict.items()
283-
for key, value in items:
284-
if value.endswith("nuts"):
285-
namespaces["nuts"] = namespaces.pop(key)
322+
@property
323+
def place_of_performance(self):
324+
extracted_nuts_code = extract_text_from_element(
325+
element=self.manifestation_root.find(self.xpath_registry.xpath_place_of_performance, namespaces=self.namespaces))
326+
return [EncodedValue(value=extracted_nuts_code,code=extracted_nuts_code)]
286327

287-
if "nuts" not in namespaces.keys():
288-
namespaces.update({"nuts": "no_nuts"})
328+
@property
329+
def common_procurement(self):
330+
common_procurement_elements = self.manifestation_root.findall(
331+
self.xpath_registry.xpath_common_procurement_elements,
332+
namespaces=self.namespaces)
333+
return [extract_code_from_element(element=element) for element in common_procurement_elements]
289334

290-
return namespaces
335+
@property
336+
def internet_address(self):
337+
return extract_text_from_element(
338+
element=self.manifestation_root.find(self.xpath_registry.xpath_internet_address, namespaces=self.namespaces))
339+
340+
@property
341+
def legal_basis_directive(self):
342+
return extract_text_from_element(
343+
element=self.manifestation_root.find(self.xpath_registry.xpath_legal_basis_directive, namespaces=self.namespaces))
344+
345+
@property
346+
def extracted_notice_subtype(self):
347+
return extract_text_from_element(
348+
element=self.manifestation_root.find(self.xpath_registry.xpath_notice_subtype, namespaces=self.namespaces))
349+
350+
@property
351+
def extracted_eform_type(self):
352+
return extract_attribute_from_element(
353+
element=self.manifestation_root.find(
354+
self.xpath_registry.xpath_form_type,
355+
namespaces=self.namespaces), attrib_key="listName")
356+
357+
@property
358+
def extracted_notice_type(self):
359+
return extract_text_from_element(
360+
element=self.manifestation_root.find(self.xpath_registry.xpath_notice_type, namespaces=self.namespaces))
361+
362+
@property
363+
def xml_schema_version(self):
364+
return extract_text_from_element(
365+
element=self.manifestation_root.find(self.xpath_registry.xpath_eform_sdk_version, namespaces=self.namespaces))
366+
367+
def extract_metadata(self) -> ExtractedMetadata:
368+
metadata: ExtractedMetadata = ExtractedMetadata()
369+
metadata.title = self.title
370+
metadata.notice_publication_number = self.notice_publication_number
371+
metadata.publication_date = self.publication_date
372+
metadata.ojs_issue_number = self.ojs_issue_number
373+
metadata.original_language = self.original_language
374+
metadata.document_sent_date = self.document_sent_date
375+
metadata.type_of_contract = self.type_of_contract
376+
metadata.type_of_procedure = self.type_of_procedure
377+
metadata.common_procurement = self.common_procurement
378+
metadata.place_of_performance = self.place_of_performance
379+
metadata.internet_address = self.internet_address
380+
metadata.legal_basis_directive = self.legal_basis_directive
381+
metadata.xml_schema_version = self.xml_schema_version
382+
metadata.extracted_notice_type = self.extracted_notice_type
383+
metadata.extracted_notice_subtype = self.extracted_notice_subtype
384+
metadata.extracted_eform_type = self.extracted_eform_type
385+
return metadata
291386

292387

293388
def extract_text_from_element(element: ET.Element) -> str:
@@ -320,3 +415,43 @@ def extract_code_and_value_from_element(element: ET.Element) -> EncodedValue:
320415
if element is not None:
321416
return EncodedValue(code=extract_attribute_from_element(element=element, attrib_key="CODE"),
322417
value=extract_text_from_element(element=element))
418+
419+
def extract_code_from_element(element: ET.Element) -> EncodedValue:
420+
"""
421+
Extract code from text value from an element in the XML structure
422+
:param element:
423+
:return:
424+
"""
425+
if element is not None:
426+
return EncodedValue(code=extract_text_from_element(element=element),
427+
value=extract_text_from_element(element=element))
428+
429+
def parse_xml_manifestation(xml_manifestation: XMLManifestation) -> ET.Element:
430+
"""
431+
Parsing XML manifestation and getting the root
432+
:return:
433+
"""
434+
xml_manifestation_content = xml_manifestation.object_data
435+
return ET.fromstring(xml_manifestation_content)
436+
437+
438+
def normalised_namespaces_from_xml_manifestation(xml_manifestation: XMLManifestation) -> Dict:
439+
"""
440+
Get normalised namespaces from XML manifestation
441+
:return:
442+
"""
443+
namespaces = dict([node for _, node in ET.iterparse(source=StringIO(xml_manifestation.object_data),
444+
events=['start-ns'])])
445+
446+
namespaces["manifestation_ns"] = namespaces.pop("") if "" in namespaces.keys() else ""
447+
448+
tmp_dict = namespaces.copy()
449+
items = tmp_dict.items()
450+
for key, value in items:
451+
if value.endswith("nuts"):
452+
namespaces["nuts"] = namespaces.pop(key)
453+
454+
if "nuts" not in namespaces.keys():
455+
namespaces.update({"nuts": "no_nuts"})
456+
457+
return namespaces

0 commit comments

Comments
 (0)