Skip to content

Commit ce9288b

Browse files
Update notice_xml_indexer.py
1 parent 2ffa403 commit ce9288b

1 file changed

Lines changed: 56 additions & 5 deletions

File tree

ted_sws/data_sampler/services/notice_xml_indexer.py

Lines changed: 56 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
import pathlib
2+
import re
23
import tempfile
3-
from typing import List
4+
import xml.etree.ElementTree as XMLElementTree
5+
from io import StringIO
6+
from typing import List, Set, Generator, Optional
47

58
from pymongo import MongoClient
69

@@ -10,12 +13,13 @@
1013
from ted_sws.data_manager.adapters.notice_repository import NoticeRepository
1114
from ted_sws.mapping_suite_processor.adapters.conceptual_mapping_reader import ConceptualMappingReader
1215
from ted_sws.resources import XSLT_FILES_PATH
13-
import xml.etree.ElementTree as XMLElementTree
14-
import re
1516

1617
UNIQUE_XPATHS_XSLT_FILE_PATH = "get_unique_xpaths.xsl"
1718
XSLT_PREFIX_RESULT = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
1819

20+
INCLUDE_VALUES_BY_ATTRIBUTES_NAMES = {"schemeName", "unitCode", "listName"}
21+
EXCLUDE_ATTRIBUTES_VALUES = {"nuts", "country", "cpv"}
22+
1923

2024
def index_notice_by_id(notice_id: str, mongodb_client: MongoClient):
2125
"""
@@ -58,8 +62,54 @@ def index_notice_xslt(notice: Notice, xslt_transformer=None) -> Notice:
5862
return notice
5963

6064

61-
def index_notice(notice: Notice, base_xpath="") -> Notice:
65+
def get_all_xpath_generator(xml_content: str,
66+
remove_namespaces: bool = True,
67+
include_values_by_attribute_names: Optional[Set[str]] = None,
68+
exclude_attribute_values: Optional[Set[str]] = None
69+
) -> Generator[str, None, None]:
70+
"""
71+
Generate all XPaths based on the given XML content
72+
:param xml_content:
73+
:param remove_namespaces:
74+
:param include_values_by_attribute_names:
75+
:param exclude_attribute_values:
76+
return: generator of all XPaths based on the given XML content
77+
"""
78+
xml_file = StringIO(xml_content)
79+
path = []
80+
it = XMLElementTree.iterparse(xml_file, events=('start', 'end'))
81+
for evt, el in it:
82+
if evt == 'start':
83+
if remove_namespaces:
84+
ns_tag = re.split('[{}]', el.tag, 2)[1:]
85+
path.append(ns_tag[1] if len(ns_tag) > 1 else el.tag)
86+
else:
87+
path.append(el.tag)
88+
xpath = "/" + '/'.join(path)
89+
for attribute_key, attribute_value in el.attrib.items():
90+
if (attribute_key in include_values_by_attribute_names) and (
91+
attribute_value not in exclude_attribute_values):
92+
yield f"{xpath}@{attribute_key}={attribute_value}"
93+
else:
94+
yield f"{xpath}@{attribute_key}"
95+
yield xpath
96+
else:
97+
path.pop()
98+
99+
100+
def index_eforms_notice(notice: Notice) -> Notice:
101+
xml_content = notice.xml_manifestation.object_data
102+
unique_xpaths = list(set(get_all_xpath_generator(xml_content=xml_content, remove_namespaces=True,
103+
include_values_by_attribute_names=INCLUDE_VALUES_BY_ATTRIBUTES_NAMES,
104+
exclude_attribute_values=EXCLUDE_ATTRIBUTES_VALUES
105+
)))
106+
xml_metadata = XMLMetadata()
107+
xml_metadata.unique_xpaths = unique_xpaths
108+
notice.set_xml_metadata(xml_metadata=xml_metadata)
109+
return notice
110+
62111

112+
def index_notice(notice: Notice, base_xpath="") -> Notice:
63113
# To be removed later if will not be used
64114
# def _notice_namespaces(xml_file) -> dict:
65115
# _namespaces = dict([node for _, node in XMLElementTree.iterparse(xml_file, events=['start-ns'])])
@@ -229,7 +279,8 @@ def get_unique_xpaths_covered_by_notices(notice_ids: List[str], mongodb_client:
229279
"""
230280
notice_repository = NoticeRepository(mongodb_client=mongodb_client)
231281
results = notice_repository.xml_metadata_repository.collection.aggregate([{"$match": {"ted_id": {"$in": notice_ids},
232-
"metadata_type": {"$eq":"xml"}
282+
"metadata_type": {
283+
"$eq": "xml"}
233284
}
234285
}], allowDiskUse=True)
235286
unique_xpaths = set()

0 commit comments

Comments
 (0)