|
1 | 1 | import pathlib |
| 2 | +import re |
2 | 3 | import tempfile |
3 | | -from typing import List |
| 4 | +import xml.etree.ElementTree as XMLElementTree |
| 5 | +from io import StringIO |
| 6 | +from typing import List, Set, Generator, Optional |
4 | 7 |
|
5 | 8 | from pymongo import MongoClient |
6 | 9 |
|
|
10 | 13 | from ted_sws.data_manager.adapters.notice_repository import NoticeRepository |
11 | 14 | from ted_sws.mapping_suite_processor.adapters.conceptual_mapping_reader import ConceptualMappingReader |
12 | 15 | from ted_sws.resources import XSLT_FILES_PATH |
13 | | -import xml.etree.ElementTree as XMLElementTree |
14 | | -import re |
15 | 16 |
|
16 | 17 | UNIQUE_XPATHS_XSLT_FILE_PATH = "get_unique_xpaths.xsl" |
17 | 18 | XSLT_PREFIX_RESULT = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" |
18 | 19 |
|
| 20 | +INCLUDE_VALUES_BY_ATTRIBUTES_NAMES = {"schemeName", "unitCode", "listName"} |
| 21 | +EXCLUDE_ATTRIBUTES_VALUES = {"nuts", "country", "cpv"} |
| 22 | + |
19 | 23 |
|
20 | 24 | def index_notice_by_id(notice_id: str, mongodb_client: MongoClient): |
21 | 25 | """ |
@@ -58,8 +62,54 @@ def index_notice_xslt(notice: Notice, xslt_transformer=None) -> Notice: |
58 | 62 | return notice |
59 | 63 |
|
60 | 64 |
|
61 | | -def index_notice(notice: Notice, base_xpath="") -> Notice: |
| 65 | +def get_all_xpath_generator(xml_content: str, |
| 66 | + remove_namespaces: bool = True, |
| 67 | + include_values_by_attribute_names: Optional[Set[str]] = None, |
| 68 | + exclude_attribute_values: Optional[Set[str]] = None |
| 69 | + ) -> Generator[str, None, None]: |
| 70 | + """ |
| 71 | + Generate all XPaths based on the given XML content |
| 72 | + :param xml_content: |
| 73 | + :param remove_namespaces: |
| 74 | + :param include_values_by_attribute_names: |
| 75 | + :param exclude_attribute_values: |
| 76 | + return: generator of all XPaths based on the given XML content |
| 77 | + """ |
| 78 | + xml_file = StringIO(xml_content) |
| 79 | + path = [] |
| 80 | + it = XMLElementTree.iterparse(xml_file, events=('start', 'end')) |
| 81 | + for evt, el in it: |
| 82 | + if evt == 'start': |
| 83 | + if remove_namespaces: |
| 84 | + ns_tag = re.split('[{}]', el.tag, 2)[1:] |
| 85 | + path.append(ns_tag[1] if len(ns_tag) > 1 else el.tag) |
| 86 | + else: |
| 87 | + path.append(el.tag) |
| 88 | + xpath = "/" + '/'.join(path) |
| 89 | + for attribute_key, attribute_value in el.attrib.items(): |
| 90 | + if (attribute_key in include_values_by_attribute_names) and ( |
| 91 | + attribute_value not in exclude_attribute_values): |
| 92 | + yield f"{xpath}@{attribute_key}={attribute_value}" |
| 93 | + else: |
| 94 | + yield f"{xpath}@{attribute_key}" |
| 95 | + yield xpath |
| 96 | + else: |
| 97 | + path.pop() |
| 98 | + |
| 99 | + |
| 100 | +def index_eforms_notice(notice: Notice) -> Notice: |
| 101 | + xml_content = notice.xml_manifestation.object_data |
| 102 | + unique_xpaths = list(set(get_all_xpath_generator(xml_content=xml_content, remove_namespaces=True, |
| 103 | + include_values_by_attribute_names=INCLUDE_VALUES_BY_ATTRIBUTES_NAMES, |
| 104 | + exclude_attribute_values=EXCLUDE_ATTRIBUTES_VALUES |
| 105 | + ))) |
| 106 | + xml_metadata = XMLMetadata() |
| 107 | + xml_metadata.unique_xpaths = unique_xpaths |
| 108 | + notice.set_xml_metadata(xml_metadata=xml_metadata) |
| 109 | + return notice |
| 110 | + |
62 | 111 |
|
| 112 | +def index_notice(notice: Notice, base_xpath="") -> Notice: |
63 | 113 | # To be removed later if will not be used |
64 | 114 | # def _notice_namespaces(xml_file) -> dict: |
65 | 115 | # _namespaces = dict([node for _, node in XMLElementTree.iterparse(xml_file, events=['start-ns'])]) |
@@ -229,7 +279,8 @@ def get_unique_xpaths_covered_by_notices(notice_ids: List[str], mongodb_client: |
229 | 279 | """ |
230 | 280 | notice_repository = NoticeRepository(mongodb_client=mongodb_client) |
231 | 281 | results = notice_repository.xml_metadata_repository.collection.aggregate([{"$match": {"ted_id": {"$in": notice_ids}, |
232 | | - "metadata_type": {"$eq":"xml"} |
| 282 | + "metadata_type": { |
| 283 | + "$eq": "xml"} |
233 | 284 | } |
234 | 285 | }], allowDiskUse=True) |
235 | 286 | unique_xpaths = set() |
|
0 commit comments