Skip to content

Commit b3cd5d1

Browse files
bug fixes
1 parent addcc68 commit b3cd5d1

6 files changed

Lines changed: 37 additions & 23 deletions

File tree

ted_sws/core/model/notice.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,7 @@ def original_metadata(self) -> Optional[TEDMetadata]:
203203
@property
204204
def xml_manifestation(self) -> XMLManifestation:
205205
if self._xml_manifestation is None:
206-
self.load_lazy_field(property_field=Notice.xml_metadata)
206+
self.load_lazy_field(property_field=Notice.xml_manifestation)
207207
return self._xml_manifestation
208208

209209
def set_original_metadata(self, ted_metadata: TEDMetadata):

ted_sws/data_manager/adapters/manifestation_repository.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
FILE_STORAGE_COLLECTION_NAME = "fs.files"
1414
MANIFESTATION_ID = "manifestation_id"
1515
OBJECT_DATA_KEY = "object_data"
16+
AGGREGATE_REFERENCE_ID = "ted_id"
1617

1718

1819
class BaseManifestationRepository(ManifestationRepositoryABC):
@@ -25,6 +26,7 @@ def __init__(self, mongodb_client: MongoClient, database_name: str = None):
2526
db = mongodb_client[self._database_name]
2627
self.file_storage = gridfs.GridFS(db) # TODO: Investigate how it works in multiple processes in parallel.
2728
self.collection = db[self._collection_name]
29+
self.collection.create_index([(AGGREGATE_REFERENCE_ID, ASCENDING)])
2830
self.file_storage_collection = db[FILE_STORAGE_COLLECTION_NAME]
2931
self.file_storage_collection.create_index([(MANIFESTATION_ID,
3032
ASCENDING)]) # TODO: index creation may bring race condition error.
@@ -56,6 +58,8 @@ def _update_manifestation(self, reference: str, manifestation: Manifestation, up
5658
"""
5759
if manifestation is not None:
5860
manifestation_dict = manifestation.dict()
61+
manifestation_dict[AGGREGATE_REFERENCE_ID] = reference
62+
reference = self._build_reference(base_reference=reference)
5963
manifestation_dict[MONGODB_COLLECTION_ID] = reference
6064
old_linked_manifestation_file = self.file_storage.find_one({MANIFESTATION_ID: reference})
6165
manifestation_dict[OBJECT_DATA_KEY] = self._put_file_content_in_grid_fs(file_reference=reference,
@@ -66,10 +70,12 @@ def _update_manifestation(self, reference: str, manifestation: Manifestation, up
6670
self.file_storage.delete(file_id=old_linked_manifestation_file._id)
6771

6872
def _get_manifestation_dict(self, reference: str) -> Optional[dict]:
73+
reference = self._build_reference(base_reference=reference)
6974
result_dict = self.collection.find_one({MONGODB_COLLECTION_ID: reference})
7075
if result_dict:
7176
result_dict[OBJECT_DATA_KEY] = self._get_file_content_from_grid_fs(file_id=result_dict[OBJECT_DATA_KEY])
7277
del result_dict[MONGODB_COLLECTION_ID]
78+
del result_dict[AGGREGATE_REFERENCE_ID]
7379
return result_dict
7480

7581
@abc.abstractmethod
@@ -95,7 +101,6 @@ def add(self, reference: str, manifestation: Manifestation):
95101
:param manifestation:
96102
:return:
97103
"""
98-
reference = self._build_reference(base_reference=reference)
99104
self._update_manifestation(reference=reference, manifestation=manifestation, upsert=True)
100105

101106
def update(self, reference: str, manifestation: Manifestation):
@@ -105,8 +110,6 @@ def update(self, reference: str, manifestation: Manifestation):
105110
:param manifestation:
106111
:return:
107112
"""
108-
reference = self._build_reference(base_reference=reference)
109-
reference = f"{reference}_rdf"
110113
self._update_manifestation(reference=reference, manifestation=manifestation)
111114

112115
def get(self, reference: str) -> Optional[Manifestation]:
@@ -115,7 +118,6 @@ def get(self, reference: str) -> Optional[Manifestation]:
115118
:param reference:
116119
:return:
117120
"""
118-
reference = self._build_reference(base_reference=reference)
119121
result_dict = self._get_manifestation_dict(reference=reference)
120122
if result_dict is not None:
121123
return self._build_manifestation_from_dict(manifestation_dict=result_dict)

ted_sws/data_manager/adapters/metadata_repository.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
11
import abc
22
from typing import Optional
33

4-
from pymongo import MongoClient
4+
from pymongo import MongoClient, ASCENDING
55

66
from ted_sws import config
77
from ted_sws.core.model.metadata import Metadata, NormalisedMetadata, TEDMetadata, XMLMetadata
88
from ted_sws.data_manager.adapters.repository_abc import RepositoryABC
99

1010
MONGODB_COLLECTION_ID = "_id"
11+
AGGREGATE_REFERENCE_ID = "ted_id"
1112

1213

1314
class BaseMetadataRepository(RepositoryABC):
@@ -22,6 +23,7 @@ def __init__(self, mongodb_client: MongoClient, database_name: str = None):
2223
self.mongodb_client = mongodb_client
2324
db = mongodb_client[self._database_name]
2425
self.collection = db[self._collection_name]
26+
self.collection.create_index([(AGGREGATE_REFERENCE_ID, ASCENDING)])
2527

2628
def _update_metadata(self, reference: str, metadata: Metadata, upsert: bool = False):
2729
"""
@@ -33,6 +35,7 @@ def _update_metadata(self, reference: str, metadata: Metadata, upsert: bool = Fa
3335
"""
3436
if metadata is not None:
3537
metadata_dict = metadata.dict()
38+
metadata_dict[AGGREGATE_REFERENCE_ID] = reference
3639
reference = self._build_reference(base_reference=reference)
3740
metadata_dict[MONGODB_COLLECTION_ID] = reference
3841
self.collection.update_one({MONGODB_COLLECTION_ID: reference}, {"$set": metadata_dict}, upsert=upsert)
@@ -47,6 +50,7 @@ def _get_metadata_dict(self, reference: str) -> Optional[dict]:
4750
result_dict = self.collection.find_one({MONGODB_COLLECTION_ID: reference})
4851
if result_dict:
4952
del result_dict[MONGODB_COLLECTION_ID]
53+
del result_dict[AGGREGATE_REFERENCE_ID]
5054
return result_dict
5155

5256
@abc.abstractmethod

ted_sws/data_manager/adapters/notice_repository.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
NOTICE_TED_METADATA = "original_metadata"
3939
NOTICE_XML_MANIFESTATION = "xml_manifestation"
4040
NOTICE_XML_METADATA = "xml_metadata"
41+
VALIDATION_SUMMARY = "validation_summary"
4142

4243
METADATA_PUBLICATION_DATE = "publication_date"
4344
METADATA_DOCUMENT_SENT_DATE = "document_sent_date"
@@ -46,11 +47,13 @@
4647
NOTICE_NORMALISED_METADATA_PRIVATE_KEY = "_normalised_metadata"
4748
NOTICE_XML_METADATA_PRIVATE_KEY = "_xml_metadata"
4849
NOTICE_XML_MANIFESTATION_PRIVATE_KEY = "_xml_manifestation"
50+
NOTICE_PREPROCESSED_XML_MANIFESTATION_KEY = "_preprocessed_xml_manifestation"
4951
NOTICE_RDF_MANIFESTATION_PRIVATE_KEY = "_rdf_manifestation"
5052
NOTICE_DISTILLED_RDF_MANIFESTATION_PRIVATE_KEY = "_distilled_rdf_manifestation"
5153
NOTICE_METS_MANIFESTATION_PRIVATE_KEY = "_mets_manifestation"
5254

5355

56+
5457
class NoticeRepositoryInFileSystem(NoticeRepositoryABC):
5558
"""
5659
This repository is intended for storing Notice objects as JSON files in file system.
@@ -182,6 +185,10 @@ def _mapping_lazy_fields(self):
182185
self.xml_metadata_repository),
183186
Notice.xml_manifestation: (NOTICE_XML_MANIFESTATION_PRIVATE_KEY,
184187
self.xml_manifestation_repository),
188+
#@Note: preprocessed_xml_manifestation at the moment is same as xml_manifestation
189+
# in this case is used same repository, in future need to create another repository
190+
Notice.preprocessed_xml_manifestation: (NOTICE_PREPROCESSED_XML_MANIFESTATION_KEY,
191+
self.xml_manifestation_repository),
185192
Notice.rdf_manifestation: (NOTICE_RDF_MANIFESTATION_PRIVATE_KEY,
186193
self.rdf_manifestation_repository),
187194
Notice.distilled_rdf_manifestation: (NOTICE_DISTILLED_RDF_MANIFESTATION_PRIVATE_KEY,
@@ -234,7 +241,8 @@ def _create_dict_from_notice(notice: Notice) -> dict:
234241
:return:
235242
"""
236243

237-
notice_dict = notice.dict(include={NOTICE_TED_ID: True, NOTICE_STATUS: True, NOTICE_CREATED_AT: True})
244+
notice_dict = notice.dict(include={NOTICE_TED_ID: True, NOTICE_STATUS: True,
245+
NOTICE_CREATED_AT: True, VALIDATION_SUMMARY: True})
238246
notice_dict[MONGODB_COLLECTION_ID] = notice_dict[NOTICE_TED_ID]
239247
notice_dict[NOTICE_STATUS] = str(notice_dict[NOTICE_STATUS])
240248
notice_dict[NOTICE_CREATED_AT] = datetime.fromisoformat(notice_dict[NOTICE_CREATED_AT])

ted_sws/data_sampler/services/notice_xml_indexer.py

Lines changed: 15 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ def get_unique_xpaths_from_notice_repository(mongodb_client: MongoClient) -> Lis
114114
:return:
115115
"""
116116
notice_repository = NoticeRepository(mongodb_client=mongodb_client)
117-
return notice_repository.collection.distinct("xml_metadata.unique_xpaths")
117+
return notice_repository.xml_metadata_repository.collection.distinct("unique_xpaths")
118118

119119

120120
def get_unique_notice_id_from_notice_repository(mongodb_client: MongoClient) -> List[str]:
@@ -138,14 +138,14 @@ def get_minimal_set_of_xpaths_for_coverage_notices(notice_ids: List[str], mongod
138138
unique_notice_ids = notice_ids.copy()
139139
notice_repository = NoticeRepository(mongodb_client=mongodb_client)
140140
while len(unique_notice_ids):
141-
tmp_result = list(notice_repository.collection.aggregate([
142-
{"$unwind": "$xml_metadata.unique_xpaths"},
141+
tmp_result = list(notice_repository.xml_metadata_repository.collection.aggregate([
142+
{"$unwind": "$unique_xpaths"},
143143
{"$match": {
144-
"xml_metadata.unique_xpaths": {"$nin": minimal_set_of_xpaths},
144+
"unique_xpaths": {"$nin": minimal_set_of_xpaths},
145145
"ted_id": {"$in": unique_notice_ids}
146146
}
147147
},
148-
{"$group": {"_id": "$xml_metadata.unique_xpaths", "count": {"$sum": 1},
148+
{"$group": {"_id": "$unique_xpaths", "count": {"$sum": 1},
149149
"notice_ids": {"$push": "$ted_id"}}},
150150
{"$sort": {"count": -1}},
151151
{"$limit": 1}
@@ -171,17 +171,17 @@ def get_minimal_set_of_notices_for_coverage_xpaths(notice_ids: List[str], mongod
171171
search_notices = notice_ids.copy()
172172
notice_repository = NoticeRepository(mongodb_client=mongodb_client)
173173
while len(unique_xpaths):
174-
tmp_result = list(notice_repository.collection.aggregate([
174+
tmp_result = list(notice_repository.xml_metadata_repository.collection.aggregate([
175175
{"$match": {
176-
"_id": {"$in": search_notices}
176+
"ted_id": {"$in": search_notices}
177177
}
178178
},
179-
{"$unwind": "$xml_metadata.unique_xpaths"},
179+
{"$unwind": "$unique_xpaths"},
180180
{"$match": {
181-
"xml_metadata.unique_xpaths": {"$in": unique_xpaths},
181+
"unique_xpaths": {"$in": unique_xpaths},
182182
}
183183
},
184-
{"$group": {"_id": "$ted_id", "count": {"$sum": 1}, "xpaths": {"$addToSet": "$xml_metadata.unique_xpaths"}
184+
{"$group": {"_id": "$ted_id", "count": {"$sum": 1}, "xpaths": {"$addToSet": "$unique_xpaths"}
185185
}},
186186
{"$sort": {"count": -1}},
187187
{"$limit": 1}
@@ -204,8 +204,8 @@ def get_unique_notices_id_covered_by_xpaths(xpaths: List[str], mongodb_client: M
204204
:return:
205205
"""
206206
notice_repository = NoticeRepository(mongodb_client=mongodb_client)
207-
results = list(notice_repository.collection.aggregate([
208-
{"$match": {"xml_metadata.unique_xpaths": {"$in": xpaths}}},
207+
results = list(notice_repository.xml_metadata_repository.collection.aggregate([
208+
{"$match": {"unique_xpaths": {"$in": xpaths}}},
209209
{
210210
"$group": {"_id": None,
211211
"ted_ids": {"$push": "$ted_id"}
@@ -223,12 +223,11 @@ def get_unique_xpaths_covered_by_notices(notice_ids: List[str], mongodb_client:
223223
:return:
224224
"""
225225
notice_repository = NoticeRepository(mongodb_client=mongodb_client)
226-
results = notice_repository.collection.aggregate([{"$match": {"ted_id": {"$in": notice_ids}}}], allowDiskUse=True)
226+
results = notice_repository.xml_metadata_repository.collection.aggregate([{"$match": {"ted_id": {"$in": notice_ids}}}], allowDiskUse=True)
227227
unique_xpaths = set()
228228
for result in results:
229-
xml_metadata = result["xml_metadata"]
230-
if xml_metadata is not None:
231-
unique_xpaths.update(result["xml_metadata"]["unique_xpaths"])
229+
if result["unique_xpaths"] is not None:
230+
unique_xpaths.update(result["unique_xpaths"])
232231
return list(unique_xpaths)
233232

234233

tests/unit/notice_validator/test_shacl_test_suite_runner.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ def test_validate_notice_by_id_with_shacl_suite(notice_with_distilled_status, rd
6666
mapping_suite_repository = MappingSuiteRepositoryInFileSystem(repository_path=path_to_file_system_repository)
6767
notice_repository.add(notice)
6868

69+
assert len(notice.get_rdf_validation()) == 0
6970
validate_notice_by_id_with_shacl_suite(notice_id="408313-2020",
7071
mapping_suite_repository=mapping_suite_repository,
7172
notice_repository=notice_repository,

0 commit comments

Comments
 (0)