Skip to content

Commit f932ed5

Browse files
refactor data model, add lazy fields, refactor repositories
1 parent ba37f66 commit f932ed5

36 files changed

Lines changed: 844 additions & 230 deletions

File tree

ted_sws/core/model/lazy_object.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
import abc
2+
from typing import Optional, Any
3+
4+
5+
class LazyObjectFieldsLoaderABC(abc.ABC):
6+
7+
@abc.abstractmethod
8+
def load_lazy_field(self, source_object: Any, property_field: property) -> Any:
9+
"""
10+
11+
:param source_object:
12+
:param property_field:
13+
:return:
14+
"""
15+
16+
17+
class LazyObject:
18+
_lazy_object_fields_loader: LazyObjectFieldsLoaderABC = None
19+
20+
def set_lazy_object_fields_loader(self, lazy_object_fields_loader: LazyObjectFieldsLoaderABC):
21+
"""
22+
23+
:param lazy_object_fields_loader:
24+
:return:
25+
"""
26+
self._lazy_object_fields_loader = lazy_object_fields_loader
27+
28+
def get_lazy_object_fields_loader(self) -> Optional[LazyObjectFieldsLoaderABC]:
29+
"""
30+
31+
:return:
32+
"""
33+
return self._lazy_object_fields_loader
34+
35+
def load_lazy_field(self, property_field: property):
36+
"""
37+
38+
:param property_field:
39+
:return:
40+
"""
41+
if self._lazy_object_fields_loader:
42+
return self._lazy_object_fields_loader.load_lazy_field(source_object=self, property_field=property_field)
43+
return None

ted_sws/core/model/metadata.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,31 @@ class NormalisedMetadata(Metadata):
8282
xsd_version: str
8383

8484

85+
class NormalisedMetadataView(Metadata):
86+
title: str
87+
long_title: str
88+
notice_publication_number: str
89+
publication_date: str
90+
ojs_issue_number: str
91+
ojs_type: str
92+
city_of_buyer: Optional[str]
93+
name_of_buyer: Optional[str]
94+
original_language: Optional[str]
95+
country_of_buyer: Optional[str]
96+
eu_institution: Optional[bool]
97+
document_sent_date: Optional[str]
98+
deadline_for_submission: Optional[str]
99+
notice_type: str
100+
form_type: str
101+
place_of_performance: Optional[List[str]]
102+
extracted_legal_basis_directive: Optional[str]
103+
legal_basis_directive: str
104+
form_number: str
105+
eforms_subtype: str
106+
xsd_version: str
107+
108+
109+
85110
class TEDMetadata(Metadata):
86111
"""
87112
Stores notice original metadata

ted_sws/core/model/notice.py

Lines changed: 41 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from pydantic import Field
2222

2323
from ted_sws.core.model import PropertyBaseModel
24+
from ted_sws.core.model.lazy_object import LazyObject
2425
from ted_sws.core.model.manifestation import METSManifestation, RDFManifestation, XMLManifestation, \
2526
RDFValidationManifestation, SPARQLTestSuiteValidationReport, SHACLTestSuiteValidationReport, \
2627
XPATHCoverageValidationReport, XMLValidationManifestation, ValidationSummaryReport
@@ -142,7 +143,7 @@ def update_status_to(self, new_status):
142143
"""
143144

144145

145-
class Notice(WorkExpression):
146+
class Notice(WorkExpression, LazyObject):
146147
"""
147148
A TED notice in any of its forms across the TED-SWS pipeline. This class is conceptualised as a merger of Work
148149
and Expression in the FRBR class hierarchy and is connected to some of its Manifestations.
@@ -161,36 +162,70 @@ class Notice(WorkExpression):
161162
The original XML manifestation of the notice as downloaded from the TED website.
162163
163164
"""
164-
_status: NoticeStatus = NoticeStatus.RAW # PrivateAttr(default=NoticeStatus.RAW)
165+
_status: NoticeStatus = NoticeStatus.RAW
165166
ted_id: str = Field(..., allow_mutation=False)
166-
original_metadata: Optional[TEDMetadata] = None
167-
xml_manifestation: XMLManifestation = Field(..., allow_mutation=False)
167+
_original_metadata: Optional[TEDMetadata] = None
168+
_xml_manifestation: Optional[XMLManifestation] = None
168169
_normalised_metadata: Optional[NormalisedMetadata] = None
169170
_preprocessed_xml_manifestation: Optional[XMLManifestation] = None
170171
_distilled_rdf_manifestation: Optional[RDFManifestation] = None
171172
_rdf_manifestation: Optional[RDFManifestation] = None
172173
_mets_manifestation: Optional[METSManifestation] = None
173-
xml_metadata: Optional[XMLMetadata] = None
174+
_xml_metadata: Optional[XMLMetadata] = None
174175
validation_summary: Optional[ValidationSummaryReport] = None
175176

177+
@property
178+
def original_metadata(self) -> Optional[TEDMetadata]:
179+
if self._original_metadata is None:
180+
self.load_lazy_field(property_field=Notice.original_metadata)
181+
return self._original_metadata
182+
183+
@property
184+
def xml_manifestation(self) -> XMLManifestation:
185+
if self._xml_manifestation is None:
186+
self.load_lazy_field(property_field=Notice.xml_metadata)
187+
return self._xml_manifestation
188+
189+
def set_original_metadata(self, ted_metadata: TEDMetadata):
190+
self._original_metadata = ted_metadata
191+
192+
def set_xml_manifestation(self, xml_manifestation: XMLManifestation):
193+
self._xml_manifestation = xml_manifestation
194+
195+
@property
196+
def xml_metadata(self) -> XMLMetadata:
197+
if self._xml_metadata is None:
198+
self.load_lazy_field(property_field=Notice.xml_metadata)
199+
return self._xml_metadata
200+
176201
@property
177202
def preprocessed_xml_manifestation(self) -> XMLManifestation:
203+
if self._preprocessed_xml_manifestation is None:
204+
self.load_lazy_field(property_field=Notice.preprocessed_xml_manifestation)
178205
return self._preprocessed_xml_manifestation
179206

180207
@property
181208
def distilled_rdf_manifestation(self) -> RDFManifestation:
209+
if self._distilled_rdf_manifestation is None:
210+
self.load_lazy_field(property_field=Notice.distilled_rdf_manifestation)
182211
return self._distilled_rdf_manifestation
183212

184213
@property
185214
def normalised_metadata(self) -> NormalisedMetadata:
215+
if self._normalised_metadata is None:
216+
self.load_lazy_field(property_field=Notice.normalised_metadata)
186217
return self._normalised_metadata
187218

188219
@property
189220
def rdf_manifestation(self) -> RDFManifestation:
221+
if self._rdf_manifestation is None:
222+
self.load_lazy_field(property_field=Notice.rdf_manifestation)
190223
return self._rdf_manifestation
191224

192225
@property
193226
def mets_manifestation(self) -> METSManifestation:
227+
if self._mets_manifestation is None:
228+
self.load_lazy_field(property_field=Notice.mets_manifestation)
194229
return self._mets_manifestation
195230

196231
def get_rdf_validation(self) -> Optional[List[RDFValidationManifestation]]:
@@ -225,7 +260,7 @@ def set_xml_metadata(self, xml_metadata: XMLMetadata):
225260
:param xml_metadata:
226261
:return:
227262
"""
228-
self.xml_metadata = xml_metadata
263+
self._xml_metadata = xml_metadata
229264
self.update_status_to(NoticeStatus.INDEXED)
230265

231266
def set_preprocessed_xml_manifestation(self, preprocessed_xml_manifestation: XMLManifestation):
Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
import abc
2+
from typing import Optional
3+
4+
import gridfs
5+
from bson import ObjectId
6+
from pymongo import MongoClient, ASCENDING
7+
8+
from ted_sws import config
9+
from ted_sws.core.model.manifestation import Manifestation, RDFManifestation, XMLManifestation, METSManifestation
10+
from ted_sws.data_manager.adapters.repository_abc import ManifestationRepositoryABC
11+
12+
MONGODB_COLLECTION_ID = "_id"
13+
FILE_STORAGE_COLLECTION_NAME = "fs.files"
14+
MANIFESTATION_ID = "manifestation_id"
15+
OBJECT_DATA_KEY = "object_data"
16+
17+
18+
class BaseManifestationRepository(ManifestationRepositoryABC):
19+
_collection_name = "notice_manifestations"
20+
21+
def __init__(self, mongodb_client: MongoClient, database_name: str = None):
22+
database_name = database_name if database_name else config.MONGO_DB_AGGREGATES_DATABASE_NAME
23+
self._database_name = database_name
24+
self.mongodb_client = mongodb_client
25+
db = mongodb_client[self._database_name]
26+
self.file_storage = gridfs.GridFS(db) # TODO: Investigate how it works in multiple processes in parallel.
27+
self.collection = db[self._collection_name]
28+
self.file_storage_collection = db[FILE_STORAGE_COLLECTION_NAME]
29+
self.file_storage_collection.create_index([(MANIFESTATION_ID,
30+
ASCENDING)]) # TODO: index creation may bring race condition error.
31+
32+
def _get_file_content_from_grid_fs(self, file_id: str) -> str:
33+
"""
34+
This method load file_content from GridFS by field_id.
35+
:param file_id:
36+
:return:
37+
"""
38+
return self.file_storage.get(file_id=ObjectId(file_id)).read().decode("utf-8")
39+
40+
def _put_file_content_in_grid_fs(self, file_reference: str, file_content: str) -> ObjectId:
41+
"""
42+
This method store file_content in GridFS and set notice_id as file metadata.
43+
:param file_reference:
44+
:param file_content:
45+
:return:
46+
"""
47+
return self.file_storage.put(data=file_content.encode("utf-8"), file_reference=file_reference)
48+
49+
def _update_manifestation(self, reference: str, manifestation: Manifestation, upsert: bool = False):
50+
"""
51+
52+
:param reference:
53+
:param manifestation:
54+
:param upsert:
55+
:return:
56+
"""
57+
if manifestation is not None:
58+
manifestation_dict = manifestation.dict()
59+
manifestation_dict[MONGODB_COLLECTION_ID] = reference
60+
old_linked_manifestation_file = self.file_storage.find_one({MANIFESTATION_ID: reference})
61+
manifestation_dict[OBJECT_DATA_KEY] = self._put_file_content_in_grid_fs(file_reference=reference,
62+
file_content=manifestation_dict[
63+
OBJECT_DATA_KEY])
64+
self.collection.update_one({MONGODB_COLLECTION_ID: reference}, {"$set": manifestation_dict}, upsert=upsert)
65+
if old_linked_manifestation_file is not None:
66+
self.file_storage.delete(file_id=old_linked_manifestation_file._id)
67+
68+
def _get_manifestation_dict(self, reference: str) -> Optional[dict]:
69+
result_dict = self.collection.find_one({MONGODB_COLLECTION_ID: reference})
70+
if result_dict:
71+
result_dict[OBJECT_DATA_KEY] = self._get_file_content_from_grid_fs(file_id=result_dict[OBJECT_DATA_KEY])
72+
del result_dict[MONGODB_COLLECTION_ID]
73+
return result_dict
74+
75+
@abc.abstractmethod
76+
def _build_reference(self, base_reference: str) -> str:
77+
"""
78+
79+
:param base_reference:
80+
:return:
81+
"""
82+
83+
@abc.abstractmethod
84+
def _build_manifestation_from_dict(self, manifestation_dict: dict) -> Manifestation:
85+
"""
86+
87+
:param manifestation_dict:
88+
:return:
89+
"""
90+
91+
def add(self, reference: str, manifestation: Manifestation):
92+
"""
93+
94+
:param reference:
95+
:param manifestation:
96+
:return:
97+
"""
98+
reference = self._build_reference(base_reference=reference)
99+
self._update_manifestation(reference=reference, manifestation=manifestation, upsert=True)
100+
101+
def update(self, reference: str, manifestation: Manifestation):
102+
"""
103+
104+
:param reference:
105+
:param manifestation:
106+
:return:
107+
"""
108+
reference = self._build_reference(base_reference=reference)
109+
reference = f"{reference}_rdf"
110+
self._update_manifestation(reference=reference, manifestation=manifestation)
111+
112+
def get(self, reference: str) -> Optional[Manifestation]:
113+
"""
114+
115+
:param reference:
116+
:return:
117+
"""
118+
reference = self._build_reference(base_reference=reference)
119+
result_dict = self._get_manifestation_dict(reference=reference)
120+
if result_dict is not None:
121+
return self._build_manifestation_from_dict(manifestation_dict=result_dict)
122+
return None
123+
124+
125+
class RDFManifestationRepository(BaseManifestationRepository):
126+
127+
def _build_reference(self, base_reference: str) -> str:
128+
return f"{base_reference}_rdf"
129+
130+
def _build_manifestation_from_dict(self, manifestation_dict: dict) -> Manifestation:
131+
return RDFManifestation(**manifestation_dict)
132+
133+
134+
class DistilledRDFManifestationRepository(RDFManifestationRepository):
135+
def _build_reference(self, base_reference: str) -> str:
136+
return f"{base_reference}_distilled_rdf"
137+
138+
139+
class XMLManifestationRepository(BaseManifestationRepository):
140+
141+
def _build_reference(self, base_reference: str) -> str:
142+
return f"{base_reference}_xml"
143+
144+
def _build_manifestation_from_dict(self, manifestation_dict: dict) -> Manifestation:
145+
return XMLManifestation(**manifestation_dict)
146+
147+
148+
class METSManifestationRepository(BaseManifestationRepository):
149+
150+
def _build_reference(self, base_reference: str) -> str:
151+
return f"{base_reference}_mets"
152+
153+
def _build_manifestation_from_dict(self, manifestation_dict: dict) -> Manifestation:
154+
return METSManifestation(**manifestation_dict)

0 commit comments

Comments
 (0)