Skip to content

Commit 3ac6f0a

Browse files
Adapting text format in multiple languages
1 parent c5ade67 commit 3ac6f0a

11 files changed

Lines changed: 245 additions & 91 deletions

docker-compose.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,10 @@ services:
6969
wikibase:
7070
condition: service_healthy
7171
environment:
72+
LABEL_UNLIMITED: ${LABEL_UNLIMITED}
73+
LABEL_TTL_DAYS: ${LABEL_TTL_DAYS}
74+
LABEL_MAX_ROWS: ${LABEL_MAX_ROWS}
75+
7276
WIKIBASE_HOST: wikibase
7377
DB_HOST: db
7478
DB_NAME: ${DB_NAME_LABEL}

main.py

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from fastapi.middleware.cors import CORSMiddleware
33
from fastapi import BackgroundTasks
44
import traceback
5+
import requests
56

67
from src.Normalizer import TTLNormalizer, JSONNormalizer
78
from src.WikidataLabel import WikidataLabel
@@ -26,6 +27,10 @@
2627
allow_headers=["*"],
2728
)
2829

30+
@app.on_event("startup")
31+
async def startup():
32+
WikidataLabel.initialize_database()
33+
2934
@app.get(
3035
"/",
3136
responses={
@@ -58,6 +63,7 @@ async def get_textified_wd(
5863
external_ids: bool = True,
5964
references: bool = False,
6065
all_ranks: bool = False,
66+
qualifiers: bool = True,
6167
fallback_lang: str = 'en'
6268
):
6369
"""
@@ -71,17 +77,13 @@ async def get_textified_wd(
7177
external_ids (bool): If True, includes external IDs in the response.
7278
all_ranks (bool): If True, includes statements of all ranks (preferred, normal, deprecated).
7379
references (bool): If True, includes references in the response. (only available in JSON format)
80+
qualifiers (bool): If True, includes qualifiers in the response.
7481
fallback_lang (str): The fallback language code if the preferred language is not available.
7582
7683
Returns:
7784
list: A list of dictionaries containing QIDs and the similarity scores.
7885
"""
7986
try:
80-
81-
if not id:
82-
response = "ID is missing"
83-
return HTTPException(status_code=422, detail=response)
84-
8587
filter_pids = []
8688
if pid:
8789
filter_pids = [p.strip() for p in pid.split(',')]
@@ -94,7 +96,7 @@ async def get_textified_wd(
9496
entity_data = utils.get_wikidata_ttl_by_id(qids[0], lang=lang)
9597
if not entity_data:
9698
response = "ID not found"
97-
return HTTPException(status_code=404, detail=response)
99+
raise HTTPException(status_code=404, detail=response)
98100

99101
entity_data = TTLNormalizer(
100102
entity_id=qids[0],
@@ -109,15 +111,16 @@ async def get_textified_wd(
109111
external_ids=external_ids,
110112
all_ranks=all_ranks,
111113
references=references,
112-
filter_pids=filter_pids
114+
filter_pids=filter_pids,
115+
qualifiers=qualifiers,
113116
)
114117
}
115118
else:
116119
# JSON is used with Action API for bulk retrieval
117120
entity_data = utils.get_wikidata_json_by_ids(qids)
118121
if not entity_data:
119122
response = "IDs not found"
120-
return HTTPException(status_code=404, detail=response)
123+
raise HTTPException(status_code=404, detail=response)
121124

122125
entity_data = {
123126
qid: JSONNormalizer(
@@ -135,7 +138,8 @@ async def get_textified_wd(
135138
external_ids=external_ids,
136139
all_ranks=all_ranks,
137140
references=references,
138-
filter_pids=filter_pids
141+
filter_pids=filter_pids,
142+
qualifiers=qualifiers
139143
) if entity else None
140144
for qid, entity in entity_data.items()
141145
}
@@ -147,7 +151,7 @@ async def get_textified_wd(
147151
continue
148152

149153
if format == 'text':
150-
results = str(entity)
154+
results = entity.to_text(lang)
151155
elif format == 'triplet':
152156
results = entity.to_triplet()
153157
else:
@@ -158,6 +162,10 @@ async def get_textified_wd(
158162
background_tasks.add_task(WikidataLabel.delete_old_labels)
159163
return return_data
160164

161-
except Exception as e:
165+
except HTTPException:
166+
raise
167+
except requests.RequestException:
168+
raise HTTPException(status_code=502, detail="Upstream service unavailable")
169+
except Exception:
162170
traceback.print_exc()
163171
raise HTTPException(status_code=500, detail="Internal Server Error")

requirements.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
# This file was autogenerated by uv via the following command:
22
# uv pip compile pyproject.toml --output-file requirements.txt --no-deps
3-
babel==2.17.0
4-
# via wikidatatextifier (pyproject.toml)
53
fastapi==0.116.1
64
# via wikidatatextifier (pyproject.toml)
75
gunicorn==23.0.0
86
# via wikidatatextifier (pyproject.toml)
9-
python-dateutil==2.9.0.post0
7+
pymysql==1.1.2
8+
# via wikidatatextifier (pyproject.toml)
9+
rdflib==7.6.0
1010
# via wikidatatextifier (pyproject.toml)
1111
requests==2.32.4
1212
# via wikidatatextifier (pyproject.toml)

src/Normalizer/JSONNormalizer.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
from __future__ import annotations
22

33
from typing import Any, Dict, List, Optional
4+
import requests
45

56
from ..WikidataLabel import WikidataLabel, LazyLabelFactory
6-
from ..WikidataTextifier import (
7+
from ..Textifier.WikidataTextifier import (
78
WikidataClaim,
89
WikidataClaimValue,
910
WikidataCoordinates,
@@ -45,6 +46,7 @@ def normalize(
4546
external_ids: bool = True,
4647
references: bool = False,
4748
all_ranks: bool = False,
49+
qualifiers: bool = True,
4850
filter_pids: List[str] = [],
4951
) -> WikidataEntity:
5052
e = self.entity_json
@@ -91,6 +93,7 @@ def normalize(
9193
external_ids=external_ids,
9294
include_references=references,
9395
all_ranks=all_ranks,
96+
qualifiers=qualifiers
9497
)
9598
if claim_obj and claim_obj.values:
9699
claims_out.append(claim_obj)
@@ -116,6 +119,7 @@ def _build_claim(
116119
external_ids: bool,
117120
include_references: bool,
118121
all_ranks: bool,
122+
qualifiers: bool,
119123
) -> Optional[WikidataClaim]:
120124
datatype = self._claim_datatype_from_statements(statements) or "string"
121125
if (not external_ids) and datatype == "external-id":
@@ -141,6 +145,7 @@ def _build_claim(
141145
statement=st,
142146
datatype=datatype,
143147
include_references=include_references,
148+
qualifiers=qualifiers
144149
)
145150
if cv is not None:
146151
values.append(cv)
@@ -187,6 +192,7 @@ def _build_claim_value(
187192
statement: Dict[str, Any],
188193
datatype: str,
189194
include_references: bool,
195+
qualifiers: bool,
190196
) -> Optional[WikidataClaimValue]:
191197
mainsnak = statement.get("mainsnak", statement)
192198
if not isinstance(mainsnak, dict):
@@ -206,7 +212,9 @@ def _build_claim_value(
206212
datavalue = mainsnak.get("datavalue")
207213
value_obj = self._to_value_object(datatype, datavalue)
208214

209-
qualifiers_obj = self._parse_qualifiers(statement.get("qualifiers", {}) or {})
215+
qualifiers_obj: List[WikidataClaim] = []
216+
if qualifiers:
217+
qualifiers_obj = self._parse_qualifiers(statement.get("qualifiers", {}) or {})
210218
references_obj: List[List[WikidataClaim]] = []
211219
if include_references:
212220
references_obj = self._parse_references(statement.get("references", []) or [])
@@ -336,7 +344,7 @@ def _to_value_object(
336344
dv_val,
337345
self.lang,
338346
)
339-
except (ValueError, TypeError) as e:
347+
except (ValueError, TypeError, KeyError, requests.RequestException) as e:
340348
if self.debug:
341349
print(f"Warning: Failed to parse time value {time_val}: {e}")
342350
return None
@@ -376,7 +384,7 @@ def _to_value_object(
376384

377385
try:
378386
string_val = wikidata_geolocation_to_text(dv_val, self.lang)
379-
except (ValueError, TypeError) as e:
387+
except (ValueError, TypeError, KeyError, requests.RequestException) as e:
380388
if self.debug:
381389
print(f"Warning: Failed to parse coordinates ({lat}, {lon}): {e}")
382390
return None

src/Normalizer/TTLNormalizer.py

Lines changed: 22 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,13 @@
22

33
from collections import defaultdict
44
from typing import Any, DefaultDict, Dict, List, Optional, Set
5+
import requests
56

67
from rdflib import Graph, Literal, Namespace, URIRef
78
from rdflib.namespace import RDF, RDFS
89

910
from ..WikidataLabel import WikidataLabel, LazyLabelFactory
10-
from ..WikidataTextifier import (
11+
from ..Textifier.WikidataTextifier import (
1112
WikidataClaim,
1213
WikidataClaimValue,
1314
WikidataCoordinates,
@@ -80,6 +81,7 @@ def normalize(
8081
external_ids: bool = True,
8182
references: bool = False,
8283
all_ranks: bool = False,
84+
qualifiers: bool = True,
8385
filter_pids: List[str] = []
8486
) -> WikidataEntity:
8587
# Preload labels found inside TTL so LazyLabelFactory can avoid lookups.
@@ -107,6 +109,7 @@ def normalize(
107109
external_ids=external_ids,
108110
include_references=references,
109111
all_ranks=all_ranks,
112+
qualifiers=qualifiers,
110113
filter_pids=filter_pids
111114
)
112115

@@ -124,6 +127,7 @@ def normalize(
124127
pid=pid,
125128
statements=statements,
126129
include_references=references,
130+
qualifiers=qualifiers,
127131
)
128132
for pid, statements in claims_dict.items()
129133
if statements
@@ -162,6 +166,7 @@ def _claims_for_subject(
162166
external_ids: bool,
163167
include_references: bool,
164168
all_ranks: bool,
169+
qualifiers: bool,
165170
filter_pids: List[str] = []
166171
) -> Dict[str, List[Dict[str, Any]]]:
167172
"""Return mapping: pid -> list of statement dicts."""
@@ -195,7 +200,7 @@ def _claims_for_subject(
195200
is_special = self._is_special_main_value(obj, pid)
196201
main = None if is_special else self._main_value(obj, pid, datatype)
197202

198-
qualifiers = self._qualifiers(obj)
203+
qualifiers_data = self._qualifiers(obj) if qualifiers else {}
199204
refs = self._references(obj) if include_references else []
200205

201206
out[pid].append(
@@ -204,7 +209,7 @@ def _claims_for_subject(
204209
"datatype": datatype,
205210
"rank": rank,
206211
"main": main,
207-
"qualifiers": qualifiers if qualifiers else {},
212+
"qualifiers": qualifiers_data if qualifiers_data else {},
208213
"references": refs if refs else [],
209214
"is_special_value": is_special,
210215
}
@@ -273,6 +278,7 @@ def _build_claim_object(
273278
pid: str,
274279
statements: List[Dict[str, Any]],
275280
include_references: bool,
281+
qualifiers: bool = True,
276282
) -> WikidataClaim:
277283
prop_ent = WikidataEntity(
278284
id=pid,
@@ -291,15 +297,17 @@ def _build_claim_object(
291297
print(f"{pid}: {st.get('main')} (special: {st.get('is_special_value', False)})")
292298

293299
value_obj = self._to_value_object(st["datatype"], st.get("main"))
294-
295-
qualifiers_obj: List[WikidataClaim] = [
296-
self._build_snak_claim(
297-
pid=qpid,
298-
datatype=self._prop_datatype(qpid),
299-
snaks=qsnaks,
300-
)
301-
for qpid, qsnaks in (st.get("qualifiers") or {}).items()
302-
]
300+
qualifiers_obj: List[WikidataClaim] = []
301+
302+
if qualifiers:
303+
qualifiers_obj = [
304+
self._build_snak_claim(
305+
pid=qpid,
306+
datatype=self._prop_datatype(qpid),
307+
snaks=qsnaks,
308+
)
309+
for qpid, qsnaks in (st.get("qualifiers") or {}).items()
310+
]
303311

304312
refs_obj: List[List[WikidataClaim]] = []
305313
if include_references:
@@ -409,7 +417,7 @@ def _to_value_object(self, datatype: str, parsed: Any) -> Any:
409417
parsed,
410418
self.lang,
411419
)
412-
except (ValueError, TypeError) as e:
420+
except (ValueError, TypeError, KeyError, requests.RequestException) as e:
413421
if self.debug:
414422
print(f"Warning: Failed to parse time value {time_val}: {e}")
415423
return None
@@ -436,7 +444,7 @@ def _to_value_object(self, datatype: str, parsed: Any) -> Any:
436444

437445
try:
438446
string_val = wikidata_geolocation_to_text(parsed, self.lang)
439-
except (ValueError, TypeError) as e:
447+
except (ValueError, TypeError, KeyError, requests.RequestException) as e:
440448
if self.debug:
441449
print(f"Warning: Failed to parse coordinates ({lat}, {lon}): {e}")
442450
return None

0 commit comments

Comments
 (0)