Skip to content

Commit 586aa02

Browse files
External ID filtering
1 parent 21afe15 commit 586aa02

4 files changed

Lines changed: 95 additions & 29 deletions

File tree

README.md

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# Wikidata Textifier
2+
3+
**Wikidata Textifier** is an API that transforms Wikidata items into compact JSON formats or textual representations for use in LLMs and GenAI applications. It resolves missing labels of properties and claim values by querying the Wikidata Action API, making it efficient and suitable for AI pipelines.
4+
5+
🔗 Live API: [https://wd-textify.toolforge.org/](https://wd-textify.toolforge.org/)
6+
---
7+
8+
## Functionalities
9+
10+
- **Textifies** any Wikidata item into a readable or JSON format suitable for LLMs.
11+
- **Resolves all labels**, including those missing when querying the Wikidata API.
12+
- **Caches labels** for 90 days to boost performance and reduce API load.
13+
- **Avoids SPARQL** and uses the Wikidata Action API for better efficiency and compatibility.
14+
- **Hosted on Toolforge**: [https://wd-textify.toolforge.org/](https://wd-textify.toolforge.org/)
15+
16+
---
17+
18+
## API Usage
19+
20+
### `GET /`
21+
22+
#### Query Parameters
23+
24+
| Name | Type | Required | Description |
25+
|----------------|---------|----------|-----------------------------------------------------------------------------|
26+
| `id` | string | Yes | Wikidata item ID (e.g., `Q42`) |
27+
| `lang` | string | No | Language code for labels (default: `en`) |
28+
| `json` | bool | No | If `true`, returns JSON. If `false`, returns text representation (default: `false`) |
29+
| `external_ids` | bool | No | Whether to include external IDs in the output (default: `true`) |

main.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,13 +51,16 @@ async def property_query_route(
5151
id: str = Query(..., examples="Q42"),
5252
lang: str = 'en',
5353
json: bool = False,
54+
external_ids: bool = True
5455
):
5556
"""
5657
Retrieve a Wikidata item with all labels or textual representations for an LLM.
5758
5859
Args:
5960
id (str): The Wikidata item ID (e.g., "Q42").
6061
json (bool): If True, returns the item in JSON format.
62+
lang (str): The language code for labels (default is 'en').
63+
external_ids (bool): If True, includes external IDs in the response.
6164
6265
Returns:
6366
list: A list of dictionaries containing QIDs and the similarity scores.
@@ -67,7 +70,11 @@ async def property_query_route(
6770
return HTTPException(status_code=422, detail=response)
6871

6972
try:
70-
entity = WikidataEntity.from_id(id, lang=lang)
73+
entity = WikidataEntity.from_id(
74+
id,
75+
lang=lang,
76+
external_ids=external_ids
77+
)
7178

7279
if not entity:
7380
response = "Item not found"

src/WikidataLabel.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22
from sqlalchemy.ext.declarative import declarative_base
33
from sqlalchemy.orm import sessionmaker
44
from sqlalchemy.types import TypeDecorator
5+
6+
from .utils import get_lang_val
7+
58
from datetime import datetime, timedelta
69
import json
710
import requests
@@ -164,7 +167,6 @@ def get_bulk_labels(ids):
164167
# Fallback when labels are missing from the database
165168
missing_ids = set(ids) - set(labels.keys())
166169
if missing_ids:
167-
print(f"Missing IDs: {missing_ids}")
168170
missing_labels = WikidataLabel._get_labels_wdapi(missing_ids)
169171
labels.update(missing_labels)
170172

@@ -267,10 +269,8 @@ def resolve_all(self):
267269

268270
def get_label(self, qid: str) -> str:
269271
label_dict = self._resolved_labels.get(qid, {})
270-
label = label_dict.get(self.lang) or label_dict.get('mul') or ''
271-
if isinstance(label, dict):
272-
return label.get('value', '')
273-
return label or ''
272+
label = get_lang_val(label_dict, lang=self.lang)
273+
return label
274274

275275
def set_lang(self, lang: str):
276276
self.lang = lang

src/WikidataTextifier.py

Lines changed: 53 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -253,7 +253,11 @@ class WikidataClaimValue:
253253
@classmethod
254254
def from_raw(cls, claim, value, qualifiers, lazylabel):
255255
if value.get('value') is None:
256-
parsed_value = None
256+
return cls(
257+
claim=claim,
258+
value=None,
259+
qualifiers=[]
260+
)
257261
elif value.get('type') == 'wikibase-entityid':
258262
parsed_value = WikidataEntity(
259263
id=value['value']['id'],
@@ -306,14 +310,20 @@ def from_raw(cls, claim, value, qualifiers, lazylabel):
306310
)
307311

308312
def __str__(self):
313+
if not self:
314+
return ''
315+
309316
string = str(self.value)
310-
attributes = [str(q) for q in self.qualifiers if str(q) != '']
317+
attributes = [str(q) for q in self.qualifiers if q]
311318
if len(attributes) > 0:
312319
string += f" ({', '.join(attributes)})"
313320
return string
314321

322+
def __bool__(self):
323+
return (self.value is not None) and str(self.value) != ''
324+
315325
def to_json(self):
316-
if self.value is None:
326+
if not self:
317327
return None
318328

319329
value = self.value.to_json()
@@ -324,10 +334,15 @@ def to_json(self):
324334
'label': value['label']
325335
}
326336

337+
qualifiers = [q.to_json() for q in self.qualifiers if q]
338+
if len(qualifiers) == 0:
339+
return {
340+
"value": value
341+
}
342+
327343
return {
328344
"value": value,
329-
"qualifiers": [q.to_json() for q in self.qualifiers \
330-
if q is not None]
345+
"qualifiers": [q.to_json() for q in self.qualifiers if q]
331346
}
332347

333348

@@ -339,7 +354,7 @@ class WikidataClaim:
339354
datatype: str
340355

341356
@classmethod
342-
def from_raw(cls, subject, property, claim, lazylabel):
357+
def from_raw(cls, subject, property, claim, lazylabel, external_ids=True):
343358
if not claim:
344359
return cls(
345360
subject=subject,
@@ -348,6 +363,16 @@ def from_raw(cls, subject, property, claim, lazylabel):
348363
datatype='empty'
349364
)
350365

366+
datatype = claim[0].get('mainsnak', claim[0])\
367+
.get('datatype', {})
368+
if not external_ids and datatype == 'external-id':
369+
return cls(
370+
subject=subject,
371+
property=property,
372+
values=[],
373+
datatype=datatype
374+
)
375+
351376
rank_preferred_found = False
352377
for i in range(len(claim)):
353378
claim[i]['datavalue'] = claim[i].get('mainsnak', claim[i])\
@@ -369,8 +394,6 @@ def from_raw(cls, subject, property, claim, lazylabel):
369394
claim[i]['include'] = rank_normal_condition or \
370395
rank_preferred_condition
371396

372-
datatype = claim[0].get('mainsnak', claim[0])\
373-
.get('datatype', {})
374397
values = [
375398
WikidataClaimValue.from_raw(
376399
claim=None,
@@ -394,28 +417,30 @@ def from_raw(cls, subject, property, claim, lazylabel):
394417

395418

396419
def __str__(self):
397-
if str(self.property.label) == '':
420+
if not self:
398421
return ''
399422

400-
if len(self.values) == 0:
401-
values = "no value"
402-
else:
403-
values = [str(val) for val in self.values if str(val) != '']
404-
values = ", ".join(values)
405-
406-
if values == '':
423+
if not str(self.property.label):
407424
return ''
408425

426+
values = [str(v) for v in self.values if v]
427+
values = ", ".join(values)
428+
409429
return f"{str(self.property.label)}: {values}"
410430

431+
def __bool__(self):
432+
return (self.property is not None) and \
433+
str(self.property) != '' and \
434+
(len(self.values) > 0) and \
435+
any(bool(val) for val in self.values)
436+
411437
def to_json(self):
412438
property = self.property.to_json()
413439
return {
414440
"PID": property['QID'],
415441
"property_label": property['label'],
416442
"datatype": self.datatype,
417-
"values": [v.to_json() for v in self.values \
418-
if v is not None]
443+
"values": [v.to_json() for v in self.values if v]
419444
}
420445

421446

@@ -429,7 +454,7 @@ class WikidataEntity:
429454
claims: list[WikidataClaim]
430455

431456
@classmethod
432-
def from_id(cls, id: str, lang: str = 'en'):
457+
def from_id(cls, id: str, lang: str = 'en', external_ids: bool = True):
433458
entity_dict = get_wikidata_entities_by_ids(id)
434459
if id not in entity_dict:
435460
raise ValueError(f"ID not found.")
@@ -459,7 +484,8 @@ def from_id(cls, id: str, lang: str = 'en'):
459484
claims=[]
460485
),
461486
claim=claim,
462-
lazylabel=lazylabel
487+
lazylabel=lazylabel,
488+
external_ids=external_ids
463489
) for pid, claim in entity_dict.get('claims', {}).items()
464490
]
465491

@@ -492,7 +518,7 @@ def __str__(self):
492518
if self.aliases:
493519
string += f", also known as {', '.join(map(str, self.aliases))}"
494520

495-
attributes = [str(c) for c in self.claims if str(c) != '']
521+
attributes = [str(c) for c in self.claims if c]
496522
if len(attributes) > 0:
497523
attributes = "\n- ".join(attributes)
498524
string += f". Attributes:\n- {attributes}"
@@ -501,12 +527,16 @@ def __str__(self):
501527

502528
return string
503529

530+
def __bool__(self):
531+
return (self.id is not None) and \
532+
(self.label is not None) and \
533+
(str(self.label) != '')
534+
504535
def to_json(self):
505536
return {
506537
'QID': self.id,
507538
'label': str(self.label),
508539
'description': self.description,
509540
'aliases': self.aliases,
510-
'claims': [c.to_json() for c in self.claims \
511-
if c is not None]
541+
'claims': [c.to_json() for c in self.claims if c]
512542
}

0 commit comments

Comments
 (0)