Skip to content

Commit 0873dbd

Browse files
Including a triplet format
1 parent 586aa02 commit 0873dbd

4 files changed

Lines changed: 546 additions & 14 deletions

File tree

README.md

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
# Wikidata Textifier
22

3-
**Wikidata Textifier** is an API that transforms Wikidata items into compact JSON formats or textual representations for use in LLMs and GenAI applications. It resolves missing labels of properties and claim values by querying the Wikidata Action API, making it efficient and suitable for AI pipelines.
3+
**Wikidata Textifier** is an API that transforms Wikidata items into compact format for use in LLMs and GenAI applications. It resolves missing labels of properties and claim values by querying the Wikidata Action API, making it efficient and suitable for AI pipelines.
44

55
🔗 Live API: [https://wd-textify.toolforge.org/](https://wd-textify.toolforge.org/)
6+
67
---
78

89
## Functionalities
@@ -15,6 +16,14 @@
1516

1617
---
1718

19+
## Formats
20+
21+
- **Text**: A textual representation or summary of the Wikidata item, including its label, description, aliases, and claims. Useful for helping LLMs understand what the item represents.
22+
- **Triplet**: Outputs each triplet as a structured line, including labels and IDs, but omits descriptions and aliases. Ideal for agentic LLMs to traverse and explore Wikidata.
23+
- **JSON**: A structured and compact representation of the full item, suitable for custom formats.
24+
25+
---
26+
1827
## API Usage
1928

2029
### `GET /`
@@ -25,5 +34,5 @@
2534
|----------------|---------|----------|-----------------------------------------------------------------------------|
2635
| `id` | string | Yes | Wikidata item ID (e.g., `Q42`) |
2736
| `lang` | string | No | Language code for labels (default: `en`) |
28-
| `json` | bool | No | If `true`, returns JSON. If `false`, returns text representation (default: `false`) |
37+
| `format` | string | No | The format of the response, either 'json', 'text', or 'triplet' (default: `json`) |
2938
| `external_ids` | bool | No | Whether to include external IDs in the output (default: `true`) |

main.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
"description": "Missing or invalid query parameter",
4141
"content": {
4242
"application/json": {
43-
"example": {"detail": "ID is missing"}
43+
"example": {"detail": "Invalid format specified"}
4444
}
4545
},
4646
},
@@ -50,15 +50,15 @@ async def property_query_route(
5050
request: Request,
5151
id: str = Query(..., examples="Q42"),
5252
lang: str = 'en',
53-
json: bool = False,
53+
format: str = 'json',
5454
external_ids: bool = True
5555
):
5656
"""
5757
Retrieve a Wikidata item with all labels or textual representations for an LLM.
5858
5959
Args:
6060
id (str): The Wikidata item ID (e.g., "Q42").
61-
json (bool): If True, returns the item in JSON format.
61+
format (str): The format of the response, either 'json', 'text', or 'triplet'.
6262
lang (str): The language code for labels (default is 'en').
6363
external_ids (bool): If True, includes external IDs in the response.
6464
@@ -80,10 +80,15 @@ async def property_query_route(
8080
response = "Item not found"
8181
return HTTPException(status_code=404, detail=response)
8282

83-
if json:
83+
if format == 'json':
8484
results = entity.to_json()
85-
else:
85+
elif format == 'triplet':
86+
results = entity.to_triplet()
87+
elif format == 'text':
8688
results = str(entity)
89+
else:
90+
response = "Invalid format specified"
91+
return HTTPException(status_code=422, detail=response)
8792

8893
return results
8994
except Exception as e:

src/WikidataTextifier.py

Lines changed: 43 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -314,9 +314,9 @@ def __str__(self):
314314
return ''
315315

316316
string = str(self.value)
317-
attributes = [str(q) for q in self.qualifiers if q]
318-
if len(attributes) > 0:
319-
string += f" ({', '.join(attributes)})"
317+
qualifiers = [str(q) for q in self.qualifiers if q]
318+
if len(qualifiers) > 0:
319+
string += f" ({', '.join(qualifiers)})"
320320
return string
321321

322322
def __bool__(self):
@@ -345,6 +345,19 @@ def to_json(self):
345345
"qualifiers": [q.to_json() for q in self.qualifiers if q]
346346
}
347347

348+
def to_triplet(self):
349+
if not self:
350+
return ''
351+
352+
string = str(self.value)
353+
if isinstance(self.value, WikidataEntity):
354+
string = f"{self.value.label} ({self.value.id})"
355+
356+
qualifiers = [q.to_triplet() for q in self.qualifiers if q]
357+
if len(qualifiers) > 0:
358+
string += f" | {' | '.join(qualifiers)})"
359+
return string
360+
348361

349362
@dataclass
350363
class WikidataClaim:
@@ -420,17 +433,14 @@ def __str__(self):
420433
if not self:
421434
return ''
422435

423-
if not str(self.property.label):
424-
return ''
425-
426436
values = [str(v) for v in self.values if v]
427437
values = ", ".join(values)
428438

429439
return f"{str(self.property.label)}: {values}"
430440

431441
def __bool__(self):
432442
return (self.property is not None) and \
433-
str(self.property) != '' and \
443+
str(self.property.label) != '' and \
434444
(len(self.values) > 0) and \
435445
any(bool(val) for val in self.values)
436446

@@ -443,6 +453,20 @@ def to_json(self):
443453
"values": [v.to_json() for v in self.values if v]
444454
}
445455

456+
def to_triplet(self):
457+
if not self:
458+
return ''
459+
460+
label = f"{str(self.property.label)} ({self.property.id})"
461+
values = [v.to_triplet() for v in self.values if v]
462+
463+
if len(values) > 0:
464+
values = [f"{label}: {v}" for v in values]
465+
values = "\n".join(values)
466+
return values
467+
468+
return ''
469+
446470

447471
@dataclass
448472
class WikidataEntity:
@@ -540,3 +564,15 @@ def to_json(self):
540564
'aliases': self.aliases,
541565
'claims': [c.to_json() for c in self.claims if c]
542566
}
567+
568+
def to_triplet(self):
569+
label = f"{str(self.label)} ({self.id})"
570+
attributes = [c.to_triplet() for c in self.claims if c]
571+
572+
if len(attributes) > 0:
573+
attributes = "\n".join(attributes).split("\n")
574+
attributes = [f"{label}: {a}" for a in attributes]
575+
attributes = "\n".join(attributes)
576+
return attributes
577+
578+
return label

0 commit comments

Comments
 (0)