Skip to content

Commit bccbb13

Browse files
Fix bug with missing items
1 parent 0682185 commit bccbb13

2 files changed

Lines changed: 34 additions & 20 deletions

File tree

src/WikidataLabel.py

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -247,7 +247,7 @@ def _get_labels_wdapi(ids):
247247
'origin': '*',
248248
}
249249
headers = {
250-
'User-Agent': 'Wikidata Textifier'
250+
'User-Agent': 'Wikidata Textifier (embedding@wikimedia.de)'
251251
}
252252

253253
response = requests.get(
@@ -263,22 +263,25 @@ def _get_labels_wdapi(ids):
263263
return entities_data
264264

265265
@staticmethod
266-
def _compress_labels(labels):
266+
def _compress_labels(data):
267267
"""
268268
Compress labels by extracting the 'value' field from each label.
269269
270270
Parameters:
271-
- labels (dict): A dictionary of labels from Wikidata API.
271+
- data (dict): A dictionary of labels from Wikidata API.
272272
273273
Returns:
274274
- dict: A new dictionary with labels compressed to their 'value' field.
275275
"""
276276
new_labels = {}
277-
for qid, labels in labels.items():
278-
new_labels[qid] = {
279-
lang: label.get('value') \
280-
for lang,label in labels['labels'].items()
281-
}
277+
for qid, labels in data.items():
278+
if 'labels' in labels:
279+
new_labels[qid] = {
280+
lang: label.get('value') \
281+
for lang, label in labels['labels'].items()
282+
}
283+
else:
284+
new_labels[qid] = {}
282285
return new_labels
283286

284287
@staticmethod

src/WikidataTextifier.py

Lines changed: 23 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -288,8 +288,9 @@ def to_json(self):
288288
value = self.value.to_json()
289289
if isinstance(self.value, WikidataEntity):
290290
ID_name = "QID" if self.claim.datatype == 'wikibase-item' else "PID"
291+
entity_id = value.get('QID') or value.get('PID')
291292
value = {
292-
ID_name: value['QID'],
293+
ID_name: entity_id,
293294
'label': str(value['label'])
294295
}
295296

@@ -322,9 +323,10 @@ def to_triplet(self):
322323
if self.rank == 'deprecated':
323324
string += " [deprecated]"
324325

325-
qualifiers = [q.to_triplet() for q in self.qualifiers if q]
326+
qualifiers = [q.to_triplet(as_qualifier=True) \
327+
for q in self.qualifiers if q]
326328
if len(qualifiers) > 0:
327-
string += f" | {' | '.join(qualifiers)})"
329+
string += f" | {' | '.join(qualifiers)}"
328330
return string
329331

330332

@@ -427,24 +429,31 @@ def __bool__(self):
427429

428430
def to_json(self):
429431
property = self.property.to_json()
432+
property_id = property.get('PID') or property.get('QID')
430433
return {
431-
"PID": property['QID'],
434+
"PID": property_id,
432435
"property_label": property['label'],
433436
"datatype": self.datatype,
434437
"values": [v.to_json() for v in self.values if v]
435438
}
436439

437-
def to_triplet(self):
440+
def to_triplet(self, as_qualifier=False):
438441
if not self:
439442
return ''
440443

441444
label = f"{str(self.property.label)} ({self.property.id})"
442445
values = [v.to_triplet() for v in self.values if v]
443446

444447
if len(values) > 0:
445-
values = [f"{label}: {v}" for v in values]
446-
values = "\n".join(values)
447-
return values
448+
if as_qualifier:
449+
# For qualifiers: join multiple values with comma on same line
450+
values_str = ", ".join(values)
451+
return f"{label}: {values_str}"
452+
else:
453+
# For main claims: each value gets its own line
454+
values = [f"{label}: {v}" for v in values]
455+
values = "\n".join(values)
456+
return values
448457

449458
return ''
450459

@@ -519,7 +528,7 @@ def from_id(cls, id: str,
519528
return entity
520529

521530
def __str__(self):
522-
label_str = str(self.label)
531+
label_str = str(self.label) if self.label else '<missing>'
523532
string = label_str
524533

525534
if self.description:
@@ -542,16 +551,18 @@ def __bool__(self):
542551
(str(self.label) != '')
543552

544553
def to_json(self):
554+
id_key = 'PID' if self.id.startswith('P') else 'QID'
555+
545556
return {
546-
'QID': self.id,
547-
'label': str(self.label),
557+
id_key: self.id,
558+
'label': str(self.label) if self.label else None,
548559
'description': self.description,
549560
'aliases': self.aliases,
550561
'claims': [c.to_json() for c in self.claims if c]
551562
}
552563

553564
def to_triplet(self):
554-
label = f"{str(self.label)} ({self.id})"
565+
label = f"{str(self.label) if self.label else '<missing>'} ({self.id})"
555566
attributes = []
556567
if self.description:
557568
attributes.append(f"description: {self.description}")

0 commit comments

Comments
 (0)