Skip to content

Commit 1970006

Browse files
Time to text, multilingual support
1 parent f188823 commit 1970006

4 files changed

Lines changed: 2383 additions & 728 deletions

File tree

src/WikidataLabel.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@
33
from sqlalchemy.orm import sessionmaker
44
from sqlalchemy.types import TypeDecorator
55

6-
from .utils import get_lang_val
7-
86
from datetime import datetime, timedelta
97
import json
108
import requests
@@ -241,6 +239,19 @@ def _compress_labels(labels):
241239
}
242240
return new_labels
243241

242+
@staticmethod
243+
def get_lang_val(data, lang='en', fallback_lang=None):
244+
"""
245+
Extracts the value for a given language from a dictionary of labels.
246+
"""
247+
label = data.get(lang, data.get('mul', {}))
248+
if fallback_lang and not label:
249+
label = data.get(fallback_lang, {})
250+
251+
if isinstance(label, str):
252+
return label
253+
return label.get('value', '')
254+
244255
class LazyLabel:
245256
def __init__(self, qid, factory):
246257
self.qid = qid
@@ -269,7 +280,7 @@ def resolve_all(self):
269280

270281
def get_label(self, qid: str) -> str:
271282
label_dict = self._resolved_labels.get(qid, {})
272-
label = get_lang_val(label_dict, lang=self.lang)
283+
label = WikidataLabel.get_lang_val(label_dict, lang=self.lang)
273284
return label
274285

275286
def set_lang(self, lang: str):

src/WikidataTextifier.py

Lines changed: 29 additions & 112 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11

2-
from .WikidataLabel import LazyLabelFactory
3-
from .utils import get_wikidata_entities_by_ids, get_lang_val
2+
from .WikidataLabel import WikidataLabel, LazyLabelFactory
3+
from .utils import get_wikidata_entities_by_ids, wikidata_time_to_text, wikidata_geolocation_to_text
44
from datetime import datetime, date
55
from dataclasses import dataclass
66
import re
@@ -49,159 +49,76 @@ def to_json(self):
4949
class WikidataCoordinates:
5050
latitude: float | None = None
5151
longitude: float | None = None
52+
string_val: str | None = None
5253

5354
@classmethod
5455
def from_raw(cls, value, lazylabel):
5556
if not isinstance(value, dict):
5657
return cls(
57-
time=None,
58-
precision=None,
59-
calendarmodel=None
58+
latitude=None,
59+
longitude=None,
60+
string_val=None
6061
)
6162

63+
string_val = wikidata_geolocation_to_text(
64+
value.get('latitude'),
65+
value.get('longitude')
66+
)
67+
6268
return cls(
6369
latitude=value.get('latitude'),
64-
longitude=value.get('longitude')
70+
longitude=value.get('longitude'),
71+
string_val=string_val
6572
)
6673

6774
def __str__(self):
68-
latitude = abs(self.latitude)
69-
hemi = 'N' if self.latitude >= 0 else 'S'
70-
71-
degrees = int(latitude)
72-
minutes_full = (latitude - degrees) * 60
73-
minutes = int(minutes_full)
74-
seconds = (minutes_full - minutes) * 60
75-
76-
# Round to-tenth of a second, drop trailing .0
77-
seconds = round(seconds, 1)
78-
seconds_str = f"{seconds}".rstrip("0").rstrip(".")
79-
80-
lat_str = f"{degrees}°{minutes}'{seconds_str}\"{hemi}"
81-
82-
longitude = abs(self.longitude)
83-
hemi = 'E' if self.longitude >= 0 else 'W'
84-
85-
degrees = int(longitude)
86-
minutes_full = (longitude - degrees) * 60
87-
minutes = int(minutes_full)
88-
seconds = (minutes_full - minutes) * 60
89-
90-
# Round to-tenth of a second, drop trailing .0
91-
seconds = round(seconds, 1)
92-
seconds_str = f"{seconds}".rstrip("0").rstrip(".")
93-
94-
lon_str = f"{degrees}°{minutes}'{seconds_str}\"{hemi}"
95-
96-
return f'{lat_str}, {lon_str}'
75+
return self.string_val or ''
9776

9877
def to_json(self):
9978
return {
10079
'latitude': self.latitude,
101-
'longitude': self.longitude
80+
'longitude': self.longitude,
81+
'string': self.string_val
10282
}
10383

10484
@dataclass
10585
class WikidataTime:
10686
time: str | None = None
10787
precision: int | None = None
10888
calendarmodel: str | None = None
89+
string_val: str | None = None
10990

11091
@classmethod
11192
def from_raw(cls, value, lazylabel):
11293
if not isinstance(value, dict):
11394
return cls(
11495
time=None,
11596
precision=None,
116-
calendarmodel=None
97+
calendarmodel=None,
98+
string_val=None
11799
)
118100

119101
calendarmodel = value.get('calendarmodel', 'Q1985786')
120102
calendarmodel = calendarmodel.split('/')[-1]
103+
104+
string_val = wikidata_time_to_text(value, lazylabel.lang)
105+
121106
return cls(
122107
time=value.get('time'),
123108
precision=value.get('precision'),
124-
calendarmodel=calendarmodel
109+
calendarmodel=calendarmodel,
110+
string_val=string_val
125111
)
126112

127113
def __str__(self):
128-
# Use regex to parse the time string
129-
pattern = r'([+-])(\d{1,16})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2})Z'
130-
match = re.match(pattern, self.time)
131-
132-
if not match:
133-
raise ValueError("Malformed time string")
134-
135-
sign, year_str, month_str, day_str, hour_str, minute_str, second_str = match.groups()
136-
year = int(year_str) * (1 if sign == '+' else -1)
137-
138-
# Convert Julian to Gregorian if necessary
139-
if 'Q1985786' in self.calendarmodel and year > 1 and len(str(abs(year))) <= 4: # Julian calendar
140-
try:
141-
month = 1 if month_str == '00' else int(month_str)
142-
day = 1 if day_str == '00' else int(day_str)
143-
julian_date = date(year, month, day)
144-
gregorian_ordinal = julian_date.toordinal() + (datetime(1582, 10, 15).toordinal() - datetime(1582, 10, 5).toordinal())
145-
gregorian_date = date.fromordinal(gregorian_ordinal)
146-
year, month, day = gregorian_date.year, gregorian_date.month, gregorian_date.day
147-
except ValueError:
148-
raise ValueError("Invalid date for Julian calendar")
149-
else:
150-
month = int(month_str) if month_str != '00' else 1
151-
day = int(day_str) if day_str != '00' else 1
152-
153-
# Next step: take translations from Wikidata Labels
154-
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
155-
month_str = months[month - 1] if month != 0 else ''
156-
era = 'AD' if year > 0 else 'BC'
157-
158-
if self.precision == 14:
159-
return f"{year} {month_str} {day} {hour_str}:{minute_str}:{second_str}"
160-
elif self.precision == 13:
161-
return f"{year} {month_str} {day} {hour_str}:{minute_str}"
162-
elif self.precision == 12:
163-
return f"{year} {month_str} {day} {hour_str}:00"
164-
elif self.precision == 11:
165-
return f"{day} {month_str} {year}"
166-
elif self.precision == 10:
167-
return f"{month_str} {year}"
168-
elif self.precision == 9:
169-
return f"{abs(year)} {era}"
170-
elif self.precision == 8:
171-
decade = (year // 10) * 10
172-
return f"{abs(decade)}s {era}"
173-
elif self.precision == 7:
174-
century = (abs(year) - 1) // 100 + 1
175-
return f"{century}th century {era}"
176-
elif self.precision == 6:
177-
millennium = (abs(year) - 1) // 1000 + 1
178-
return f"{millennium}th millennium {era}"
179-
elif self.precision == 5:
180-
tens_of_thousands = abs(year) // 10000
181-
return f"{tens_of_thousands} ten thousand years {era}"
182-
elif self.precision == 4:
183-
hundreds_of_thousands = abs(year) // 100000
184-
return f"{hundreds_of_thousands} hundred thousand years {era}"
185-
elif self.precision == 3:
186-
millions = abs(year) // 1000000
187-
return f"{millions} million years {era}"
188-
elif self.precision == 2:
189-
tens_of_millions = abs(year) // 10000000
190-
return f"{tens_of_millions} tens of millions of years {era}"
191-
elif self.precision == 1:
192-
hundreds_of_millions = abs(year) // 100000000
193-
return f"{hundreds_of_millions} hundred million years {era}"
194-
elif self.precision == 0:
195-
billions = abs(year) // 1000000000
196-
return f"{billions} billion years {era}"
197-
else:
198-
raise ValueError(f"Unknown precision value {self.precision}")
114+
return self.string_val or ''
199115

200116
def to_json(self):
201117
return {
202118
'time': self.time,
203119
'precision': self.precision,
204-
'calendar_QID': self.calendarmodel
120+
'calendar_QID': self.calendarmodel,
121+
'string': self.string_val
205122
}
206123

207124
@dataclass
@@ -493,8 +410,8 @@ def from_id(cls, id: str, lang: str = 'en', external_ids: bool = True):
493410
if 'labels' not in entity_dict:
494411
return None
495412

496-
label = get_lang_val(entity_dict['labels'], lang)
497-
description = get_lang_val(entity_dict['descriptions'], lang)
413+
label = WikidataLabel.get_lang_val(entity_dict['labels'], lang)
414+
description = WikidataLabel.get_lang_val(entity_dict['descriptions'], lang)
498415

499416
aliases = entity_dict['aliases'].get(lang, []) + \
500417
entity_dict['aliases'].get('mul', [])

0 commit comments

Comments
 (0)