Skip to content

Commit c121082

Browse files
Limit database cleanup to once per hour
1 parent 6b81aa2 commit c121082

3 files changed

Lines changed: 24 additions & 39 deletions

File tree

main.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
from fastapi import BackgroundTasks
44
import traceback
55
import requests
6+
import time
7+
import os
68

79
from src.Normalizer import TTLNormalizer, JSONNormalizer
810
from src.WikidataLabel import WikidataLabel, LazyLabelFactory
@@ -27,6 +29,9 @@
2729
allow_headers=["*"],
2830
)
2931

32+
LABEL_CLEANUP_INTERVAL_SECONDS = int(os.environ.get("LABEL_CLEANUP_INTERVAL_SECONDS", 3600))
33+
_last_label_cleanup = 0.0
34+
3035
@app.on_event("startup")
3136
async def startup():
3237
WikidataLabel.initialize_database()
@@ -169,7 +174,11 @@ async def get_textified_wd(
169174

170175
return_data[qid] = results
171176

172-
background_tasks.add_task(WikidataLabel.delete_old_labels)
177+
global _last_label_cleanup
178+
if time.time() - _last_label_cleanup > LABEL_CLEANUP_INTERVAL_SECONDS:
179+
background_tasks.add_task(WikidataLabel.delete_old_labels)
180+
_last_label_cleanup = time.time()
181+
173182
return return_data
174183

175184
except HTTPException:

src/WikidataLabel.py

Lines changed: 2 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
from sqlalchemy.dialects.mysql import JSON
33
from sqlalchemy.orm import sessionmaker, declarative_base
44

5+
from .utils import get_wikidata_json_by_ids
56
from datetime import datetime, timedelta
6-
import requests
77
import os
88
import json
99

@@ -253,38 +253,7 @@ def _get_labels_wdapi(ids):
253253
Returns:
254254
- dict: A dictionary mapping each ID to its labels.
255255
"""
256-
entities_data = {}
257-
258-
if isinstance(ids, str):
259-
ids = ids.split('|')
260-
ids = list(set(ids)) # Ensure unique IDs
261-
262-
# Wikidata API has a limit on the number of IDs per request, typically 50 for wbgetentities.
263-
for chunk_idx in range(0, len(ids), 50):
264-
265-
ids_chunk = ids[chunk_idx:chunk_idx+50]
266-
ids_chunk = "|".join(ids_chunk)
267-
params = {
268-
'action': 'wbgetentities',
269-
'ids': ids_chunk,
270-
'props': 'labels',
271-
'format': 'json',
272-
'origin': '*',
273-
}
274-
headers = {
275-
'User-Agent': 'Wikidata Textifier (embedding@wikimedia.de)'
276-
}
277-
278-
response = requests.get(
279-
"https://www.wikidata.org/w/api.php?",
280-
params=params,
281-
headers=headers,
282-
timeout=REQUEST_TIMEOUT_SECONDS,
283-
)
284-
response.raise_for_status()
285-
chunk_data = response.json().get("entities", {})
286-
entities_data = entities_data | chunk_data
287-
256+
entities_data = get_wikidata_json_by_ids(ids, props="labels")
288257
entities_data = WikidataLabel._compress_labels(entities_data)
289258
return entities_data
290259

src/utils.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,17 @@
11
import requests
2+
from requests.adapters import HTTPAdapter
3+
24
import json
35
import html
46
import os
57

68
REQUEST_TIMEOUT_SECONDS = float(os.environ.get("REQUEST_TIMEOUT_SECONDS", "15"))
79

10+
SESSION = requests.Session()
11+
adapter = HTTPAdapter(pool_connections=20, pool_maxsize=20)
12+
SESSION.mount("http://", adapter)
13+
SESSION.mount("https://", adapter)
14+
815
def get_wikidata_ttl_by_id(
916
id,
1017
lang='en',
@@ -25,7 +32,7 @@ def get_wikidata_ttl_by_id(
2532
'User-Agent': 'Wikidata Textifier (embeddings@wikimedia.de)'
2633
}
2734

28-
response = requests.get(
35+
response = SESSION.get(
2936
f"https://www.wikidata.org/wiki/Special:EntityData/{id}.ttl",
3037
params=params,
3138
headers=headers,
@@ -52,7 +59,7 @@ def get_wikidata_json_by_ids(
5259

5360
if isinstance(ids, str):
5461
ids = ids.split('|')
55-
ids = list(set(ids)) # Ensure unique IDs
62+
ids = list(dict.fromkeys(ids)) # Ensure unique IDs
5663

5764
entities_data = {}
5865

@@ -72,7 +79,7 @@ def get_wikidata_json_by_ids(
7279
'User-Agent': 'Wikidata Textifier (embeddings@wikimedia.de)'
7380
}
7481

75-
response = requests.get(
82+
response = SESSION.get(
7683
"https://www.wikidata.org/w/api.php?",
7784
params=params,
7885
headers=headers,
@@ -116,7 +123,7 @@ def wikidata_time_to_text(value: dict, lang: str = "en"):
116123
},
117124
}
118125

119-
r = requests.post(WIKIBASE_API, data={
126+
r = SESSION.post(WIKIBASE_API, data={
120127
"action": "wbformatvalue",
121128
"format": "json",
122129
"uselang": lang,
@@ -148,7 +155,7 @@ def wikidata_geolocation_to_text(value: dict, lang: str = "en"):
148155
},
149156
}
150157

151-
r = requests.post(WIKIBASE_API, data={
158+
r = SESSION.post(WIKIBASE_API, data={
152159
"action": "wbformatvalue",
153160
"format": "json",
154161
"uselang": lang,

0 commit comments

Comments
 (0)