Skip to content

Commit 0682185

Browse files
Add description and aliases in triplet form
1 parent af66815 commit 0682185

5 files changed

Lines changed: 55 additions & 5 deletions

File tree

Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,4 @@ RUN uv sync
1313
COPY --chmod=755 . .
1414

1515
# Container start script
16-
CMD ["uv", "run", "gunicorn", "main:app", "-k", "uvicorn.workers.UvicornWorker", "-w", "4", "-b", "0.0.0.0:8000"]
16+
CMD ["uv", "run", "gunicorn", "main:app", "-k", "uvicorn.workers.UvicornWorker", "-w", "4", "-b", "0.0.0.0:5000"]

docker-compose.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,4 @@ services:
55
- ./data:/workspace/data
66
container_name: wdtextifier
77
ports:
8-
- "8000:8000"
8+
- "5000:5000"

main.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
from fastapi import FastAPI, Header, HTTPException, Query, Request
22
from fastapi.middleware.cors import CORSMiddleware
3+
from fastapi import BackgroundTasks
34
import traceback
45

56
from src.WikidataTextifier import WikidataEntity
7+
from src.WikidataLabel import WikidataLabel
68

79
# Start Fastapi app
810
app = FastAPI(
@@ -46,8 +48,8 @@
4648
},
4749
},
4850
)
49-
async def property_query_route(
50-
request: Request,
51+
async def get_labels(
52+
request: Request, background_tasks: BackgroundTasks,
5153
id: str = Query(..., examples="Q42"),
5254
pid: str = Query(None, examples="P31,P279"),
5355
lang: str = 'en',
@@ -103,6 +105,7 @@ async def property_query_route(
103105
response = "Invalid format specified"
104106
return HTTPException(status_code=422, detail=response)
105107

108+
background_tasks.add_task(WikidataLabel.delete_old_labels)
106109
return results
107110
except Exception as e:
108111
traceback.print_exc()

src/WikidataLabel.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,48 @@ def get_bulk_labels(ids):
175175

176176
return labels
177177

178+
@staticmethod
179+
def delete_old_labels():
180+
"""
181+
Delete labels older than 90 days.
182+
If the database exceeds 10 million rows, delete the oldest rows until it is below the threshold.
183+
"""
184+
with Session() as session:
185+
try:
186+
# Step 1: Delete labels older than 90 days
187+
date_limit = (datetime.now() - timedelta(days=90)).strftime('%Y-%m-%d %H:%M:%S')
188+
session.execute(
189+
text("DELETE FROM labels WHERE date_added < :date_limit"),
190+
{"date_limit": date_limit}
191+
)
192+
session.commit()
193+
194+
# Step 2: Check total count
195+
total_count = session.execute(text("SELECT COUNT(*) FROM labels")).scalar()
196+
max_rows = 10_000_000
197+
198+
if total_count > max_rows:
199+
# Calculate how many rows to delete
200+
rows_to_delete = total_count - max_rows
201+
202+
# Delete oldest rows by date_added
203+
session.execute(text(f"""
204+
DELETE FROM labels
205+
WHERE id IN (
206+
SELECT id FROM labels
207+
ORDER BY date_added ASC
208+
LIMIT :rows_to_delete
209+
)
210+
"""), {"rows_to_delete": rows_to_delete})
211+
212+
session.commit()
213+
214+
return True
215+
except Exception as e:
216+
session.rollback()
217+
print(f"Error while deleting old labels: {e}")
218+
return False
219+
178220
@staticmethod
179221
def _get_labels_wdapi(ids):
180222
"""

src/WikidataTextifier.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -552,7 +552,12 @@ def to_json(self):
552552

553553
def to_triplet(self):
554554
label = f"{str(self.label)} ({self.id})"
555-
attributes = [c.to_triplet() for c in self.claims if c]
555+
attributes = []
556+
if self.description:
557+
attributes.append(f"description: {self.description}")
558+
if self.aliases:
559+
attributes.append(f"aliases: {', '.join(map(str, self.aliases))}")
560+
attributes = [*attributes, *[c.to_triplet() for c in self.claims if c]]
556561

557562
if len(attributes) > 0:
558563
attributes = "\n".join(attributes).split("\n")

0 commit comments

Comments
 (0)