Skip to content

Commit 085cf0c

Browse files
Major update: the textifier now runs a small wikibase for proper time transformation to natural language in all 300+ languages. The textifier also processes TTL as well as JSON
1 parent fd3436d commit 085cf0c

14 files changed

Lines changed: 1725 additions & 882 deletions

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1-
# SQLite Database
1+
# Databases
22
*.db
3+
./data/*
34

45
# Byte-compiled / optimized / DLL files
56
__pycache__/

Dockerfile

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,9 @@ RUN uv sync
1313
COPY --chmod=755 . .
1414

1515
# Container start script
16-
CMD ["uv", "run", "gunicorn", "main:app", "-k", "uvicorn.workers.UvicornWorker", "-w", "4", "-b", "0.0.0.0:5000"]
16+
CMD ["uv", "run", "gunicorn", "main:app", \
17+
"--timeout", "300", \
18+
"--graceful-timeout", "30", \
19+
"-k", "uvicorn.workers.UvicornWorker", \
20+
"-w", "1", \
21+
"-b", "0.0.0.0:5000"]

docker-compose.yml

Lines changed: 74 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,80 @@
11
services:
2+
3+
# Database for Wikibase & for caching the labels for the Textifier
4+
db:
5+
container_name: db
6+
image: mariadb:10.11
7+
environment:
8+
MARIADB_DATABASE: ${DB_NAME_WIKI}
9+
MARIADB_USER: ${DB_USER}
10+
MARIADB_PASSWORD: ${DB_PASS}
11+
MARIADB_ROOT_USER: ${DB_USER}
12+
MARIADB_ROOT_PASSWORD: ${DB_PASS}
13+
volumes:
14+
- ./data/mysql:/var/lib/mysql
15+
- ./docker-entrypoint-initdb:/docker-entrypoint-initdb.d
16+
ports:
17+
- "3306:3306"
18+
healthcheck:
19+
test: ["CMD-SHELL", "mariadb-admin ping -h 127.0.0.1 -u root -p$${MARIADB_ROOT_PASSWORD} --silent"]
20+
interval: 5s
21+
timeout: 5s
22+
retries: 30
23+
start_period: 20s
24+
25+
# Wikibase instance only used for formatting time datatypes
26+
wikibase:
27+
container_name: wikibase
28+
image: wikibase/wikibase:mw1.44.0
29+
depends_on:
30+
db:
31+
condition: service_healthy
32+
environment:
33+
DB_SERVER: db
34+
DB_NAME: ${DB_NAME_WIKI}
35+
DB_USER: ${DB_USER}
36+
DB_PASS: ${DB_PASS}
37+
38+
MW_ADMIN_NAME: ${MW_ADMIN_NAME}
39+
MW_ADMIN_PASS: ${MW_ADMIN_PASS}
40+
MW_ADMIN_EMAIL: ${MW_ADMIN_EMAIL}
41+
42+
MW_SITE_LANG: en
43+
MW_SITE_NAME: "Formatter"
44+
MW_WG_SERVER: "http://wikibase"
45+
46+
METADATA_CALLBACK: "false"
47+
volumes:
48+
- ./data/config:/config
49+
tmpfs:
50+
- /tmp
51+
- /var/tmp
52+
- /run
53+
- /var/cache
54+
- /var/log
55+
healthcheck:
56+
test: ["CMD-SHELL", "curl -fsS http://localhost/w/api.php?action=query&meta=siteinfo&format=json > /dev/null"]
57+
interval: 10s
58+
timeout: 5s
59+
retries: 30
60+
start_period: 60s
61+
62+
# Textifier
263
wdtextifier:
64+
container_name: wdtextifier
365
build: .
66+
depends_on:
67+
db:
68+
condition: service_healthy
69+
wikibase:
70+
condition: service_healthy
71+
environment:
72+
WIKIBASE_HOST: wikibase
73+
DB_HOST: db
74+
DB_NAME: ${DB_NAME_LABEL}
75+
DB_USER: ${DB_USER}
76+
DB_PASS: ${DB_PASS}
477
volumes:
578
- ./data:/workspace/data
6-
container_name: wdtextifier
779
ports:
8-
- "5000:5000"
80+
- "5000:5000"
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#!/bin/sh
2+
set -eu
3+
4+
: "${DB_NAME_LABEL:=label}"
5+
6+
mariadb -uroot -p"$MARIADB_ROOT_PASSWORD" <<SQL
7+
CREATE DATABASE IF NOT EXISTS \`$DB_NAME_LABEL\`
8+
CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
9+
10+
GRANT ALL PRIVILEGES ON \`$DB_NAME_LABEL\`.* TO '$MARIADB_USER'@'%';
11+
FLUSH PRIVILEGES;
12+
SQL
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#!/bin/sh
2+
set -eu
3+
4+
: "${DB_NAME_WIKI:=wiki}"
5+
6+
mariadb -uroot -p"$MARIADB_ROOT_PASSWORD" <<SQL
7+
CREATE DATABASE IF NOT EXISTS \`$DB_NAME_WIKI\`
8+
CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
9+
10+
GRANT ALL PRIVILEGES ON \`$DB_NAME_WIKI\`.* TO '$MARIADB_USER'@'%';
11+
FLUSH PRIVILEGES;
12+
SQL

main.py

Lines changed: 60 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
1-
from fastapi import FastAPI, Header, HTTPException, Query, Request
1+
from fastapi import FastAPI, HTTPException, Query, Request
22
from fastapi.middleware.cors import CORSMiddleware
33
from fastapi import BackgroundTasks
44
import traceback
55

6-
from src.WikidataTextifier import WikidataEntity
6+
from src.Normalizer import TTLNormalizer, JSONNormalizer
77
from src.WikidataLabel import WikidataLabel
88
from src import utils
99

@@ -82,51 +82,78 @@ async def get_textified_wd(
8282
response = "ID is missing"
8383
return HTTPException(status_code=422, detail=response)
8484

85-
filter_pids = None
85+
filter_pids = []
8686
if pid:
8787
filter_pids = [p.strip() for p in pid.split(',')]
8888

8989
qids = [q.strip() for q in id.split(',')]
90-
entity_dict = utils.get_wikidata_entities_by_ids(qids)
9190

92-
if not entity_dict:
93-
response = "ID not found"
94-
return HTTPException(status_code=404, detail=response)
91+
entities = {}
92+
if len(qids) == 1:
93+
# When one QID is requested, TTL is used
94+
entity_data = utils.get_wikidata_ttl_by_id(qids[0], lang=lang)
95+
if not entity_data:
96+
response = "ID not found"
97+
return HTTPException(status_code=404, detail=response)
9598

96-
return_data = {}
97-
for id in qids:
98-
if id in entity_dict:
99-
entity = WikidataEntity.from_wd(
100-
entity_dict[id],
101-
id=id,
102-
lang=lang,
99+
entity_data = TTLNormalizer(
100+
entity_id=qids[0],
101+
ttl_text=entity_data,
102+
lang=lang,
103+
fallback_lang=fallback_lang,
104+
debug=False,
105+
)
106+
107+
entities = {
108+
qids[0]: entity_data.normalize(
103109
external_ids=external_ids,
104110
all_ranks=all_ranks,
105111
references=references,
106-
filter_pids=filter_pids,
107-
fallback_lang=fallback_lang
112+
filter_pids=filter_pids
108113
)
114+
}
115+
else:
116+
# JSON is used with Action API for bulk retrieval
117+
entity_data = utils.get_wikidata_json_by_ids(qids)
118+
if not entity_data:
119+
response = "IDs not found"
120+
return HTTPException(status_code=404, detail=response)
109121

110-
if not entity:
111-
return_data[id] = None
112-
continue
113-
114-
if format == 'text':
115-
results = str(entity)
116-
elif format == 'triplet':
117-
results = entity.to_triplet()
118-
else:
119-
results = entity.to_json()
122+
entity_data = {
123+
qid: JSONNormalizer(
124+
entity_id=qid,
125+
entity_json=entity_data[qid],
126+
lang=lang,
127+
fallback_lang=fallback_lang,
128+
debug=False,
129+
) if entity_data.get(qid) else None
130+
for qid in qids
131+
}
132+
133+
entities = {
134+
qid: entity.normalize(
135+
external_ids=external_ids,
136+
all_ranks=all_ranks,
137+
references=references,
138+
filter_pids=filter_pids
139+
) if entity else None
140+
for qid, entity in entity_data.items()
141+
}
120142

121-
return_data[id] = results
143+
return_data = {}
144+
for qid, entity in entities.items():
145+
if not entity:
146+
return_data[qid] = None
147+
continue
148+
149+
if format == 'text':
150+
results = str(entity)
151+
elif format == 'triplet':
152+
results = entity.to_triplet()
122153
else:
123-
return_data[id] = None
154+
results = entity.to_json()
124155

125-
if len(qids) == 1:
126-
return_data = return_data[qids[0]]
127-
if not return_data:
128-
response = "Item not found"
129-
return HTTPException(status_code=404, detail=response)
156+
return_data[qid] = results
130157

131158
background_tasks.add_task(WikidataLabel.delete_old_labels)
132159
return return_data

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,10 @@ description = "Transforms Wikidata entities into text representations."
55
readme = "README.md"
66
requires-python = ">=3.13"
77
dependencies = [
8-
"babel>=2.17.0",
98
"fastapi>=0.116.1",
109
"gunicorn>=23.0.0",
11-
"python-dateutil>=2.9.0.post0",
10+
"pymysql>=1.1.2",
11+
"rdflib>=7.5.0",
1212
"requests>=2.32.4",
1313
"sqlalchemy>=2.0.41",
1414
"uvicorn>=0.35.0",

0 commit comments

Comments
 (0)