Add fastAPI and gunicorn

philippesaade-wmde · philippesaade-wmde · commit 4eac53f32420 · 2025-07-29T00:53:28.000+02:00
diff --git a/Dockerfile b/Dockerfile
@@ -13,4 +13,4 @@ RUN uv sync
 COPY --chmod=755 . .
 
 # Container start script
-CMD [ "uv", "run", "main.py" ]
+CMD ["uv", "run", "gunicorn", "main:app", "-k", "uvicorn.workers.UvicornWorker", "-w", "4", "-b", "0.0.0.0:8000"]
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -1,6 +1,8 @@
 services:
   wdtextifier:
     build: .
+    volumes:
+      - ./data:/workspace/data
     container_name: wdtextifier
     ports:
       - "8000:8000"
diff --git a/main.py b/main.py
@@ -1,11 +1,6 @@
-from typing import Annotated
-import time
-import os
-import traceback
-
-# Import necessary types and classes from FastAPI and other libraries.
 from fastapi import FastAPI, Header, HTTPException, Query, Request
 from fastapi.middleware.cors import CORSMiddleware
+import traceback
 
 from src.WikidataTextifier import WikidataEntity
 
@@ -53,27 +48,31 @@
 )
 async def property_query_route(
     request: Request,
-    id: str = Query(..., example="Q42"),
+    id: str = Query(..., examples="Q42"),
     lang: str = 'en',
-    json: bool = True,
+    json: bool = False,
 ):
     """
     Retrieve a Wikidata item with all labels or textual representations for an LLM.
 
     Args:
         id (str): The Wikidata item ID (e.g., "Q42").
-        json (bool): If True, returns the item in JSON format. Defaults to True.
+        json (bool): If True, returns the item in JSON format.
 
     Returns:
         list: A list of dictionaries containing QIDs and the similarity scores.
     """
     if not id:
         response = "ID is missing"
-        raise HTTPException(status_code=422, detail=response)
+        return HTTPException(status_code=422, detail=response)
 
     try:
         entity = WikidataEntity.from_id(id, lang=lang)
 
+        if not entity:
+            response = "Item not found"
+            return HTTPException(status_code=404, detail=response)
+
         if json:
             results = entity.to_json()
         else:
diff --git a/pyproject.toml b/pyproject.toml
@@ -6,5 +6,8 @@ readme = "README.md"
 requires-python = ">=3.13"
 dependencies = [
     "fastapi>=0.116.1",
+    "gunicorn>=23.0.0",
+    "requests>=2.32.4",
     "sqlalchemy>=2.0.41",
+    "uvicorn>=0.35.0",
 ]
diff --git a/src/WikidataLabel.py b/src/WikidataLabel.py
@@ -10,7 +10,7 @@
 """
 SQLite database setup for storing Wikidata labels in all languages.
 """
-TOOL_DATA_DIR = os.environ.get("TOOL_DATA_DIR", "./")
+TOOL_DATA_DIR = os.environ.get("TOOL_DATA_DIR", "./data")
 DATABASE_URL = os.path.join(TOOL_DATA_DIR, 'sqlite_wikidata_labels.db')
 
 engine = create_engine(f'sqlite:///{DATABASE_URL}',
diff --git a/src/WikidataTextifier.py b/src/WikidataTextifier.py
@@ -1,6 +1,6 @@
 
-from .WikidataLabel import WikidataLabel, LazyLabelFactory
-from .utils import get_wikidata_entities_by_ids, get_all_missing_labels_ids, get_lang_val, time_to_text, quantity_to_text
+from .WikidataLabel import LazyLabelFactory
+from .utils import get_wikidata_entities_by_ids, get_lang_val
 from datetime import datetime, date
 from dataclasses import dataclass
 import re
@@ -435,6 +435,9 @@ def from_id(cls, id: str, lang: str = 'en'):
             raise ValueError(f"ID not found.")
 
         entity_dict = entity_dict[id]
+        if 'labels' not in entity_dict:
+            return None
+
         label = get_lang_val(entity_dict['labels'], lang)
         description = get_lang_val(entity_dict['descriptions'], lang)
 
diff --git a/src/utils.py b/src/utils.py
@@ -1,6 +1,4 @@
-from .WikidataLabel import WikidataLabel
 import requests
-from datetime import datetime, date
 
 def get_wikidata_entities_by_ids(
         ids,
@@ -81,171 +79,6 @@ def get_all_missing_labels_ids(data):
 
     return ids_list
 
-def time_to_text(time_data, lang='en'):
-    """
-    Converts Wikidata time data into a human-readable string.
-
-    Parameters:
-    - time_data (dict): A dictionary containing the time string, precision, and calendar model.
-    - lang (str): The language code for the output (currently not supported).
-
-    Returns:
-    - str: A textual representation of the time with appropriate granularity.
-    """
-    if time_data is None:
-        return None
-
-    time_value = time_data['time']
-    precision = time_data['precision']
-    calendarmodel = time_data.get('calendarmodel', 'http://www.wikidata.org/entity/Q1985786')
-
-    # Use regex to parse the time string
-    pattern = r'([+-])(\d{1,16})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2})Z'
-    match = re.match(pattern, time_value)
-
-    if not match:
-        raise ValueError("Malformed time string")
-
-    sign, year_str, month_str, day_str, hour_str, minute_str, second_str = match.groups()
-    year = int(year_str) * (1 if sign == '+' else -1)
-
-    # Convert Julian to Gregorian if necessary
-    if 'Q1985786' in calendarmodel and year > 1 and len(str(abs(year))) <= 4:  # Julian calendar
-        try:
-            month = 1 if month_str == '00' else int(month_str)
-            day = 1 if day_str == '00' else int(day_str)
-            julian_date = date(year, month, day)
-            gregorian_ordinal = julian_date.toordinal() + (datetime(1582, 10, 15).toordinal() - datetime(1582, 10, 5).toordinal())
-            gregorian_date = date.fromordinal(gregorian_ordinal)
-            year, month, day = gregorian_date.year, gregorian_date.month, gregorian_date.day
-        except ValueError:
-            raise ValueError("Invalid date for Julian calendar")
-    else:
-        month = int(month_str) if month_str != '00' else 1
-        day = int(day_str) if day_str != '00' else 1
-
-    months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
-    month_str = months[month - 1] if month != 0 else ''
-    era = 'AD' if year > 0 else 'BC'
-
-    if precision == 14:
-        return f"{year} {month_str} {day} {hour_str}:{minute_str}:{second_str}"
-    elif precision == 13:
-        return f"{year} {month_str} {day} {hour_str}:{minute_str}"
-    elif precision == 12:
-        return f"{year} {month_str} {day} {hour_str}:00"
-    elif precision == 11:
-        return f"{day} {month_str} {year}"
-    elif precision == 10:
-        return f"{month_str} {year}"
-    elif precision == 9:
-        return f"{abs(year)} {era}"
-    elif precision == 8:
-        decade = (year // 10) * 10
-        return f"{abs(decade)}s {era}"
-    elif precision == 7:
-        century = (abs(year) - 1) // 100 + 1
-        return f"{century}th century {era}"
-    elif precision == 6:
-        millennium = (abs(year) - 1) // 1000 + 1
-        return f"{millennium}th millennium {era}"
-    elif precision == 5:
-        tens_of_thousands = abs(year) // 10000
-        return f"{tens_of_thousands} ten thousand years {era}"
-    elif precision == 4:
-        hundreds_of_thousands = abs(year) // 100000
-        return f"{hundreds_of_thousands} hundred thousand years {era}"
-    elif precision == 3:
-        millions = abs(year) // 1000000
-        return f"{millions} million years {era}"
-    elif precision == 2:
-        tens_of_millions = abs(year) // 10000000
-        return f"{tens_of_millions} tens of millions of years {era}"
-    elif precision == 1:
-        hundreds_of_millions = abs(year) // 100000000
-        return f"{hundreds_of_millions} hundred million years {era}"
-    elif precision == 0:
-        billions = abs(year) // 1000000000
-        return f"{billions} billion years {era}"
-    else:
-        raise ValueError(f"Unknown precision value {precision}")
-
-
-def quantity_to_text(quantity_data, labels={}, lang='en'):
-    """
-    Converts Wikidata quantity data into a human-readable string.
-
-    Parameters:
-    - quantity_data (dict): A dictionary with 'amount' and optionally 'unit' (often a QID).
-    - labels (dict): A dictionary mapping QIDs to their labels, previously fetched.
-    - lang (str): The language code for the output.
-
-    Returns:
-    - str: A textual representation of the quantity (e.g., "5 kg").
-    """
-    if quantity_data is None:
-        return None
-
-    quantity = quantity_data.get('amount')
-    unit = quantity_data.get('unit')
-
-    # 'unit' of '1' means that the value is a count and doesn't require a unit.
-    if unit == '1':
-        unit = None
-    else:
-        unit_qid = unit.rsplit('/')[-1]
-        if unit_qid in labels:
-            unit = labels[unit_qid]
-        else:
-            unit = WikidataLabel.get_labels(unit_qid)
-        unit = get_lang_val(unit, lang=lang)
-
-    return quantity + (f" {unit}" if unit else "")
-
-
-def globalcoordinate_to_text(coor_data, lang='en'):
-    """
-    Convert a single decimal degree value to DMS with hemisphere suffix.
-    `hemi_pair` is ("N", "S") for latitude or ("E", "W") for longitude.
-
-    Parameters:
-    - coor_data (dict): A dictionary with 'latitude' and 'longitude' keys.
-    - lang (str): The language code for the output (currently not supported).
-
-    Returns:
-    - str: A string representation of the coordinates in DMS format.
-    """
-
-    latitude = abs(coor_data['latitude'])
-    hemi = 'N' if coor_data['latitude'] >= 0 else 'S'
-
-    degrees = int(latitude)
-    minutes_full = (latitude - degrees) * 60
-    minutes = int(minutes_full)
-    seconds = (minutes_full - minutes) * 60
-
-    # Round to-tenth of a second, drop trailing .0
-    seconds = round(seconds, 1)
-    seconds_str = f"{seconds}".rstrip("0").rstrip(".")
-
-    lat_str = f"{degrees}°{minutes}'{seconds_str}\"{hemi}"
-
-    longitude = abs(coor_data['longitude'])
-    hemi = 'E' if coor_data['longitude'] >= 0 else 'W'
-
-    degrees = int(longitude)
-    minutes_full = (longitude - degrees) * 60
-    minutes = int(minutes_full)
-    seconds = (minutes_full - minutes) * 60
-
-    # Round to-tenth of a second, drop trailing .0
-    seconds = round(seconds, 1)
-    seconds_str = f"{seconds}".rstrip("0").rstrip(".")
-
-    lon_str = f"{degrees}°{minutes}'{seconds_str}\"{hemi}"
-
-    return f'{lat_str}, {lon_str}'
-
 def get_lang_val(data, lang='en'):
     """
     Extracts the value for a given language from a dictionary of labels.
diff --git a/test.ipynb b/test.ipynb
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -6,5 +6,8 @@ readme = "README.md"`
`6`	`6`	`requires-python = ">=3.13"`
`7`	`7`	`dependencies = [`
`8`	`8`	`"fastapi>=0.116.1",`
	`9`	`+ "gunicorn>=23.0.0",`
	`10`	`+ "requests>=2.32.4",`
`9`	`11`	`"sqlalchemy>=2.0.41",`
	`12`	`+ "uvicorn>=0.35.0",`
`10`	`13`	`]`