Skip to content

Commit eaec3cc

Browse files
committed
initial commit
0 parents  commit eaec3cc

14 files changed

Lines changed: 1485 additions & 0 deletions

.dockerignore

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
.git
2+
.gitignore
3+
AGENTS.md
4+
5+
__pycache__/
6+
*.pyc
7+
*.pyo
8+
*.pyd
9+
.pytest_cache/
10+
.mypy_cache/
11+
.ruff_cache/
12+
13+
.venv/
14+
venv/
15+
env/
16+
17+
.DS_Store
18+
*.log
19+
*.tmp
20+
*.swp
21+
22+
README.md

.github/dependabot.yml

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
version: 2
2+
updates:
3+
- package-ecosystem: "pip"
4+
directory: "/"
5+
schedule:
6+
interval: "monthly"
7+
open-pull-requests-limit: 1
8+
groups:
9+
monthly-python:
10+
patterns:
11+
- "*"
12+
commit-message:
13+
prefix: "deps"
14+
15+
- package-ecosystem: "github-actions"
16+
directory: "/"
17+
schedule:
18+
interval: "monthly"
19+
open-pull-requests-limit: 1
20+
groups:
21+
monthly-actions:
22+
patterns:
23+
- "*"
24+
commit-message:
25+
prefix: "deps"

.github/workflows/ci.yml

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
name: CI
2+
3+
on:
4+
pull_request:
5+
push:
6+
branches:
7+
- "**"
8+
9+
permissions:
10+
contents: read
11+
12+
jobs:
13+
unit-tests:
14+
name: Run unit tests
15+
runs-on: ubuntu-latest
16+
steps:
17+
- name: Check out repository
18+
uses: actions/checkout@v6
19+
20+
- name: Set up Python
21+
uses: actions/setup-python@v6
22+
with:
23+
python-version: "3.12"
24+
cache: "pip"
25+
26+
- name: Install Python dependencies
27+
run: |
28+
python -m pip install --upgrade pip
29+
pip install -r requirements.txt
30+
31+
- name: Execute unit test suite
32+
run: python -m unittest discover -s tests -p "test_*.py" -v
33+
34+
smoke-test:
35+
name: Run docker smoke test
36+
runs-on: ubuntu-latest
37+
timeout-minutes: 30
38+
steps:
39+
- name: Check out repository
40+
uses: actions/checkout@v6
41+
42+
- name: Execute smoke test
43+
run: make smoke
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
name: Docker Publish
2+
3+
on:
4+
workflow_run:
5+
workflows: ["CI"]
6+
types: [completed]
7+
8+
permissions:
9+
contents: read
10+
11+
env:
12+
IMAGE_NAME: html2rss/botasaurus-scrape-api
13+
14+
jobs:
15+
docker-publish:
16+
name: Build and publish Docker image
17+
if: >-
18+
github.event.workflow_run.conclusion == 'success' &&
19+
github.event.workflow_run.event == 'push' &&
20+
github.event.workflow_run.head_branch == 'main'
21+
runs-on: ubuntu-latest
22+
steps:
23+
- name: Check out repository at triggering commit
24+
uses: actions/checkout@v6
25+
with:
26+
ref: ${{ github.event.workflow_run.head_sha }}
27+
28+
- name: Set up QEMU
29+
uses: docker/setup-qemu-action@v4
30+
31+
- name: Set up Docker Buildx
32+
uses: docker/setup-buildx-action@v4
33+
34+
- name: Log in to Docker Hub
35+
uses: docker/login-action@v4
36+
with:
37+
username: ${{ secrets.DOCKER_USERNAME }}
38+
password: ${{ secrets.DOCKER_PASSWORD }}
39+
40+
- name: Build and push multi-arch image
41+
uses: docker/build-push-action@v7
42+
with:
43+
context: .
44+
push: true
45+
platforms: linux/amd64,linux/arm64
46+
tags: |
47+
${{ env.IMAGE_NAME }}:latest
48+
${{ env.IMAGE_NAME }}:${{ github.event.workflow_run.head_sha }}
49+
labels: |
50+
org.opencontainers.image.source=https://github.com/${{ github.repository }}
51+
org.opencontainers.image.revision=${{ github.event.workflow_run.head_sha }}
52+
org.opencontainers.image.title=botasaurus-scrape-api
53+
org.opencontainers.image.description=Botasaurus scrape API image

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
.ruff_cache/
2+
__pycache__/
3+
*.pyc

AGENTS.md

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
# AGENTS.md
2+
3+
## Core Rules
4+
5+
- Docker-first and Docker-only unless user asks otherwise.
6+
- Keep repo focused: stable Botasaurus scrape API wrapper, not generic framework.
7+
8+
## Contract (Do Not Break)
9+
10+
- Endpoints: `GET /health`, `POST /scrape`.
11+
- Stable legacy `/scrape` fields: `url`, `final_url`, `status_code`, `headers`, `html`, `error`, `metadata_error`.
12+
- Additive diagnostics fields (current contract): `request_id`, `attempts`, `strategy_used`, `render_ms`, `blocked_detected`, `challenge_detected`, `error_category`.
13+
- Request options (current contract): `navigation_mode`, `max_retries`, `wait_for_selector`, `wait_timeout_seconds`, `block_images`.
14+
- Error codes:
15+
- `400` validation/resolution failure
16+
- `403` SSRF guardrail block
17+
- `422` request schema validation
18+
- `502` scrape execution failure
19+
- `504` timeout
20+
21+
## Runtime + Browser Constraints
22+
23+
- `POST /scrape` is async API over sync browser work (threadpool).
24+
- Each scrape request must use isolated runtime state:
25+
- request-scoped runtime dir `/tmp/scrape/<request_id>`
26+
- request-scoped browser profile
27+
- no cache/profile/driver reuse across requests
28+
- Cleanup is mandatory in `finally`:
29+
- close browser driver
30+
- delete request runtime dir
31+
- remove in-memory active request id
32+
- Keep request-id collision/invariant guard (`_active_request_ids`) intact.
33+
- `driver.requests.get` metadata is best-effort; metadata failure must not fail HTML success.
34+
- Keep strategy engine behavior:
35+
- `auto` mode attempt order: `google_get` -> `google_get_bypass` -> `get`
36+
- do not alter retry semantics without docs/tests update
37+
- Multi-arch image required:
38+
- all architectures: Chromium install
39+
- keep `/usr/bin/google-chrome` symlink to Chromium for compatibility
40+
- If browser install logic changes, re-verify binary path and Botasaurus startup.
41+
42+
## Safety
43+
44+
- Keep SSRF guardrails: localhost/domain checks and blocked IP classes (loopback/private/link-local/multicast/reserved/unspecified).
45+
- Do not weaken URL validation without explicit request plus docs/tests updates.
46+
47+
## Done Criteria
48+
49+
- Run `make smoke` before finish.
50+
- `make smoke` must cover build, boot, `/health`, `/scrape` happy path, strategy override, retry path, isolation check, localhost guardrail.
51+
- If API contract, Docker behavior, or error semantics changed, update README in same change.
52+
- Keep commits scoped (infra vs API vs docs).

Dockerfile

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
# Use official Python runtime pinned by digest
2+
FROM python:3.12-slim-bookworm@sha256:d97792894a6a4162cae14da44542a83c75e56c77a27b92d58f3f83b7bc961292 AS builder
3+
4+
WORKDIR /build
5+
6+
# Build wheels in an isolated stage (botasaurus dependency is sourced from git).
7+
RUN apt-get update \
8+
&& apt-get install -y --no-install-recommends git \
9+
&& rm -rf /var/lib/apt/lists/*
10+
11+
COPY requirements.txt /build/requirements.txt
12+
RUN pip wheel --no-cache-dir --wheel-dir /build/wheels -r /build/requirements.txt
13+
14+
FROM python:3.12-slim-bookworm@sha256:d97792894a6a4162cae14da44542a83c75e56c77a27b92d58f3f83b7bc961292
15+
16+
WORKDIR /app
17+
18+
# Install minimal browser/runtime dependencies for Botasaurus.
19+
RUN apt-get update \
20+
&& apt-get install -y --no-install-recommends \
21+
ca-certificates \
22+
chromium \
23+
xvfb \
24+
&& rm -rf /var/lib/apt/lists/*
25+
26+
COPY requirements.txt /app/requirements.txt
27+
COPY --from=builder /build/wheels /wheels
28+
RUN grep -v '^botasaurus @ git+' /app/requirements.txt > /app/requirements.runtime.txt \
29+
&& echo 'botasaurus' >> /app/requirements.runtime.txt \
30+
&& pip install --no-cache-dir --no-index --find-links /wheels -r /app/requirements.runtime.txt \
31+
&& rm -f /app/requirements.runtime.txt \
32+
&& rm -rf /wheels
33+
34+
# Copy application code
35+
COPY . /app
36+
37+
# Keep both paths available; Botasaurus integrations often look for google-chrome.
38+
ENV CHROME_BIN=/usr/bin/chromium
39+
RUN ln -sf /usr/bin/chromium /usr/bin/google-chrome
40+
41+
# Run as unprivileged user
42+
RUN useradd --create-home --shell /usr/sbin/nologin appuser \
43+
&& chown -R appuser:appuser /app
44+
USER appuser
45+
46+
EXPOSE 4010
47+
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "4010"]

Makefile

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
.PHONY: build serve health scrape-example smoke
2+
3+
IMAGE ?= botasaurus-api
4+
PORT ?= 4010
5+
BASE_URL ?= http://localhost:$(PORT)
6+
7+
build:
8+
docker build -t $(IMAGE) .
9+
10+
serve: build
11+
docker run --rm -p $(PORT):4010 $(IMAGE)
12+
13+
health:
14+
curl -s $(BASE_URL)/health
15+
16+
scrape-example:
17+
curl -s -X POST $(BASE_URL)/scrape \
18+
-H 'Content-Type: application/json' \
19+
-d '{"url":"https://example.com"}'
20+
21+
smoke:
22+
./scripts/smoke.sh

0 commit comments

Comments
 (0)