diff --git a/examples/deployment/docker-compose.yml b/examples/deployment/docker-compose.yml index e22e093c..3774a619 100644 --- a/examples/deployment/docker-compose.yml +++ b/examples/deployment/docker-compose.yml @@ -1,12 +1,17 @@ services: html2rss-web: - image: html2rss/web:latest + image: html2rss/web:1 restart: unless-stopped env_file: - path: .env required: false environment: PORT: 4000 + BOTASAURUS_SCRAPER_URL: http://botasaurus:4010 + + botasaurus: + image: html2rss/botasaurus-scrape-api:latest + restart: unless-stopped caddy: image: caddy:2-alpine @@ -30,6 +35,7 @@ services: depends_on: - html2rss-web - caddy + - botasaurus command: - --cleanup - --interval diff --git a/src/components/docs/DockerComposeSnippet.astro b/src/components/docs/DockerComposeSnippet.astro index 2a37676e..0ced6c48 100644 --- a/src/components/docs/DockerComposeSnippet.astro +++ b/src/components/docs/DockerComposeSnippet.astro @@ -1,6 +1,6 @@ --- import { Code } from "@astrojs/starlight/components"; -import { browserlessImage, caddyImage, watchtowerImage, webImage } from "../../data/docker"; +import { botasaurusImage, browserlessImage, caddyImage, watchtowerImage, webImage } from "../../data/docker"; interface Props { variant: "minimal" | "productionCaddy" | "secure" | "watchtower" | "resourceGuardrails"; @@ -21,13 +21,16 @@ const snippets: Record = { environment: RACK_ENV: production PORT: 4000 - BUILD_TAG: \${BUILD_TAG:-local} - GIT_SHA: \${GIT_SHA:-local} HTML2RSS_SECRET_KEY: \${HTML2RSS_SECRET_KEY:?set HTML2RSS_SECRET_KEY} HEALTH_CHECK_TOKEN: \${HEALTH_CHECK_TOKEN:?set HEALTH_CHECK_TOKEN} SENTRY_DSN: \${SENTRY_DSN:-} BROWSERLESS_IO_WEBSOCKET_URL: ws://browserless:4002 BROWSERLESS_IO_API_TOKEN: \${BROWSERLESS_IO_API_TOKEN:?set BROWSERLESS_IO_API_TOKEN} + BOTASAURUS_SCRAPER_URL: http://botasaurus:4010 + + botasaurus: + image: ${botasaurusImage} + restart: unless-stopped browserless: image: "${browserlessImage}" @@ -64,13 +67,16 @@ const snippets: Record = { environment: RACK_ENV: production PORT: 4000 - BUILD_TAG: \${BUILD_TAG:-local} - GIT_SHA: \${GIT_SHA:-local} HTML2RSS_SECRET_KEY: \${HTML2RSS_SECRET_KEY:?set HTML2RSS_SECRET_KEY} HEALTH_CHECK_TOKEN: \${HEALTH_CHECK_TOKEN:?set HEALTH_CHECK_TOKEN} SENTRY_DSN: \${SENTRY_DSN:-} BROWSERLESS_IO_WEBSOCKET_URL: ws://browserless:4002 BROWSERLESS_IO_API_TOKEN: \${BROWSERLESS_IO_API_TOKEN:?set BROWSERLESS_IO_API_TOKEN} + BOTASAURUS_SCRAPER_URL: http://botasaurus:4010 + + botasaurus: + image: ${botasaurusImage} + restart: unless-stopped browserless: image: "${browserlessImage}" @@ -92,13 +98,16 @@ volumes: environment: RACK_ENV: production PORT: 4000 - BUILD_TAG: \${BUILD_TAG:-local} - GIT_SHA: \${GIT_SHA:-local} HTML2RSS_SECRET_KEY: \${HTML2RSS_SECRET_KEY:?set HTML2RSS_SECRET_KEY} HEALTH_CHECK_TOKEN: \${HEALTH_CHECK_TOKEN:?set HEALTH_CHECK_TOKEN} SENTRY_DSN: \${SENTRY_DSN:-} BROWSERLESS_IO_WEBSOCKET_URL: ws://browserless:4002 BROWSERLESS_IO_API_TOKEN: \${BROWSERLESS_IO_API_TOKEN:?set BROWSERLESS_IO_API_TOKEN} + BOTASAURUS_SCRAPER_URL: http://botasaurus:4010 + + botasaurus: + image: ${botasaurusImage} + restart: unless-stopped browserless: image: "${browserlessImage}" @@ -115,7 +124,7 @@ volumes: - /var/run/docker.sock:/var/run/docker.sock:ro # Optional for private registries only: # - "\${HOME}/.docker/config.json:/config.json:ro" - command: --cleanup --interval 7200 html2rss-web browserless caddy`, + command: --cleanup --interval 7200 html2rss-web botasaurus browserless caddy`, resourceGuardrails: `services: html2rss-web: image: ${webImage} diff --git a/src/content/docs/common-use-cases.mdx b/src/content/docs/common-use-cases.mdx index 27e10760..4f58cc49 100644 --- a/src/content/docs/common-use-cases.mdx +++ b/src/content/docs/common-use-cases.mdx @@ -1,51 +1,47 @@ --- title: "Common Use Cases" -description: "See how people use html2rss to stay updated with their favorite websites. Real examples for personal and business use cases." +description: "Use html2rss for common tracking and monitoring workflows." --- -Discover how people are using html2rss to take control of their web content consumption. These real-world examples show the power and flexibility of creating custom RSS feeds. - ---- +Use html2rss when you want updates in a reader instead of checking websites by hand. ## Personal Use Cases ### Following Your Favorite Bloggers -Many bloggers don't offer RSS feeds, but you can create them with html2rss. Follow writers you love without relying on social media algorithms. +Many blogs and creator sites do not publish feeds. -**Example:** Create a feed for a personal blog that only posts to social media. +**Example:** Follow a newsroom, company blog, or publication section from your own `html2rss-web` deployment. ### Job Hunting Track job postings from multiple company websites in one place. Never miss an opportunity again. -**Example:** Follow job boards, company career pages, and industry-specific job sites. +**Example:** Track a company careers page or a narrower role-specific listing. ### Local News Follow your local newspaper or community website to stay informed about your neighborhood. -**Example:** Create feeds for local news sites, community forums, and city government updates. +**Example:** Subscribe to local news sites, community forums, and city government updates from one reader. ### Academic Research Follow new papers and research in your field from multiple sources. -**Example:** Track arXiv submissions, journal publications, and conference proceedings. +**Example:** Track publication pages, research blogs, and conference updates. ### Product Updates Get notified when software you use releases updates, new features, or security patches. -**Example:** Follow product blogs, changelog pages, and release notes. +**Example:** Track release notes, changelog pages, and product blogs. ### Hobby Communities Follow forums, communities, and websites related to your hobbies and interests. -**Example:** Track gaming forums, photography communities, or cooking blogs. - ---- +**Example:** Track gaming forums, photography communities, or cooking blogs without manually checking each site. ## Business Use Cases @@ -59,21 +55,19 @@ Track what your competitors are posting about - new products, features, or annou Follow multiple industry publications in one feed to stay ahead of trends. -**Example:** Aggregate news from industry blogs, trade publications, and thought leaders. +**Example:** Aggregate trade publications, company blogs, and research updates in one reader. ### Customer Support Monitor customer feedback and support requests across different platforms. -**Example:** Track support forums, review sites, and social media mentions. +**Example:** Track support forums, review sites, and product-update pages that affect your users. ### Content Marketing Follow industry influencers and competitors for content inspiration. -**Example:** Track competitor blogs, industry newsletters, and thought leadership content. - ---- +**Example:** Track competitor blogs, industry newsletters, and thought leadership content in one place. ## Technical Use Cases @@ -95,20 +89,8 @@ Follow multiple open source projects and their updates. **Example:** Track project blogs, release notes, and community discussions. ---- - -## Getting Started with Your Use Case - -1. **Identify the websites** you want to follow -2. **Check our [Feed Directory](/feed-directory/)** to see if feeds already exist -3. **Try the [Web App](/web-application/getting-started)** to create feeds easily -4. **Learn advanced techniques** with our [Config Guide](/creating-custom-feeds/) - ---- - -## Need Help? +## Next Steps -- **Can't find what you're looking for?** [Browse our Feed Directory](/feed-directory/) -- **Want to create custom feeds?** [Try the Web App](/web-application/getting-started) -- **Need advanced features?** [Check our Ruby Gem docs](/ruby-gem/) -- **Have questions?** [Join our community discussions](https://github.com/orgs/html2rss/discussions) +- **[Run html2rss-web with Docker](/web-application/getting-started)** to verify your own instance. +- **[Use automatic feed generation](/web-application/how-to/use-automatic-feed-generation/)** when you want direct page-URL conversion. +- **[Create custom feeds](/creating-custom-feeds/)** when you need stable, reviewable extraction rules. diff --git a/src/content/docs/creating-custom-feeds.mdx b/src/content/docs/creating-custom-feeds.mdx index 24a38fed..0d782f8d 100644 --- a/src/content/docs/creating-custom-feeds.mdx +++ b/src/content/docs/creating-custom-feeds.mdx @@ -7,22 +7,13 @@ sidebar: import { Aside, Code } from "@astrojs/starlight/components"; -When auto-sourcing isn't enough, you can write your own configuration files to create custom RSS feeds for any website. This guide shows you how to take full control with YAML configs. +When existing feeds or auto-sourcing are not enough, write a YAML config for the site you want to follow. **Prerequisites:** You should be familiar with the [Getting Started](/getting-started) guide before diving into custom configurations. - - --- @@ -37,10 +28,6 @@ When auto-sourcing isn't enough, you can write your own configuration files to c - **The website has complex structure** that requires custom selectors - **You want to combine data** from multiple sources -**Don't need custom configs?** Check the [Feed Directory](/feed-directory/) first - there might already be a working feed for your website. - ---- - ## Recommended Workflow 1. **Inspect the live page** in your browser developer tools @@ -48,7 +35,7 @@ When auto-sourcing isn't enough, you can write your own configuration files to c 3. **Validate the config** with `html2rss validate your-config.yml` 4. **Render the feed** with `html2rss feed your-config.yml` 5. **Add it to `html2rss-web`** so you can use it through your normal instance -6. **Escalate to `browserless`** if the content is rendered by JavaScript +6. **Escalate request strategy when needed**: use a browser-based rendering strategy only when troubleshooting requires it This order keeps iteration fast and makes it easier to see whether the problem is the page structure, your selectors, or the fetch strategy. @@ -210,7 +197,7 @@ there. - **No items found?** Check your selectors with browser tools (F12) - the `items.selector` might not match the page structure - **Invalid YAML?** Use spaces, not tabs, and ensure proper indentation - **Website not loading?** Check the URL and try accessing it in your browser -- **Missing content?** Some websites load content with JavaScript - you may need to use the `browserless` strategy +- **Missing content?** Try a browser-based rendering strategy during troubleshooting - **Wrong data extracted?** Verify your selectors are pointing to the right elements **Need more help?** See our [comprehensive troubleshooting guide](/troubleshooting/troubleshooting) or ask in [GitHub Discussions](https://github.com/orgs/html2rss/discussions). @@ -225,7 +212,6 @@ there. **For Beginners:** -- **[Browse the Feed Directory](/feed-directory/)** - See real-world examples - **[Run html2rss-web with Docker](/web-application/getting-started)** - Use the newest integrated behavior - **[Learn more about selectors](/ruby-gem/reference/selectors/)** - Master CSS selectors - **[Submit your config via GitHub Web](https://github.com/html2rss/html2rss-configs)** - No Git knowledge required! @@ -234,5 +220,5 @@ there. - **[Browse existing configs](https://github.com/html2rss/html2rss-configs/tree/master/lib/html2rss/configs)** - See real examples - **[Join discussions](https://github.com/orgs/html2rss/discussions)** - Connect with other users -- **[Learn about strategies](/ruby-gem/reference/strategy/)** - Decide when to use `browserless` +- **[Learn about strategies](/ruby-gem/reference/strategy/)** - Decide when to use static vs JavaScript/browser-based extraction - **[Learn advanced features](/ruby-gem/how-to/advanced-features/)** - Take your configs to the next level diff --git a/src/content/docs/getting-started.mdx b/src/content/docs/getting-started.mdx index d0061e77..aa60c728 100644 --- a/src/content/docs/getting-started.mdx +++ b/src/content/docs/getting-started.mdx @@ -1,6 +1,6 @@ --- title: "Getting Started" -description: "Start html2rss-web locally, verify a working included feed from your self-hosted instance, and decide when to enable automatic generation or move to custom configs." +description: "Start html2rss-web locally, verify one feed, and decide when to enable automatic generation or move to custom configs." sidebar: order: 1 --- @@ -17,13 +17,12 @@ That guide is the canonical setup flow for: - running `html2rss-web` locally - confirming the interface is working -- opening a first included feed URL +- opening a known feed URL - deciding when to use automatic generation or custom configs ## Quick Shortcuts - **[Run html2rss-web with Docker](/web-application/getting-started)**: recommended first step -- **[Browse working feed examples](/feed-directory/)**: see what successful outputs look like - **[Use automatic feed generation](/web-application/how-to/use-automatic-feed-generation/)**: enable direct feed creation from a page URL when you want that workflow - **[Create Custom Feeds](/creating-custom-feeds)**: write configs when you need more control - **[Troubleshooting Guide](/troubleshooting/troubleshooting)**: fix startup or extraction problems @@ -34,6 +33,8 @@ If you are working directly with the gem instead of `html2rss-web`, start with: +For strategy behavior and manual overrides, see the [Strategy reference](/ruby-gem/reference/strategy). + If the target site is unusually redirect-heavy or needs extra follow-up requests, the CLI also supports: diff --git a/src/content/docs/index.mdx b/src/content/docs/index.mdx index 8374aca4..a98d87de 100644 --- a/src/content/docs/index.mdx +++ b/src/content/docs/index.mdx @@ -1,9 +1,9 @@ --- title: "Turn Any Website Into an RSS Feed" -description: "Run html2rss-web with Docker, verify a working included feed from your self-hosted instance, then consciously enable automatic generation or move to custom configs when you need more control." +description: "Run html2rss-web with Docker, verify one feed, then enable automatic generation or move to custom configs when you need more control." --- -Run `html2rss-web` with Docker, verify a working included feed from your self-hosted instance, and only then decide whether to enable automatic generation or move to custom configs. +Run `html2rss-web` with Docker, verify one feed from your own instance, then decide whether you need automatic generation or custom configs. ## Start Here @@ -13,14 +13,8 @@ That guide is the canonical onboarding flow for: - starting a local instance - verifying the web interface -- opening a first included feed URL -- deciding when to consciously enable automatic generation or move to custom configs - -## How It Works - -1. **Run your own local instance** with Docker -2. **Open a built-in feed URL** from your own instance -3. **Copy the feed URL into your reader** +- opening a known feed URL +- choosing the next path ## What is html2rss? @@ -36,14 +30,13 @@ Most people should start with the web application: ### I want a working instance first 1. **[Run html2rss-web with Docker](/web-application/getting-started)**: recommended starting path -2. **[Use the included configs](/web-application/how-to/use-included-configs/)**: use real embedded feeds from your own instance -3. **[Browse working feed examples](/feed-directory/)**: see what working outputs look like +2. **[Use the included configs](/web-application/how-to/use-included-configs/)**: optional guide for the embedded feed set ### I need more control 1. **[Creating Custom Feeds](/creating-custom-feeds)**: write and test your own configs 2. **[Selectors Reference](/ruby-gem/reference/selectors/)**: learn the matching rules -3. **[Strategy Reference](/ruby-gem/reference/strategy/)**: decide when `browserless` is justified +3. **[Strategy Reference](/ruby-gem/reference/strategy/)**: choose the right extraction strategy for static vs JavaScript-heavy pages ### I'm building or integrating @@ -62,7 +55,7 @@ Most people should start with the web application: ## Practical Notes - Start with Docker, not a public instance. -- Use an included feed to verify the deployment first. +- Verify the deployment with one known feed first. - Enable automatic generation only when you want the direct page-URL workflow and are ready to allow it on your self-hosted instance. - Move to custom configs when you need a stable, reviewable setup. diff --git a/src/content/docs/ruby-gem/how-to/advanced-features.mdx b/src/content/docs/ruby-gem/how-to/advanced-features.mdx index ab429c29..f89cc46c 100644 --- a/src/content/docs/ruby-gem/how-to/advanced-features.mdx +++ b/src/content/docs/ruby-gem/how-to/advanced-features.mdx @@ -16,7 +16,7 @@ html2rss uses parallel processing in auto-source discovery. This happens automat 1. **Use appropriate selectors:** More specific selectors reduce processing time 2. **Limit items when possible:** Use CSS selectors that target only the content you need 3. **Cache responses:** The web application caches responses automatically -4. **Choose the right strategy:** Use `faraday` for static content, `browserless` only when JavaScript is required +4. **Choose the right strategy:** Use static HTTP fetching for simple pages, and move to a JavaScript/browser-based extraction strategy when rendering or anti-bot handling is required ## Memory Optimization diff --git a/src/content/docs/ruby-gem/how-to/custom-http-requests.mdx b/src/content/docs/ruby-gem/how-to/custom-http-requests.mdx index e361f410..365ef6ca 100644 --- a/src/content/docs/ruby-gem/how-to/custom-http-requests.mdx +++ b/src/content/docs/ruby-gem/how-to/custom-http-requests.mdx @@ -11,7 +11,7 @@ Keep this structure in mind: - `headers` stays top-level - `strategy` stays top-level -- request-specific controls such as budgets and Browserless options live under `request` +- request-specific controls such as budgets and strategy-specific options live under `request` ## When You Need Custom Headers @@ -74,6 +74,7 @@ Request budgets are configured under `request`, not as top-level keys: - `request.max_redirects` limits redirect hops - `request.max_requests` limits the total request budget for the feed build - `request.browserless.*` is reserved for Browserless-only behavior such as preload actions +- `request.botasaurus.*` is reserved for Botasaurus-only behavior such as navigation mode and retries ## Common Use Cases diff --git a/src/content/docs/ruby-gem/how-to/handling-dynamic-content.mdx b/src/content/docs/ruby-gem/how-to/handling-dynamic-content.mdx index f7b739ad..ffae2482 100644 --- a/src/content/docs/ruby-gem/how-to/handling-dynamic-content.mdx +++ b/src/content/docs/ruby-gem/how-to/handling-dynamic-content.mdx @@ -1,15 +1,17 @@ --- title: Handling Dynamic Content -description: "Learn how to handle JavaScript-heavy websites and dynamic content with html2rss. Use browserless strategy for sites that load content dynamically." +description: "Learn how to handle JavaScript-heavy websites and dynamic content with html2rss using browser-based extraction strategies." --- import { Code } from "@astrojs/starlight/components"; -Some websites load their content dynamically using JavaScript. The default `html2rss` strategy might not see this content. +Some websites load their content dynamically using JavaScript. Static fetch paths may not see this content reliably. ## Solution -Use the [`browserless` strategy](/ruby-gem/reference/strategy) to render JavaScript-heavy websites with a headless browser. +Use a [browser-based extraction strategy](/ruby-gem/reference/strategy) when JavaScript-heavy pages do not work with default static fetching. + +`browserless` is common for this workflow, and `botasaurus` is an alternate browser-based strategy when you run a Botasaurus scrape API. Keep the strategy at the top level and put request-specific options under `request`: @@ -36,9 +38,9 @@ Keep the strategy at the top level and put request-specific options under `reque lang="yaml" /> -## When to Use Browserless +## When to Use Browser-Based Extraction -The `browserless` strategy is necessary when: +A browser-based extraction strategy is necessary when: - **Content loads after page load** - JavaScript fetches data from APIs - **Single Page Applications (SPAs)** - React, Vue, Angular apps @@ -100,13 +102,13 @@ These preload steps can be combined in a single config when a site needs several ## Performance Considerations -The `browserless` strategy is slower than the default `faraday` strategy because it: +Browser-based extraction is slower than default static HTTP fetching because it: - Launches a headless Chrome browser - Renders the full page with JavaScript - Takes more memory and CPU resources -**Use `faraday` for static content** and only switch to `browserless` when necessary. +**Use static HTTP fetching for static content** and switch to browser-based extraction when needed. See the [Strategy Reference](/ruby-gem/reference/strategy) for concrete transports, defaults, and environment requirements. ## Related Topics diff --git a/src/content/docs/ruby-gem/reference/cli-reference.mdx b/src/content/docs/ruby-gem/reference/cli-reference.mdx index 8e4ba22f..01c77e50 100644 --- a/src/content/docs/ruby-gem/reference/cli-reference.mdx +++ b/src/content/docs/ruby-gem/reference/cli-reference.mdx @@ -22,8 +22,8 @@ Automatically discovers items from a page and prints the generated RSS feed to s + +If you see a Botasaurus configuration error, check: + +- `BOTASAURUS_SCRAPER_URL` is set +- `BOTASAURUS_SCRAPER_URL` is a valid URL +- the Botasaurus scrape API is reachable from the shell environment running `html2rss` + ### Feed Loads a YAML config, builds the feed, and prints the RSS XML to stdout. @@ -105,7 +131,9 @@ Loads a YAML config, builds the feed, and prints the RSS XML to stdout. code={` html2rss feed single.yml ; \ html2rss feed feeds.yml my-first-feed ; \ + html2rss feed single.yml --strategy auto ; \ html2rss feed single.yml --strategy browserless ; \ + BOTASAURUS_SCRAPER_URL="http://localhost:4010" html2rss feed single.yml --strategy botasaurus ; \ html2rss feed single.yml --max-redirects 5 --max-requests 6 ; \ html2rss feed single.yml --params id:42 foo:bar `} diff --git a/src/content/docs/ruby-gem/reference/strategy.mdx b/src/content/docs/ruby-gem/reference/strategy.mdx index f35f383b..c8219ccd 100644 --- a/src/content/docs/ruby-gem/reference/strategy.mdx +++ b/src/content/docs/ruby-gem/reference/strategy.mdx @@ -1,18 +1,26 @@ --- title: Strategy -description: "Learn about different strategies for fetching website content with html2rss. Choose between faraday and browserless strategies for optimal performance." +description: "Learn how html2rss chooses request strategies by default with auto fallback, and when to override with faraday, botasaurus, or browserless." --- import { Code } from "@astrojs/starlight/components"; The `strategy` key defines how `html2rss` fetches a website's content. -- **`faraday`** (default): Makes a direct HTTP request. It is fast but does not execute JavaScript. +- **`auto`** (default): Tries concrete strategies in order: `faraday` -> `botasaurus` -> `browserless`. +- **`faraday`**: Makes a direct HTTP request. It is fast but does not execute JavaScript. - **`browserless`**: Renders the website in a headless Chrome browser, which is necessary for JavaScript-heavy sites. +- **`botasaurus`**: Delegates fetching to a Botasaurus scrape API. This is opt-in and requires `BOTASAURUS_SCRAPER_URL`. `strategy` is a top-level config key. Request-specific controls live under `request`. -Use `faraday` first for direct newsroom/listing/changelog pages. Prefer `browserless` when the target is client-rendered, protected by anti-bot checks, or otherwise requires JavaScript to expose article links. +`auto` falls back to the next strategy when the current attempt errors or extracts zero items. Use explicit `--strategy ...` only when you need to force a specific transport for troubleshooting or reproducibility. + +## `auto` (default) + +The default strategy chain is: + +`faraday` -> `botasaurus` -> `browserless` ## `browserless` @@ -62,11 +70,12 @@ Set the `strategy` at the top level of your feed configuration and put request c Use this split consistently: -- `strategy`: selects `faraday` or `browserless` +- `strategy`: selects `auto`, `faraday`, `browserless`, or `botasaurus` - `headers`: top-level headers shared by all strategies - `request.max_redirects`: redirect limit for the request session - `request.max_requests`: total request budget for the whole feed build - `request.browserless.*`: Browserless-only options +- `request.botasaurus.*`: Botasaurus-only options Example: @@ -153,6 +162,58 @@ Check these first: For custom Browserless websocket endpoints, `BROWSERLESS_IO_API_TOKEN` is mandatory. The local default endpoint (`ws://127.0.0.1:3000`) can use the default local token `6R0W53R135510`. +## `botasaurus` + +`botasaurus` delegates page fetching to a Botasaurus scrape API endpoint. This strategy is explicit opt-in and requires: + +- `strategy: botasaurus` +- `BOTASAURUS_SCRAPER_URL` set to your Botasaurus scrape API base URL (for example `http://localhost:4010`) + +### Configuration + + + +Supported `request.botasaurus` options: + +- `navigation_mode` (`auto`, `get`, `google_get`, `google_get_bypass`) +- `max_retries` (`0..3`) +- `wait_for_selector` +- `wait_timeout_seconds` +- `block_images` +- `block_images_and_css` +- `wait_for_complete_page_load` +- `headless` +- `proxy` +- `user_agent` +- `window_size` (two integers, for example `[1920, 1080]`) +- `lang` + +### Command-Line Usage + + + --- For detailed documentation on the Ruby API, see the [official YARD documentation](https://www.rubydoc.info/gems/html2rss). diff --git a/src/content/docs/troubleshooting/troubleshooting.mdx b/src/content/docs/troubleshooting/troubleshooting.mdx index a02a7290..dcb11e2f 100644 --- a/src/content/docs/troubleshooting/troubleshooting.mdx +++ b/src/content/docs/troubleshooting/troubleshooting.mdx @@ -39,7 +39,7 @@ If your feed is empty, check the following: - **URL:** Ensure the `url` in your configuration is correct and accessible. - **`items.selector`:** Verify that the `items.selector` matches the elements on the page. - **Website Changes:** Websites change their HTML structure frequently. Your selectors may be outdated. -- **JavaScript Content:** If the content is loaded via JavaScript, use the `browserless` strategy instead of `faraday`. +- **JavaScript Content:** If the content is loaded via JavaScript, use a browser-based rendering strategy. - **Authentication:** Some sites require authentication — check if you need to add headers or use a different strategy. ### `No scrapers found` Failure Taxonomy (`auto`) @@ -47,10 +47,8 @@ If your feed is empty, check the following: `auto` classifies no-scraper failures with actionable hints: - **Blocked surface likely (anti-bot or interstitial):** - - retry with `--strategy browserless` - try a more specific public listing URL - **App-shell surface detected:** - - retry with `--strategy browserless` - target a direct listing/update page instead of homepage/shell entrypoint - **Unsupported extraction surface for auto mode:** - switch to listing/changelog/category URLs @@ -58,6 +56,10 @@ If your feed is empty, check the following: Known anti-bot interstitial patterns (for example Cloudflare challenge pages) are surfaced as blocked-surface errors instead of silent empty extraction results. +When all auto fallback tiers complete but still extract zero items, html2rss raises `No RSS feed items extracted after auto fallback ...`. + +If failures continue after URL/surface fixes, retry with an explicit browser-based override (`--strategy browserless`), or `--strategy botasaurus` when `BOTASAURUS_SCRAPER_URL` is configured. + ### Browserless Connection / Setup Failures If you receive `Browserless connection failed (...)`: @@ -91,7 +93,9 @@ For custom websocket endpoints, `BROWSERLESS_IO_API_TOKEN` is required. Common configuration-related errors: - **`UnsupportedResponseContentType`:** The website returned content that html2rss can't parse (not HTML or JSON). -- **`UnsupportedStrategy`:** The specified strategy is not available. Use `faraday` or `browserless`. +- **`UnsupportedStrategy`:** The specified strategy is not available. Use `auto`, `faraday`, `browserless`, or `botasaurus`. +- **`BOTASAURUS_SCRAPER_URL is required for strategy=botasaurus.`:** Set `BOTASAURUS_SCRAPER_URL` to your Botasaurus scrape API base URL when using `--strategy botasaurus`. +- **`BOTASAURUS_SCRAPER_URL is invalid`:** Fix the URL format and retry. - **`Configuration must include at least 'selectors' or 'auto_source'`:** You need to specify either manual selectors or enable auto-source. - **`stylesheet.type invalid`:** Only `text/css` and `text/xsl` are supported for stylesheets. @@ -101,7 +105,7 @@ If parts of your items (e.g., title, link) are missing, check the following: - **Selector:** Ensure the selector for the missing part is correct and relative to the `items.selector`. - **Extractor:** Verify that you are using the correct `extractor` (e.g., `text`, `href`, `attribute`). -- **Dynamic Content:** `faraday` does not render JavaScript. If content loads dynamically, run with `--strategy browserless` (with the Browserless service available) so the page can be rendered before extraction. +- **Dynamic Content:** `faraday` does not render JavaScript. If content loads dynamically, run with `--strategy browserless` (with Browserless available) or `--strategy botasaurus` (with `BOTASAURUS_SCRAPER_URL` configured) so the page can be rendered before extraction. ### Date/Time Parsing Errors diff --git a/src/content/docs/web-application/getting-started.mdx b/src/content/docs/web-application/getting-started.mdx index 1beb338a..00025fb6 100644 --- a/src/content/docs/web-application/getting-started.mdx +++ b/src/content/docs/web-application/getting-started.mdx @@ -10,7 +10,7 @@ import { Code } from "@astrojs/starlight/components"; import AutoGenerationOptional from "../../../components/docs/AutoGenerationOptional.astro"; import MinimalDockerCompose from "../../../components/docs/MinimalDockerCompose.astro"; -Run `html2rss-web` locally with Docker, open the web interface, and verify that your instance can serve a working included feed before you enable direct feed generation. +Run `html2rss-web` locally with Docker and verify one included feed before enabling direct feed generation. ## What You Will Have When This Works @@ -23,7 +23,7 @@ After this guide, you should have: ## Installation Guide -This guide walks you through a local Docker setup that gives you the most reliable starting point. +This guide uses a local Docker Compose stack. ### What You'll Need @@ -50,7 +50,7 @@ Create a file called `docker-compose.yml` in that folder and start with the mini -This minimal stack intentionally proves the included-feed path first. Add automatic updates, reverse proxying, or your own config file only after this first run works. +Add automatic updates, reverse proxying, or your own config file after this first run works. ### Step 3: Start html2rss-web @@ -73,8 +73,6 @@ Then run: ## First Success Check -At this point, `html2rss-web` should be running. - 1. Open `http://localhost:4000` 2. Confirm the web interface loads 3. Open one of the included feed URLs from your own instance: @@ -82,19 +80,9 @@ At this point, `html2rss-web` should be running. - `http://localhost:4000/phys.org/weekly.rss` - `http://localhost:4000/softwareleadweekly.com/issues.rss` 4. Confirm the feed opens -5. Copy that feed URL into your reader - -If that works, the deployment, included-config path, and reader subscription path are working together. - -## What To Do First - -Start with an included config from your own instance: - -1. open a known included feed URL -2. copy that feed URL into your reader -3. confirm your reader can subscribe successfully +5. Copy that feed URL into your reader if you want to keep it -That proves the lowest-friction path before you invest in automatic generation or custom configs. +If that works, the local app and included-feed path are ready. ## What Changes If You Enable Feed Generation diff --git a/src/content/docs/web-application/how-to/automatic-updates.mdx b/src/content/docs/web-application/how-to/automatic-updates.mdx index cda50295..242b296b 100644 --- a/src/content/docs/web-application/how-to/automatic-updates.mdx +++ b/src/content/docs/web-application/how-to/automatic-updates.mdx @@ -1,6 +1,6 @@ --- title: "Automatic Updates" -description: "Learn how to set up automatic updates for html2rss-web using watchtower. Keep your Docker containers updated automatically with the latest features." +description: "Use Watchtower to keep html2rss-web updated within the Docker tag you selected." sidebar: order: 10 --- @@ -9,7 +9,9 @@ import { Code } from "@astrojs/starlight/components"; import DockerComposeSnippet from "../../../../components/docs/DockerComposeSnippet.astro"; -Use [watchtower](https://containrrr.dev/watchtower/) to periodically pull and restart containers when newer images are available. +Use [Watchtower](https://containrrr.dev/watchtower/) to pull newer images and restart the selected containers. + +Updates follow the tag in your Compose file. With the documented `html2rss/web:1` image, Watchtower stays on major version `1`. Add this service to your existing `docker-compose.yml`: @@ -19,9 +21,9 @@ Then restart the stack: -Operational note: +Operational notes: - Keep the Docker socket mount (read-only in this example). - Add the optional Docker config mount only if you pull private images that require registry auth. -- The shown Watchtower command updates all running containers by default. +- The shown command scopes updates to `html2rss-web`, `browserless`, and `caddy`; change the service names if your stack differs. - Check `docker compose logs watchtower` to confirm scans and update runs. diff --git a/src/content/docs/web-application/how-to/deployment.mdx b/src/content/docs/web-application/how-to/deployment.mdx index 8e55d075..3fd26161 100644 --- a/src/content/docs/web-application/how-to/deployment.mdx +++ b/src/content/docs/web-application/how-to/deployment.mdx @@ -7,11 +7,9 @@ import { Code } from "@astrojs/starlight/components"; import DockerComposeSnippet from "../../../../components/docs/DockerComposeSnippet.astro"; -html2rss-web ships on Docker Hub, so you can launch this self-hosted service wherever Docker runs. Start with the official [`docker-compose.yml`](https://github.com/html2rss/html2rss-web/blob/main/docker-compose.yml) as your baseline, and treat the [Getting Started guide](/web-application/getting-started) as the required first proof that your instance can already serve included feeds locally. +html2rss-web ships on Docker Hub. Start with the [Getting Started guide](/web-application/getting-started), then add the production pieces below. -If you have not yet created a local instance, complete the [Getting Started guide](/web-application/getting-started) first. It walks through the one-time project directory setup, creating a minimal compose file, and confirming the application locally, which gives you the right baseline before exposing a self-hosted instance publicly. - -Already running html2rss-web on your workstation? The sections below focus on what changes when you take that setup to production. +The examples use `html2rss/web:1`, the recommended major-version tag. Pin an exact release if your deployment process requires it. ## Choose Your Production Scope First @@ -40,54 +38,37 @@ If you plan to enable automatic feed generation, also prepare: ### Why a Reverse Proxy? -A reverse proxy accepts public HTTPS traffic, terminates TLS, and forwards requests to html2rss-web running on your private network. +A reverse proxy terminates public HTTPS traffic and forwards requests to html2rss-web on your private Docker network. ### Option A: Caddy (Automatic HTTPS) -Caddy handles certificates and redirects with almost no configuration. +Caddy handles certificates and redirects. -- Create a `.env` file beside your compose file with the following variables: - - + HEALTH_CHECK_TOKEN= + BROWSERLESS_IO_API_TOKEN= +`} + lang="dotenv" +/> - BUILD_TAG= +Before starting the stack: - # Recommended for production traceability (compose defaults to local) +- Set `CADDY_HOST` for your domain. +- Generate `HTML2RSS_SECRET_KEY` with `openssl rand -hex 32`. +- Set a strong `HEALTH_CHECK_TOKEN` when you use authenticated `GET /api/v1/health`; liveness/readiness probes can use `/api/v1/health/live` and `/api/v1/health/ready` without it. +- Leave `BUILD_TAG` and `GIT_SHA` unset unless you intentionally override image metadata in logs. +- Adjust optional knobs such as `AUTO_SOURCE_ENABLED` and `SENTRY_DSN` as needed; refer to the [environment reference](/web-application/reference/env-variables) for details. - GIT_SHA= +After `docker compose up -d`, run `docker compose logs caddy --tail 20`; look for `certificate obtained`. - `} - lang="dotenv" - /> - -- Update your `.env` before starting the stack: - - Set `CADDY_HOST` for your domain. - - Generate a production secret (`openssl rand -hex 32`) and assign it to `HTML2RSS_SECRET_KEY`. - - Set a strong `HEALTH_CHECK_TOKEN` when you use authenticated `GET /api/v1/health`; liveness/readiness probes can use `/api/v1/health/live` and `/api/v1/health/ready` without it. - - Set `BUILD_TAG` and `GIT_SHA` to real release metadata for production. - - Adjust optional knobs such as `AUTO_SOURCE_ENABLED` and `SENTRY_DSN` as needed; refer to the [environment reference](/web-application/reference/env-variables) for details. -- After `docker compose up -d`, run `docker compose logs caddy --tail 20`; look for `certificate obtained`. -- Re-test after DNS changes with [SSL Labs](https://www.ssllabs.com/ssltest/). -- Want automatic updates? Add the Watchtower service shown below. +Re-test after DNS changes with [SSL Labs](https://www.ssllabs.com/ssltest/). ## Secure Your Instance @@ -107,14 +88,14 @@ Keep the instance healthy once it is in production: - Monitor `https://yourdomain.com/api/v1/health` with the configured bearer token for authenticated health checks - Review `docker compose logs` regularly for feed errors or certificate renewals -- Enable automatic image updates so security patches roll out quickly +- Enable automatic image updates for the Docker tag you selected - Right-size CPU and memory to avoid starvation when parsing large feeds ### Auto-update with Watchtower -This Watchtower shape scopes updates to `html2rss-web`, `browserless`, and `caddy`; replace service names if your stack differs. +This Watchtower shape scopes updates to `html2rss-web`, `browserless`, and `caddy`; change the service names if your stack differs. Check `docker compose logs watchtower` occasionally to confirm updates are applied. @@ -122,7 +103,7 @@ Check `docker compose logs watchtower` occasionally to confirm updates are appli -Adjust the limits to match your host capacity. Increase memory if you process many large feeds. +Adjust limits to match host capacity. Increase memory for large feeds. ## Share & Support diff --git a/src/content/docs/web-application/how-to/use-automatic-feed-generation.mdx b/src/content/docs/web-application/how-to/use-automatic-feed-generation.mdx index 15c7c06a..c6d7fe5d 100644 --- a/src/content/docs/web-application/how-to/use-automatic-feed-generation.mdx +++ b/src/content/docs/web-application/how-to/use-automatic-feed-generation.mdx @@ -42,7 +42,7 @@ Then restart the stack: 1. Open your instance at `http://localhost:4000` 2. Paste a page URL into `Create a feed` 3. Add a valid access token when prompted -4. Choose a strategy if needed, then submit +4. Submit the request 5. Copy the generated feed URL or open it directly ## What Success Looks Like @@ -59,23 +59,21 @@ That is enough to confirm the self-hosted flow is working. ## Strategy Behavior -- `faraday` is the default strategy and should be your first try for most pages. -- During the feed-creation API request (`POST /api/v1/feeds`) from the web UI, a `faraday` submission may be retried once with `browserless` when the first failure looks retryable. -- If that fallback attempt fails, or if the first failure is clearly auth/URL/unsupported-strategy related, the UI stops and shows an error. -- This retry behavior is scoped to feed creation. It is not a general retry layer for later feed rendering (`GET /api/v1/feeds/:token`) or preview loading. +- Feed creation uses the backend default strategy behavior. +- If feed creation fails, the UI surfaces structured retry/error guidance rather than exposing low-level strategy controls. ## Input URL Guidance (Quality First) Automatic generation is most successful when the input URL is already a listing/update surface. - Higher-success inputs: -- newsroom/press listing pages -- category/tag/archive/listing pages -- changelog/release/update pages + - newsroom/press listing pages + - category/tag/archive/listing pages + - changelog/release/update pages - Lower-success inputs: -- generic homepages -- search pages -- app-shell entrypoints (client-rendered shells) + - generic homepages + - search pages + - app-shell entrypoints (client-rendered shells) If output quality is poor, switch the input to a direct listing/update URL before assuming the feature is broken. diff --git a/src/content/docs/web-application/index.mdx b/src/content/docs/web-application/index.mdx index c39dac1b..a69ecebd 100644 --- a/src/content/docs/web-application/index.mdx +++ b/src/content/docs/web-application/index.mdx @@ -1,12 +1,12 @@ --- title: "Web Application" -description: "html2rss-web is the self-hosted web interface and feed server for running included feeds first, then enabling direct generation only when needed." +description: "html2rss-web is the self-hosted web interface and feed server for included feeds, direct generation, and custom configs." sidebar: label: "Overview" order: 1 --- -`html2rss-web` is the recommended way to get started. Run it locally with Docker, verify a working included feed from your own instance, and only then decide whether you need token-gated direct generation or custom configs. +`html2rss-web` is the recommended way to get started. Run it locally with Docker, verify one feed from your own instance, then decide whether you need token-gated direct generation or custom configs. ## Get Started @@ -14,7 +14,6 @@ Start with **[Getting Started](/web-application/getting-started)** to: - run your own local instance - verify the web interface -- open a first included feed URL - choose the right next step for your site ## What The Web App Gives You @@ -31,10 +30,9 @@ The scraping and feed-building engine is provided by the Ruby gem [`html2rss`](h ## Recommended Flow 1. **[Getting Started](/web-application/getting-started)**: run the app locally -2. **[Use the included configs](/web-application/how-to/use-included-configs/)**: start with embedded feed paths from your own instance -3. **[Browse working feed examples](/feed-directory/)**: compare against existing outputs -4. **[Use automatic feed generation](/web-application/how-to/use-automatic-feed-generation/)**: enable direct page-URL conversion when you want that workflow -5. **[Create Custom Feeds](/creating-custom-feeds)**: build a stable custom setup when needed +2. **[Use the included configs](/web-application/how-to/use-included-configs/)**: use the embedded feed set when it covers your site +3. **[Use automatic feed generation](/web-application/how-to/use-automatic-feed-generation/)**: enable direct page-URL conversion when you want that workflow +4. **[Create Custom Feeds](/creating-custom-feeds)**: build a stable custom setup when needed ## For Integrations diff --git a/src/content/docs/web-application/reference/env-variables.mdx b/src/content/docs/web-application/reference/env-variables.mdx index 42b357b4..dd1944b1 100644 --- a/src/content/docs/web-application/reference/env-variables.mdx +++ b/src/content/docs/web-application/reference/env-variables.mdx @@ -9,8 +9,8 @@ description: "Configuration reference for html2rss-web environment variables." | --------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `HTML2RSS_SECRET_KEY` | required in production; development/test gets a temporary default | | `HEALTH_CHECK_TOKEN` | bearer token for authenticated `GET /api/v1/health`; optional unless you use that endpoint (the documented Compose stack includes it); `/api/v1/health/live` and `/api/v1/health/ready` do not require it | -| `BUILD_TAG` | defaults to `local` in the Compose stack; set release metadata explicitly in production | -| `GIT_SHA` | defaults to `local` in the Compose stack; set deployed commit SHA explicitly in production | +| `BUILD_TAG` | release metadata used in logs; published Docker images set this to the release version | +| `GIT_SHA` | deployed commit metadata used in logs; published Docker images set this to the released commit | | `SENTRY_DSN` | optional; enables Sentry errors/logs when set | | `BROWSERLESS_IO_WEBSOCKET_URL` | optional; Browserless websocket endpoint for `browserless` strategy | | `BROWSERLESS_IO_API_TOKEN` | required by this site's Compose stack and by custom websocket endpoints; standalone `html2rss` local defaults can omit it | diff --git a/src/content/docs/web-application/reference/versioning-and-releases.mdx b/src/content/docs/web-application/reference/versioning-and-releases.mdx index 168aa8d6..46edfa9a 100644 --- a/src/content/docs/web-application/reference/versioning-and-releases.mdx +++ b/src/content/docs/web-application/reference/versioning-and-releases.mdx @@ -5,11 +5,19 @@ description: Learn about versioning and release strategy for html2rss-web import { dockerHubRepository, dockerHubUrl } from "../../../../data/docker"; -This web application is distributed in a [rolling release](https://en.wikipedia.org/wiki/Rolling_release) fashion from the `main` branch. +html2rss-web publishes versioned Docker images to Docker Hub: {dockerHubRepository}. -For the latest commit passing GitHub CI/CD on the main branch, an updated Docker image will be pushed to Docker Hub: {dockerHubRepository}. -The [SBOM](https://en.wikipedia.org/wiki/Software_supply_chain) is embedded in the Docker image. +For release `1.2.3`, we publish these tags: -GitHub's @dependabot is enabled for dependency updates and they are automatically merged to the `main` branch when the CI gives the green light. +- `html2rss/web:1.2.3`: exact release +- `html2rss/web:1`: latest release in major version `1` +- `html2rss/web:latest`: newest published release +- `html2rss/web:`: release image pinned by source commit -If you use Docker, you should update to the latest image automatically by [setting up _watchtower_ as described](/web-application/how-to/automatic-updates). +Use `html2rss/web:1` for normal deployments. It receives newer releases for major version `1` without moving across a future major release. + +Use an exact version tag when you need fully pinned deploys. Use `latest` only when you intentionally want the newest published release. Use the commit SHA tag when you need to trace or reproduce one released build. + +Release images include SBOM and provenance metadata. The image build also sets `BUILD_TAG` to the release version and `GIT_SHA` to the released commit. + +If you use Docker Compose, [set up Watchtower](/web-application/how-to/automatic-updates) to pull updates for the tag you selected. diff --git a/src/data/docker.ts b/src/data/docker.ts index dabe362c..014f2d69 100644 --- a/src/data/docker.ts +++ b/src/data/docker.ts @@ -1,6 +1,7 @@ export const dockerHubRepository = 'html2rss/web'; export const dockerHubUrl = `https://hub.docker.com/r/${dockerHubRepository}`; -export const webImage = `${dockerHubRepository}:latest`; +export const webImage = `${dockerHubRepository}:1`; export const browserlessImage = 'ghcr.io/browserless/chromium'; export const caddyImage = 'caddy:2-alpine'; export const watchtowerImage = 'containrrr/watchtower'; +export const botasaurusImage = 'html2rss/botasaurus-scrape-api:latest';