Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion examples/deployment/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
services:
html2rss-web:
image: html2rss/web:latest
image: html2rss/web:1
restart: unless-stopped
env_file:
- path: .env
required: false
environment:
PORT: 4000
BOTASAURUS_SCRAPER_URL: http://botasaurus:4010

botasaurus:
image: html2rss/botasaurus-scrape-api:latest
restart: unless-stopped

caddy:
image: caddy:2-alpine
Expand All @@ -30,6 +35,7 @@ services:
depends_on:
- html2rss-web
- caddy
- botasaurus
command:
- --cleanup
- --interval
Expand Down
25 changes: 17 additions & 8 deletions src/components/docs/DockerComposeSnippet.astro
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
---
import { Code } from "@astrojs/starlight/components";
import { browserlessImage, caddyImage, watchtowerImage, webImage } from "../../data/docker";
import { botasaurusImage, browserlessImage, caddyImage, watchtowerImage, webImage } from "../../data/docker";

interface Props {
variant: "minimal" | "productionCaddy" | "secure" | "watchtower" | "resourceGuardrails";
Expand All @@ -21,13 +21,16 @@ const snippets: Record<Props["variant"], string> = {
environment:
RACK_ENV: production
PORT: 4000
BUILD_TAG: \${BUILD_TAG:-local}
GIT_SHA: \${GIT_SHA:-local}
HTML2RSS_SECRET_KEY: \${HTML2RSS_SECRET_KEY:?set HTML2RSS_SECRET_KEY}
HEALTH_CHECK_TOKEN: \${HEALTH_CHECK_TOKEN:?set HEALTH_CHECK_TOKEN}
SENTRY_DSN: \${SENTRY_DSN:-}
BROWSERLESS_IO_WEBSOCKET_URL: ws://browserless:4002
BROWSERLESS_IO_API_TOKEN: \${BROWSERLESS_IO_API_TOKEN:?set BROWSERLESS_IO_API_TOKEN}
BOTASAURUS_SCRAPER_URL: http://botasaurus:4010

botasaurus:
image: ${botasaurusImage}
restart: unless-stopped

browserless:
image: "${browserlessImage}"
Expand Down Expand Up @@ -64,13 +67,16 @@ const snippets: Record<Props["variant"], string> = {
environment:
RACK_ENV: production
PORT: 4000
BUILD_TAG: \${BUILD_TAG:-local}
GIT_SHA: \${GIT_SHA:-local}
HTML2RSS_SECRET_KEY: \${HTML2RSS_SECRET_KEY:?set HTML2RSS_SECRET_KEY}
HEALTH_CHECK_TOKEN: \${HEALTH_CHECK_TOKEN:?set HEALTH_CHECK_TOKEN}
SENTRY_DSN: \${SENTRY_DSN:-}
BROWSERLESS_IO_WEBSOCKET_URL: ws://browserless:4002
BROWSERLESS_IO_API_TOKEN: \${BROWSERLESS_IO_API_TOKEN:?set BROWSERLESS_IO_API_TOKEN}
BOTASAURUS_SCRAPER_URL: http://botasaurus:4010

botasaurus:
image: ${botasaurusImage}
restart: unless-stopped

browserless:
image: "${browserlessImage}"
Expand All @@ -92,13 +98,16 @@ volumes:
environment:
RACK_ENV: production
PORT: 4000
BUILD_TAG: \${BUILD_TAG:-local}
GIT_SHA: \${GIT_SHA:-local}
HTML2RSS_SECRET_KEY: \${HTML2RSS_SECRET_KEY:?set HTML2RSS_SECRET_KEY}
HEALTH_CHECK_TOKEN: \${HEALTH_CHECK_TOKEN:?set HEALTH_CHECK_TOKEN}
SENTRY_DSN: \${SENTRY_DSN:-}
BROWSERLESS_IO_WEBSOCKET_URL: ws://browserless:4002
BROWSERLESS_IO_API_TOKEN: \${BROWSERLESS_IO_API_TOKEN:?set BROWSERLESS_IO_API_TOKEN}
BOTASAURUS_SCRAPER_URL: http://botasaurus:4010

botasaurus:
image: ${botasaurusImage}
restart: unless-stopped

browserless:
image: "${browserlessImage}"
Expand All @@ -115,7 +124,7 @@ volumes:
- /var/run/docker.sock:/var/run/docker.sock:ro
# Optional for private registries only:
# - "\${HOME}/.docker/config.json:/config.json:ro"
command: --cleanup --interval 7200 html2rss-web browserless caddy`,
command: --cleanup --interval 7200 html2rss-web botasaurus browserless caddy`,
resourceGuardrails: `services:
html2rss-web:
image: ${webImage}
Expand Down
50 changes: 16 additions & 34 deletions src/content/docs/common-use-cases.mdx
Original file line number Diff line number Diff line change
@@ -1,51 +1,47 @@
---
title: "Common Use Cases"
description: "See how people use html2rss to stay updated with their favorite websites. Real examples for personal and business use cases."
description: "Use html2rss for common tracking and monitoring workflows."
---

Discover how people are using html2rss to take control of their web content consumption. These real-world examples show the power and flexibility of creating custom RSS feeds.

---
Use html2rss when you want updates in a reader instead of checking websites by hand.

## Personal Use Cases

### Following Your Favorite Bloggers

Many bloggers don't offer RSS feeds, but you can create them with html2rss. Follow writers you love without relying on social media algorithms.
Many blogs and creator sites do not publish feeds.

**Example:** Create a feed for a personal blog that only posts to social media.
**Example:** Follow a newsroom, company blog, or publication section from your own `html2rss-web` deployment.

### Job Hunting

Track job postings from multiple company websites in one place. Never miss an opportunity again.

**Example:** Follow job boards, company career pages, and industry-specific job sites.
**Example:** Track a company careers page or a narrower role-specific listing.

### Local News

Follow your local newspaper or community website to stay informed about your neighborhood.

**Example:** Create feeds for local news sites, community forums, and city government updates.
**Example:** Subscribe to local news sites, community forums, and city government updates from one reader.

### Academic Research

Follow new papers and research in your field from multiple sources.

**Example:** Track arXiv submissions, journal publications, and conference proceedings.
**Example:** Track publication pages, research blogs, and conference updates.

### Product Updates

Get notified when software you use releases updates, new features, or security patches.

**Example:** Follow product blogs, changelog pages, and release notes.
**Example:** Track release notes, changelog pages, and product blogs.

### Hobby Communities

Follow forums, communities, and websites related to your hobbies and interests.

**Example:** Track gaming forums, photography communities, or cooking blogs.

---
**Example:** Track gaming forums, photography communities, or cooking blogs without manually checking each site.

## Business Use Cases

Expand All @@ -59,21 +55,19 @@ Track what your competitors are posting about - new products, features, or annou

Follow multiple industry publications in one feed to stay ahead of trends.

**Example:** Aggregate news from industry blogs, trade publications, and thought leaders.
**Example:** Aggregate trade publications, company blogs, and research updates in one reader.

### Customer Support

Monitor customer feedback and support requests across different platforms.

**Example:** Track support forums, review sites, and social media mentions.
**Example:** Track support forums, review sites, and product-update pages that affect your users.

### Content Marketing

Follow industry influencers and competitors for content inspiration.

**Example:** Track competitor blogs, industry newsletters, and thought leadership content.

---
**Example:** Track competitor blogs, industry newsletters, and thought leadership content in one place.

## Technical Use Cases

Expand All @@ -95,20 +89,8 @@ Follow multiple open source projects and their updates.

**Example:** Track project blogs, release notes, and community discussions.

---

## Getting Started with Your Use Case

1. **Identify the websites** you want to follow
2. **Check our [Feed Directory](/feed-directory/)** to see if feeds already exist
3. **Try the [Web App](/web-application/getting-started)** to create feeds easily
4. **Learn advanced techniques** with our [Config Guide](/creating-custom-feeds/)

---

## Need Help?
## Next Steps

- **Can't find what you're looking for?** [Browse our Feed Directory](/feed-directory/)
- **Want to create custom feeds?** [Try the Web App](/web-application/getting-started)
- **Need advanced features?** [Check our Ruby Gem docs](/ruby-gem/)
- **Have questions?** [Join our community discussions](https://github.com/orgs/html2rss/discussions)
- **[Run html2rss-web with Docker](/web-application/getting-started)** to verify your own instance.
- **[Use automatic feed generation](/web-application/how-to/use-automatic-feed-generation/)** when you want direct page-URL conversion.
- **[Create custom feeds](/creating-custom-feeds/)** when you need stable, reviewable extraction rules.
26 changes: 6 additions & 20 deletions src/content/docs/creating-custom-feeds.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -7,22 +7,13 @@ sidebar:

import { Aside, Code } from "@astrojs/starlight/components";

When auto-sourcing isn't enough, you can write your own configuration files to create custom RSS feeds for any website. This guide shows you how to take full control with YAML configs.
When existing feeds or auto-sourcing are not enough, write a YAML config for the site you want to follow.

**Prerequisites:** You should be familiar with the [Getting Started](/getting-started) guide before diving into custom configurations.

<Aside type="note" title="Release note">
This guide tracks the current documentation tree and may describe features that have not yet shipped in the
latest released `html2rss` gem. If you want the newest integrated behavior, prefer running
[`html2rss-web`](/web-application/getting-started) via Docker. The web application ships as a rolling
release and usually reflects the latest development state of the gem first. See [Versioning and
releases](/web-application/reference/versioning-and-releases/) for details.
</Aside>

<Aside type="tip" title="Use this guide when you need more control">
Start with included feeds first. If your site is not covered, try [automatic feed
generation](/web-application/how-to/use-automatic-feed-generation/) next. Reach for a custom config when you
need a stable, reviewable setup or the generated feed misses important content.
Reach for a custom config when you need stable, reviewable extraction rules or generated output misses
important content.
</Aside>

---
Expand All @@ -37,18 +28,14 @@ When auto-sourcing isn't enough, you can write your own configuration files to c
- **The website has complex structure** that requires custom selectors
- **You want to combine data** from multiple sources

**Don't need custom configs?** Check the [Feed Directory](/feed-directory/) first - there might already be a working feed for your website.

---

## Recommended Workflow

1. **Inspect the live page** in your browser developer tools
2. **Write the smallest useful config** that extracts items, titles, and links
3. **Validate the config** with `html2rss validate your-config.yml`
4. **Render the feed** with `html2rss feed your-config.yml`
5. **Add it to `html2rss-web`** so you can use it through your normal instance
6. **Escalate to `browserless`** if the content is rendered by JavaScript
6. **Escalate request strategy when needed**: use a browser-based rendering strategy only when troubleshooting requires it

This order keeps iteration fast and makes it easier to see whether the problem is the page structure, your
selectors, or the fetch strategy.
Expand Down Expand Up @@ -210,7 +197,7 @@ there.
- **No items found?** Check your selectors with browser tools (F12) - the `items.selector` might not match the page structure
- **Invalid YAML?** Use spaces, not tabs, and ensure proper indentation
- **Website not loading?** Check the URL and try accessing it in your browser
- **Missing content?** Some websites load content with JavaScript - you may need to use the `browserless` strategy
- **Missing content?** Try a browser-based rendering strategy during troubleshooting
- **Wrong data extracted?** Verify your selectors are pointing to the right elements

**Need more help?** See our [comprehensive troubleshooting guide](/troubleshooting/troubleshooting) or ask in [GitHub Discussions](https://github.com/orgs/html2rss/discussions).
Expand All @@ -225,7 +212,6 @@ there.

**For Beginners:**

- **[Browse the Feed Directory](/feed-directory/)** - See real-world examples
- **[Run html2rss-web with Docker](/web-application/getting-started)** - Use the newest integrated behavior
- **[Learn more about selectors](/ruby-gem/reference/selectors/)** - Master CSS selectors
- **[Submit your config via GitHub Web](https://github.com/html2rss/html2rss-configs)** - No Git knowledge required!
Expand All @@ -234,5 +220,5 @@ there.

- **[Browse existing configs](https://github.com/html2rss/html2rss-configs/tree/master/lib/html2rss/configs)** - See real examples
- **[Join discussions](https://github.com/orgs/html2rss/discussions)** - Connect with other users
- **[Learn about strategies](/ruby-gem/reference/strategy/)** - Decide when to use `browserless`
- **[Learn about strategies](/ruby-gem/reference/strategy/)** - Decide when to use static vs JavaScript/browser-based extraction
- **[Learn advanced features](/ruby-gem/how-to/advanced-features/)** - Take your configs to the next level
7 changes: 4 additions & 3 deletions src/content/docs/getting-started.mdx
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
---
title: "Getting Started"
description: "Start html2rss-web locally, verify a working included feed from your self-hosted instance, and decide when to enable automatic generation or move to custom configs."
description: "Start html2rss-web locally, verify one feed, and decide when to enable automatic generation or move to custom configs."
sidebar:
order: 1
---
Expand All @@ -17,13 +17,12 @@ That guide is the canonical setup flow for:

- running `html2rss-web` locally
- confirming the interface is working
- opening a first included feed URL
- opening a known feed URL
- deciding when to use automatic generation or custom configs

## Quick Shortcuts

- **[Run html2rss-web with Docker](/web-application/getting-started)**: recommended first step
- **[Browse working feed examples](/feed-directory/)**: see what successful outputs look like
- **[Use automatic feed generation](/web-application/how-to/use-automatic-feed-generation/)**: enable direct feed creation from a page URL when you want that workflow
- **[Create Custom Feeds](/creating-custom-feeds)**: write configs when you need more control
- **[Troubleshooting Guide](/troubleshooting/troubleshooting)**: fix startup or extraction problems
Expand All @@ -34,6 +33,8 @@ If you are working directly with the gem instead of `html2rss-web`, start with:

<Code code={`html2rss auto https://example.com/blog`} lang="bash" />

For strategy behavior and manual overrides, see the [Strategy reference](/ruby-gem/reference/strategy).

If the target site is unusually redirect-heavy or needs extra follow-up requests, the CLI also supports:

<Code code={`html2rss auto https://example.com/blog --max-redirects 10 --max-requests 5`} lang="bash" />
Expand Down
21 changes: 7 additions & 14 deletions src/content/docs/index.mdx
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
---
title: "Turn Any Website Into an RSS Feed"
description: "Run html2rss-web with Docker, verify a working included feed from your self-hosted instance, then consciously enable automatic generation or move to custom configs when you need more control."
description: "Run html2rss-web with Docker, verify one feed, then enable automatic generation or move to custom configs when you need more control."
---

Run `html2rss-web` with Docker, verify a working included feed from your self-hosted instance, and only then decide whether to enable automatic generation or move to custom configs.
Run `html2rss-web` with Docker, verify one feed from your own instance, then decide whether you need automatic generation or custom configs.

## Start Here

Expand All @@ -13,14 +13,8 @@ That guide is the canonical onboarding flow for:

- starting a local instance
- verifying the web interface
- opening a first included feed URL
- deciding when to consciously enable automatic generation or move to custom configs

## How It Works

1. **Run your own local instance** with Docker
2. **Open a built-in feed URL** from your own instance
3. **Copy the feed URL into your reader**
- opening a known feed URL
- choosing the next path

## What is html2rss?

Expand All @@ -36,14 +30,13 @@ Most people should start with the web application:
### I want a working instance first

1. **[Run html2rss-web with Docker](/web-application/getting-started)**: recommended starting path
2. **[Use the included configs](/web-application/how-to/use-included-configs/)**: use real embedded feeds from your own instance
3. **[Browse working feed examples](/feed-directory/)**: see what working outputs look like
2. **[Use the included configs](/web-application/how-to/use-included-configs/)**: optional guide for the embedded feed set

### I need more control

1. **[Creating Custom Feeds](/creating-custom-feeds)**: write and test your own configs
2. **[Selectors Reference](/ruby-gem/reference/selectors/)**: learn the matching rules
3. **[Strategy Reference](/ruby-gem/reference/strategy/)**: decide when `browserless` is justified
3. **[Strategy Reference](/ruby-gem/reference/strategy/)**: choose the right extraction strategy for static vs JavaScript-heavy pages

### I'm building or integrating

Expand All @@ -62,7 +55,7 @@ Most people should start with the web application:
## Practical Notes

- Start with Docker, not a public instance.
- Use an included feed to verify the deployment first.
- Verify the deployment with one known feed first.
- Enable automatic generation only when you want the direct page-URL workflow and are ready to allow it on your self-hosted instance.
- Move to custom configs when you need a stable, reviewable setup.

Expand Down
2 changes: 1 addition & 1 deletion src/content/docs/ruby-gem/how-to/advanced-features.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ html2rss uses parallel processing in auto-source discovery. This happens automat
1. **Use appropriate selectors:** More specific selectors reduce processing time
2. **Limit items when possible:** Use CSS selectors that target only the content you need
3. **Cache responses:** The web application caches responses automatically
4. **Choose the right strategy:** Use `faraday` for static content, `browserless` only when JavaScript is required
4. **Choose the right strategy:** Use static HTTP fetching for simple pages, and move to a JavaScript/browser-based extraction strategy when rendering or anti-bot handling is required

## Memory Optimization

Expand Down
3 changes: 2 additions & 1 deletion src/content/docs/ruby-gem/how-to/custom-http-requests.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ Keep this structure in mind:

- `headers` stays top-level
- `strategy` stays top-level
- request-specific controls such as budgets and Browserless options live under `request`
- request-specific controls such as budgets and strategy-specific options live under `request`

## When You Need Custom Headers

Expand Down Expand Up @@ -74,6 +74,7 @@ Request budgets are configured under `request`, not as top-level keys:
- `request.max_redirects` limits redirect hops
- `request.max_requests` limits the total request budget for the feed build
- `request.browserless.*` is reserved for Browserless-only behavior such as preload actions
- `request.botasaurus.*` is reserved for Botasaurus-only behavior such as navigation mode and retries

## Common Use Cases

Expand Down
Loading
Loading