heyblog · ligeaaa · Jun 11, 2026 · Jun 2, 2026 · Jun 2, 2026 · Jun 2, 2026
diff --git a/alembic/versions/20260602_01_add_blog_acceptance_status.py b/alembic/versions/20260602_01_add_blog_acceptance_status.py
@@ -0,0 +1,115 @@
+"""Split blog acceptance from crawl execution status.
+
+Revision ID: 20260602_01
+Revises: 20260601_01
+Create Date: 2026-06-02 21:30:29 BST
+"""
+
+from __future__ import annotations
+
+from alembic import op
+import sqlalchemy as sa
+
+
+revision = "20260602_01"
+down_revision = "20260601_01"
+branch_labels = None
+depends_on = None
+
+
+def _columns(table_name: str) -> set[str]:
+    """Return currently present column names for one table.
+
+    Args:
+        table_name: Database table name to inspect.
+
+    Returns:
+        Set of column names currently present in the database.
+    """
+    return {column["name"] for column in sa.inspect(op.get_bind()).get_columns(table_name)}
+
+
+def upgrade() -> None:
+    """Add acceptance and crawl-error fields, then backfill accepted graph rows.
+
+    Args:
+        None.
+
+    Returns:
+        None. The migration mutates the active database schema in place.
+    """
+    blog_columns = _columns("blogs")
+    if "acceptance_status" not in blog_columns:
+        op.add_column(
+            "blogs",
+            sa.Column("acceptance_status", sa.Text(), nullable=False, server_default="UNKNOWN"),
+        )
+    for column_name in (
+        "accepted_by",
+        "crawl_error_kind",
+        "crawl_error_message",
+    ):
+        if column_name not in blog_columns:
+            op.add_column("blogs", sa.Column(column_name, sa.Text(), nullable=True))
+    for column_name in (
+        "accepted_at",
+        "last_crawl_attempt_at",
+        "successful_crawl_at",
+    ):
+        if column_name not in blog_columns:
+            op.add_column("blogs", sa.Column(column_name, sa.DateTime(timezone=True), nullable=True))
+
+    op.execute(
+        """
+        UPDATE blogs b
+        SET acceptance_status = 'ACCEPTED',
+            accepted_by = COALESCE(b.accepted_by, r.accepted_by, 'unknown'),
+            accepted_at = COALESCE(b.accepted_at, r.updated_at, b.created_at)
+        FROM raw_discovered_urls r
+        WHERE b.normalized_url = r.normalized_url
+          AND r.status = 'success'
+          AND b.acceptance_status = 'UNKNOWN'
+        """
+    )
+    op.execute(
+        """
+        UPDATE blogs
+        SET acceptance_status = 'ACCEPTED',
+            accepted_by = COALESCE(accepted_by, 'seed'),
+            accepted_at = COALESCE(accepted_at, created_at)
+        WHERE acceptance_status = 'UNKNOWN'
+          AND blog_id NOT IN (SELECT to_blog_id FROM edges)
+        """
+    )
+    op.execute(
+        """
+        UPDATE blogs
+        SET acceptance_status = 'ACCEPTED',
+            accepted_by = COALESCE(accepted_by, 'graph'),
+            accepted_at = COALESCE(accepted_at, created_at)
+        WHERE acceptance_status = 'UNKNOWN'
+          AND blog_id IN (SELECT from_blog_id FROM edges UNION SELECT to_blog_id FROM edges)
+        """
+    )
+
+
+def downgrade() -> None:
+    """Remove acceptance and crawl-error fields.
+
+    Args:
+        None.
+
+    Returns:
+        None. The migration mutates the active database schema in place.
+    """
+    for column_name in (
+        "successful_crawl_at",
+        "last_crawl_attempt_at",
+        "crawl_error_message",
+        "crawl_error_kind",
+        "accepted_at",
+        "accepted_by",
+        "acceptance_status",
+    ):
+        if column_name in _columns("blogs"):
+            op.drop_column("blogs", column_name)
diff --git a/backend/main.py b/backend/main.py
@@ -3,11 +3,14 @@
 from __future__ import annotations
 
 from dataclasses import dataclass
+import ipaddress
+import socket
 from threading import Thread
 from time import sleep
 from typing import Any
 from typing import Callable
 from typing import NoReturn
+from urllib.parse import urlsplit
 
 import httpx
 from fastapi import Depends, FastAPI, HTTPException, Request
@@ -75,6 +78,145 @@ class CreateBlogLabelTagRequest(BaseModel):
 
 
 ACTIVE_CRAWLER_RUNNER_STATUSES = frozenset({"starting", "running", "stopping"})
+ICON_PROXY_MAX_BYTES = 1_000_000
+ICON_PROXY_ALLOWED_SCHEMES = frozenset({"http", "https"})
+ICON_PROXY_IMAGE_EXTENSIONS = (".ico", ".png", ".jpg", ".jpeg", ".svg", ".webp", ".gif", ".avif")
+
+
+def _is_private_icon_proxy_host(hostname: str) -> bool:
+    """Return whether one hostname resolves to local or private network space.
+
+    Args:
+        hostname: Parsed URL hostname to validate before proxying.
+
+    Returns:
+        True when the hostname itself or any resolved address is unsafe for the
+        public icon proxy.
+    """
+    try:
+        ip_addresses = [ipaddress.ip_address(hostname)]
+    except ValueError:
+        try:
+            resolved = socket.getaddrinfo(hostname, None, type=socket.SOCK_STREAM)
+        except socket.gaierror:
+            return True
+        ip_addresses = []
+        for item in resolved:
+            address = item[4][0]
+            try:
+                ip_addresses.append(ipaddress.ip_address(address))
+            except ValueError:
+                return True
+
+    return any(
+        address.is_private
+        or address.is_loopback
+        or address.is_link_local
+        or address.is_multicast
+        or address.is_reserved
+        or address.is_unspecified
+        for address in ip_addresses
+    )
+
+
+def _validate_icon_proxy_url(url: str) -> str:
+    """Normalize and validate a remote icon URL before proxying it.
+
+    Args:
+        url: User-supplied absolute URL.
+
+    Returns:
+        The trimmed URL when it is an allowed public HTTP(S) URL.
+
+    Raises:
+        HTTPException: If the URL is unsupported or points at unsafe address
+            space.
+    """
+    clean_url = url.strip()
+    parsed = urlsplit(clean_url)
+    if parsed.scheme.lower() not in ICON_PROXY_ALLOWED_SCHEMES or not parsed.hostname:
+        raise HTTPException(status_code=422, detail="unsupported_icon_url")
+    if _is_private_icon_proxy_host(parsed.hostname):
+        raise HTTPException(status_code=422, detail="unsafe_icon_url")
+    return clean_url
+
+
+def _is_image_like_icon_response(response: httpx.Response) -> bool:
+    """Return whether one HTTP response looks like an icon image.
+
+    Args:
+        response: HTTP response from the remote icon URL.
+
+    Returns:
+        True when the content type is image-like, or a generic binary response
+        has a known image file extension.
+    """
+    content_type = response.headers.get("content-type", "").split(";", 1)[0].strip().lower()
+    if content_type.startswith("image/"):
+        return True
+    if content_type in {"application/octet-stream", "binary/octet-stream"}:
+        return urlsplit(str(response.url)).path.lower().endswith(ICON_PROXY_IMAGE_EXTENSIONS)
+    return False
+
+
+def _fetch_icon_proxy_response(url: str) -> Response:
+    """Fetch one remote icon and return it as a same-origin image response.
+
+    Args:
+        url: Validated public HTTP(S) icon URL.
+
+    Returns:
+        FastAPI response containing the icon bytes.
+
+    Raises:
+        HTTPException: If the remote URL cannot be fetched, is too large, or
+            does not return an image-like response.
+    """
+    try:
+        current_url = url
+        for _ in range(4):
+            with httpx.stream(
+                "GET",
+                current_url,
+                follow_redirects=False,
+                timeout=8.0,
+                headers={"User-Agent": "HeyBlogBot/0.1 (+https://example.invalid/heyblog)"},
+            ) as response:
+                if response.status_code in {301, 302, 303, 307, 308} and response.headers.get("location"):
+                    current_url = _validate_icon_proxy_url(str(httpx.URL(str(response.url)).join(response.headers["location"])))
+                    continue
+                response.raise_for_status()
+                if not _is_image_like_icon_response(response):
+                    raise HTTPException(status_code=502, detail="icon_proxy_not_image")
+                content_length = response.headers.get("content-length")
+                if content_length is not None:
+                    try:
+                        if int(content_length) > ICON_PROXY_MAX_BYTES:
+                            raise HTTPException(status_code=502, detail="icon_proxy_too_large")
+                    except ValueError:
+                        pass
+                chunks: list[bytes] = []
+                size = 0
+                for chunk in response.iter_bytes():
+                    size += len(chunk)
+                    if size > ICON_PROXY_MAX_BYTES:
+                        raise HTTPException(status_code=502, detail="icon_proxy_too_large")
+                    chunks.append(chunk)
+                content_type = response.headers.get("content-type", "image/x-icon")
+                return Response(
+                    content=b"".join(chunks),
+                    media_type=content_type,
+                    headers={"cache-control": "public, max-age=86400"},
+                )
+        raise HTTPException(status_code=502, detail="icon_proxy_too_many_redirects")
+    except HTTPException:
+        raise
+    except httpx.TimeoutException as exc:
+        raise HTTPException(status_code=504, detail="icon_proxy_timeout") from exc
+    except httpx.HTTPStatusError as exc:
+        raise HTTPException(status_code=502, detail=f"icon_proxy_http_{exc.response.status_code}") from exc
+    except httpx.RequestError as exc:
+        raise HTTPException(status_code=502, detail="icon_proxy_fetch_failed") from exc
 
 
 def _crawler_runtime_is_active(runtime: dict[str, Any]) -> bool:
@@ -436,6 +578,7 @@ def get_blogs_catalog(
         has_title: str | None = None,
         has_icon: str | None = None,
         min_connections: str | None = None,
+        acceptance_status: str | None = "ACCEPTED",
     ) -> dict[str, Any]:
         return _call_upstream_with_http_error_translation(
             lambda: get_state().persistence.list_blogs_catalog(
@@ -450,6 +593,7 @@ def get_blogs_catalog(
                 has_title=has_title,
                 has_icon=has_icon,
                 min_connections=min_connections,
+                acceptance_status=acceptance_status,
             )
         )
 
@@ -459,6 +603,18 @@ def lookup_blog_candidates(url: str) -> dict[str, Any]:
             lambda: get_state().persistence.lookup_blog_candidates(url=url)
         )
 
+    @app.get("/api/icons/proxy")
+    def proxy_icon(url: str) -> Response:
+        """Return one remote icon through the backend origin for graph textures.
+
+        Args:
+            url: Absolute HTTP(S) icon URL to fetch.
+
+        Returns:
+            Image response with cache headers when the remote resource is valid.
+        """
+        return _fetch_icon_proxy_response(_validate_icon_proxy_url(url))
+
     @app.post("/api/auth/register")
     def register_user(payload: UserAuthRequest) -> dict[str, Any]:
         return _call_upstream_with_http_error_translation(

diff --git a/crawler/crawling/bootstrap.py b/crawler/crawling/bootstrap.py
@@ -57,6 +57,7 @@ def bootstrap_seeds(self, seed_path: Path) -> dict[str, Any]:
                     url=raw_url,
                     normalized_url=normalized.normalized_url,
                     domain=normalized.domain,
+                    accepted_by="seed",
                 )
                 created += int(inserted)
         self.logger.bootstrap_success(seed_path)

diff --git a/crawler/crawling/fetching/base.py b/crawler/crawling/fetching/base.py
@@ -88,3 +88,16 @@ def fetch_many(
             ``FetchAttempt`` result.
         """
         ...
+
+    def validate_icon_url(self, url: str, *, timeout_seconds: float | None = None) -> str | None:
+        """Return a reachable final icon URL, or ``None`` when unusable.
+
+        Args:
+            url: Absolute HTTP(S) icon candidate URL to verify.
+            timeout_seconds: Optional per-request timeout override in seconds.
+
+        Returns:
+            Final URL after redirects when the candidate is reachable and looks
+            like an image resource; otherwise ``None``.
+        """
+        ...