Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
d9040c3
🎈 perf: 可视化图谱优化
ligeaaa Jun 2, 2026
e3e1683
🦄 refactor: 重构数据库,将爬虫状态和博客状态拆分,使其语义不混淆
ligeaaa Jun 2, 2026
db61b2a
🎈 perf: 优化icon_url的提取逻辑
ligeaaa Jun 2, 2026
bdffd60
🐞 fix: 修复了图谱中不加载icon的问题
ligeaaa Jun 2, 2026
40704e8
📃 docs:
ligeaaa Jun 3, 2026
bd022ba
🐞 fix: 修改了有些博客之间的边没有正常持久化的问题
ligeaaa Jun 4, 2026
ed9d044
✨ feat: 添加了可视化图谱测试
ligeaaa Jun 5, 2026
63406a2
🐞 fix: ruff
ligeaaa Jun 5, 2026
d7f18c1
📃 docs: readme
ligeaaa Jun 5, 2026
4880a8e
✨ feat: 清理首页
ligeaaa Jun 5, 2026
4856a02
🎈 perf: 优化渲染ticks选择
ligeaaa Jun 6, 2026
8482837
✨ feat: 可视化新增“精简”模式
ligeaaa Jun 6, 2026
69a193a
🦄 refactor: 首页重构
ligeaaa Jun 6, 2026
8b0c34f
✨ feat: 博客详情页初版
ligeaaa Jun 6, 2026
5927259
✨ feat: 完善了博客搜索,现在没搜索到的可以选择直接加入网络了
ligeaaa Jun 6, 2026
911f8d0
✨ feat: 博客详情页新增“发现路径”“博客关联图”
ligeaaa Jun 7, 2026
da18deb
🐞 fix: ruff
ligeaaa Jun 7, 2026
69e0f24
🐞 fix: 对应的修改测试案例(修改测试案例好怪
ligeaaa Jun 7, 2026
00ae89b
✨ feat: 随机博客界面新增博客详情入口
ligeaaa Jun 7, 2026
f461766
✨ feat: 添加用户打开博客详情和外部链接的数据统计
ligeaaa Jun 7, 2026
b9b51b7
🎈 perf: 现在随机博客界面打开博客详情会另开一个标签页了
ligeaaa Jun 7, 2026
2aaa8da
🎈 perf: 优化随机博客权重为不考虑blog label
ligeaaa Jun 7, 2026
242d5b7
🐳 chore: 修改seed.csv
ligeaaa Jun 7, 2026
adfcc44
🦄 refactor: 清理掉早期的“申请添加博客”“重试过滤链”相关代码和数据库
ligeaaa Jun 7, 2026
e784808
🦄 refactor: 清理掉blog_interactions和recommendation_impressions表中的blog_id字段
ligeaaa Jun 7, 2026
d3ed85d
🐳 chore: 修改seed
ligeaaa Jun 7, 2026
3f33a2e
✨ feat: 博客详情页添加当前博客爬虫状态
ligeaaa Jun 7, 2026
61562fc
🐞 fix: ruff
ligeaaa Jun 7, 2026
46f87ad
🎈 perf: 优化可视化图谱
ligeaaa Jun 9, 2026
401420d
Delete tracker directory
ligeaaa Jun 9, 2026
e58aa56
✨ feat: 完善用户系统
ligeaaa Jun 11, 2026
9ae18e5
Merge branch 'version/v0.3' of github.com:ligeaaa/HeyBlog into versio…
ligeaaa Jun 11, 2026
effbe83
✨ feat: 添加统计信息
ligeaaa Jun 11, 2026
62867de
🎈 perf: 默认关闭token暴露
ligeaaa Jun 11, 2026
c494823
🐞 fix: pytest
ligeaaa Jun 11, 2026
13aa4e3
✨ feat: 添加每小时自动开启爬虫
ligeaaa Jun 11, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 115 additions & 0 deletions alembic/versions/20260602_01_add_blog_acceptance_status.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
"""Split blog acceptance from crawl execution status.

Revision ID: 20260602_01
Revises: 20260601_01
Create Date: 2026-06-02 21:30:29 BST
"""

from __future__ import annotations

from alembic import op
import sqlalchemy as sa


revision = "20260602_01"
down_revision = "20260601_01"
branch_labels = None
depends_on = None


def _columns(table_name: str) -> set[str]:
"""Return currently present column names for one table.

Args:
table_name: Database table name to inspect.

Returns:
Set of column names currently present in the database.
"""
return {column["name"] for column in sa.inspect(op.get_bind()).get_columns(table_name)}


def upgrade() -> None:
"""Add acceptance and crawl-error fields, then backfill accepted graph rows.

Args:
None.

Returns:
None. The migration mutates the active database schema in place.
"""
blog_columns = _columns("blogs")
if "acceptance_status" not in blog_columns:
op.add_column(
"blogs",
sa.Column("acceptance_status", sa.Text(), nullable=False, server_default="UNKNOWN"),
)
for column_name in (
"accepted_by",
"crawl_error_kind",
"crawl_error_message",
):
if column_name not in blog_columns:
op.add_column("blogs", sa.Column(column_name, sa.Text(), nullable=True))
for column_name in (
"accepted_at",
"last_crawl_attempt_at",
"successful_crawl_at",
):
if column_name not in blog_columns:
op.add_column("blogs", sa.Column(column_name, sa.DateTime(timezone=True), nullable=True))

op.execute(
"""
UPDATE blogs b
SET acceptance_status = 'ACCEPTED',
accepted_by = COALESCE(b.accepted_by, r.accepted_by, 'unknown'),
accepted_at = COALESCE(b.accepted_at, r.updated_at, b.created_at)
FROM raw_discovered_urls r
WHERE b.normalized_url = r.normalized_url
AND r.status = 'success'
AND b.acceptance_status = 'UNKNOWN'
"""
)
op.execute(
"""
UPDATE blogs
SET acceptance_status = 'ACCEPTED',
accepted_by = COALESCE(accepted_by, 'seed'),
accepted_at = COALESCE(accepted_at, created_at)
WHERE acceptance_status = 'UNKNOWN'
AND blog_id NOT IN (SELECT to_blog_id FROM edges)
"""
)
op.execute(
"""
UPDATE blogs
SET acceptance_status = 'ACCEPTED',
accepted_by = COALESCE(accepted_by, 'graph'),
accepted_at = COALESCE(accepted_at, created_at)
WHERE acceptance_status = 'UNKNOWN'
AND blog_id IN (SELECT from_blog_id FROM edges UNION SELECT to_blog_id FROM edges)
"""
)


def downgrade() -> None:
"""Remove acceptance and crawl-error fields.

Args:
None.

Returns:
None. The migration mutates the active database schema in place.
"""
for column_name in (
"successful_crawl_at",
"last_crawl_attempt_at",
"crawl_error_message",
"crawl_error_kind",
"accepted_at",
"accepted_by",
"acceptance_status",
):
if column_name in _columns("blogs"):
op.drop_column("blogs", column_name)
156 changes: 156 additions & 0 deletions backend/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,14 @@
from __future__ import annotations

from dataclasses import dataclass
import ipaddress
import socket
from threading import Thread
from time import sleep
from typing import Any
from typing import Callable
from typing import NoReturn
from urllib.parse import urlsplit

import httpx
from fastapi import Depends, FastAPI, HTTPException, Request
Expand Down Expand Up @@ -75,6 +78,145 @@ class CreateBlogLabelTagRequest(BaseModel):


ACTIVE_CRAWLER_RUNNER_STATUSES = frozenset({"starting", "running", "stopping"})
ICON_PROXY_MAX_BYTES = 1_000_000
ICON_PROXY_ALLOWED_SCHEMES = frozenset({"http", "https"})
ICON_PROXY_IMAGE_EXTENSIONS = (".ico", ".png", ".jpg", ".jpeg", ".svg", ".webp", ".gif", ".avif")


def _is_private_icon_proxy_host(hostname: str) -> bool:
"""Return whether one hostname resolves to local or private network space.

Args:
hostname: Parsed URL hostname to validate before proxying.

Returns:
True when the hostname itself or any resolved address is unsafe for the
public icon proxy.
"""
try:
ip_addresses = [ipaddress.ip_address(hostname)]
except ValueError:
try:
resolved = socket.getaddrinfo(hostname, None, type=socket.SOCK_STREAM)
except socket.gaierror:
return True
ip_addresses = []
for item in resolved:
address = item[4][0]
try:
ip_addresses.append(ipaddress.ip_address(address))
except ValueError:
return True

return any(
address.is_private
or address.is_loopback
or address.is_link_local
or address.is_multicast
or address.is_reserved
or address.is_unspecified
for address in ip_addresses
)


def _validate_icon_proxy_url(url: str) -> str:
"""Normalize and validate a remote icon URL before proxying it.

Args:
url: User-supplied absolute URL.

Returns:
The trimmed URL when it is an allowed public HTTP(S) URL.

Raises:
HTTPException: If the URL is unsupported or points at unsafe address
space.
"""
clean_url = url.strip()
parsed = urlsplit(clean_url)
if parsed.scheme.lower() not in ICON_PROXY_ALLOWED_SCHEMES or not parsed.hostname:
raise HTTPException(status_code=422, detail="unsupported_icon_url")
if _is_private_icon_proxy_host(parsed.hostname):
raise HTTPException(status_code=422, detail="unsafe_icon_url")
return clean_url


def _is_image_like_icon_response(response: httpx.Response) -> bool:
"""Return whether one HTTP response looks like an icon image.

Args:
response: HTTP response from the remote icon URL.

Returns:
True when the content type is image-like, or a generic binary response
has a known image file extension.
"""
content_type = response.headers.get("content-type", "").split(";", 1)[0].strip().lower()
if content_type.startswith("image/"):
return True
if content_type in {"application/octet-stream", "binary/octet-stream"}:
return urlsplit(str(response.url)).path.lower().endswith(ICON_PROXY_IMAGE_EXTENSIONS)
return False


def _fetch_icon_proxy_response(url: str) -> Response:
"""Fetch one remote icon and return it as a same-origin image response.

Args:
url: Validated public HTTP(S) icon URL.

Returns:
FastAPI response containing the icon bytes.

Raises:
HTTPException: If the remote URL cannot be fetched, is too large, or
does not return an image-like response.
"""
try:
current_url = url
for _ in range(4):
with httpx.stream(
"GET",
current_url,
Comment on lines +217 to +219

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Bind icon proxy fetches to the validated address

For attacker-controlled icon hosts, this fetch re-resolves current_url after _validate_icon_proxy_url has already done its safety check, so DNS rebinding can return a public address during validation and a loopback/link-local address for the actual httpx.stream connection. Because this endpoint proxies arbitrary user-supplied icon URLs, that still allows SSRF against internal services despite the private-address guard; the fetch needs to use the validated resolved address (or otherwise prevent a second unsafe resolution) for every redirect hop.

Useful? React with 👍 / 👎.

follow_redirects=False,
timeout=8.0,
headers={"User-Agent": "HeyBlogBot/0.1 (+https://example.invalid/heyblog)"},
) as response:
if response.status_code in {301, 302, 303, 307, 308} and response.headers.get("location"):
current_url = _validate_icon_proxy_url(str(httpx.URL(str(response.url)).join(response.headers["location"])))
continue
response.raise_for_status()
if not _is_image_like_icon_response(response):
raise HTTPException(status_code=502, detail="icon_proxy_not_image")
content_length = response.headers.get("content-length")
if content_length is not None:
try:
if int(content_length) > ICON_PROXY_MAX_BYTES:
raise HTTPException(status_code=502, detail="icon_proxy_too_large")
except ValueError:
pass
chunks: list[bytes] = []
size = 0
for chunk in response.iter_bytes():
size += len(chunk)
if size > ICON_PROXY_MAX_BYTES:
raise HTTPException(status_code=502, detail="icon_proxy_too_large")
chunks.append(chunk)
content_type = response.headers.get("content-type", "image/x-icon")
return Response(
content=b"".join(chunks),
media_type=content_type,
headers={"cache-control": "public, max-age=86400"},
)
raise HTTPException(status_code=502, detail="icon_proxy_too_many_redirects")
except HTTPException:
raise
except httpx.TimeoutException as exc:
raise HTTPException(status_code=504, detail="icon_proxy_timeout") from exc
except httpx.HTTPStatusError as exc:
raise HTTPException(status_code=502, detail=f"icon_proxy_http_{exc.response.status_code}") from exc
except httpx.RequestError as exc:
raise HTTPException(status_code=502, detail="icon_proxy_fetch_failed") from exc


def _crawler_runtime_is_active(runtime: dict[str, Any]) -> bool:
Expand Down Expand Up @@ -436,6 +578,7 @@ def get_blogs_catalog(
has_title: str | None = None,
has_icon: str | None = None,
min_connections: str | None = None,
acceptance_status: str | None = "ACCEPTED",
) -> dict[str, Any]:
return _call_upstream_with_http_error_translation(
lambda: get_state().persistence.list_blogs_catalog(
Expand All @@ -450,6 +593,7 @@ def get_blogs_catalog(
has_title=has_title,
has_icon=has_icon,
min_connections=min_connections,
acceptance_status=acceptance_status,
)
)

Expand All @@ -459,6 +603,18 @@ def lookup_blog_candidates(url: str) -> dict[str, Any]:
lambda: get_state().persistence.lookup_blog_candidates(url=url)
)

@app.get("/api/icons/proxy")
def proxy_icon(url: str) -> Response:
"""Return one remote icon through the backend origin for graph textures.

Args:
url: Absolute HTTP(S) icon URL to fetch.

Returns:
Image response with cache headers when the remote resource is valid.
"""
return _fetch_icon_proxy_response(_validate_icon_proxy_url(url))

@app.post("/api/auth/register")
def register_user(payload: UserAuthRequest) -> dict[str, Any]:
return _call_upstream_with_http_error_translation(
Expand Down
1 change: 1 addition & 0 deletions crawler/crawling/bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def bootstrap_seeds(self, seed_path: Path) -> dict[str, Any]:
url=raw_url,
normalized_url=normalized.normalized_url,
domain=normalized.domain,
accepted_by="seed",
)
created += int(inserted)
self.logger.bootstrap_success(seed_path)
Expand Down
13 changes: 13 additions & 0 deletions crawler/crawling/fetching/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,3 +88,16 @@ def fetch_many(
``FetchAttempt`` result.
"""
...

def validate_icon_url(self, url: str, *, timeout_seconds: float | None = None) -> str | None:
"""Return a reachable final icon URL, or ``None`` when unusable.

Args:
url: Absolute HTTP(S) icon candidate URL to verify.
timeout_seconds: Optional per-request timeout override in seconds.

Returns:
Final URL after redirects when the candidate is reachable and looks
like an image resource; otherwise ``None``.
"""
...
Loading
Loading