diff --git a/.env.example b/.env.example index a8eb718..56c6868 100644 --- a/.env.example +++ b/.env.example @@ -20,6 +20,21 @@ HEYBLOG_DOCKER_EXPORT_DIR=/data/exports HEYBLOG_DOCKER_SEARCH_CACHE_DIR=/data/search-cache HEYBLOG_DOCKER_LOG_DIR=/data/logs +# Public frontend URL used in verification and password reset links +HEYBLOG_PUBLIC_BASE_URL=http://127.0.0.1:3000 + +# Email delivery. Keep disabled for local dev unless SMTP credentials are set. +HEYBLOG_EMAIL_PROVIDER=disabled +HEYBLOG_EMAIL_FROM=no-reply@example.com +HEYBLOG_EMAIL_DEV_EXPOSE_TOKENS=false +HEYBLOG_SMTP_HOST=smtp.example.com +HEYBLOG_SMTP_PORT=587 +HEYBLOG_SMTP_USERNAME= +HEYBLOG_SMTP_PASSWORD= +HEYBLOG_SMTP_USE_TLS=true +HEYBLOG_SMTP_USE_SSL=false +HEYBLOG_SMTP_TIMEOUT_SECONDS=10.0 + # Logging HEYBLOG_LOG_DIR=/file/path/logs HEYBLOG_LOG_LEVEL=INFO @@ -44,6 +59,8 @@ HEYBLOG_MAX_PATH_PROBES_PER_BLOG=50 HEYBLOG_MAX_CANDIDATE_LINKS_PER_PAGE=50 HEYBLOG_CANDIDATE_PAGE_FETCH_CONCURRENCY=4 HEYBLOG_RUNTIME_WORKER_COUNT=3 +# Idle runtime auto-start check interval. Default is one hour. +HEYBLOG_RUNTIME_AUTO_START_INTERVAL_SECONDS=3600 # Set HEYBLOG_RAW_DISCOVERED_URL_LIMIT=-1 to disable the crawler raw URL limit. HEYBLOG_RAW_DISCOVERED_URL_LIMIT=1000000 diff --git a/alembic/versions/20260423_02_add_blog_business_key_schema.py b/alembic/versions/20260423_02_add_blog_business_key_schema.py index 6fa077b..c002400 100644 --- a/alembic/versions/20260423_02_add_blog_business_key_schema.py +++ b/alembic/versions/20260423_02_add_blog_business_key_schema.py @@ -20,10 +20,7 @@ BLOG_FK_REWRITES = ( ("edges", "edges_from_blog_id_fkey", "from_blog_id", "CASCADE"), ("edges", "edges_to_blog_id_fkey", "to_blog_id", "CASCADE"), - ("ingestion_requests", "ingestion_requests_seed_blog_id_fkey", "seed_blog_id", "SET NULL"), - ("ingestion_requests", "ingestion_requests_matched_blog_id_fkey", "matched_blog_id", "SET NULL"), ("blog_label_assignments", "blog_label_assignments_blog_id_fkey", "blog_id", "CASCADE"), - ("blog_dedup_scan_run_items", "blog_dedup_scan_run_items_survivor_blog_id_fkey", "survivor_blog_id", "SET NULL"), ) diff --git a/alembic/versions/20260602_01_add_blog_acceptance_status.py b/alembic/versions/20260602_01_add_blog_acceptance_status.py new file mode 100644 index 0000000..f1090d5 --- /dev/null +++ b/alembic/versions/20260602_01_add_blog_acceptance_status.py @@ -0,0 +1,115 @@ +"""Split blog acceptance from crawl execution status. + +Revision ID: 20260602_01 +Revises: 20260601_01 +Create Date: 2026-06-02 21:30:29 BST +""" + +from __future__ import annotations + +from alembic import op +import sqlalchemy as sa + + +revision = "20260602_01" +down_revision = "20260601_01" +branch_labels = None +depends_on = None + + +def _columns(table_name: str) -> set[str]: + """Return currently present column names for one table. + + Args: + table_name: Database table name to inspect. + + Returns: + Set of column names currently present in the database. + """ + return {column["name"] for column in sa.inspect(op.get_bind()).get_columns(table_name)} + + +def upgrade() -> None: + """Add acceptance and crawl-error fields, then backfill accepted graph rows. + + Args: + None. + + Returns: + None. The migration mutates the active database schema in place. + """ + blog_columns = _columns("blogs") + if "acceptance_status" not in blog_columns: + op.add_column( + "blogs", + sa.Column("acceptance_status", sa.Text(), nullable=False, server_default="UNKNOWN"), + ) + for column_name in ( + "accepted_by", + "crawl_error_kind", + "crawl_error_message", + ): + if column_name not in blog_columns: + op.add_column("blogs", sa.Column(column_name, sa.Text(), nullable=True)) + for column_name in ( + "accepted_at", + "last_crawl_attempt_at", + "successful_crawl_at", + ): + if column_name not in blog_columns: + op.add_column("blogs", sa.Column(column_name, sa.DateTime(timezone=True), nullable=True)) + + op.execute( + """ + UPDATE blogs b + SET acceptance_status = 'ACCEPTED', + accepted_by = COALESCE(b.accepted_by, r.accepted_by, 'unknown'), + accepted_at = COALESCE(b.accepted_at, r.updated_at, b.created_at) + FROM raw_discovered_urls r + WHERE b.normalized_url = r.normalized_url + AND r.status = 'success' + AND b.acceptance_status = 'UNKNOWN' + """ + ) + op.execute( + """ + UPDATE blogs + SET acceptance_status = 'ACCEPTED', + accepted_by = COALESCE(accepted_by, 'seed'), + accepted_at = COALESCE(accepted_at, created_at) + WHERE acceptance_status = 'UNKNOWN' + AND blog_id NOT IN (SELECT to_blog_id FROM edges) + """ + ) + op.execute( + """ + UPDATE blogs + SET acceptance_status = 'ACCEPTED', + accepted_by = COALESCE(accepted_by, 'graph'), + accepted_at = COALESCE(accepted_at, created_at) + WHERE acceptance_status = 'UNKNOWN' + AND blog_id IN (SELECT from_blog_id FROM edges UNION SELECT to_blog_id FROM edges) + """ + ) + + +def downgrade() -> None: + """Remove acceptance and crawl-error fields. + + Args: + None. + + Returns: + None. The migration mutates the active database schema in place. + """ + for column_name in ( + "successful_crawl_at", + "last_crawl_attempt_at", + "crawl_error_message", + "crawl_error_kind", + "accepted_at", + "accepted_by", + "acceptance_status", + ): + if column_name in _columns("blogs"): + op.drop_column("blogs", column_name) diff --git a/alembic/versions/20260606_01_add_seed_table.py b/alembic/versions/20260606_01_add_seed_table.py new file mode 100644 index 0000000..2b53969 --- /dev/null +++ b/alembic/versions/20260606_01_add_seed_table.py @@ -0,0 +1,74 @@ +"""Persist imported seed CSV rows in a dedicated table. + +Revision ID: 20260606_01 +Revises: 20260602_01 +Create Date: 2026-06-06 16:27:56 BST +""" + +from __future__ import annotations + +from alembic import op +import sqlalchemy as sa + + +revision = "20260606_01" +down_revision = "20260602_01" +branch_labels = None +depends_on = None + + +def _tables() -> set[str]: + """Return currently present database table names. + + Args: + None. + + Returns: + Set of table names currently present in the migration target. + """ + + return set(sa.inspect(op.get_bind()).get_table_names()) + + +def upgrade() -> None: + """Create the durable seed import table. + + Args: + None. + + Returns: + None. The migration mutates the active database schema in place. + """ + + if "seeds" in _tables(): + return + op.create_table( + "seeds", + sa.Column("id", sa.Integer(), primary_key=True), + sa.Column("url", sa.Text(), nullable=False), + sa.Column("normalized_url", sa.Text(), nullable=False), + sa.Column("domain", sa.Text(), nullable=False), + sa.Column("source_path", sa.Text(), nullable=True), + sa.Column("source_row", sa.Integer(), nullable=True), + sa.Column("blog_id", sa.Integer(), sa.ForeignKey("blogs.blog_id", ondelete="SET NULL"), nullable=True), + sa.Column("imported_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False), + sa.Column("updated_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False), + sa.UniqueConstraint("normalized_url", name="uq_seeds_normalized_url"), + ) + op.create_index("ix_seeds_normalized_url", "seeds", ["normalized_url"]) + + +def downgrade() -> None: + """Drop the durable seed import table. + + Args: + None. + + Returns: + None. The migration mutates the active database schema in place. + """ + + if "seeds" not in _tables(): + return + op.drop_index("ix_seeds_normalized_url", table_name="seeds") + op.drop_table("seeds") diff --git a/alembic/versions/20260607_01_add_recommendation_event_tables.py b/alembic/versions/20260607_01_add_recommendation_event_tables.py new file mode 100644 index 0000000..48f766a --- /dev/null +++ b/alembic/versions/20260607_01_add_recommendation_event_tables.py @@ -0,0 +1,174 @@ +"""Add local recommendation request, impression, and interaction tables. + +Revision ID: 20260607_01 +Revises: 20260606_01 +Create Date: 2026-06-07 14:21:29 BST +""" + +from __future__ import annotations + +from alembic import op +import sqlalchemy as sa + + +revision = "20260607_01" +down_revision = "20260606_01" +branch_labels = None +depends_on = None + + +def _tables() -> set[str]: + """Return currently present database table names. + + Args: + None. + + Returns: + Set of table names currently present in the migration target. + """ + + return set(sa.inspect(op.get_bind()).get_table_names()) + + +def upgrade() -> None: + """Create the recommendation event substrate tables. + + Args: + None. + + Returns: + None. The migration mutates the active database schema in place. + """ + + tables = _tables() + if "recommendation_requests" not in tables: + op.create_table( + "recommendation_requests", + sa.Column("id", sa.Integer(), primary_key=True), + sa.Column("request_uuid", sa.Text(), nullable=False), + sa.Column("surface", sa.Text(), nullable=False), + sa.Column("strategy", sa.Text(), nullable=False), + sa.Column("strategy_version", sa.Text(), nullable=False, server_default="v1"), + sa.Column("visitor_id", sa.Text(), nullable=False), + sa.Column("user_id", sa.Integer(), sa.ForeignKey("users.id", ondelete="SET NULL"), nullable=True), + sa.Column("session_id", sa.Text(), nullable=False), + sa.Column("source", sa.Text(), nullable=True), + sa.Column("page_url", sa.Text(), nullable=True), + sa.Column("requested_count", sa.Integer(), nullable=False), + sa.Column("served_count", sa.Integer(), nullable=False), + sa.Column("context_json", sa.JSON(), nullable=False, server_default=sa.text("'{}'")), + sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False), + sa.UniqueConstraint("request_uuid", name="uq_recommendation_requests_request_uuid"), + ) + op.create_index("ix_recommendation_requests_request_uuid", "recommendation_requests", ["request_uuid"]) + op.create_index("ix_recommendation_requests_surface", "recommendation_requests", ["surface"]) + op.create_index("ix_recommendation_requests_user_id", "recommendation_requests", ["user_id"]) + op.create_index("ix_recommendation_requests_visitor_id", "recommendation_requests", ["visitor_id"]) + op.create_index("ix_recommendation_requests_session_id", "recommendation_requests", ["session_id"]) + op.create_index( + "ix_recommendation_requests_surface_created", + "recommendation_requests", + ["surface", "created_at"], + ) + op.create_index( + "ix_recommendation_requests_strategy_created", + "recommendation_requests", + ["strategy", "strategy_version", "created_at"], + ) + + tables = _tables() + if "recommendation_impressions" not in tables: + op.create_table( + "recommendation_impressions", + sa.Column("id", sa.Integer(), primary_key=True), + sa.Column( + "request_id", + sa.Integer(), + sa.ForeignKey("recommendation_requests.id", ondelete="CASCADE"), + nullable=False, + ), + sa.Column("normalized_url", sa.Text(), nullable=False), + sa.Column("position", sa.Integer(), nullable=False), + sa.Column("score", sa.Integer(), nullable=True), + sa.Column("reason_json", sa.JSON(), nullable=False, server_default=sa.text("'{}'")), + sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False), + sa.UniqueConstraint("request_id", "position", name="uq_recommendation_impression_request_position"), + sa.UniqueConstraint("request_id", "normalized_url", name="uq_recommendation_impression_request_url"), + ) + op.create_index("ix_recommendation_impressions_request_id", "recommendation_impressions", ["request_id"]) + op.create_index("ix_recommendation_impressions_normalized_url", "recommendation_impressions", ["normalized_url"]) + op.create_index( + "ix_recommendation_impressions_url_created", + "recommendation_impressions", + ["normalized_url", "created_at"], + ) + + tables = _tables() + if "blog_interactions" not in tables: + op.create_table( + "blog_interactions", + sa.Column("id", sa.Integer(), primary_key=True), + sa.Column("event_uuid", sa.Text(), nullable=False), + sa.Column( + "request_id", + sa.Integer(), + sa.ForeignKey("recommendation_requests.id", ondelete="SET NULL"), + nullable=True, + ), + sa.Column( + "impression_id", + sa.Integer(), + sa.ForeignKey("recommendation_impressions.id", ondelete="SET NULL"), + nullable=True, + ), + sa.Column("normalized_url", sa.Text(), nullable=False), + sa.Column("event_type", sa.Text(), nullable=False), + sa.Column("position", sa.Integer(), nullable=True), + sa.Column("entrance_kind", sa.Text(), nullable=False), + sa.Column("entrance_url", sa.Text(), nullable=False), + sa.Column("interaction_order", sa.Integer(), nullable=False, server_default="1"), + sa.Column("visitor_id", sa.Text(), nullable=False), + sa.Column("user_id", sa.Integer(), sa.ForeignKey("users.id", ondelete="SET NULL"), nullable=True), + sa.Column("session_id", sa.Text(), nullable=False), + sa.Column("client_event_at", sa.DateTime(timezone=True), nullable=True), + sa.Column("attributes_json", sa.JSON(), nullable=False, server_default=sa.text("'{}'")), + sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False), + sa.UniqueConstraint("event_uuid", name="uq_blog_interactions_event_uuid"), + ) + op.create_index("ix_blog_interactions_event_uuid", "blog_interactions", ["event_uuid"]) + op.create_index("ix_blog_interactions_request_id", "blog_interactions", ["request_id"]) + op.create_index("ix_blog_interactions_impression_id", "blog_interactions", ["impression_id"]) + op.create_index("ix_blog_interactions_normalized_url", "blog_interactions", ["normalized_url"]) + op.create_index("ix_blog_interactions_event_type", "blog_interactions", ["event_type"]) + op.create_index("ix_blog_interactions_entrance_kind", "blog_interactions", ["entrance_kind"]) + op.create_index("ix_blog_interactions_entrance_url", "blog_interactions", ["entrance_url"]) + op.create_index("ix_blog_interactions_visitor_id", "blog_interactions", ["visitor_id"]) + op.create_index("ix_blog_interactions_user_id", "blog_interactions", ["user_id"]) + op.create_index("ix_blog_interactions_session_id", "blog_interactions", ["session_id"]) + op.create_index( + "ix_blog_interactions_url_event_created", + "blog_interactions", + ["normalized_url", "event_type", "created_at"], + ) + op.create_index("ix_blog_interactions_request_event", "blog_interactions", ["request_id", "event_type"]) + + +def downgrade() -> None: + """Drop the recommendation event substrate tables. + + Args: + None. + + Returns: + None. The migration mutates the active database schema in place. + """ + + tables = _tables() + if "blog_interactions" in tables: + op.drop_table("blog_interactions") + tables = _tables() + if "recommendation_impressions" in tables: + op.drop_table("recommendation_impressions") + tables = _tables() + if "recommendation_requests" in tables: + op.drop_table("recommendation_requests") diff --git a/alembic/versions/20260607_02_add_blog_interaction_entrance_fields.py b/alembic/versions/20260607_02_add_blog_interaction_entrance_fields.py new file mode 100644 index 0000000..2cf8f0d --- /dev/null +++ b/alembic/versions/20260607_02_add_blog_interaction_entrance_fields.py @@ -0,0 +1,93 @@ +"""Add entrance metadata columns to existing blog interaction tables. + +Revision ID: 20260607_02 +Revises: 20260607_01 +Create Date: 2026-06-07 15:08:00 BST +""" + +from __future__ import annotations + +from alembic import op +import sqlalchemy as sa + + +revision = "20260607_02" +down_revision = "20260607_01" +branch_labels = None +depends_on = None + + +def _table_columns(table_name: str) -> set[str]: + """Return current column names for one table. + + Args: + table_name: Table to inspect. + + Returns: + Set of column names currently present on the table. + """ + + inspector = sa.inspect(op.get_bind()) + if table_name not in set(inspector.get_table_names()): + return set() + return {column["name"] for column in inspector.get_columns(table_name)} + + +def upgrade() -> None: + """Backfill entrance metadata columns added after the first event migration. + + Args: + None. + + Returns: + None. The active database schema is mutated in place. + """ + + columns = _table_columns("blog_interactions") + if not columns: + return + if "entrance_kind" not in columns: + op.add_column( + "blog_interactions", + sa.Column("entrance_kind", sa.Text(), nullable=False, server_default="legacy_unknown"), + ) + op.alter_column("blog_interactions", "entrance_kind", server_default=None) + if "entrance_url" not in columns: + op.add_column( + "blog_interactions", + sa.Column("entrance_url", sa.Text(), nullable=False, server_default="legacy_unknown"), + ) + op.alter_column("blog_interactions", "entrance_url", server_default=None) + op.create_index( + "ix_blog_interactions_entrance_kind", + "blog_interactions", + ["entrance_kind"], + if_not_exists=True, + ) + op.create_index( + "ix_blog_interactions_entrance_url", + "blog_interactions", + ["entrance_url"], + if_not_exists=True, + ) + + +def downgrade() -> None: + """Drop entrance metadata columns from blog interaction rows. + + Args: + None. + + Returns: + None. The active database schema is mutated in place. + """ + + columns = _table_columns("blog_interactions") + if not columns: + return + op.drop_index("ix_blog_interactions_entrance_url", table_name="blog_interactions", if_exists=True) + op.drop_index("ix_blog_interactions_entrance_kind", table_name="blog_interactions", if_exists=True) + if "entrance_url" in columns: + op.drop_column("blog_interactions", "entrance_url") + if "entrance_kind" in columns: + op.drop_column("blog_interactions", "entrance_kind") diff --git a/alembic/versions/20260607_03_drop_deprecated_ingestion_and_dedup_tables.py b/alembic/versions/20260607_03_drop_deprecated_ingestion_and_dedup_tables.py new file mode 100644 index 0000000..3ac83f6 --- /dev/null +++ b/alembic/versions/20260607_03_drop_deprecated_ingestion_and_dedup_tables.py @@ -0,0 +1,129 @@ +"""Drop deprecated ingestion request and blog dedup scan tables. + +Revision ID: 20260607_03 +Revises: 20260607_02 +Create Date: 2026-06-07 16:30:00 BST +""" + +from __future__ import annotations + +from alembic import op +import sqlalchemy as sa + + +revision = "20260607_03" +down_revision = "20260607_02" +branch_labels = None +depends_on = None + + +def _table_names() -> set[str]: + """Return the current database table names. + + Args: + None. + + Returns: + Set of table names visible to the active migration connection. + """ + + return set(sa.inspect(op.get_bind()).get_table_names()) + + +def upgrade() -> None: + """Remove persistence tables for deprecated ingestion and dedup scan features. + + Args: + None. + + Returns: + None. Existing deprecated tables are dropped when present. + """ + + tables = _table_names() + for table_name in ( + "blog_dedup_scan_run_items", + "blog_dedup_scan_runs", + "ingestion_requests", + ): + if table_name in tables: + op.drop_table(table_name) + + +def downgrade() -> None: + """Recreate the deprecated tables with their final historical schema. + + Args: + None. + + Returns: + None. The removed tables are recreated for migration rollback only. + """ + + tables = _table_names() + if "ingestion_requests" not in tables: + op.create_table( + "ingestion_requests", + sa.Column("id", sa.Integer(), primary_key=True), + sa.Column("requested_url", sa.Text(), nullable=False), + sa.Column("normalized_url", sa.Text(), nullable=False), + sa.Column("identity_key", sa.Text(), nullable=True), + sa.Column("identity_reason_codes", sa.Text(), nullable=True), + sa.Column("identity_ruleset_version", sa.Text(), nullable=True), + sa.Column("requester_email", sa.Text(), nullable=False), + sa.Column("status", sa.Text(), nullable=False), + sa.Column("priority", sa.Integer(), nullable=False, server_default="100"), + sa.Column("seed_blog_id", sa.Integer(), nullable=True), + sa.Column("matched_blog_id", sa.Integer(), nullable=True), + sa.Column("request_token", sa.Text(), nullable=False), + sa.Column("expires_at", sa.DateTime(timezone=True), nullable=True), + sa.Column("error_message", sa.Text(), nullable=True), + sa.Column("created_at", sa.DateTime(timezone=True), nullable=False, server_default=sa.func.now()), + sa.Column("updated_at", sa.DateTime(timezone=True), nullable=False, server_default=sa.func.now()), + sa.ForeignKeyConstraint(["seed_blog_id"], ["blogs.blog_id"], ondelete="SET NULL"), + sa.ForeignKeyConstraint(["matched_blog_id"], ["blogs.blog_id"], ondelete="SET NULL"), + ) + op.create_index("ix_ingestion_requests_identity_key", "ingestion_requests", ["identity_key"]) + op.create_index("ix_ingestion_requests_status", "ingestion_requests", ["status"]) + op.create_index("ix_ingestion_requests_seed_blog_id", "ingestion_requests", ["seed_blog_id"]) + op.create_index("ix_ingestion_requests_matched_blog_id", "ingestion_requests", ["matched_blog_id"]) + + if "blog_dedup_scan_runs" not in tables: + op.create_table( + "blog_dedup_scan_runs", + sa.Column("id", sa.Integer(), primary_key=True), + sa.Column("status", sa.Text(), nullable=False), + sa.Column("ruleset_version", sa.Text(), nullable=False), + sa.Column("total_count", sa.Integer(), nullable=False, server_default="0"), + sa.Column("scanned_count", sa.Integer(), nullable=False, server_default="0"), + sa.Column("removed_count", sa.Integer(), nullable=False, server_default="0"), + sa.Column("kept_count", sa.Integer(), nullable=False, server_default="0"), + sa.Column("crawler_was_running", sa.Boolean(), nullable=False, server_default=sa.false()), + sa.Column("crawler_restart_attempted", sa.Boolean(), nullable=False, server_default=sa.false()), + sa.Column("crawler_restart_succeeded", sa.Boolean(), nullable=False, server_default=sa.false()), + sa.Column("search_reindexed", sa.Boolean(), nullable=False, server_default=sa.false()), + sa.Column("error_message", sa.Text(), nullable=True), + sa.Column("started_at", sa.DateTime(timezone=True), nullable=False, server_default=sa.func.now()), + sa.Column("completed_at", sa.DateTime(timezone=True), nullable=True), + sa.Column("duration_ms", sa.Integer(), nullable=True), + ) + op.create_index("ix_blog_dedup_scan_runs_status", "blog_dedup_scan_runs", ["status"]) + + if "blog_dedup_scan_run_items" not in tables: + op.create_table( + "blog_dedup_scan_run_items", + sa.Column("id", sa.Integer(), primary_key=True), + sa.Column("run_id", sa.Integer(), nullable=False), + sa.Column("survivor_blog_id", sa.Integer(), nullable=True), + sa.Column("removed_blog_id", sa.Integer(), nullable=True), + sa.Column("survivor_identity_key", sa.Text(), nullable=True), + sa.Column("removed_identity_key", sa.Text(), nullable=True), + sa.Column("removed_url", sa.Text(), nullable=False), + sa.Column("reason_code", sa.Text(), nullable=False), + sa.Column("reason_codes", sa.Text(), nullable=True), + sa.Column("survivor_selection_basis", sa.Text(), nullable=True), + sa.Column("created_at", sa.DateTime(timezone=True), nullable=False, server_default=sa.func.now()), + sa.ForeignKeyConstraint(["run_id"], ["blog_dedup_scan_runs.id"], ondelete="CASCADE"), + sa.ForeignKeyConstraint(["survivor_blog_id"], ["blogs.blog_id"], ondelete="SET NULL"), + ) + op.create_index("ix_blog_dedup_scan_run_items_run_id", "blog_dedup_scan_run_items", ["run_id"]) diff --git a/alembic/versions/20260607_04_drop_recommendation_blog_id_columns.py b/alembic/versions/20260607_04_drop_recommendation_blog_id_columns.py new file mode 100644 index 0000000..8f5215e --- /dev/null +++ b/alembic/versions/20260607_04_drop_recommendation_blog_id_columns.py @@ -0,0 +1,187 @@ +"""Drop blog_id columns from recommendation event tables. + +Revision ID: 20260607_04 +Revises: 20260607_03 +Create Date: 2026-06-07 16:35:00 BST +""" + +from __future__ import annotations + +from alembic import op +import sqlalchemy as sa + + +revision = "20260607_04" +down_revision = "20260607_03" +branch_labels = None +depends_on = None + + +def _table_names() -> set[str]: + """Return currently present table names. + + Args: + None. + + Returns: + Set of table names visible through the active migration connection. + """ + + return set(sa.inspect(op.get_bind()).get_table_names()) + + +def _column_names(table_name: str) -> set[str]: + """Return column names for one table. + + Args: + table_name: Table to inspect. + + Returns: + Set of existing column names, or an empty set when the table is absent. + """ + + inspector = sa.inspect(op.get_bind()) + if table_name not in set(inspector.get_table_names()): + return set() + return {column["name"] for column in inspector.get_columns(table_name)} + + +def _index_names(table_name: str) -> set[str]: + """Return index names for one table. + + Args: + table_name: Table to inspect. + + Returns: + Set of existing index names, or an empty set when the table is absent. + """ + + inspector = sa.inspect(op.get_bind()) + if table_name not in set(inspector.get_table_names()): + return set() + return {index["name"] for index in inspector.get_indexes(table_name)} + + +def _unique_constraint_names(table_name: str) -> set[str]: + """Return named unique constraints for one table. + + Args: + table_name: Table to inspect. + + Returns: + Set of existing unique constraint names, or an empty set when absent. + """ + + inspector = sa.inspect(op.get_bind()) + if table_name not in set(inspector.get_table_names()): + return set() + return { + constraint["name"] + for constraint in inspector.get_unique_constraints(table_name) + if constraint["name"] + } + + +def upgrade() -> None: + """Remove blog foreign-key columns from recommendation event tables. + + Args: + None. + + Returns: + None. Existing rows keep their durable `normalized_url` attribution. + """ + + tables = _table_names() + if "recommendation_impressions" in tables: + columns = _column_names("recommendation_impressions") + indexes = _index_names("recommendation_impressions") + unique_constraints = _unique_constraint_names("recommendation_impressions") + with op.batch_alter_table("recommendation_impressions") as batch_op: + if "ix_recommendation_impressions_blog_created" in indexes: + batch_op.drop_index("ix_recommendation_impressions_blog_created") + if "ix_recommendation_impressions_blog_id" in indexes: + batch_op.drop_index("ix_recommendation_impressions_blog_id") + if "uq_recommendation_impression_request_blog" in unique_constraints: + batch_op.drop_constraint("uq_recommendation_impression_request_blog", type_="unique") + if "uq_recommendation_impression_request_url" not in unique_constraints: + batch_op.create_unique_constraint( + "uq_recommendation_impression_request_url", + ["request_id", "normalized_url"], + ) + if "ix_recommendation_impressions_url_created" not in indexes: + batch_op.create_index( + "ix_recommendation_impressions_url_created", + ["normalized_url", "created_at"], + ) + if "blog_id" in columns: + batch_op.drop_column("blog_id") + + if "blog_interactions" in tables: + columns = _column_names("blog_interactions") + indexes = _index_names("blog_interactions") + with op.batch_alter_table("blog_interactions") as batch_op: + if "ix_blog_interactions_blog_event_created" in indexes: + batch_op.drop_index("ix_blog_interactions_blog_event_created") + if "ix_blog_interactions_blog_id" in indexes: + batch_op.drop_index("ix_blog_interactions_blog_id") + if "ix_blog_interactions_url_event_created" not in indexes: + batch_op.create_index( + "ix_blog_interactions_url_event_created", + ["normalized_url", "event_type", "created_at"], + ) + if "blog_id" in columns: + batch_op.drop_column("blog_id") + + +def downgrade() -> None: + """Recreate removed blog_id columns for rollback. + + Args: + None. + + Returns: + None. Recreated values are nullable because historical URL-keyed event + rows cannot always be relinked after a graph reset. + """ + + tables = _table_names() + if "recommendation_impressions" in tables: + columns = _column_names("recommendation_impressions") + indexes = _index_names("recommendation_impressions") + unique_constraints = _unique_constraint_names("recommendation_impressions") + with op.batch_alter_table("recommendation_impressions") as batch_op: + if "blog_id" not in columns: + batch_op.add_column(sa.Column("blog_id", sa.Integer(), nullable=True)) + if "ix_recommendation_impressions_url_created" in indexes: + batch_op.drop_index("ix_recommendation_impressions_url_created") + if "uq_recommendation_impression_request_url" in unique_constraints: + batch_op.drop_constraint("uq_recommendation_impression_request_url", type_="unique") + if "uq_recommendation_impression_request_blog" not in unique_constraints: + batch_op.create_unique_constraint( + "uq_recommendation_impression_request_blog", + ["request_id", "blog_id"], + ) + if "ix_recommendation_impressions_blog_id" not in indexes: + batch_op.create_index("ix_recommendation_impressions_blog_id", ["blog_id"]) + if "ix_recommendation_impressions_blog_created" not in indexes: + batch_op.create_index( + "ix_recommendation_impressions_blog_created", + ["blog_id", "created_at"], + ) + + if "blog_interactions" in tables: + columns = _column_names("blog_interactions") + indexes = _index_names("blog_interactions") + with op.batch_alter_table("blog_interactions") as batch_op: + if "blog_id" not in columns: + batch_op.add_column(sa.Column("blog_id", sa.Integer(), nullable=True)) + if "ix_blog_interactions_url_event_created" in indexes: + batch_op.drop_index("ix_blog_interactions_url_event_created") + if "ix_blog_interactions_blog_id" not in indexes: + batch_op.create_index("ix_blog_interactions_blog_id", ["blog_id"]) + if "ix_blog_interactions_blog_event_created" not in indexes: + batch_op.create_index( + "ix_blog_interactions_blog_event_created", + ["blog_id", "event_type", "created_at"], + ) diff --git a/alembic/versions/20260609_01_extend_user_system.py b/alembic/versions/20260609_01_extend_user_system.py new file mode 100644 index 0000000..26472e2 --- /dev/null +++ b/alembic/versions/20260609_01_extend_user_system.py @@ -0,0 +1,103 @@ +"""Extend user auth lifecycle fields. + +Revision ID: 20260609_01 +Revises: 20260607_04 +Create Date: 2026-06-09 16:40:00 UTC +""" + +from __future__ import annotations + +from alembic import op +import sqlalchemy as sa + + +revision = "20260609_01" +down_revision = "20260607_04" +branch_labels = None +depends_on = None + + +def _table_names() -> set[str]: + """Return table names currently visible to Alembic.""" + return set(sa.inspect(op.get_bind()).get_table_names()) + + +def _column_names(table_name: str) -> set[str]: + """Return column names for one table currently visible to Alembic.""" + return {column["name"] for column in sa.inspect(op.get_bind()).get_columns(table_name)} + + +def upgrade() -> None: + """Add user lifecycle columns, token rows, and audit rows.""" + existing_tables = _table_names() + if "users" in existing_tables: + user_columns = _column_names("users") + if "role" not in user_columns: + op.add_column("users", sa.Column("role", sa.Text(), nullable=False, server_default="user")) + if "is_active" not in user_columns: + op.add_column("users", sa.Column("is_active", sa.Boolean(), nullable=False, server_default=sa.true())) + if "email_verified_at" not in user_columns: + op.add_column("users", sa.Column("email_verified_at", sa.DateTime(timezone=True), nullable=True)) + if "password_changed_at" not in user_columns: + op.add_column("users", sa.Column("password_changed_at", sa.DateTime(timezone=True), nullable=True)) + if "last_login_at" not in user_columns: + op.add_column("users", sa.Column("last_login_at", sa.DateTime(timezone=True), nullable=True)) + op.create_check_constraint( + "ck_users_role", + "users", + "role IN ('admin', 'user')", + ) + + if "user_verification_tokens" not in existing_tables: + op.create_table( + "user_verification_tokens", + sa.Column("id", sa.Integer(), primary_key=True), + sa.Column("user_id", sa.Integer(), sa.ForeignKey("users.id", ondelete="CASCADE"), nullable=False), + sa.Column("token_hash", sa.Text(), nullable=False), + sa.Column("purpose", sa.Text(), nullable=False), + sa.Column("created_at", sa.DateTime(timezone=True), nullable=False, server_default=sa.func.now()), + sa.Column("expires_at", sa.DateTime(timezone=True), nullable=False), + sa.Column("consumed_at", sa.DateTime(timezone=True), nullable=True), + sa.UniqueConstraint("token_hash", name="uq_user_verification_tokens_token_hash"), + ) + op.create_index("ix_user_verification_tokens_user_id", "user_verification_tokens", ["user_id"]) + op.create_index("ix_user_verification_tokens_token_hash", "user_verification_tokens", ["token_hash"]) + op.create_index("ix_user_verification_tokens_purpose", "user_verification_tokens", ["purpose"]) + + if "user_audit_events" not in existing_tables: + op.create_table( + "user_audit_events", + sa.Column("id", sa.Integer(), primary_key=True), + sa.Column("user_id", sa.Integer(), sa.ForeignKey("users.id", ondelete="SET NULL"), nullable=True), + sa.Column("event_type", sa.Text(), nullable=False), + sa.Column("details", sa.JSON(), nullable=False, server_default=sa.text("'{}'")), + sa.Column("created_at", sa.DateTime(timezone=True), nullable=False, server_default=sa.func.now()), + ) + op.create_index("ix_user_audit_events_user_id", "user_audit_events", ["user_id"]) + op.create_index("ix_user_audit_events_event_type", "user_audit_events", ["event_type"]) + + +def downgrade() -> None: + """Remove user lifecycle columns, token rows, and audit rows.""" + existing_tables = _table_names() + if "user_audit_events" in existing_tables: + op.drop_index("ix_user_audit_events_event_type", table_name="user_audit_events") + op.drop_index("ix_user_audit_events_user_id", table_name="user_audit_events") + op.drop_table("user_audit_events") + if "user_verification_tokens" in existing_tables: + op.drop_index("ix_user_verification_tokens_purpose", table_name="user_verification_tokens") + op.drop_index("ix_user_verification_tokens_token_hash", table_name="user_verification_tokens") + op.drop_index("ix_user_verification_tokens_user_id", table_name="user_verification_tokens") + op.drop_table("user_verification_tokens") + if "users" in existing_tables: + user_columns = _column_names("users") + op.drop_constraint("ck_users_role", "users", type_="check") + for column_name in ( + "last_login_at", + "password_changed_at", + "email_verified_at", + "is_active", + "role", + ): + if column_name in user_columns: + op.drop_column("users", column_name) diff --git a/alembic/versions/20260611_01_add_pending_user_registrations.py b/alembic/versions/20260611_01_add_pending_user_registrations.py new file mode 100644 index 0000000..67723b7 --- /dev/null +++ b/alembic/versions/20260611_01_add_pending_user_registrations.py @@ -0,0 +1,51 @@ +"""Add pending user registration table. + +Revision ID: 20260611_01 +Revises: 20260609_01 +Create Date: 2026-06-11 00:00:00 UTC +""" + +from __future__ import annotations + +from alembic import op +import sqlalchemy as sa + + +revision = "20260611_01" +down_revision = "20260609_01" +branch_labels = None +depends_on = None + + +def _table_names() -> set[str]: + """Return table names currently visible to Alembic.""" + return set(sa.inspect(op.get_bind()).get_table_names()) + + +def upgrade() -> None: + """Create pending registrations for verify-before-persist signup.""" + if "pending_user_registrations" in _table_names(): + return + op.create_table( + "pending_user_registrations", + sa.Column("id", sa.Integer(), primary_key=True), + sa.Column("email", sa.Text(), nullable=False), + sa.Column("password_hash", sa.Text(), nullable=False), + sa.Column("token_hash", sa.Text(), nullable=False), + sa.Column("created_at", sa.DateTime(timezone=True), nullable=False, server_default=sa.func.now()), + sa.Column("expires_at", sa.DateTime(timezone=True), nullable=False), + sa.Column("consumed_at", sa.DateTime(timezone=True), nullable=True), + sa.UniqueConstraint("email", name="uq_pending_user_registrations_email"), + sa.UniqueConstraint("token_hash", name="uq_pending_user_registrations_token_hash"), + ) + op.create_index("ix_pending_user_registrations_email", "pending_user_registrations", ["email"]) + op.create_index("ix_pending_user_registrations_token_hash", "pending_user_registrations", ["token_hash"]) + + +def downgrade() -> None: + """Drop pending registrations.""" + if "pending_user_registrations" not in _table_names(): + return + op.drop_index("ix_pending_user_registrations_token_hash", table_name="pending_user_registrations") + op.drop_index("ix_pending_user_registrations_email", table_name="pending_user_registrations") + op.drop_table("pending_user_registrations") diff --git a/alembic/versions/20260611_02_add_admin_hourly_stats.py b/alembic/versions/20260611_02_add_admin_hourly_stats.py new file mode 100644 index 0000000..b96bcd4 --- /dev/null +++ b/alembic/versions/20260611_02_add_admin_hourly_stats.py @@ -0,0 +1,75 @@ +"""Add hourly admin statistics snapshots. + +Revision ID: 20260611_02 +Revises: 20260611_01 +Create Date: 2026-06-11 00:00:00 BST +""" + +from __future__ import annotations + +from alembic import op +import sqlalchemy as sa + + +revision = "20260611_02" +down_revision = "20260611_01" +branch_labels = None +depends_on = None + + +def _tables() -> set[str]: + """Return currently present database table names. + + Args: + None. + + Returns: + Set of table names currently present in the migration target. + """ + + return set(sa.inspect(op.get_bind()).get_table_names()) + + +def upgrade() -> None: + """Create the hourly admin statistics snapshot table. + + Args: + None. + + Returns: + None. The migration mutates the active database schema in place. + """ + + if "admin_hourly_stats" in _tables(): + return + op.create_table( + "admin_hourly_stats", + sa.Column("id", sa.Integer(), primary_key=True), + sa.Column("hour_start", sa.DateTime(timezone=True), nullable=False), + sa.Column("user_count", sa.Integer(), nullable=False, server_default="0"), + sa.Column("random_request_count", sa.Integer(), nullable=False, server_default="0"), + sa.Column("random_impression_count", sa.Integer(), nullable=False, server_default="0"), + sa.Column("detail_open_count", sa.Integer(), nullable=False, server_default="0"), + sa.Column("external_open_count", sa.Integer(), nullable=False, server_default="0"), + sa.Column("detail_ctr", sa.Float(), nullable=False, server_default="0"), + sa.Column("external_ctr", sa.Float(), nullable=False, server_default="0"), + sa.Column("total_click_ctr", sa.Float(), nullable=False, server_default="0"), + sa.Column("refreshed_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False), + sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False), + sa.UniqueConstraint("hour_start", name="uq_admin_hourly_stats_hour_start"), + ) + op.create_index("ix_admin_hourly_stats_hour_start", "admin_hourly_stats", ["hour_start"]) + + +def downgrade() -> None: + """Drop the hourly admin statistics snapshot table. + + Args: + None. + + Returns: + None. The migration mutates the active database schema in place. + """ + + if "admin_hourly_stats" in _tables(): + op.drop_table("admin_hourly_stats") diff --git a/backend/main.py b/backend/main.py index 3a86d0b..f1d5ca8 100644 --- a/backend/main.py +++ b/backend/main.py @@ -3,11 +3,12 @@ from __future__ import annotations from dataclasses import dataclass -from threading import Thread +import ipaddress +import socket from time import sleep from typing import Any from typing import Callable -from typing import NoReturn +from urllib.parse import urlsplit import httpx from fastapi import Depends, FastAPI, HTTPException, Request @@ -45,9 +46,8 @@ class RunBatchRequest(BaseModel): max_nodes: int -class CreateIngestionRequest(BaseModel): +class CreateUserSeedRequest(BaseModel): homepage_url: str - email: str class UserAuthRequest(BaseModel): @@ -55,6 +55,23 @@ class UserAuthRequest(BaseModel): password: str +class EmailRequest(BaseModel): + email: str + + +class TokenRequest(BaseModel): + token: str + + +class PasswordResetRequest(BaseModel): + token: str + password: str + + +class UpdateUserRoleRequest(BaseModel): + role: str + + class ReplaceBlogLabelsRequest(BaseModel): tag_ids: list[int] | None = None label_id: dict[str, int] | None = None @@ -66,6 +83,31 @@ class IncrementBlogUserLabelRequest(BaseModel): previous_label: str | None = None +class CreateRandomRecommendationBatchRequest(BaseModel): + count: int = 9 + visitor_id: str + session_id: str + source: str | None = None + page_url: str | None = None + context: dict[str, Any] | None = None + + +class RecordRecommendationEventRequest(BaseModel): + event_uuid: str + event_type: str + blog_id: int + visitor_id: str + session_id: str + entrance_kind: str + entrance_url: str + request_uuid: str | None = None + impression_id: int | None = None + position: int | None = None + interaction_order: int = 1 + client_event_at: str | None = None + attributes: dict[str, Any] | None = None + + class BlogLabelTitlePreviewRequest(BaseModel): url: str @@ -75,6 +117,145 @@ class CreateBlogLabelTagRequest(BaseModel): ACTIVE_CRAWLER_RUNNER_STATUSES = frozenset({"starting", "running", "stopping"}) +ICON_PROXY_MAX_BYTES = 1_000_000 +ICON_PROXY_ALLOWED_SCHEMES = frozenset({"http", "https"}) +ICON_PROXY_IMAGE_EXTENSIONS = (".ico", ".png", ".jpg", ".jpeg", ".svg", ".webp", ".gif", ".avif") + + +def _is_private_icon_proxy_host(hostname: str) -> bool: + """Return whether one hostname resolves to local or private network space. + + Args: + hostname: Parsed URL hostname to validate before proxying. + + Returns: + True when the hostname itself or any resolved address is unsafe for the + public icon proxy. + """ + try: + ip_addresses = [ipaddress.ip_address(hostname)] + except ValueError: + try: + resolved = socket.getaddrinfo(hostname, None, type=socket.SOCK_STREAM) + except socket.gaierror: + return True + ip_addresses = [] + for item in resolved: + address = item[4][0] + try: + ip_addresses.append(ipaddress.ip_address(address)) + except ValueError: + return True + + return any( + address.is_private + or address.is_loopback + or address.is_link_local + or address.is_multicast + or address.is_reserved + or address.is_unspecified + for address in ip_addresses + ) + + +def _validate_icon_proxy_url(url: str) -> str: + """Normalize and validate a remote icon URL before proxying it. + + Args: + url: User-supplied absolute URL. + + Returns: + The trimmed URL when it is an allowed public HTTP(S) URL. + + Raises: + HTTPException: If the URL is unsupported or points at unsafe address + space. + """ + clean_url = url.strip() + parsed = urlsplit(clean_url) + if parsed.scheme.lower() not in ICON_PROXY_ALLOWED_SCHEMES or not parsed.hostname: + raise HTTPException(status_code=422, detail="unsupported_icon_url") + if _is_private_icon_proxy_host(parsed.hostname): + raise HTTPException(status_code=422, detail="unsafe_icon_url") + return clean_url + + +def _is_image_like_icon_response(response: httpx.Response) -> bool: + """Return whether one HTTP response looks like an icon image. + + Args: + response: HTTP response from the remote icon URL. + + Returns: + True when the content type is image-like, or a generic binary response + has a known image file extension. + """ + content_type = response.headers.get("content-type", "").split(";", 1)[0].strip().lower() + if content_type.startswith("image/"): + return True + if content_type in {"application/octet-stream", "binary/octet-stream"}: + return urlsplit(str(response.url)).path.lower().endswith(ICON_PROXY_IMAGE_EXTENSIONS) + return False + + +def _fetch_icon_proxy_response(url: str) -> Response: + """Fetch one remote icon and return it as a same-origin image response. + + Args: + url: Validated public HTTP(S) icon URL. + + Returns: + FastAPI response containing the icon bytes. + + Raises: + HTTPException: If the remote URL cannot be fetched, is too large, or + does not return an image-like response. + """ + try: + current_url = url + for _ in range(4): + with httpx.stream( + "GET", + current_url, + follow_redirects=False, + timeout=8.0, + headers={"User-Agent": "HeyBlogBot/0.1 (+https://example.invalid/heyblog)"}, + ) as response: + if response.status_code in {301, 302, 303, 307, 308} and response.headers.get("location"): + current_url = _validate_icon_proxy_url(str(httpx.URL(str(response.url)).join(response.headers["location"]))) + continue + response.raise_for_status() + if not _is_image_like_icon_response(response): + raise HTTPException(status_code=502, detail="icon_proxy_not_image") + content_length = response.headers.get("content-length") + if content_length is not None: + try: + if int(content_length) > ICON_PROXY_MAX_BYTES: + raise HTTPException(status_code=502, detail="icon_proxy_too_large") + except ValueError: + pass + chunks: list[bytes] = [] + size = 0 + for chunk in response.iter_bytes(): + size += len(chunk) + if size > ICON_PROXY_MAX_BYTES: + raise HTTPException(status_code=502, detail="icon_proxy_too_large") + chunks.append(chunk) + content_type = response.headers.get("content-type", "image/x-icon") + return Response( + content=b"".join(chunks), + media_type=content_type, + headers={"cache-control": "public, max-age=86400"}, + ) + raise HTTPException(status_code=502, detail="icon_proxy_too_many_redirects") + except HTTPException: + raise + except httpx.TimeoutException as exc: + raise HTTPException(status_code=504, detail="icon_proxy_timeout") from exc + except httpx.HTTPStatusError as exc: + raise HTTPException(status_code=502, detail=f"icon_proxy_http_{exc.response.status_code}") from exc + except httpx.RequestError as exc: + raise HTTPException(status_code=502, detail="icon_proxy_fetch_failed") from exc def _crawler_runtime_is_active(runtime: dict[str, Any]) -> bool: @@ -235,98 +416,6 @@ def build_backend_state(settings: Settings | None = None) -> BackendState: ) -def _execute_blog_dedup_scan_in_background( - state: BackendState, - *, - run_id: int, - crawler_was_running: bool, -) -> None: - restart_attempted = False - restart_succeeded = False - search_reindexed = False - error_message: str | None = None - try: - state.persistence.execute_blog_dedup_scan_run(run_id=run_id) - search_reindexed = _best_effort_search_reindex(state.search) - except httpx.HTTPStatusError as exc: - error_message = str(_upstream_error_detail(exc)) - except Exception as exc: # noqa: BLE001 - error_message = str(exc) - finally: - if crawler_was_running: - restart_attempted = True - try: - state.crawler.start() - restart_succeeded = True - except Exception: # noqa: BLE001 - restart_succeeded = False - try: - state.persistence.finalize_blog_dedup_scan_run( - run_id=run_id, - crawler_restart_attempted=restart_attempted, - crawler_restart_succeeded=restart_succeeded, - search_reindexed=search_reindexed, - error_message=error_message, - ) - except Exception: # noqa: BLE001 - pass - state.maintenance_in_progress = False - - -def _start_maintenance_background_task( - state: BackendState, - *, - prepare_run: Callable[[bool], tuple[dict[str, Any], dict[str, Any]]], - on_http_error: Callable[[httpx.HTTPStatusError], NoReturn], - on_http_exception: Callable[[HTTPException], NoReturn], - on_unexpected_error: Callable[[Exception], NoReturn], - target: Callable[..., None], -) -> dict[str, Any]: - """Start one maintenance-mode background task with shared exception handling.""" - crawler_was_running = _enter_maintenance(state) - try: - payload, thread_kwargs = prepare_run(crawler_was_running) - except httpx.HTTPStatusError as exc: - on_http_error(exc) - except HTTPException as exc: - on_http_exception(exc) - except Exception as exc: # noqa: BLE001 - on_unexpected_error(exc) - Thread( - target=target, - kwargs={"state": state, **thread_kwargs}, - daemon=True, - ).start() - return payload - - -def _build_maintenance_start_error_handlers( - *, - cleanup: Callable[[str], None], - unexpected_detail: str, -) -> tuple[ - Callable[[httpx.HTTPStatusError], NoReturn], - Callable[[HTTPException], NoReturn], - Callable[[Exception], NoReturn], -]: - """Build the shared error-handler skeleton for maintenance start routes.""" - - def on_http_error(exc: httpx.HTTPStatusError) -> NoReturn: - detail = _upstream_error_detail(exc) - cleanup(str(detail)) - _raise_upstream_http_error(exc, detail_override=detail) - - def on_http_exception(exc: HTTPException) -> NoReturn: - cleanup(str(exc.detail)) - raise exc - - def on_unexpected_error(exc: Exception) -> NoReturn: - cleanup(str(exc)) - raise HTTPException(status_code=500, detail=unexpected_detail) from exc - - return on_http_error, on_http_exception, on_unexpected_error - - def create_app(state: BackendState | None = None) -> FastAPI: """Create the public backend app.""" settings = Settings.from_env() @@ -350,16 +439,29 @@ def require_admin_access(request: Request) -> None: state = get_state() if state.admin_dev_bypass: return - if not state.admin_token: - raise HTTPException(status_code=503, detail="admin_auth_not_configured") authorization = request.headers.get("authorization", "").strip() if not authorization: raise HTTPException(status_code=401, detail="admin_auth_required") scheme, _, token = authorization.partition(" ") if scheme.lower() != "bearer" or not token: raise HTTPException(status_code=401, detail="admin_auth_required") - if token != state.admin_token: + if state.admin_token and token == state.admin_token: + return + get_user_by_session_token = getattr(state.persistence, "get_user_by_session_token", None) + if get_user_by_session_token is None: + if not state.admin_token: + raise HTTPException(status_code=503, detail="admin_auth_not_configured") + raise HTTPException(status_code=403, detail="admin_auth_invalid") + try: + user = get_user_by_session_token(token=token) + except httpx.HTTPStatusError as exc: + _raise_upstream_http_error(exc, default="admin_auth_invalid", detail_override="admin_auth_invalid") + if user is None: + if not state.admin_token: + raise HTTPException(status_code=503, detail="admin_auth_not_configured") raise HTTPException(status_code=403, detail="admin_auth_invalid") + if user.get("role") != "admin" or not user.get("is_active") or not user.get("email_verified"): + raise HTTPException(status_code=403, detail="admin_auth_forbidden") def optional_user(request: Request) -> dict[str, Any] | None: authorization = request.headers.get("authorization", "").strip() @@ -436,6 +538,7 @@ def get_blogs_catalog( has_title: str | None = None, has_icon: str | None = None, min_connections: str | None = None, + acceptance_status: str | None = "ACCEPTED", ) -> dict[str, Any]: return _call_upstream_with_http_error_translation( lambda: get_state().persistence.list_blogs_catalog( @@ -450,6 +553,7 @@ def get_blogs_catalog( has_title=has_title, has_icon=has_icon, min_connections=min_connections, + acceptance_status=acceptance_status, ) ) @@ -459,6 +563,63 @@ def lookup_blog_candidates(url: str) -> dict[str, Any]: lambda: get_state().persistence.lookup_blog_candidates(url=url) ) + @app.post("/api/recommendations/random-blog-batches") + def post_random_recommendation_batch( + payload: CreateRandomRecommendationBatchRequest, + user: dict[str, Any] | None = Depends(optional_user), + ) -> dict[str, Any]: + return _call_upstream_with_http_error_translation( + lambda: get_state().persistence.create_random_recommendation_batch( + **payload.model_dump(), + user_id=int(user["id"]) if user is not None else None, + ) + ) + + @app.post("/api/recommendation-events") + def post_recommendation_event( + payload: RecordRecommendationEventRequest, + user: dict[str, Any] | None = Depends(optional_user), + ) -> dict[str, Any]: + return _call_upstream_with_http_error_translation( + lambda: get_state().persistence.record_blog_interaction( + **payload.model_dump(), + user_id=int(user["id"]) if user is not None else None, + ) + ) + + @app.get("/api/blogs/{blog_id}/stats") + def get_blog_recommendation_stats(blog_id: int) -> dict[str, Any]: + return _call_upstream_with_http_error_translation( + lambda: get_state().persistence.get_blog_recommendation_stats(blog_id) + ) + + @app.get("/api/admin/recommendation-stats") + def get_admin_recommendation_stats(_: None = Depends(require_admin_access)) -> dict[str, Any]: + return _call_upstream_with_http_error_translation( + lambda: get_state().persistence.get_recommendation_strategy_stats() + ) + + @app.get("/api/admin/hourly-stats") + def get_admin_hourly_stats( + limit: int = 24, + _: None = Depends(require_admin_access), + ) -> dict[str, Any]: + return _call_upstream_with_http_error_translation( + lambda: get_state().persistence.get_admin_hourly_stats(limit=limit) + ) + + @app.get("/api/icons/proxy") + def proxy_icon(url: str) -> Response: + """Return one remote icon through the backend origin for graph textures. + + Args: + url: Absolute HTTP(S) icon URL to fetch. + + Returns: + Image response with cache headers when the remote resource is valid. + """ + return _fetch_icon_proxy_response(_validate_icon_proxy_url(url)) + @app.post("/api/auth/register") def register_user(payload: UserAuthRequest) -> dict[str, Any]: return _call_upstream_with_http_error_translation( @@ -483,6 +644,30 @@ def logout_user(request: Request, user: dict[str, Any] = Depends(require_user)) lambda: get_state().persistence.revoke_user_session(token=token) ) + @app.post("/api/auth/email/verify/request") + def request_email_verification(payload: EmailRequest) -> dict[str, Any]: + return _call_upstream_with_http_error_translation( + lambda: get_state().persistence.request_email_verification(email=payload.email) + ) + + @app.post("/api/auth/email/verify/confirm") + def confirm_email_verification(payload: TokenRequest) -> dict[str, Any]: + return _call_upstream_with_http_error_translation( + lambda: get_state().persistence.confirm_email_verification(token=payload.token) + ) + + @app.post("/api/auth/password/forgot") + def request_password_reset(payload: EmailRequest) -> dict[str, Any]: + return _call_upstream_with_http_error_translation( + lambda: get_state().persistence.request_password_reset(email=payload.email) + ) + + @app.post("/api/auth/password/reset") + def reset_user_password(payload: PasswordResetRequest) -> dict[str, Any]: + return _call_upstream_with_http_error_translation( + lambda: get_state().persistence.reset_user_password(token=payload.token, password=payload.password) + ) + @app.get("/api/me/label-selections") def get_my_label_selections( limit: int = 50, @@ -498,6 +683,26 @@ def get_my_label_stats(user: dict[str, Any] = Depends(require_user)) -> dict[str lambda: get_state().persistence.get_user_label_stats(user_id=int(user["id"])) ) + @app.get("/api/admin/users") + def list_admin_users( + page: int = 1, + page_size: int = 50, + _: None = Depends(require_admin_access), + ) -> dict[str, Any]: + return _call_upstream_with_http_error_translation( + lambda: get_state().persistence.list_users(page=page, page_size=page_size) + ) + + @app.patch("/api/admin/users/{user_id}/role") + def patch_admin_user_role( + user_id: int, + payload: UpdateUserRoleRequest, + _: None = Depends(require_admin_access), + ) -> dict[str, Any]: + return _call_upstream_with_http_error_translation( + lambda: get_state().persistence.update_user_role(user_id=user_id, role=payload.role) + ) + @app.get("/api/admin/blog-labeling/candidates") def get_blog_labeling_candidates( page: int = 1, @@ -676,92 +881,21 @@ def run_crawl(max_nodes: int | None = None, _: None = Depends(require_admin_acce lambda: state.crawler.run(max_nodes=max_nodes), ) - @app.post("/api/ingestion-requests") - def create_ingestion_request(payload: CreateIngestionRequest) -> dict[str, Any]: + @app.post("/api/blogs/user-seeds") + def create_user_seed(payload: CreateUserSeedRequest) -> dict[str, Any]: result = _call_upstream_with_http_error_translation( - lambda: get_state().persistence.create_ingestion_request(**payload.model_dump()) + lambda: get_state().persistence.create_user_seed(**payload.model_dump()) ) log_event( LOGGER, - event="ingestion.request.created", - message="ingestion request created", + event="blog.user_seed.created", + message="user seed created", stage="ingestion", - run_id=result.get("request_id"), + run_id=result.get("blog_id"), url=payload.homepage_url, ) return result - @app.get("/api/ingestion-requests") - def list_priority_ingestion_requests() -> list[dict[str, Any]]: - return _call_upstream_with_http_error_translation( - lambda: get_state().persistence.list_priority_ingestion_requests() - ) - - @app.get("/api/ingestion-requests/{request_id}") - def get_ingestion_request(request_id: int, request_token: str) -> dict[str, Any]: - return _call_upstream_with_http_error_translation( - lambda: get_state().persistence.get_ingestion_request( - request_id=request_id, - request_token=request_token, - ) - ) - - @app.post("/api/admin/blog-dedup-scans") - def run_blog_dedup_scan(_: None = Depends(require_admin_access)) -> dict[str, Any]: - state = get_state() - - def prepare_run(crawler_was_running: bool) -> tuple[dict[str, Any], dict[str, Any]]: - _stop_active_crawler( - state, - crawler_was_running=crawler_was_running, - wait_for_idle=ensure_runtime_idle, - ) - payload = state.persistence.create_blog_dedup_scan_run(crawler_was_running=crawler_was_running) - log_event( - LOGGER, - event="maintenance.blog_dedup.started", - message="blog dedup scan started", - stage="blog_dedup", - run_id=int(payload["id"]), - crawler_was_running=crawler_was_running, - ) - return payload, { - "run_id": int(payload["id"]), - "crawler_was_running": crawler_was_running, - } - - def cleanup(_: str) -> None: - _leave_maintenance(state) - - on_http_error, on_http_exception, on_unexpected_error = _build_maintenance_start_error_handlers( - cleanup=cleanup, - unexpected_detail="blog_dedup_scan_failed", - ) - - return _start_maintenance_background_task( - state, - prepare_run=prepare_run, - on_http_error=on_http_error, - on_http_exception=on_http_exception, - on_unexpected_error=on_unexpected_error, - target=_execute_blog_dedup_scan_in_background, - ) - - @app.get("/api/admin/blog-dedup-scans/latest") - def get_latest_blog_dedup_scan_run(_: None = Depends(require_admin_access)) -> dict[str, Any]: - return _call_upstream_with_http_error_translation( - lambda: get_state().persistence.latest_blog_dedup_scan_run() - ) - - @app.get("/api/admin/blog-dedup-scans/{run_id}/items") - def get_blog_dedup_scan_run_items( - run_id: int, - _: None = Depends(require_admin_access), - ) -> list[dict[str, Any]]: - return _call_upstream_with_http_error_translation( - lambda: get_state().persistence.list_blog_dedup_scan_run_items(run_id) - ) - @app.get("/api/admin/runtime/status") def runtime_status(_: None = Depends(require_admin_access)) -> dict[str, Any]: payload = get_state().crawler.runtime_status() diff --git a/crawler/README.md b/crawler/README.md index ab71788..d6a49b2 100644 --- a/crawler/README.md +++ b/crawler/README.md @@ -90,6 +90,7 @@ crawler/ - 通过 `Settings.from_env()` 读取运行配置。 - 构建 `PersistenceHttpClient`,说明 crawler 服务默认不直接连数据库,而是通过 persistence-api 通信。 - 构建 `CrawlPipeline` 和 `CrawlerRuntimeService`。 +- 启动内置 runtime auto scheduler:默认每小时检查一次,若 runtime 已空闲则自动调用 start,以便用户新提交的 `WAITING` 博客被继续抓取。 - 暴露内部接口: - `GET /internal/health` - `POST /internal/crawl/bootstrap` @@ -325,7 +326,7 @@ crawler/ - 启动 / 停止后台抓取线程 - 维护多个 worker 的运行状态快照 -- 控制 priority queue 和 normal queue 的公平 claim +- 按等待队列领取并分发待抓取 blog - 聚合本次 runtime 的 processed / discovered / failed - 对外提供 `/internal/runtime/*` 需要的状态 diff --git a/crawler/crawling/bootstrap.py b/crawler/crawling/bootstrap.py index 5d150b5..91b2727 100644 --- a/crawler/crawling/bootstrap.py +++ b/crawler/crawling/bootstrap.py @@ -14,9 +14,10 @@ class BootstrapService: """Import crawler seed URLs into persistence storage. - The bootstrap flow reads the configured seed CSV, normalizes each URL, and - upserts the result into the blog repository so later crawl runs have an - initial queue to process. + The bootstrap flow first replays any durable seeds already stored in + persistence. When no durable seeds exist yet, it reads the configured seed + CSV, normalizes each URL, stores it in the seed table, and upserts the blog + queue row. """ def __init__(self, repository: RepositoryProtocol, logger: CrawlerLogger) -> None: @@ -35,20 +36,68 @@ def __init__(self, repository: RepositoryProtocol, logger: CrawlerLogger) -> Non self.logger = logger def bootstrap_seeds(self, seed_path: Path) -> dict[str, Any]: - """Import seed URLs from a CSV file into the blogs table. + """Import seed URLs into the blogs table. Args: - seed_path: Filesystem path to the CSV file containing a ``url`` - column of initial crawl targets. + seed_path: Filesystem path to the fallback CSV file containing a + ``url`` column of initial crawl targets. The CSV is only read + when the durable seed table is empty. Returns: A small result payload containing the imported seed file path and the number of newly created blog rows. """ + existing_seeds = self.repository.list_seeds() + if existing_seeds: + created = self._bootstrap_from_seed_rows(existing_seeds) + self.logger.bootstrap_success(seed_path) + return {"seed_path": str(seed_path), "imported": created} + created = self._bootstrap_from_csv(seed_path) + self.logger.bootstrap_success(seed_path) + return {"seed_path": str(seed_path), "imported": created} + + def _bootstrap_from_seed_rows(self, seeds: list[dict[str, Any]]) -> int: + """Replay persisted seed rows into the blog queue. + + Args: + seeds: Durable seed payloads loaded from persistence. + + Returns: + Number of newly inserted blog rows. + """ + + created = 0 + for seed in seeds: + raw_url = str(seed.get("url") or "").strip() + normalized_url = str(seed.get("normalized_url") or "").strip() + domain = str(seed.get("domain") or "").strip() + if not raw_url or not normalized_url or not domain: + continue + _, inserted = self.repository.upsert_blog( + url=raw_url, + normalized_url=normalized_url, + domain=domain, + accepted_by="seed", + seed_source_path=seed.get("source_path"), + seed_source_row=seed.get("source_row"), + ) + created += int(inserted) + return created + + def _bootstrap_from_csv(self, seed_path: Path) -> int: + """Load fallback CSV seed rows into seeds and blogs. + + Args: + seed_path: Filesystem path to the seed CSV file. + + Returns: + Number of newly inserted blog rows. + """ + created = 0 with seed_path.open("r", encoding="utf-8") as handle: reader = csv.DictReader(handle) - for row in reader: + for row_number, row in enumerate(reader, start=2): raw_url = (row.get("url") or "").strip() if not raw_url: continue @@ -57,7 +106,9 @@ def bootstrap_seeds(self, seed_path: Path) -> dict[str, Any]: url=raw_url, normalized_url=normalized.normalized_url, domain=normalized.domain, + accepted_by="seed", + seed_source_path=str(seed_path), + seed_source_row=row_number, ) created += int(inserted) - self.logger.bootstrap_success(seed_path) - return {"seed_path": str(seed_path), "imported": created} + return created diff --git a/crawler/crawling/decisions/chain.py b/crawler/crawling/decisions/chain.py index 3dd252a..a740e84 100644 --- a/crawler/crawling/decisions/chain.py +++ b/crawler/crawling/decisions/chain.py @@ -94,6 +94,41 @@ def _build_rss_discovery_filter(settings: Settings) -> BaseUrlFilter: return RssDiscoveryFilter() +def _build_implicit_success_deciders( + settings: Settings, + *, + configured_filter_kinds: set[str], + disabled_filter_kinds: set[str], +) -> list[BaseUrlFilter]: + """Append optional success deciders controlled only by settings toggles. + + Args: + settings: Runtime settings that enable or disable optional deciders. + configured_filter_kinds: Filter kinds mentioned by the TOML config. + disabled_filter_kinds: Filter kinds explicitly disabled in the TOML + config. + + Returns: + Success decider filters that should be appended after deterministic + rule filters because the config omitted them and the corresponding + runtime toggle is enabled. + """ + implicit_filters: list[BaseUrlFilter] = [] + if ( + settings.rss_discovery_enabled + and "rss_discovery" not in configured_filter_kinds + and "rss_discovery" not in disabled_filter_kinds + ): + implicit_filters.append(_build_rss_discovery_filter(settings)) + if ( + settings.decision_model_consensus_enabled + and "model_consensus" not in configured_filter_kinds + and "model_consensus" not in disabled_filter_kinds + ): + implicit_filters.append(_build_model_consensus_filter(settings)) + return implicit_filters + + FILTER_REGISTRY: dict[str, FilterFactory] = { "duplicate_url": _static_filter_factory(DuplicateUrlFilter), "non_http_scheme": _static_filter_factory(NonHttpSchemeFilter), @@ -167,10 +202,16 @@ def steps(self) -> tuple[BaseUrlFilter, ...]: def from_settings(cls, settings: Settings) -> "ConfiguredUrlFilterChain": """Build a filter chain using the configured TOML ordering.""" loaded_filters: list[BaseUrlFilter] = [] + configured_filter_kinds: set[str] = set() + disabled_filter_kinds: set[str] = set() for item in _load_filter_chain_config(settings.filter_chain_config_path): + kind = str(item.get("kind", "")).strip() + if kind: + configured_filter_kinds.add(kind) if not bool(item.get("enabled", True)): + if kind: + disabled_filter_kinds.add(kind) continue - kind = str(item.get("kind", "")).strip() if kind == "model_consensus" and not settings.decision_model_consensus_enabled: continue if kind == "rss_discovery" and not settings.rss_discovery_enabled: @@ -179,6 +220,13 @@ def from_settings(cls, settings: Settings) -> "ConfiguredUrlFilterChain": if factory is None: raise ValueError(f"unknown_filter_kind:{kind}") loaded_filters.append(factory(settings)) + loaded_filters.extend( + _build_implicit_success_deciders( + settings, + configured_filter_kinds=configured_filter_kinds, + disabled_filter_kinds=disabled_filter_kinds, + ) + ) return cls(filters=tuple(loaded_filters)) @property diff --git a/crawler/crawling/decisions/rule_helpers.py b/crawler/crawling/decisions/rule_helpers.py index b7a4019..efc3e18 100644 --- a/crawler/crawling/decisions/rule_helpers.py +++ b/crawler/crawling/decisions/rule_helpers.py @@ -43,7 +43,7 @@ "/rss", "/search", } -BLOCKED_TLDS = (".gov", ".gov.cn", ".org", ".edu") +BLOCKED_TLDS = (".gov", ".gov.cn", ".edu") FILE_SUFFIX_BLOCKLIST = ( ".7z", ".css", diff --git a/crawler/crawling/fetching/base.py b/crawler/crawling/fetching/base.py index 99484a0..be8c5f9 100644 --- a/crawler/crawling/fetching/base.py +++ b/crawler/crawling/fetching/base.py @@ -88,3 +88,16 @@ def fetch_many( ``FetchAttempt`` result. """ ... + + def validate_icon_url(self, url: str, *, timeout_seconds: float | None = None) -> str | None: + """Return a reachable final icon URL, or ``None`` when unusable. + + Args: + url: Absolute HTTP(S) icon candidate URL to verify. + timeout_seconds: Optional per-request timeout override in seconds. + + Returns: + Final URL after redirects when the candidate is reachable and looks + like an image resource; otherwise ``None``. + """ + ... diff --git a/crawler/crawling/fetching/httpx_fetcher.py b/crawler/crawling/fetching/httpx_fetcher.py index c5106ad..41b1f7d 100644 --- a/crawler/crawling/fetching/httpx_fetcher.py +++ b/crawler/crawling/fetching/httpx_fetcher.py @@ -4,6 +4,7 @@ import asyncio from typing import Any +from urllib.parse import urlsplit import httpx @@ -99,6 +100,35 @@ def fetch_many( "Use 'await fetch_many_async(...)' instead." ) + def validate_icon_url(self, url: str, *, timeout_seconds: float | None = None) -> str | None: + """Return the final URL for a reachable image-like favicon candidate. + + Args: + url: Absolute HTTP(S) icon candidate URL. + timeout_seconds: Optional request timeout override. + + Returns: + Final URL after redirects when the candidate responds successfully + with an image-like content type; otherwise ``None``. + """ + parsed = urlsplit(url) + if parsed.scheme not in {"http", "https"} or not parsed.netloc: + return None + request_kwargs: dict[str, Any] = {} + if timeout_seconds is not None: + request_kwargs["timeout"] = timeout_seconds + headers = {"Range": "bytes=0-0"} + try: + response = self.client.head(url, **request_kwargs) + if response.status_code in {405, 501} or response.status_code >= 400: + response = self.client.get(url, headers=headers, **request_kwargs) + response.raise_for_status() + except httpx.HTTPError: + return None + if not self._is_icon_response(response): + return None + return str(response.url) + async def fetch_many_async( self, urls: list[str], @@ -312,3 +342,20 @@ def _raise_if_content_length_too_large(self, response: httpx.Response) -> None: raise PageTooLargeError( f"page exceeded max size limit ({size} > {self.max_page_bytes} bytes): {response.url}" ) + + def _is_icon_response(self, response: httpx.Response) -> bool: + """Return whether an HTTP response looks like a usable icon image. + + Args: + response: HTTPX response returned by a HEAD or lightweight GET + request. + + Returns: + True for successful image-like responses; false otherwise. + """ + content_type = response.headers.get("content-type", "").split(";", 1)[0].strip().lower() + if content_type.startswith("image/"): + return True + if content_type in {"application/octet-stream", "binary/octet-stream"}: + return urlsplit(str(response.url)).path.lower().endswith((".ico", ".png", ".jpg", ".jpeg", ".svg", ".webp")) + return False diff --git a/crawler/crawling/metadata.py b/crawler/crawling/metadata.py index d79f1c9..2e936b0 100644 --- a/crawler/crawling/metadata.py +++ b/crawler/crawling/metadata.py @@ -8,6 +8,7 @@ from bs4 import BeautifulSoup from bs4 import Tag +from extract_favicon import from_html as extract_favicons_from_html from crawler.utils import clean_text @@ -102,14 +103,56 @@ def _pick_icon_url(page_url: str, soup: BeautifulSoup) -> str | None: continue ranked_candidates.append((priority, index, resolved_href)) - if ranked_candidates: - ranked_candidates.sort(key=lambda item: (item[0], item[1])) - return ranked_candidates[0][2] + if not ranked_candidates: + return None + ranked_candidates.sort(key=lambda item: (item[0], item[1])) + return ranked_candidates[0][2] + + +def _favicon_score(favicon: object, ordinal: int) -> tuple[int, int]: + """Return a sortable quality score for one `extract-favicon` result. + + Args: + favicon: Favicon-like object returned by `extract-favicon`. + ordinal: Original candidate order used as a stable tie-breaker. + + Returns: + Tuple where larger values are preferred. + """ + width = int(getattr(favicon, "width", 0) or 0) + height = int(getattr(favicon, "height", 0) or 0) + area = width * height + return (area, -ordinal) - if not _is_http_url(page_url): + +def _pick_icon_url_with_library(page_url: str, html: str) -> str | None: + """Pick the best explicit icon URL using `extract-favicon`. + + Args: + page_url: Final fetched page URL used to resolve relative icon paths. + html: Raw homepage HTML to parse. + + Returns: + Best HTTP(S) icon candidate from page metadata, or ``None`` when the + page declares no usable icon. + """ + favicons = extract_favicons_from_html(html, root_url=page_url, include_fallbacks=False) + candidates = [ + (favicon, str(getattr(favicon, "url", "") or "").strip()) + for favicon in sorted(favicons, key=lambda item: str(getattr(item, "url", ""))) + ] + usable = [ + (favicon, url) + for favicon, url in candidates + if url and _is_http_url(url) + ] + if not usable: return None - parsed = urlsplit(page_url) - return f"{parsed.scheme}://{parsed.netloc}/favicon.ico" + _favicon, url = max( + enumerate(usable), + key=lambda item: _favicon_score(item[1][0], item[0]), + )[1] + return url def extract_site_metadata(page_url: str, html: str) -> SiteMetadata: @@ -128,4 +171,5 @@ def extract_site_metadata(page_url: str, html: str) -> SiteMetadata: if soup.title is not None: title = clean_text(soup.title.get_text(" ", strip=True)) or None - return SiteMetadata(title=title, icon_url=_pick_icon_url(page_url, soup)) + icon_url = _pick_icon_url_with_library(page_url, html) or _pick_icon_url(page_url, soup) + return SiteMetadata(title=title, icon_url=icon_url) diff --git a/crawler/crawling/orchestrator.py b/crawler/crawling/orchestrator.py index e9ec5f2..bda782f 100644 --- a/crawler/crawling/orchestrator.py +++ b/crawler/crawling/orchestrator.py @@ -87,6 +87,7 @@ def crawl_blog(self, blog: dict[str, object]) -> int: timeout_seconds=self._remaining_timeout_seconds(deadline, blog_record.url), ) metadata = extract_site_metadata(homepage.url, homepage.text) + validated_icon_url = self._validated_icon_url(metadata.icon_url, deadline, blog_record.url) candidate_pages = self._discover_candidate_pages(homepage) discovered_count = self._crawl_candidate_pages( blog_record, @@ -100,11 +101,28 @@ def crawl_blog(self, blog: dict[str, object]) -> int: status_code=homepage.status_code, discovered_count=discovered_count, title=metadata.title, - icon_url=metadata.icon_url, + icon_url=validated_icon_url, ), ) return discovered_count + def _validated_icon_url(self, icon_url: str | None, deadline: float, blog_url: str) -> str | None: + """Return a reachable icon URL, or ``None`` when no candidate validates. + + Args: + icon_url: Candidate icon URL extracted from homepage metadata. + deadline: Monotonic crawl deadline shared by the current blog crawl. + blog_url: Source blog URL used in timeout error messages. + + Returns: + Final reachable icon URL when validation succeeds; otherwise + ``None``. + """ + if not icon_url: + return None + remaining = self._remaining_timeout_seconds(deadline, blog_url) + return self.fetcher.validate_icon_url(icon_url, timeout_seconds=remaining) + def _discover_candidate_pages(self, homepage: FetchResult) -> list[str]: """Discover friend-link candidate pages starting from the homepage. @@ -220,6 +238,12 @@ def _store_page_links( ) raw_record_id = int(raw_record["id"]) if raw_record["status"] == "rule:duplicate_url": + stored_count += self._store_duplicate_target_edge( + blog=blog, + normalized_url=normalized.normalized_url, + link=link, + seen_normalized=seen_normalized, + ) continue decision = self._evaluate_link(blog, normalized.normalized_url, link, deadline=deadline) status = str(decision.status or "success") @@ -242,6 +266,7 @@ def _store_page_links( normalized_url=normalized.normalized_url, domain=normalized.domain, feed_url=decision.feed_url, + accepted_by=decision.accepted_by, ) edge = FriendLinkEdge( from_blog_id=blog.id, @@ -259,6 +284,42 @@ def _store_page_links( return stored_count + def _store_duplicate_target_edge( + self, + *, + blog: BlogNode, + normalized_url: str, + link: ExtractedLink, + seen_normalized: set[str], + ) -> int: + """Persist an edge for a duplicate raw URL that already maps to a blog. + + Args: + blog: Source blog currently being crawled. + normalized_url: Normalized target URL already seen by an earlier + raw discovery. + link: Extracted source-page link carrying raw URL and text. + seen_normalized: Per-source crawl de-duplication set for targets. + + Returns: + ``1`` when an edge write was attempted for an existing target blog, + otherwise ``0``. + """ + + if normalized_url in seen_normalized: + return 0 + existing_blog_id = self.repository.find_blog_id_by_normalized_url(normalized_url=normalized_url) + if existing_blog_id is None: + return 0 + seen_normalized.add(normalized_url) + self.repository.add_edge( + from_blog_id=blog.id, + to_blog_id=existing_blog_id, + link_url_raw=link.url, + link_text=link.text, + ) + return 1 + def _evaluate_link( self, blog: BlogNode, diff --git a/crawler/crawling/pipeline.py b/crawler/crawling/pipeline.py index 54fabe1..f65ca62 100644 --- a/crawler/crawling/pipeline.py +++ b/crawler/crawling/pipeline.py @@ -26,6 +26,29 @@ ShouldStopHook = Callable[[], bool] +def _crawl_error_kind(error: Exception) -> str: + """Return a stable crawl failure category for persistence. + + Args: + error: Exception raised while processing one blog. + + Returns: + Machine-readable failure kind used to separate retryable crawl errors + from blog acceptance semantics. + """ + + if isinstance(error, PageTooLargeError): + return "page_too_large" + if isinstance(error, TimeoutError): + return "timeout" + error_name = type(error).__name__.lower() + if "http" in error_name: + return "http_status" + if "request" in error_name: + return "request_error" + return "crawl_error" + + class CrawlPipeline: """Coordinate seed bootstrap, one-shot crawl batches, and export writing. @@ -110,7 +133,6 @@ def run_once( """ stats = CrawlRunStats() limit = max_nodes or self.settings.max_nodes_per_run - normal_slots_remaining = 0 stop_reason: str | None = None while stats.processed < limit: @@ -121,11 +143,7 @@ def run_once( if not capacity.allowed: stop_reason = capacity.reason break - # Claiming and processing stay in one loop so the batch result - # always reflects the same queue-fairness rules as runtime mode. - row, _claimed_priority, normal_slots_remaining = self._claim_next_scheduled_blog( - normal_slots_remaining=normal_slots_remaining - ) + row = self._get_next_waiting_blog() if row is None: break result = self.process_blog_row( @@ -172,10 +190,6 @@ def process_blog_row( failed. """ blog = BlogNode.from_row(row) - if hasattr(self.repository, "mark_ingestion_request_crawling"): - # Priority ingestion requests need a state transition before the - # actual crawl starts so UI callers can observe progress promptly. - self.repository.mark_ingestion_request_crawling(blog_id=blog.id) if on_blog_start is not None: on_blog_start(blog.callback_payload()) try: @@ -197,69 +211,14 @@ def write_exports(self) -> dict[str, Any]: """ return self.export_service.write_exports() - def _claim_next_scheduled_blog(self, *, normal_slots_remaining: int) -> tuple[dict[str, Any] | None, bool, int]: - """Claim the next eligible blog while enforcing priority fairness. - - Args: - normal_slots_remaining: Remaining count in the current fairness - window that allows normal-queue blogs after a priority claim. - - Returns: - A tuple of ``(row, claimed_priority, next_normal_slots_remaining)`` - describing the claimed blog, whether it came from the priority - queue, and the updated fairness-window counter. - """ - priority_slots = max(1, self.settings.priority_seed_normal_queue_slots) - if normal_slots_remaining <= 0: - # A priority seed wins immediately when its turn comes up. - row = self._get_next_priority_blog() - if row is not None: - return row, True, priority_slots - - include_priority = normal_slots_remaining <= 0 - row = self._get_next_waiting_blog(include_priority=include_priority) - if row is not None: - # After one priority seed is claimed, let a bounded number of normal - # queue items run before checking the priority queue again. - next_remaining = max(0, normal_slots_remaining - 1) if normal_slots_remaining > 0 else 0 - return row, False, next_remaining - - if normal_slots_remaining > 0: - # If the normal queue is empty during a fairness window, do not make - # the priority seed wait for the remaining normal slots to expire. - row = self._get_next_priority_blog() - if row is not None: - return row, True, priority_slots - return None, False, 0 - - def _get_next_priority_blog(self) -> dict[str, Any] | None: - """Return the next priority blog row if the repository supports it. - - Returns: - The claimed priority blog row, or ``None`` when no priority queue is - available or no priority blog is waiting. - """ - getter = getattr(self.repository, "get_next_priority_blog", None) - if getter is None: - return None - return getter() - - def _get_next_waiting_blog(self, *, include_priority: bool) -> dict[str, Any] | None: + def _get_next_waiting_blog(self) -> dict[str, Any] | None: """Return the next waiting blog row from the main queue. - Args: - include_priority: Whether repository implementations should allow - priority rows to be returned from the general waiting query. - Returns: The next claimed waiting blog row, or ``None`` when the queue is empty. """ - getter = self.repository.get_next_waiting_blog - try: - return getter(include_priority=include_priority) - except TypeError: - return getter() + return self.repository.get_next_waiting_blog() def _crawl_blog(self, blog: dict[str, Any]) -> int: """Crawl one blog row through the orchestrator. @@ -292,6 +251,8 @@ def _mark_blog_failed(self, blog_id: int, error: Exception) -> None: crawl_status=state.status, status_code=state.status_code, friend_links_count=state.friend_links_count, + crawl_error_kind=_crawl_error_kind(error), + crawl_error_message=str(error)[:1000], ) self.logger.crawl_error(blog_id=blog_id, error=error) diff --git a/crawler/filters.py b/crawler/filters.py index abd5f76..956e88d 100644 --- a/crawler/filters.py +++ b/crawler/filters.py @@ -6,8 +6,6 @@ from urllib.parse import urlparse from crawler.crawling.decisions.rule_helpers import BLOCKED_TLDS -from crawler.crawling.decisions.rule_helpers import FILE_SUFFIX_BLOCKLIST -from crawler.crawling.decisions.rule_helpers import PATH_BLOCKLIST from crawler.crawling.decisions.rule_helpers import PLATFORM_BLOCKLIST from crawler.crawling.decisions.rule_helpers import has_asset_suffix from crawler.crawling.decisions.rule_helpers import has_extra_location_parts diff --git a/crawler/main.py b/crawler/main.py index e5665cc..2910ed7 100644 --- a/crawler/main.py +++ b/crawler/main.py @@ -72,7 +72,7 @@ def build_crawler_state(settings: Settings | None = None) -> CrawlerState: runtime=CrawlerRuntimeService( pipeline, worker_count=resolved.runtime_worker_count, - priority_seed_normal_queue_slots=resolved.priority_seed_normal_queue_slots, + auto_start_interval_seconds=resolved.runtime_auto_start_interval_seconds, ), ) @@ -100,6 +100,32 @@ def create_app(state: CrawlerState | None = None) -> FastAPI: app.add_middleware(RequestIdMiddleware, service=SERVICE_NAME) app.state.crawler_state = state or build_crawler_state() + @app.on_event("startup") + def start_runtime_auto_scheduler() -> None: + """Start runtime auto scheduling when the ASGI app starts serving.""" + scheduler_result = app.state.crawler_state.runtime.start_auto_scheduler() + log_event( + LOGGER, + event="crawler.runtime.auto_scheduler.started", + message="crawler runtime auto scheduler started", + stage="runtime", + accepted=scheduler_result.get("accepted"), + interval_seconds=scheduler_result.get("interval_seconds"), + reason=scheduler_result.get("reason"), + ) + + @app.on_event("shutdown") + def stop_runtime_auto_scheduler() -> None: + """Stop runtime auto scheduling when the ASGI app shuts down.""" + scheduler_result = app.state.crawler_state.runtime.stop_auto_scheduler() + log_event( + LOGGER, + event="crawler.runtime.auto_scheduler.stopped", + message="crawler runtime auto scheduler stopped", + stage="runtime", + accepted=scheduler_result.get("accepted"), + ) + def get_state() -> CrawlerState: """Return the app-scoped crawler state container. diff --git a/crawler/runtime/service.py b/crawler/runtime/service.py index d4d51e8..9d4bb46 100644 --- a/crawler/runtime/service.py +++ b/crawler/runtime/service.py @@ -34,8 +34,6 @@ class CrawlerRuntimeService: pipeline: Crawl pipeline used to process individual blog rows. executor: Thread launcher used for background runtime execution. worker_count: Number of runtime workers to run in parallel. - priority_seed_normal_queue_slots: Number of normal queue claims allowed - after a priority seed claim. """ def __init__( @@ -44,7 +42,7 @@ def __init__( executor: SerialRuntimeExecutor | None = None, *, worker_count: int = 1, - priority_seed_normal_queue_slots: int = 2, + auto_start_interval_seconds: float = 3600.0, ) -> None: """Initialize runtime state, workers, and synchronization primitives. @@ -52,8 +50,7 @@ def __init__( pipeline: Crawl pipeline reused by synchronous and background runs. executor: Optional executor used to start the background thread. worker_count: Requested number of runtime workers. - priority_seed_normal_queue_slots: Fairness window size after a - priority queue claim. + auto_start_interval_seconds: Seconds between idle runtime checks. Returns: ``None``. The runtime service stores its dependencies and prepares @@ -62,14 +59,13 @@ def __init__( self.pipeline = pipeline self.executor = executor or SerialRuntimeExecutor() self.worker_count = max(1, worker_count) - self.priority_seed_normal_queue_slots = max(1, priority_seed_normal_queue_slots) + self.auto_start_interval_seconds = max(0.001, auto_start_interval_seconds) self.capacity_gate = getattr(pipeline, "capacity_gate", None) if self.capacity_gate is None: self.capacity_gate = CrawlerCapacityGate( pipeline.repository, raw_discovered_url_limit=-1, ) - self._normal_slots_remaining_after_priority = 0 self._snapshot = RuntimeSnapshot( worker_count=self.worker_count, workers=[ @@ -81,6 +77,8 @@ def __init__( self._claim_lock = Lock() self._stop_event = Event() self._thread: Thread | None = None + self._scheduler_thread: Thread | None = None + self._scheduler_stop_event = Event() def status(self) -> dict[str, Any]: """Return the current full runtime snapshot. @@ -140,6 +138,42 @@ def start(self) -> dict[str, Any]: self._thread = self.executor.start(self._run_background_loop) return self._snapshot.as_dict() + def start_auto_scheduler(self) -> dict[str, Any]: + """Start the periodic idle-runtime wakeup loop if needed. + + Returns: + Payload describing whether the scheduler is running. + """ + with self._lock: + if self._scheduler_thread is not None and self._scheduler_thread.is_alive(): + return { + "accepted": False, + "reason": "scheduler_already_running", + "interval_seconds": self.auto_start_interval_seconds, + } + self._scheduler_stop_event.clear() + self._scheduler_thread = Thread( + target=self._run_auto_scheduler, + daemon=True, + name="crawler-auto-scheduler", + ) + self._scheduler_thread.start() + return { + "accepted": True, + "interval_seconds": self.auto_start_interval_seconds, + } + + def stop_auto_scheduler(self) -> dict[str, Any]: + """Request the periodic idle-runtime wakeup loop to stop. + + Returns: + Payload describing whether a scheduler stop was requested. + """ + self._scheduler_stop_event.set() + with self._lock: + running = self._scheduler_thread is not None and self._scheduler_thread.is_alive() + return {"accepted": running} + def stop(self) -> dict[str, Any]: """Request the background loop to stop at the next safe checkpoint. @@ -183,7 +217,6 @@ def run_batch(self, max_nodes: int) -> dict[str, Any]: "runtime": self._snapshot.as_dict(), } self._stop_event.clear() - self._normal_slots_remaining_after_priority = 0 self._begin_run_locked("running") try: @@ -210,7 +243,6 @@ def _run_background_loop(self) -> None: """ with self._lock: self._snapshot.runner_status = "running" - self._normal_slots_remaining_after_priority = 0 try: result = self._run_worker_pool(max_nodes=None) @@ -223,6 +255,21 @@ def _run_background_loop(self) -> None: self._finish_run_locked(result) self._stop_event.clear() + def _run_auto_scheduler(self) -> None: + """Periodically start the runtime when it is idle. + + Returns: + ``None``. The loop exits only when ``stop_auto_scheduler`` is + called or the process stops. + """ + while not self._scheduler_stop_event.is_set(): + with self._lock: + should_start = self._snapshot.runner_status in {"idle", "error"} + if should_start: + self.start() + if self._scheduler_stop_event.wait(self.auto_start_interval_seconds): + return + def _run_worker_pool(self, *, max_nodes: int | None) -> dict[str, Any]: """Run one worker-pool execution and aggregate the results. @@ -384,61 +431,22 @@ def _release_budget_slot(self, budget: dict[str, int | None]) -> None: budget["remaining"] = remaining + 1 def _claim_next_waiting_blog(self) -> dict[str, Any] | None: - """Claim one waiting blog while enforcing runtime fairness rules. + """Claim one waiting blog from the repository. Returns: The next claimed blog row, or ``None`` when no eligible work remains. """ with self._claim_lock: - if self._normal_slots_remaining_after_priority <= 0: - priority_row = self._get_next_priority_blog() - if priority_row is not None: - # One claimed priority seed opens a bounded fairness window - # for normal queue items before the next priority check. - self._normal_slots_remaining_after_priority = self.priority_seed_normal_queue_slots - return priority_row - - row = self._get_next_waiting_blog(include_priority=self._normal_slots_remaining_after_priority <= 0) - if row is not None: - if self._normal_slots_remaining_after_priority > 0: - self._normal_slots_remaining_after_priority -= 1 - return row - - if self._normal_slots_remaining_after_priority > 0: - priority_row = self._get_next_priority_blog() - if priority_row is not None: - self._normal_slots_remaining_after_priority = self.priority_seed_normal_queue_slots - return priority_row - self._normal_slots_remaining_after_priority = 0 - return None - - def _get_next_priority_blog(self) -> dict[str, Any] | None: - """Return the next priority blog from the repository, if supported. + return self._get_next_waiting_blog() - Returns: - The next priority blog row, or ``None`` when unavailable. - """ - getter = getattr(self.pipeline.repository, "get_next_priority_blog", None) - if getter is None: - return None - return getter() - - def _get_next_waiting_blog(self, *, include_priority: bool) -> dict[str, Any] | None: + def _get_next_waiting_blog(self) -> dict[str, Any] | None: """Return the next waiting blog from the repository. - Args: - include_priority: Whether the repository should allow priority rows - in its general waiting query. - Returns: The next waiting blog row, or ``None`` when the queue is empty. """ - getter = self.pipeline.repository.get_next_waiting_blog - try: - return getter(include_priority=include_priority) - except TypeError: - return getter() + return self.pipeline.repository.get_next_waiting_blog() def _on_blog_start(self, worker_index: int, blog: dict[str, Any]) -> None: """Record that a worker has started crawling one blog. diff --git a/doc/api-docs.md b/doc/api-docs.md index 7f5408e..84386ad 100644 --- a/doc/api-docs.md +++ b/doc/api-docs.md @@ -63,7 +63,7 @@ ### 2.1 Public API -Public API 由 `backend` 服务统一暴露,供 public 浏览、图谱与 ingestion 流程使用: +Public API 由 `backend` 服务统一暴露,供 public 浏览、图谱与用户 seed 提交流程使用: - `GET /` - `GET /internal/health` @@ -72,20 +72,26 @@ Public API 由 `backend` 服务统一暴露,供 public 浏览、图谱与 inge - `POST /api/auth/login` - `GET /api/auth/me` - `POST /api/auth/logout` +- `POST /api/auth/email/verify/request` +- `POST /api/auth/email/verify/confirm` +- `POST /api/auth/password/forgot` +- `POST /api/auth/password/reset` - `GET /api/me/label-selections` - `GET /api/blogs/catalog` +- `POST /api/recommendations/random-blog-batches` +- `POST /api/recommendation-events` +- `GET /api/blogs/{blog_id}/stats` +- `POST /api/blogs/user-seeds` - `POST /api/blogs/{blog_id}/user-labels` - `GET /api/blogs/lookup` - `GET /api/blogs/{blog_id}` +- `GET /api/icons/proxy` - `GET /api/graph/views/core` - `GET /api/graph/nodes/{blog_id}/neighbors` - `GET /api/graph/snapshots/latest` - `GET /api/graph/snapshots/{version}` - `GET /api/stats` - `GET /api/filter-stats` -- `GET /api/ingestion-requests` -- `POST /api/ingestion-requests` -- `GET /api/ingestion-requests/{request_id}` 源码位置: [backend/main.py](../backend/main.py) @@ -98,7 +104,7 @@ Public API 由 `backend` 服务统一暴露,供 public 浏览、图谱与 inge ### 2.2 Admin API -Admin API 同样由 `backend` 暴露,但统一位于 `/api/admin/*` 下,并要求 `Authorization: Bearer `: +Admin API 同样由 `backend` 暴露,但统一位于 `/api/admin/*` 下,并要求 `Authorization: Bearer `。该 token 可以是 legacy `HEYBLOG_ADMIN_TOKEN`,也可以是已登录、已验证邮箱且 `role=admin` 的用户 session token: - `GET /api/admin/runtime/status` - `GET /api/admin/runtime/current` @@ -119,9 +125,10 @@ Admin API 同样由 `backend` 暴露,但统一位于 `/api/admin/*` 下,并 - `GET /api/admin/blog-labeling/parquet-export` - `POST /api/admin/blog-labeling/title-preview` - `PUT /api/admin/blog-labeling/labels/{blog_id}` -- `POST /api/admin/blog-dedup-scans` -- `GET /api/admin/blog-dedup-scans/latest` -- `GET /api/admin/blog-dedup-scans/{run_id}/items` +- `GET /api/admin/recommendation-stats` +- `GET /api/admin/hourly-stats` +- `GET /api/admin/users` +- `PATCH /api/admin/users/{user_id}/role` 补充脚本: @@ -131,9 +138,11 @@ Admin API 同样由 `backend` 暴露,但统一位于 `/api/admin/*` 下,并 认证语义: +- Admin API 接受 legacy `HEYBLOG_ADMIN_TOKEN`,也接受已登录、已验证邮箱且 `role=admin` 的用户 session token。 - 未提供 token:`401 admin_auth_required` - token 不合法:`403 admin_auth_invalid` -- 未配置 `HEYBLOG_ADMIN_TOKEN` 且未开启 `HEYBLOG_ADMIN_DEV_BYPASS=true`:`503 admin_auth_not_configured` +- token 属于普通用户或未验证 admin 候选账号:`403 admin_auth_forbidden` +- 未配置 `HEYBLOG_ADMIN_TOKEN` 且未开启 `HEYBLOG_ADMIN_DEV_BYPASS=true`,同时请求也不是合法 admin 用户 session:`503 admin_auth_not_configured` ### 2.2 内部服务 API @@ -286,7 +295,7 @@ Admin API 同样由 `backend` 暴露,但统一位于 `/api/admin/*` 下,并 #### `POST /api/auth/register` -用途:使用邮箱和密码注册普通用户,并立即创建登录会话。当前版本不做邮箱验证。 +用途:提交邮箱和密码并发送验证邮件。该接口只创建临时待验证注册记录,不创建登录 session,也不会把用户账号写入 `users`。只有用户通过验证码/验证链接完成 `/api/auth/email/verify/confirm` 后,系统才会创建持久化用户账号。游客无需入库;未登录请求即游客身份。 请求体: @@ -301,23 +310,20 @@ Admin API 同样由 `backend` 暴露,但统一位于 `/api/admin/*` 下,并 ```json { - "token": "session-token", - "expires_at": "2026-06-25T00:00:00+00:00", - "user": { - "id": 1, - "email": "user@example.com", - "display_name": "user", - "created_at": "2026-05-26T22:04:50+00:00", - "updated_at": "2026-05-26T22:04:50+00:00" - } + "sent": true, + "verification_token": "dev-verification-token", + "verification_url": "http://127.0.0.1:3000/profile?verify_token=dev-verification-token", + "expires_at": "2026-06-12T00:00:00+00:00" } ``` 错误语义: - `409 email_already_registered` +- `409 email_registration_pending` - `422 invalid_email` - `422 password_too_short` +- `502 email_delivery_failed` #### `POST /api/auth/login` @@ -344,6 +350,100 @@ Admin API 同样由 `backend` 暴露,但统一位于 `/api/admin/*` 下,并 用途:注销当前 session token。请求头同 `/api/auth/me`。 +#### `POST /api/auth/email/verify/request` + +用途:为已经创建但尚未验证的普通用户或 admin 用户重新生成邮箱验证 token。未知邮箱和仍处于注册待验证阶段、尚未持久化的邮箱返回中性成功语义,避免暴露账号是否存在;待验证新注册应继续使用注册邮件中的链接完成账号创建。 + +邮件通道由 `persistence-api` 的 `HEYBLOG_EMAIL_PROVIDER` 控制。默认 `disabled` 模式不会连接 SMTP,API 响应默认只返回发送状态和过期时间,不暴露明文 token。需要本地调试或手动验证时,可显式设置 `HEYBLOG_EMAIL_DEV_EXPOSE_TOKENS=true` 让响应体返回一次性验证 token/link。设置 `HEYBLOG_EMAIL_PROVIDER=smtp` 后,系统会把验证链接发送到用户邮箱。 + +请求体: + +```json +{ + "email": "user@example.com" +} +``` + +成功响应: + +```json +{ + "sent": true, + "verification_token": "dev-verification-token", + "verification_url": "http://127.0.0.1:3000/profile?verify_token=dev-verification-token", + "expires_at": "2026-06-10T00:00:00+00:00" +} +``` + +生产 SMTP 且关闭 dev token 暴露后的成功响应: + +```json +{ + "sent": true, + "expires_at": "2026-06-10T00:00:00+00:00" +} +``` + +错误语义: + +- `502 email_delivery_failed` + +#### `POST /api/auth/email/verify/confirm` + +用途:消费邮箱验证邮件链接中的一次性 token。对于新注册 token,该接口先创建持久化用户账号,再返回已验证用户资料;对于历史未验证账号 token,该接口把已有用户标记为已验证。token 只保存 hash,过期或已消费后不可复用。浏览器打开 `/profile?verify_token=...` 时,前端会自动调用该接口完成验证,随后提示用户登录。 + +请求体: + +```json +{ + "token": "dev-verification-token" +} +``` + +返回:创建或更新后的用户资料。新注册用户默认 `role=user`、`email_verified=true`,不会自动创建登录 session。 + +#### `POST /api/auth/password/forgot` + +用途:请求密码重置 token。未知邮箱返回中性成功语义。 + +请求体: + +```json +{ + "email": "user@example.com" +} +``` + +默认响应隐藏明文 reset token,只返回发送状态和过期时间。需要本地调试或手动验证时,可显式设置 `HEYBLOG_EMAIL_DEV_EXPOSE_TOKENS=true` 返回可直接使用的 `reset_token` 与 `reset_url`。设置 `HEYBLOG_EMAIL_PROVIDER=smtp` 后,系统会把 reset link 发送到用户邮箱。后端始终只持久化 token hash。 + +生产 SMTP 且关闭 dev token 暴露后的成功响应: + +```json +{ + "sent": true, + "expires_at": "2026-06-10T00:00:00+00:00" +} +``` + +错误语义: + +- `502 email_delivery_failed` + +#### `POST /api/auth/password/reset` + +用途:消费一次性密码重置 token,设置新密码,并撤销该用户所有旧 session。 + +请求体: + +```json +{ + "token": "dev-reset-token", + "password": "new long enough" +} +``` + +返回:更新后的用户资料。 + #### `GET /api/me/label-selections` 用途:返回当前登录用户最近的随机博客标注选择。 @@ -395,6 +495,7 @@ Admin API 同样由 `backend` 暴露,但统一位于 `/api/admin/*` 下,并 - `url`: URL 筛选,匹配 `url` / `normalized_url` - `status`: 抓取状态精确筛选;会先做 `trim + uppercase`,仅允许 `WAITING`、`PROCESSING`、`FINISHED`、`FAILED` - `statuses`: 多状态筛选,逗号分隔;会对每个值做 `trim + uppercase`,仅允许 `WAITING`、`PROCESSING`、`FINISHED`、`FAILED` +- `acceptance_status`: 博客接受状态筛选,默认 `ACCEPTED`;允许 `ACCEPTED`、`UNKNOWN`、`REJECTED`。该字段表示 URL 是否已被 seed、RSS 或模型确认为博客,独立于 `crawl_status` - `sort`: 排序方式,允许 `id_asc`、`id_desc`、`recent_activity`、`connections`、`recently_discovered`、`random` - `has_title`: 是否要求有标题;支持布尔值,也接受 `1/0`、`true/false`、`yes/no` - `has_icon`: 是否要求有 icon;支持布尔值,也接受 `1/0`、`true/false`、`yes/no` @@ -405,8 +506,10 @@ Admin API 同样由 `backend` 暴露,但统一位于 `/api/admin/*` 下,并 - 空白字符串会被视为未传参 - 非法 `status` 返回 `422` - 非法 `statuses` 返回 `422` +- 非法 `acceptance_status` 返回 `422` - 非法 `sort` 返回 `422` - 当 `statuses` 存在时优先于 `status`,用于同时查询多个 `crawl_status` +- 默认只返回 `acceptance_status=ACCEPTED` 的 URL;`crawl_status=FAILED` 只表示最近一次抓取尝试失败,不表示该 URL 不是博客 - `has_title` / `has_icon` 仅在传入真值时启用过滤;传入假值会保留参数值但不额外筛掉空字段记录 - `id_asc` 按业务 `blog_id ASC` - `recent_activity` 按 `activity_at DESC, connection_count DESC, blog_id DESC` @@ -420,6 +523,298 @@ Admin API 同样由 `backend` 暴露,但统一位于 `/api/admin/*` 下,并 当前前端使用方式: +- 首页搜索框使用 `page=1&page_size=30&url=<输入 URL>&sort=id_desc` 查询已发现博客,并把返回项渲染为可滚动结果列表。 + +#### `POST /api/recommendations/random-blog-batches` + +用途:随机博客页请求一组新的推荐卡片,并把本次刷新作为一条可追踪的 recommendation request 持久化。服务端会同时写入有序 impression 记录,所以后续点击、详情打开和标注事件可以归因到“哪次刷新中的第几个 URL”。 + +请求体: + +```json +{ + "count": 9, + "visitor_id": "visitor_lx7...", + "session_id": "session_lx7...", + "source": "random_page", + "page_url": "http://localhost:3000/random", + "context": { + "refresh_kind": "manual" + } +} +``` + +认证说明: + +- 未登录也可调用;`visitor_id` 与 `session_id` 由前端本地生成,用于匿名统计。 +- 登录后可带 `Authorization: Bearer `;backend 会把用户 ID 转发给 persistence 以便后续用户维度分析。 + +行为说明: + +- 当前 surface 固定为 `random_blog_page` +- 当前 strategy 固定为 `weighted_random`,`strategy_version = v1` +- 只返回 `crawl_status=FINISHED` 且 `acceptance_status=ACCEPTED` 的博客 +- 随机排序复用 catalog 的 `sort=random` 权重逻辑:管理员非 blog 标签会过滤,用户公开反馈会影响权重 +- `count` 当前允许 `1..50`;随机页默认请求 `9` + +成功响应示例: + +```json +{ + "request_uuid": "r_abc", + "surface": "random_blog_page", + "strategy": "weighted_random", + "strategy_version": "v1", + "visitor_id": "visitor_lx7", + "session_id": "session_lx7", + "requested_count": 9, + "served_count": 9, + "created_at": "2026-06-07T13:30:00+00:00", + "items": [ + { + "id": 12, + "url": "https://blog.example.com/", + "normalized_url": "https://blog.example.com/", + "request_uuid": "r_abc", + "impression_id": 101, + "position": 1 + } + ] +} +``` + +错误语义: + +- `401`: bearer token 非法或过期 +- `422`: count、visitor/session ID 或 JSON context 非法 + +#### `POST /api/recommendation-events` + +用途:记录随机博客卡片上的用户行为。事件以 `event_uuid` 幂等写入,适合前端在详情跳转、外链打开、标注选择等动作发生时尽力而为上报。 + +请求体: + +```json +{ + "event_uuid": "event_lx7...", + "event_type": "detail_open", + "blog_id": 12, + "visitor_id": "visitor_lx7...", + "session_id": "session_lx7...", + "entrance_kind": "random_blog_page", + "entrance_url": "http://localhost:3000/random", + "request_uuid": "r_abc", + "impression_id": 101, + "position": 1, + "interaction_order": 1, + "client_event_at": "2026-06-07T13:31:00.000Z", + "attributes": { + "label": "blog" + } +} +``` + +支持的 `event_type`: + +- `click` +- `detail_open` +- `external_open` +- `label_select` +- `refresh` +- `dismiss` +- `copy_url` + +行为说明: + +- 同一个 `event_uuid` 重复上报时不会重复计数,响应中会返回 `duplicate: true` +- `entrance_kind` 与 `entrance_url` 为必填字段。`entrance_kind` 使用稳定、可聚合的路口种类,例如 `random_blog_page`、`home_search_result`、`blog_detail_discovery_path`、`blog_detail_relation_graph`;`entrance_url` 保留触发动作时的原始页面 URL 或上下文 URL,便于追溯具体来源。 +- 若传入 `request_uuid` 或 `impression_id`,服务端会校验它们存在且与当前 blog 的 `normalized_url` 匹配 +- 前端不应因为事件上报失败而阻塞用户跳转或标注主流程 +- 持久化时事件落到 `blog_interactions`,以 `normalized_url` 作为博客归因键;其中 `entrance_kind` 与 `entrance_url` 单独存列并建立索引,便于按稳定路口维度统计详情打开、外链打开和标签选择。 + +错误语义: + +- `404`: 目标 blog 不存在 +- `401`: bearer token 非法或过期 +- `422`: event type、request/impression 归因或 JSON attributes 非法 + +#### `GET /api/blogs/{blog_id}/stats` + +用途:返回单个博客在推荐系统中的曝光和交互统计,供详情页或后续运营面板展示。 + +成功响应示例: + +```json +{ + "blog_id": 12, + "normalized_url": "https://blog.example.com/", + "impressions": 20, + "clicks": 1, + "detail_opens": 3, + "external_opens": 0, + "label_selects": 2, + "unique_visitors": 5, + "ctr": 0.2, + "last_interaction_at": "2026-06-07T13:31:00+00:00", + "by_event_type": { + "detail_open": 3, + "label_select": 2 + } +} +``` + +错误语义: + +- `404`: 目标 blog 不存在 + +#### `GET /api/admin/recommendation-stats` + +用途:返回推荐请求、曝光和交互的策略级汇总。该接口位于 admin API 下,需要 `Authorization: Bearer `。 + +成功响应示例: + +```json +{ + "total_requests": 10, + "total_impressions": 90, + "total_interactions": 12, + "by_strategy": [ + { + "surface": "random_blog_page", + "strategy": "weighted_random", + "strategy_version": "v1", + "requests": 10, + "impressions": 90, + "clicks": 8, + "unique_visitors": 6, + "ctr": 0.0888888889 + } + ] +} +``` + +#### `GET /api/admin/hourly-stats` + +用途:返回后台统计小时快照,并在读取时刷新当前自然小时的数据。该接口位于 admin API 下,需要 `Authorization: Bearer ` 或已验证 admin 用户 session token。 + +查询参数: + +- `limit`: 返回最近多少个自然小时快照,默认 `24`,最大 `168` + +统计语义: + +- 数据写入 `admin_hourly_stats` 表,每条记录对应一个 UTC 自然小时窗口 `[hour_start, hour_start + 1h)` +- `user_count`: 当前 active 用户总数 +- `random_request_count`: 该小时内 random blog 推荐请求数 +- `random_impression_count`: 该小时内 random blog 推荐曝光数;随机页每次通常请求 9 个 +- `detail_open_count`: 该小时内 random blog 卡片详情打开次数 +- `external_open_count`: 该小时内 random blog 卡片外链打开次数 +- `detail_ctr`: `detail_open_count / random_impression_count` +- `external_ctr`: `external_open_count / random_impression_count` +- `total_click_ctr`: `(detail_open_count + external_open_count) / random_impression_count` + +成功响应示例: + +```json +{ + "current_hour": { + "id": 1, + "hour_start": "2026-06-11T10:00:00+00:00", + "user_count": 12, + "random_request_count": 3, + "random_impression_count": 27, + "detail_open_count": 4, + "external_open_count": 5, + "detail_ctr": 0.1481481481, + "external_ctr": 0.1851851852, + "total_click_ctr": 0.3333333333, + "refreshed_at": "2026-06-11T10:05:00+00:00", + "created_at": "2026-06-11T10:05:00+00:00" + }, + "latest": { + "id": 1, + "hour_start": "2026-06-11T10:00:00+00:00", + "user_count": 12, + "random_request_count": 3, + "random_impression_count": 27, + "detail_open_count": 4, + "external_open_count": 5, + "detail_ctr": 0.1481481481, + "external_ctr": 0.1851851852, + "total_click_ctr": 0.3333333333, + "refreshed_at": "2026-06-11T10:05:00+00:00", + "created_at": "2026-06-11T10:05:00+00:00" + }, + "items": [] +} +``` + +#### `POST /api/blogs/user-seeds` + +用途:当首页 URL 搜索没有命中时,允许用户提交一个完整博客链接作为用户来源 seed。该接口只执行确定性规则过滤,跳过 RSS discovery 与模型共识;规则通过后会把 URL 同时写入 `blogs` 与 `seeds`。 + +请求体: + +```json +{ + "homepage_url": "https://blog.example.com" +} +``` + +成功语义: + +- URL 先按当前 identity/canonicalization 规则归一化 +- 只运行过滤链中的 rule filters;不会因为缺少 RSS、模型未加载或模型判非博客而拒绝 +- 规则通过后,`blogs.acceptance_status = ACCEPTED` +- `blogs.accepted_by = user` +- 新建或历史 `FAILED` 博客会处于 `crawl_status = WAITING`,因此可被 crawler 领取并抓取友链 +- 已经 `FINISHED` 的博客不会被强制重置为 `WAITING` +- 同一 URL 会 upsert 到 `seeds` 表,当前用 `source_path = user` 标记用户来源 + +成功响应示例: + +```json +{ + "status": "QUEUED", + "blog_id": 123, + "inserted": true, + "blog": { + "id": 123, + "blog_id": 123, + "url": "https://blog.example.com/", + "normalized_url": "https://blog.example.com/", + "domain": "blog.example.com", + "acceptance_status": "ACCEPTED", + "accepted_by": "user", + "crawl_status": "WAITING" + } +} +``` + +错误语义: + +- URL 格式无法归一化或规则过滤拒绝时返回 `422` + +#### `GET /api/icons/proxy` + +用途:把已知 icon URL 作为同源图片返回,供 3D 图谱 WebGL texture 加载使用。 + +查询参数: + +- `url`: 绝对 `http` / `https` 图片 URL。前端通常传入 `icon_url` 或 favicon API fallback URL。 + +响应: + +- 成功时返回远端图片字节,`Content-Type` 沿用远端图片 MIME,并设置 `Cache-Control: public, max-age=86400` +- 仅允许公网 HTTP(S) URL;localhost、私网、link-local、reserved 等地址会返回 `422` +- 远端超时返回 `504` +- 远端非 2xx、非图片 MIME、或响应超过 1MB 时返回 `502` + +说明: + +- 该接口不改变 `blogs.icon_url` 的持久化语义,只解决浏览器 WebGL 对跨域 texture 的 CORS 要求 +- 普通 `` 展示仍可直接使用 `icon_url` 或前端 favicon fallback;图谱纹理建议统一使用该代理后的同源 URL + #### `POST /api/blogs/{blog_id}/user-labels` 用途:随机博客页为单个博客 URL 增加一次公共用户标注。该接口写入 `blog_labels_userlabel`,表结构和 `blog_labels` 一致,均按 `normalized_url` 存储 `title` 与 `label_id` 计数字典;不会修改训练用的 `blog_labels`。 @@ -443,7 +838,7 @@ Admin API 同样由 `backend` 暴露,但统一位于 `/api/admin/*` 下,并 - `label` 只接受随机博客页使用的四类标签:`blog`、`company`、`other`、`unknown` - `previous_label` 可选;用于随机博客页内的单 URL 单选择切换。若传入且与 `label` 不同,服务端会先把旧 label 计数减 `1`,再把新 label 计数加 `1` - 前端同一张 URL 卡片重复点击已选中的 label 不会再次请求接口,也不会重复累加计数 -- 随机博客加权时,所有 URL 默认权重为 `10`;设用户表中 `blog` 计数为 `x`、非 `blog` 计数为 `y`,权重为 `(10 + x) / (1 + y)`,最高不超过 `10` +- 随机博客加权时,所有 URL 默认权重为 `10`;设用户表中非 `blog` 计数为 `y`,权重为 `10 / (1 + y)`;`blog` 正反馈不再提升随机权重 - 权重只影响 `sort=random` 的随机排序;管理员训练标签只负责过滤非 blog,不会被用户标注改变 错误语义: @@ -492,7 +887,7 @@ Admin API 同样由 `backend` 暴露,但统一位于 `/api/admin/*` 下,并 匹配阶梯: -- 先复用 ingestion 的 canonicalization / identity 规则,把输入归一化为 `normalized_query_url` +- 先复用 blog identity canonicalization 规则,把输入归一化为 `normalized_query_url` - 优先按 canonical homepage identity 精确匹配 - 若 identity 未命中,再回退到 `normalized_url` 精确相等匹配 - 若仍未命中,则返回空数组;当前不做 substring / domain contains 型广义搜索 @@ -523,12 +918,18 @@ Admin API 同样由 `backend` 暴露,但统一位于 `/api/admin/*` 下,并 - 若 blog 不存在,返回 `404` - 返回内容基于单 blog 记录扩展了 `incoming_edges` 与 `outgoing_edges` +- 返回 `discovery_path` 描述该博客进入网络的路径:手动 seed/user 添加,或由 crawler 沿友链逐级发现 +- 返回 `relation_graphs` 描述详情页“博客关联”模块使用的两层入链/出链关系图;两层深度内的入链/出链关系完整返回,不按节点或边数量裁剪 额外字段: +- `crawl_status`: 当前抓取执行状态,例如 `WAITING`、`PROCESSING`、`FAILED`、`FINISHED`;详情页会直接展示该字段 +- `crawl_error_kind`: 最近一次抓取失败分类;当 `crawl_status=FAILED` 时,详情页会把该字段作为失败原因展示,例如 `timeout`、`page_too_large`、`http_status`、`request_error` - `incoming_edges`: 所有 `to_blog_id == blog_id` 的边,每条边额外携带 `neighbor_blog` - `outgoing_edges`: 所有 `from_blog_id == blog_id` 的边,每条边额外携带 `neighbor_blog` - `recommended_blogs`: “朋友的朋友”推荐列表,规则是“当前博客的友链认识、但当前博客还没直接认识的博客” +- `discovery_path`: 发现路径。`mode=manual` 表示该博客由 `accepted_by=seed/user` 手动进入网络;`mode=crawled` 表示通过 `raw_discovered_urls` 从当前博客逐级追溯 source blog,直到 seed/user 源头、无法继续追溯或检测到循环;正常长路径会完整返回,不按固定深度截断 +- `relation_graphs`: `{ incoming, outgoing }`,两个图默认各包含从当前博客出发的 2 层关系;`incoming` 沿入链向上追溯,`outgoing` 沿出链向下展开;两层深度内不按节点或边数量裁剪 其中 `neighbor_blog` 是详情页使用的邻居摘要,字段为: @@ -875,7 +1276,7 @@ Admin API 同样由 `backend` 暴露,但统一位于 `/api/admin/*` 下,并 - `strategy`: `degree` 或 `seed` - `limit`: 默认子图规模上限,当前最大允许 `10000` -- `sample_mode`: `off` / `count` / `percent` +- `sample_mode`: `off` / `count` / `percent`;图谱页默认不启用采样 - `sample_value`: 当采样开启时的数量或百分比;`count` 会先用固定随机种子选择一个起点,再按 BFS 扩展到目标节点数,避免返回大量互不相连的随机点 - `sample_seed`: 固定随机种子,便于复现随机起点与补充分量顺序 @@ -913,6 +1314,7 @@ Admin API 同样由 `backend` 暴露,但统一位于 `/api/admin/*` 下,并 - `nodes` 元素沿用 `BlogRecord`,并额外携带 `x`、`y`、`degree`、`incoming_count`、`outgoing_count`、`priority_score`、`component_id` - 当 `has_stable_positions` 为 `true` 时,前端会优先使用这些坐标直接渲染,而不是首次实时跑力导布局 +- 当前图谱页用 0 到当前 blog 总数的滑块选择 `N`,默认值为 `min(200, total_blogs)`;点击确认后请求 `strategy=seed&limit=N`,直接按 blog id 升序选择前 N 个 blog 节点,并只返回这些节点之间的边。图谱节点不按 `crawl_status` 过滤,因为发现关系本身可能来自抓取失败或尚未完成的父节点;只要边的两端 blog 仍存在,就会参与图谱投影 - 当 `sample_mode != off` 时,会返回可复现的随机起点 BFS 子图;若起点所在连通分量不足目标规模,会按同一随机序列继续从其他分量 BFS 补足 - 服务在返回前会检查底层 graph 是否已变化;若当前仓库数据与最新 snapshot 不一致,会先重建 snapshot,再返回最新视图 - `snapshot_namespace` 用于区分当前 view 依赖的 snapshot 来源;当前默认值为 `legacy` @@ -974,100 +1376,30 @@ Admin API 同样由 `backend` 暴露,但统一位于 `/api/admin/*` 下,并 - 当前 public API 已不再暴露 legacy 的 `/api/logs` 与 `/api/search`。 - 运行日志统一由 `shared.observability` 输出到类型目录,默认是 `logs/app/`、`logs/error/`、`logs/access/`;每个类型目录下再按服务分目录,保存 `-YYYYMMDD-HH.log` 小时切片,Docker Compose 中对应 `volumes/logs`。 - legacy `/internal/logs` 仍保留兼容入口,但当前不会把 crawl log 写入业务数据库。 -- blog dedup scan 这类维护任务的进度属于 domain event,仍通过各自 run/event 接口持久化,不混入通用 application log。 - `search` 服务仍保留为内部可重建索引组件,供 health 检查与 reindex 维护链路使用,并在缓存为空时回退到 `persistence-api /internal/search-snapshot`。 - 浏览器当前没有直接依赖的 public 搜索页;public 发现主路径已经收敛到 `catalog / lookup / detail / graph views`。 -#### `GET /api/ingestion-requests` - -用途:返回统一 discovery 页“优先处理博客清单”所需的公开优先录入请求列表。 - -返回约束: - -- 固定最多返回 `20` 条 -- inclusion rule: 先返回 active request(`QUEUED`、`CRAWLING_SEED`),再补最近创建的 terminal request,直到达到上限 -- 排序固定为 `active-first -> created_at DESC -> request_id DESC` - -公开字段: - -- `request_id` -- `requested_url` -- `normalized_url` -- `status` -- `seed_blog_id` -- `matched_blog_id` -- `blog_id` -- `error_message` -- `created_at` -- `updated_at` -- `blog` - -隐私边界: - -- 该公开列表不会返回 `email` -- 该公开列表不会返回 `request_token` - -补充说明: - -- `blog` 是裁剪后的公开摘要,至少包含 `id`、`url`、`domain`、`title`、`icon_url`、`crawl_status` -- 该列表接口服务统一页的优先队列面板,不替代单条状态查询接口 - -#### `POST /api/ingestion-requests` - -用途:当搜索未命中时,由最终用户提交博客首页 URL 与联系邮箱,触发优先录入请求。 - -请求体: - -```json -{ - "homepage_url": "https://example.com/", - "email": "owner@example.com" -} -``` - -响应分两类: - -- 已收录时:直接返回 `DEDUPED_EXISTING` 与现有 `blog_id` -- 未收录时:返回请求状态、`request_id`、`request_token`、seed blog 关联信息 - -补充说明: - -- 后端会先做 URL normalize 与 email 基础校验 -- 当前去重主键已经扩展为 `identity_key`;它会忽略 `http/https`、主页默认首页路径、`www.`,以及白名单博客别名子域(如 `blog.`) -- 活跃请求会按 `identity_key + ACTIVE_INGESTION_REQUEST_STATUSES` 复用,而不是重复创建 crawl -- `request_token` 仍作为自助提交状态查询的轻量凭证;当前账号系统暂未接管 ingestion request 的所有权 - -#### `GET /api/ingestion-requests/{request_id}?request_token=...` - -用途:查询某个自助录入请求的当前状态。 - -当前返回字段重点包括: - -- `status`: 当前请求状态,常见值有 `QUEUED`、`CRAWLING_SEED`、`COMPLETED`、`FAILED` -- `seed_blog_id`: 当前请求绑定的 seed blog -- `matched_blog_id`: 若已完成并命中 blog,则返回该 blog id -- `blog`: 当前关联 blog 的摘要信息 -- `request_token`: 创建请求时返回的状态查询 token - -补充说明: - -- 当前 ingestion request 状态查询仍依赖 `request_id + request_token` -- 若 `request_token` 不匹配,返回 `404` -- 统一 discovery 页的公开优先队列列表不会暴露该 `request_token`;只有创建者通过该单条接口查询时才会使用它 - ### 3.5 管理员爬取执行接口 #### `POST /api/admin/crawl/bootstrap` -用途:从 `seed.csv` 导入种子博客。 +用途:导入种子博客。若 `seeds` 表已有记录,则直接以 `seeds` 表为来源回灌 `blogs`;仅当 `seeds` 表为空时才从 `seed.csv` 初始化。 调用链: - `backend` -> `crawler /internal/crawl/bootstrap` +持久化行为: + +- 每个 seed URL 会 upsert 到 `blogs`,并标记 `accepted_by=seed` +- 当 `seeds` 表为空时,会从 `seed.csv` 读取非空 URL,并同步 upsert 到 `seeds` 表,记录原始 URL、规范化 URL、domain、关联 `blog_id`、来源 CSV 路径与 CSV 数据行号 +- 当 `seeds` 表不为空时,导入动作直接 replay `seeds` 表记录到 `blogs`,不会读取 `seed.csv` +- `seeds.normalized_url` 唯一;重复导入同一个 seed 会刷新记录,不会创建重复 seed 行 +- 管理员数据库 reset 会保留 `seeds` 表数据,只清空其旧 `blog_id` 关联;下一次导入会重新把 seed 行关联到新建或复用的 blog + 响应字段: -- `seed_path`: 实际导入的种子文件路径 +- `seed_path`: 配置的种子 CSV 文件路径;当 `seeds` 表不为空时,该字段仅表示 fallback CSV 路径 - `imported`: 新导入的 blog 数量 #### `POST /api/admin/crawl/run` @@ -1108,81 +1440,6 @@ Admin API 同样由 `backend` 暴露,但统一位于 `/api/admin/*` 下,并 ### 3.6 数据维护接口 -#### `POST /api/admin/blog-dedup-scans` - -用途:管理员手动触发一次基于当前 `UrlDecisionChain` 的全库已收录 blog URL 重评估扫描。 - -行为说明: - -- backend 会先读取 crawler runtime;若扫描前 crawler 正在运行,则先停爬并等待 `idle` -- 扫描期间 backend 会打开 `maintenance_in_progress` 维护锁 -- `POST` 请求现在只负责创建一个 `RUNNING` scan run 并启动后台任务,因此前端会立刻收到可轮询的 run 摘要 -- 维护窗口内新的 `POST /api/admin/runtime/start` 与 `POST /api/admin/runtime/run-batch` 会返回 `409 maintenance_in_progress` -- 当前实现会复用 crawler 的共享 `UrlDecisionChain` builder,对数据库里已存的 `blogs.url` 重新跑一遍完整 URL 过滤逻辑 -- 被当前决策链拒绝的 blog 会连同其相关 edge 一起删除,并清空相关 ingestion 引用,避免残留悬挂关系 -- 扫描 summary 中的 `total_count / scanned_count / kept_count / removed_count` 对应的是已存 blog URL 数量 -- 扫描成功后 backend 会尝试调用 search reindex -- 若扫描前 crawler 原本在运行,backend 会在结束后尝试恢复 crawler,并把恢复结果写回 run summary -- 前端可通过 `GET /api/admin/blog-dedup-scans/latest` 轮询实时进度,并在需要明细时继续请求 `GET /api/admin/blog-dedup-scans/{run_id}/items`;其中 `scanned_count / total_count` 表示“已扫描 URL / 总共 URL” - -返回字段重点包括: - -- `id` -- `status` -- `ruleset_version` -- `total_count` -- `scanned_count` -- `removed_count` -- `kept_count` -- `crawler_was_running` -- `crawler_restart_attempted` -- `crawler_restart_succeeded` -- `search_reindexed` -- `error_message` -- `started_at` / `completed_at` / `duration_ms` - -#### `GET /api/admin/blog-dedup-scans/latest` - -用途:返回最近一次扫描摘要。 - -#### `GET /api/admin/blog-dedup-scans/{run_id}/items` - -用途:返回该次扫描中被决策链移除的 blog 明细与原因。 - -每条 item 至少包含: - -- `survivor_blog_id` - 当前通常为 `null`;历史字段名保留用于兼容 -- `removed_blog_id` - 当前表示被规则重扫移除的 blog id -- `survivor_identity_key` - 当前承载被扫描 blog 的 identity key 供排查使用 -- `removed_url` -- `reason_code` -- `reason_codes` -- `survivor_selection_basis` - 当前承载 scanned blog id 与 decision score 等辅助调试信息 - -#### `POST /api/admin/blogs/requeue-failed` - -用途:把所有 `FAILED` 状态的 blog 重新放回 crawler 待处理队列。 - -行为说明: - -- 仅允许在 crawler 运行器不处于 `starting/running/stopping` 时调用 -- 若运行器忙碌,返回 `409`,错误详情为 `crawler_busy` -- 会把当前所有 `crawl_status=FAILED` 的 blog 改为 `WAITING` -- 会清空这些 blog 的 `status_code`,并更新 `updated_at` -- 若对应 ingestion request 处于 `FAILED`,会同步改回 `QUEUED` 并清空 `error_message` - -成功响应示例: - -```json -{ - "requeued": 733 -} -``` - #### `POST /api/admin/database/reset` 用途:重置数据库中的 crawler 相关数据,便于测试和开发时快速回到初始状态。 @@ -1191,8 +1448,8 @@ Admin API 同样由 `backend` 暴露,但统一位于 `/api/admin/*` 下,并 - 仅允许在 crawler 运行器不处于 `starting/running/stopping` 时调用 - 若运行器忙碌,返回 `409`,错误详情为 `crawler_busy` -- 会清空 `blogs`、`edges`、`raw_discovered_urls`、`ingestion_requests` 和维护任务记录 -- 不会删除人工 label 相关数据:`blog_labels(normalized_url, title, label_id, created_time, updated_time)` 和 `blog_label_tags` 会被保留 +- 会清空 `blogs`、`edges`、`raw_discovered_urls` +- 不会删除 users、sessions、seeds、人工 label、recommendation 事件等其它表;`seeds.blog_id` 会置空以解除到 `blogs` 的引用 - backend 在数据库重置后会尝试调用 `search /internal/search/reindex` - 即使 search 重建失败,数据库重置结果仍会返回,并附带 `search_reindexed=false` @@ -1203,13 +1460,8 @@ Admin API 同样由 `backend` 暴露,但统一位于 `/api/admin/*` 下,并 "ok": true, "blogs_deleted": 12, "edges_deleted": 34, + "raw_discovered_urls_deleted": 56, "logs_deleted": 0, - "blog_link_labels_deleted": 0, - "blog_label_tags_deleted": 0, - "blog_labels_preserved": 8, - "blog_label_subjects_preserved": 0, - "blog_link_labels_preserved": 13, - "blog_label_tags_preserved": 6, "search_reindexed": true, "search": { "blogs": 0, @@ -1264,7 +1516,6 @@ Admin API 同样由 `backend` 暴露,但统一位于 `/api/admin/*` 下,并 - 若当前已在 `starting/running/stopping`,直接返回当前快照 - 成功启动后会创建新的 `active_run_id` -- 若 backend 当前处于 blog dedup 维护窗口,返回 `409 maintenance_in_progress` #### `POST /api/admin/runtime/stop` @@ -1281,7 +1532,6 @@ Admin API 同样由 `backend` 暴露,但统一位于 `/api/admin/*` 下,并 补充说明: -- 若 backend 当前处于 blog dedup 维护窗口,返回 `409 maintenance_in_progress` 请求体: @@ -1344,7 +1594,7 @@ Admin API 同样由 `backend` 暴露,但统一位于 `/api/admin/*` 下,并 ### `POST /internal/crawl/bootstrap` -用途:导入种子数据。 +用途:导入种子数据。该流程优先 replay `seeds` 表到 `blogs`;只有 `seeds` 表为空时才读取 seed CSV 并同步维护 `blogs` 与 `seeds`。 实际执行:`CrawlPipeline.bootstrap_seeds()` @@ -1474,6 +1724,30 @@ Admin API 同样由 `backend` 暴露,但统一位于 `/api/admin/*` 下,并 - 支持 `statuses` 多状态过滤与 `id_asc` 排序,供统一 discovery 队列视图使用 - blog 行数据会直接带上连接度、活跃度和身份完整度等派生字段 +### `POST /internal/recommendations/random-blog-batches` + +用途:为 backend 创建随机博客推荐批次,并写入 `recommendation_requests` 与 `recommendation_impressions`;曝光表以 `normalized_url` 持久归因,不保存 `blog_id`。 + +请求体字段与 `POST /api/recommendations/random-blog-batches` 一致,额外允许 backend 传入已解析的 `user_id`。 + +### `POST /internal/recommendation-events` + +用途:为 backend 写入幂等推荐交互事件,数据落到 `blog_interactions`,并以 `normalized_url` 持久归因。 + +请求体字段与 `POST /api/recommendation-events` 一致,额外允许 backend 传入已解析的 `user_id`。其中 `entrance_kind` 与 `entrance_url` 仍为必填字段,persistence-api 会清洗长度并写入 `blog_interactions.entrance_kind` / `blog_interactions.entrance_url`。 + +### `GET /internal/blogs/{blog_id}/recommendation-stats` + +用途:返回单个博客的推荐曝光、点击/详情打开、标注选择、独立访客和 CTR 统计。 + +返回结构与 `GET /api/blogs/{blog_id}/stats` 一致。 + +### `GET /internal/recommendation-stats` + +用途:返回 strategy/surface/version 维度的推荐请求、曝光、交互和 CTR 汇总。 + +返回结构与 `GET /api/admin/recommendation-stats` 一致。 + ### `GET /internal/blogs/lookup?url=...` 用途:为 backend 提供数据库权威的博客 URL 存在性查询。 @@ -1484,25 +1758,32 @@ Admin API 同样由 `backend` 暴露,但统一位于 `/api/admin/*` 下,并 - 命中顺序固定为 `identity_key -> normalized_url -> empty` - `match_reason` 只允许 `identity_key`、`normalized_url` 或 `null` -### `GET /internal/queue/next` +### `GET /internal/blogs/by-normalized-url?normalized_url=...` -用途:取出下一个待处理 blog,并立即将其状态更新为 `PROCESSING`。 +用途:为 crawler 在遇到重复 raw URL 时解析已存在的目标 blog id。 -行为说明: +响应: -- 只从 `crawl_status = 'WAITING'` 中选择 -- 默认允许包含 priority seed;也可以通过 `include_priority=false` 只领取普通队列 -- 选中后立刻更新为 `PROCESSING` +```json +{ + "id": 1 +} +``` + +补充说明: -### `GET /internal/queue/priority-next` +- 未找到已接受 blog 时返回 `{ "id": null }` +- 该接口不改变 raw URL 去重语义;crawler 仍可把重复 URL 标记为 `rule:duplicate_url`,但会用这里返回的 id 补写新的源博客到目标博客的边 +- 主要用于保留 A->C 已存在后,B 后续发现 C 时的 B->C 关系 -用途:只领取由 `ingestion_requests` 驱动的高优先级 seed blog。 +### `GET /internal/queue/next` + +用途:取出下一个待处理 blog,并立即将其状态更新为 `PROCESSING`。 行为说明: -- 仅选择仍处于 `QUEUED` 的请求对应 seed -- 按 `priority DESC, created_at ASC, blog.id ASC` 领取 -- 选中后立刻把 blog 更新为 `PROCESSING` +- 只从 `crawl_status = 'WAITING'` 中选择 +- 选中后立刻更新为 `PROCESSING` ### `GET /internal/blogs/{blog_id}/detail` @@ -1717,10 +1998,9 @@ Admin API 同样由 `backend` 暴露,但统一位于 `/api/admin/*` 下,并 行为说明: -- 清空 `blogs`、`edges`、`raw_discovered_urls`、`ingestion_requests` 和维护任务记录 -- 保留 URL-keyed 人工 label 数据与 tag 定义 +- 清空 `blogs`、`edges`、`raw_discovered_urls` +- 不删除其它表;`seeds.blog_id` 会置空以解除到 `blogs` 的引用 - `logs_deleted` 固定返回 `0` -- 重置主键计数器 响应: @@ -1729,90 +2009,11 @@ Admin API 同样由 `backend` 暴露,但统一位于 `/api/admin/*` 下,并 "ok": true, "blogs_deleted": 12, "edges_deleted": 34, - "logs_deleted": 0, - "blog_link_labels_deleted": 0, - "blog_label_tags_deleted": 0, - "blog_labels_preserved": 8, - "blog_label_subjects_preserved": 0, - "blog_link_labels_preserved": 13, - "blog_label_tags_preserved": 6 + "raw_discovered_urls_deleted": 56, + "logs_deleted": 0 } ``` -补充说明: - -- 若该 blog 是某个活跃 `ingestion_request` 的 seed,写回结果时会同步推进请求状态为 `COMPLETED` 或 `FAILED` - -### `POST /internal/ingestion-requests` - -用途:创建或复用一个用户自助优先录入请求。 - -请求体: - -```json -{ - "homepage_url": "https://example.com/", - "email": "owner@example.com" -} -``` - -返回: - -- 已收录时:`DEDUPED_EXISTING` -- 新建或复用请求时:请求 payload,包含 `request_id`、`request_token`、`status`、`seed_blog_id` - -补充说明: - -- 去重与复用当前按 `identity_key` 执行,而不再只看 `normalized_url` -- 返回 payload 会附带 `identity_key`、`identity_reason_codes` 与 `identity_ruleset_version` -- 对满足“tenant-like homepage 子域”启发式的 URL,`normalized_url` 与 seed blog URL 会直接收敛到 registrable root 的 canonical URL;例如 `*.66law.cn` 会统一收敛到 `https://66law.cn/`。`*.github.io`、`*.gitee.io` 等显式排除域名不会被这样归并。 - -### `GET /internal/ingestion-requests/{request_id}` - -用途:通过 `request_id + request_token` 查询请求状态。 - -查询参数: - -- `request_token`: 创建请求时生成的查询 token - -### `GET /internal/ingestion-requests` - -用途:为 backend 提供统一 discovery 页“优先处理博客清单”所需的公开优先录入请求列表。 - -补充说明: - -- 返回范围、排序与公开字段约束与 `GET /api/ingestion-requests` 一致 -- internal/public 两层都不会在该列表 payload 中暴露 `email` 与 `request_token` - -### `POST /internal/ingestion-requests/by-blog/{blog_id}/crawling` - -用途:当 crawler 真正开始处理某个 seed blog 时,把关联请求推进到 `CRAWLING_SEED`。 - -### `POST /internal/blog-dedup-scans/runs` - -用途:创建一个 `RUNNING` 的规则重扫 run,并立即返回初始摘要,供 backend 异步编排使用。 - -查询参数: - -- `crawler_was_running`: backend 透传的预扫描 runtime 状态 - -### `POST /internal/blog-dedup-scans/{run_id}/execute` - -用途:执行指定 run 的 persistence 侧规则重扫逻辑,并在执行过程中持续更新 `total_count`、`scanned_count`、`removed_count`、`kept_count`。 -当前四个计数字段都以已存 blog URL 数为口径。 - -### `POST /internal/blog-dedup-scans/{run_id}/finalize` - -用途:由 backend 在扫描编排完成后回写 crawler 恢复和 search reindex 结果。 - -### `GET /internal/blog-dedup-scans/latest` - -用途:返回最近一次 run summary。 - -### `GET /internal/blog-dedup-scans/{run_id}/items` - -用途:返回指定 run 中被决策链移除的 blog 明细。 - ## 5. 数据模型整理 以下字段来自当前仓库实现与前端类型定义,适合作为现阶段统一理解口径。 @@ -1837,8 +2038,15 @@ Admin API 同样由 `backend` 暴露,但统一位于 `/api/admin/*` 下,并 | `domain` | `string` | 域名 | | `email` | `string \| null` | 博主联系邮箱;仅在用户自助优先录入时写入,默认 `null` | | `title` | `string \| null` | 站点主页解析出的 ``,缺失时为 `null` | -| `icon_url` | `string \| null` | 站点标签页 icon URL;优先使用页面声明的 icon 链接,缺失时可能回退为 `${origin}/favicon.ico` | +| `icon_url` | `string \| null` | 站点标签页 icon URL;仅在 crawler 从页面 metadata 提取并验证可访问后持久化,缺失或验证失败时为 `null`。前端可使用第三方 favicon API 做展示兜底,但不回写该字段 | | `status_code` | `number \| null` | 最近抓取 HTTP 状态码 | +| `acceptance_status` | `string` | 博客接受状态,当前主要使用 `ACCEPTED` 与 `UNKNOWN`;该字段决定“是否被确认为博客” | +| `accepted_by` | `string \| null` | 接受来源,例如 `seed`、`rss`、`model` | +| `accepted_at` | `string \| null` | URL 被确认为博客的时间 | +| `crawl_error_kind` | `string \| null` | 最近一次抓取失败分类,例如 `timeout`、`page_too_large`、`http_status` | +| `crawl_error_message` | `string \| null` | 最近一次抓取失败详情摘要 | +| `last_crawl_attempt_at` | `string \| null` | 最近一次抓取尝试时间 | +| `successful_crawl_at` | `string \| null` | 最近一次成功完成抓取时间 | | `crawl_status` | `string` | 当前抓取状态,常见值有 `WAITING` `PROCESSING` `FAILED` `FINISHED` | | `friend_links_count` | `number` | 最近一次抓取发现的友链数 | | `last_crawled_at` | `string \| null` | 最近抓取时间 | @@ -1878,37 +2086,7 @@ Admin API 同样由 `backend` 暴露,但统一位于 `/api/admin/*` 下,并 | `filters.min_connections` | `number` | 最小连接度阈值 | | `sort` | `string` | 当前生效排序;与 `filters.sort` 保持一致 | -### 5.3 IngestionRequestPayload - -来源: - -- [persistence_api/repository.py](persistence_api/repository.py) -- [frontend/src/lib/api.ts](frontend/src/lib/api.ts) - -字段: - -| 字段 | 类型 | 说明 | -| --- | --- | --- | -| `id` / `request_id` | `number` | 请求主键 | -| `requested_url` | `string` | 用户提交的原始首页 URL | -| `normalized_url` | `string` | 归一化后的 URL | -| `identity_key` | `string` | 当前请求命中的 blog 身份键 | -| `identity_reason_codes` | `string[]` | 当前 identity 解析原因码 | -| `identity_ruleset_version` | `string` | 当前 identity 规则版本 | -| `email` | `string` | 用户提交的联系邮箱 | -| `status` | `string` | 请求状态 | -| `priority` | `number` | 当前固定优先级值 | -| `seed_blog_id` | `number \| null` | 绑定的 seed blog | -| `matched_blog_id` | `number \| null` | 已完成时关联的最终 blog | -| `blog_id` | `number \| null` | 便于前端跳转的当前关联 blog id | -| `request_token` | `string` | 无账号状态查询 token | -| `seed_blog` | `BlogRecord \| null` | seed blog 摘要 | -| `matched_blog` | `BlogRecord \| null` | 已匹配 blog 摘要 | -| `blog` | `BlogRecord \| null` | 前端使用的当前 blog 视图 | -| `error_message` | `string \| null` | 失败时的错误摘要 | -| `created_at` / `updated_at` | `string` | 请求创建/更新时间 | - -### 5.4 BlogDetailPayload +### 5.3 BlogDetailPayload 字段: @@ -1918,12 +2096,17 @@ Admin API 同样由 `backend` 暴露,但统一位于 `/api/admin/*` 下,并 | `incoming_edges` | `BlogRelationRecord[]` | 指向当前博客的关系列表 | | `outgoing_edges` | `BlogRelationRecord[]` | 当前博客指向外部的关系列表 | | `recommended_blogs` | `BlogRecommendationRecord[]` | “朋友的朋友”推荐列表 | +| `discovery_path` | `BlogDiscoveryPath` | 发现路径摘要,从源头博客到当前博客的有序步骤 | +| `relation_graphs` | `{ incoming, outgoing }` | 两层入链/出链关系图,供详情页“博客关联”模块展示;两层深度内不按节点或边数量裁剪 | 其中: - `BlogRelationRecord = EdgeRecord + { neighbor_blog: BlogNeighborSummary \| null }` - `BlogRecommendationRecord = { blog, reason, mutual_connection_count, via_blogs }` - `BlogNeighborSummary` 字段为 `id`、`domain`、`title`、`icon_url` +- `BlogDiscoveryPath = { mode, origin_source, origin_label, target_source, truncated, steps }`,其中 `truncated` 为历史兼容字段,当前始终为 `false` +- `BlogDiscoveryStep` 包含 `blog` 邻居摘要、`blog_id`、`url`、`domain`、`accepted_by`、`accepted_label`、`raw_id`、`raw_source_blog_id`、`raw_accepted_by`、`discovered_at` +- `BlogRelationGraph = { direction, focus_blog_id, depth, nodes, edges }`,其中 `direction` 为 `incoming` 或 `outgoing`,`depth` 默认是 `2` ### 5.4 EdgeRecord @@ -2019,7 +2202,7 @@ Admin API 同样由 `backend` 暴露,但统一位于 `/api/admin/*` 下,并 ### 6.1 读接口调用链 - 前端 -> `backend /api/*` -- `backend` -> `persistence-api` 获取 blog catalog、blog detail、graph views、graph snapshots、stats、ingestion request 与 dedup summary +- `backend` -> `persistence-api` 获取 blog catalog、blog detail、graph views、graph snapshots 与 stats - `backend` -> `crawler` 获取运行时状态 ### 6.2 写接口调用链 @@ -2028,8 +2211,9 @@ Admin API 同样由 `backend` 暴露,但统一位于 `/api/admin/*` 下,并 - 管理员前端/调用方 -> `POST /api/admin/crawl/bootstrap` - `backend` -> `crawler /internal/crawl/bootstrap` -- `crawler` 读取 `seed.csv` -- `crawler` -> `persistence-api /internal/blogs/upsert` +- `crawler` -> `persistence-api /internal/seeds` 检查是否已有持久化 seed +- 若已有 seed:`crawler` replay `seeds` 表到 `blogs` +- 若没有 seed:`crawler` 读取 `seed.csv`,再通过 `persistence-api /internal/blogs/upsert` 同时写入/刷新 `blogs` 与 `seeds` - `crawler` -> 结构化日志管线 #### 单次 crawl 运行 @@ -2056,7 +2240,7 @@ Admin API 同样由 `backend` 暴露,但统一位于 `/api/admin/*` 下,并 - 对外协议以 `backend /api/*` 为准,前端不要直接依赖内部服务接口 - 内部服务接口已经比较清晰,但目前没有统一版本号,也没有显式 OpenAPI schema 文档归档 -- legacy 的 raw blog/edge/graph/log/search 公共读取端点已经移除,当前对外建议继续围绕 catalog、detail、graph view、ingestion 和 admin runtime 组织能力 +- legacy 的 raw blog/edge/graph/log/search 公共读取端点已经移除,当前对外建议继续围绕 catalog、detail、graph view、user seed 和 admin runtime 组织能力 - `/api/admin/crawl/run` 使用 query 参数 `max_nodes`,而 `/api/admin/runtime/run-batch` 使用 JSON body `max_nodes`,风格不完全一致,后续可统一 - `search` 当前是轻量缓存式实现,属于可重建索引,不是强一致检索服务 - `services/*` 只是兼容入口,后续文档与新开发应优先引用顶层目录 `backend/`、`crawler/`、`search/`、`persistence_api/` diff --git a/doc/config-reference.md b/doc/config-reference.md index db54bbc..fb25226 100644 --- a/doc/config-reference.md +++ b/doc/config-reference.md @@ -44,6 +44,17 @@ Docker Compose 也会从仓库根目录的 `.env` 读取变量。 | `HEYBLOG_LOG_CONSOLE_ENABLED` | `true` | 全部 Python 服务 | 是否同时输出到控制台,方便 Docker logs 查看 | | `HEYBLOG_LOG_RETENTION_DAYS` | `7` | 全部 Python 服务 | 自动清理超过该天数的小时切片日志 | | `HEYBLOG_BACKEND_BASE_URL` | `http://127.0.0.1:8000` | `frontend` | 浏览器代理层转发到公共 API 的目标地址 | +| `HEYBLOG_PUBLIC_BASE_URL` | `http://127.0.0.1:3000` | `persistence-api` | 生成邮箱验证与密码重置链接时使用的公开前端基准地址 | +| `HEYBLOG_EMAIL_PROVIDER` | `disabled` | `persistence-api` | 用户生命周期邮件 provider。可选 `disabled`/`noop` 或 `smtp`;默认不连接邮件服务 | +| `HEYBLOG_EMAIL_FROM` | 空 | `persistence-api` | SMTP 邮件发件人地址;启用 `smtp` 时必须设置 | +| `HEYBLOG_EMAIL_DEV_EXPOSE_TOKENS` | `false` | `persistence-api` | 是否在验证/重置 API 响应中暴露 raw token/link。仅本地调试需要手动设置为 `true` | +| `HEYBLOG_SMTP_HOST` | 空 | `persistence-api` | SMTP 服务器主机名 | +| `HEYBLOG_SMTP_PORT` | `587` | `persistence-api` | SMTP 服务器端口 | +| `HEYBLOG_SMTP_USERNAME` | 未设置 | `persistence-api` | SMTP 用户名;为空时不执行登录 | +| `HEYBLOG_SMTP_PASSWORD` | 未设置 | `persistence-api` | SMTP 密码;为空时不执行登录 | +| `HEYBLOG_SMTP_USE_TLS` | `true` | `persistence-api` | 是否在普通 SMTP 连接上使用 STARTTLS | +| `HEYBLOG_SMTP_USE_SSL` | `false` | `persistence-api` | 是否使用隐式 SMTP-over-SSL 连接 | +| `HEYBLOG_SMTP_TIMEOUT_SECONDS` | `10.0` | `persistence-api` | SMTP 连接与发送超时时间 | | `HEYBLOG_CRAWLER_BASE_URL` | `http://127.0.0.1:8010` | `backend` | `backend` 调用 `crawler` 的内部地址 | | `HEYBLOG_SEARCH_BASE_URL` | `http://127.0.0.1:8020` | `backend` | `backend` 调用 `search` 的内部地址 | | `HEYBLOG_PERSISTENCE_BASE_URL` | `http://127.0.0.1:8030` | `backend`、`crawler`、`search` | 三个服务访问持久化边界的内部地址 | @@ -53,8 +64,9 @@ Docker Compose 也会从仓库根目录的 `.env` 读取变量。 | `HEYBLOG_MAX_PATH_PROBES_PER_BLOG` | `50` | `crawler` | 单站点路径探测上限 | | `HEYBLOG_CANDIDATE_PAGE_FETCH_CONCURRENCY` | `4` | `crawler` | 友链候选页抓取并发度,最小为 `1` | | `HEYBLOG_RUNTIME_WORKER_COUNT` | `3` | `crawler` | runtime 持续抓取的 worker 数 | +| `HEYBLOG_RUNTIME_AUTO_START_INTERVAL_SECONDS` | `3600` | `crawler` | crawler 服务内置 idle 检测间隔;到点时若 runtime 不在工作则自动调用 runtime start | | `HEYBLOG_RAW_DISCOVERED_URL_LIMIT` | `1000000` | `crawler` | `raw_discovered_urls` 行数达到该值后拒绝启动 crawler,并让正在运行的 runtime 在下一次 claim 前自动停止;设为 `-1` 表示不限制 | -| `HEYBLOG_MAX_FETCHED_PAGE_BYTES` | `2000000` | `crawler` | 单个页面允许读取的最大字节数;超限后当前 blog 直接记为 `FAILED`,超大页不会继续进入解析阶段 | +| `HEYBLOG_MAX_FETCHED_PAGE_BYTES` | `2000000` | `crawler` | 单个页面允许读取的最大字节数;超限后当前 crawl attempt 记为 `FAILED` 并记录错误分类,超大页不会继续进入解析阶段;这不会撤销已接受博客的 `acceptance_status` | | `HEYBLOG_FRIEND_LINK_DOMAIN_BLOCKLIST` | 空 | `crawler` | 逗号分隔的域名黑名单 | | `HEYBLOG_FRIEND_LINK_TLD_BLOCKLIST` | 空 | `crawler` | 逗号分隔的顶级域黑名单 | | `HEYBLOG_FRIEND_LINK_EXACT_URL_BLOCKLIST` | 空 | `crawler` | 逗号分隔的精确 URL 黑名单 | @@ -86,6 +98,7 @@ Docker Compose 也会从仓库根目录的 `.env` 读取变量。 | `persistence-api` | `HEYBLOG_DB_DSN` | 启用 PostgreSQL 后端 | | `persistence-api` | `HEYBLOG_DOCKER_DECISION_MODEL_ROOT` | 全库规则重扫读取的容器内运行时模型根目录 | | `persistence-api` | `HEYBLOG_DECISION_MODEL_CONSENSUS_STRATEGY` / `HEYBLOG_DECISION_MODEL_CONSENSUS_THRESHOLD` | 全库规则重扫使用的模型共识策略与 weighted 阈值 | +| `persistence-api` | `HEYBLOG_EMAIL_PROVIDER` / `HEYBLOG_EMAIL_FROM` / `HEYBLOG_SMTP_*` | 发送邮箱验证与密码重置邮件 | ## 3.1 运行时资源目录约定 diff --git a/doc/crawler-url-filtering.md b/doc/crawler-url-filtering.md index 2f6c770..39becfd 100644 --- a/doc/crawler-url-filtering.md +++ b/doc/crawler-url-filtering.md @@ -105,15 +105,9 @@ crawler 的两种主要运行方式: 6. 每个博客结束后累计 `processed / discovered / failed` 7. 本轮结束后执行 `write_exports()` -### 3.3 队列公平策略 +### 3.3 队列领取策略 -`CrawlPipeline._claim_next_scheduled_blog()` 当前不是简单 FIFO,而是带优先队列公平窗口: - -- 优先种子队列优先级更高 -- 但不会无限饿死普通 waiting 队列 -- 参数 `priority_seed_normal_queue_slots` 控制“处理一个 priority 后,允许多少个 normal queue 项穿插执行” - -这套逻辑也被 runtime 模式复用。 +`CrawlPipeline` 与 runtime 模式都会通过 `persistence-api /internal/queue/next` 领取下一个 `WAITING` blog,并在领取时把状态切换为 `PROCESSING`。 ## 4. 单博客抓取链路 @@ -137,9 +131,10 @@ crawler 的两种主要运行方式: - `status_code=首页 HTTP 状态码` - `friend_links_count=本次接受的外链博客数` - `title` -- `icon_url` +- `icon_url`,仅当页面 metadata 提取出的 icon 候选能通过轻量 HTTP 验证时写入;无候选或验证失败时保持 `NULL` 如果超时或异常,则由 `CrawlPipeline._mark_blog_failed()` 标记为 `FAILED`。 +`FAILED` 只表示最近一次抓取尝试没有完整结束,不会撤销 `acceptance_status=ACCEPTED` 的博客判定;RSS、模型或 seed 接受来源会保留在 `accepted_by` / `accepted_at` 中。 ## 5. 首页友链页发现逻辑 @@ -556,7 +551,7 @@ identity 输出里会记录: - 一个博客的整次 crawl 有总超时预算 - 候选页可并发抓取 - 单页超出字节上限会触发 `PageTooLargeError` -- 若候选页超大,当前 blog 会被标记为 `FAILED` +- 若候选页超大,当前 blog 会被标记为 `FAILED`,并记录 `crawl_error_kind=page_too_large`;这只影响抓取生命周期,不表示该 URL 不是博客 因此“没有抓到友链”不一定是过滤规则问题,也可能是: diff --git a/doc/public-admin-boundary.md b/doc/public-admin-boundary.md index c7b806d..61beecd 100644 --- a/doc/public-admin-boundary.md +++ b/doc/public-admin-boundary.md @@ -19,7 +19,8 @@ Public capabilities: - browse discovered blogs - inspect blog detail and graph relationships - search by blog/site/relation clues -- submit ingestion requests and check request status +- submit user seed blog URLs for crawling +- register, log in, verify email, reset password, and save personal label selections ### Admin @@ -36,8 +37,8 @@ Admin capabilities: - crawler runtime control - manual crawl/bootstrap triggers - database maintenance -- dedup scans - blog labeling +- user list and simple role management ## API Boundary @@ -52,9 +53,15 @@ Admin capabilities: - `GET /api/graph/snapshots/latest` - `GET /api/graph/snapshots/{version}` - `GET /api/stats` -- `POST /api/ingestion-requests` -- `GET /api/ingestion-requests` -- `GET /api/ingestion-requests/{request_id}` +- `POST /api/blogs/user-seeds` +- `POST /api/auth/register` +- `POST /api/auth/login` +- `GET /api/auth/me` +- `POST /api/auth/logout` +- `POST /api/auth/email/verify/request` +- `POST /api/auth/email/verify/confirm` +- `POST /api/auth/password/forgot` +- `POST /api/auth/password/reset` ### Admin API @@ -70,13 +77,18 @@ Admin capabilities: - `GET /api/admin/blog-labeling/tags` - `POST /api/admin/blog-labeling/tags` - `PUT /api/admin/blog-labeling/labels/{blog_id}` -- `POST /api/admin/blog-dedup-scans` -- `GET /api/admin/blog-dedup-scans/latest` -- `GET /api/admin/blog-dedup-scans/{run_id}/items` +- `GET /api/admin/hourly-stats` +- `GET /api/admin/users` +- `PATCH /api/admin/users/{user_id}/role` ## Auth -- Admin API requires `Authorization: Bearer <HEYBLOG_ADMIN_TOKEN>` unless `HEYBLOG_ADMIN_DEV_BYPASS=true` is explicitly enabled. +- HeyBlog has three identities: guest, regular user, and admin. +- Guest is any request without a valid user session. +- Regular users are stored with `role=user`. +- Admin users are stored with `role=admin` and must have a verified email to access admin APIs. +- Admin API accepts either `Authorization: Bearer <HEYBLOG_ADMIN_TOKEN>` as a migration/bootstrap fallback, or an admin user session token. - Missing token returns `401 admin_auth_required`. - Invalid token returns `403 admin_auth_invalid`. -- Unconfigured admin auth returns `503 admin_auth_not_configured`. +- Non-admin or unverified user tokens return `403 admin_auth_forbidden`. +- Unconfigured legacy admin auth with no valid admin user session returns `503 admin_auth_not_configured`. diff --git a/doc/service-architecture.md b/doc/service-architecture.md index e979491..1f3ad44 100644 --- a/doc/service-architecture.md +++ b/doc/service-architecture.md @@ -181,7 +181,7 @@ backend health / crawl-run / runtime-run-batch / database-reset -> backend 先读取 crawler /internal/runtime/status -> 若 crawler 忙碌则返回 409 crawler_busy -> 否则 backend -> persistence-api /internal/database/reset - -> persistence-api 清空 blogs / edges 和维护任务记录 + -> persistence-api 清空 blogs / edges / raw_discovered_urls -> backend 再尽力调用 search /internal/search/reindex ``` @@ -191,7 +191,6 @@ backend health / crawl-run / runtime-run-batch / database-reset | --- | --- | --- | | `blogs` / `edges` | `persistence-api` | 系统事实来源 | | application/access/error logs | 统一日志目录 | 默认按类型和服务写到 `logs/app/<service>`、`logs/error/<service>`、`logs/access/<service>` 的小时切片,Docker 中映射到 `volumes/logs` | -| maintenance run events | `persistence-api` | blog dedup scan 等后台维护进度 | | `stats` / `graph` / graph snapshots | `persistence-api` | 基于事实数据组装出的读模型 | | `search-index.json` | `search` | 可重建缓存 | | `RuntimeSnapshot` | `crawler` | 进程内内存态,不是持久化状态 | diff --git a/docker-compose.yml b/docker-compose.yml index 031a7c2..370e17e 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -79,6 +79,7 @@ services: HEYBLOG_MAX_PATH_PROBES_PER_BLOG: ${HEYBLOG_MAX_PATH_PROBES_PER_BLOG:-50} HEYBLOG_CANDIDATE_PAGE_FETCH_CONCURRENCY: ${HEYBLOG_CANDIDATE_PAGE_FETCH_CONCURRENCY:-4} HEYBLOG_RUNTIME_WORKER_COUNT: ${HEYBLOG_RUNTIME_WORKER_COUNT:-3} + HEYBLOG_RUNTIME_AUTO_START_INTERVAL_SECONDS: ${HEYBLOG_RUNTIME_AUTO_START_INTERVAL_SECONDS:-3600} HEYBLOG_RAW_DISCOVERED_URL_LIMIT: ${HEYBLOG_RAW_DISCOVERED_URL_LIMIT:-1000000} HEYBLOG_USER_AGENT: ${HEYBLOG_USER_AGENT:-HeyBlogBot/0.1 (+https://example.invalid/heyblog)} HEYBLOG_FRIEND_LINK_DOMAIN_BLOCKLIST: ${HEYBLOG_FRIEND_LINK_DOMAIN_BLOCKLIST:-} @@ -157,6 +158,17 @@ services: HEYBLOG_DECISION_MODEL_ROOT: ${HEYBLOG_DOCKER_DECISION_MODEL_ROOT:-/app/runtime_resources/models/url_decision/current} HEYBLOG_DECISION_MODEL_CONSENSUS_STRATEGY: ${HEYBLOG_DECISION_MODEL_CONSENSUS_STRATEGY:-weighted_average} HEYBLOG_DECISION_MODEL_CONSENSUS_THRESHOLD: ${HEYBLOG_DECISION_MODEL_CONSENSUS_THRESHOLD:-0.4} + HEYBLOG_PUBLIC_BASE_URL: ${HEYBLOG_PUBLIC_BASE_URL:-http://127.0.0.1:3000} + HEYBLOG_EMAIL_PROVIDER: ${HEYBLOG_EMAIL_PROVIDER:-disabled} + HEYBLOG_EMAIL_FROM: ${HEYBLOG_EMAIL_FROM:-} + HEYBLOG_EMAIL_DEV_EXPOSE_TOKENS: ${HEYBLOG_EMAIL_DEV_EXPOSE_TOKENS:-false} + HEYBLOG_SMTP_HOST: ${HEYBLOG_SMTP_HOST:-} + HEYBLOG_SMTP_PORT: ${HEYBLOG_SMTP_PORT:-587} + HEYBLOG_SMTP_USERNAME: ${HEYBLOG_SMTP_USERNAME:-} + HEYBLOG_SMTP_PASSWORD: ${HEYBLOG_SMTP_PASSWORD:-} + HEYBLOG_SMTP_USE_TLS: ${HEYBLOG_SMTP_USE_TLS:-true} + HEYBLOG_SMTP_USE_SSL: ${HEYBLOG_SMTP_USE_SSL:-false} + HEYBLOG_SMTP_TIMEOUT_SECONDS: ${HEYBLOG_SMTP_TIMEOUT_SECONDS:-10.0} HEYBLOG_LOG_DIR: ${HEYBLOG_DOCKER_LOG_DIR:-/data/logs} HEYBLOG_LOG_LEVEL: ${HEYBLOG_LOG_LEVEL:-INFO} HEYBLOG_LOG_FORMAT: ${HEYBLOG_LOG_FORMAT:-json} diff --git a/frontend/package-lock.json b/frontend/package-lock.json index 303682f..6458c7d 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -14,6 +14,7 @@ "lucide-react": "^0.487.0", "react": "^19.1.0", "react-dom": "^19.1.0", + "react-force-graph-2d": "^1.29.1", "react-force-graph-3d": "^1.29.0", "react-router-dom": "^7.7.1", "sonner": "^2.0.3", @@ -2517,6 +2518,16 @@ "node": ">= 0.6.0" } }, + "node_modules/bezier-js": { + "version": "6.1.4", + "resolved": "https://registry.npmmirror.com/bezier-js/-/bezier-js-6.1.4.tgz", + "integrity": "sha512-PA0FW9ZpcHbojUCMu28z9Vg/fNkwTj5YhusSAjHHDfHDGLxJ6YUKrAN2vk1fP2MMOxVw4Oko16FMlRGVBGqLKg==", + "license": "MIT", + "funding": { + "type": "individual", + "url": "https://github.com/Pomax/bezierjs/blob/master/FUNDING.md" + } + }, "node_modules/bubblesets-js": { "version": "2.3.4", "resolved": "https://registry.npmmirror.com/bubblesets-js/-/bubblesets-js-2.3.4.tgz", @@ -2533,6 +2544,18 @@ "node": ">=8" } }, + "node_modules/canvas-color-tracker": { + "version": "1.3.2", + "resolved": "https://registry.npmmirror.com/canvas-color-tracker/-/canvas-color-tracker-1.3.2.tgz", + "integrity": "sha512-ryQkDX26yJ3CXzb3hxUVNlg1NKE4REc5crLBq661Nxzr8TNd236SaEf2ffYLXyI5tSABSeguHLqcVq4vf9L3Zg==", + "license": "MIT", + "dependencies": { + "tinycolor2": "^1.6.0" + }, + "engines": { + "node": ">=12" + } + }, "node_modules/chai": { "version": "5.3.3", "resolved": "https://registry.npmmirror.com/chai/-/chai-5.3.3.tgz", @@ -3357,6 +3380,32 @@ "node": ">=12" } }, + "node_modules/force-graph": { + "version": "1.51.4", + "resolved": "https://registry.npmmirror.com/force-graph/-/force-graph-1.51.4.tgz", + "integrity": "sha512-TdJ2KbkoiDQ7NIRx8IPGD0mAXXpLhamS7c+b7W98b0MHG7lphnda1VOQX/98UDTsttIAdH4TcP0l0MauSnLK8w==", + "license": "MIT", + "dependencies": { + "@tweenjs/tween.js": "18 - 25", + "accessor-fn": "1", + "bezier-js": "3 - 6", + "canvas-color-tracker": "^1.3", + "d3-array": "1 - 3", + "d3-drag": "2 - 3", + "d3-force-3d": "2 - 3", + "d3-scale": "1 - 4", + "d3-scale-chromatic": "1 - 3", + "d3-selection": "2 - 3", + "d3-zoom": "2 - 3", + "float-tooltip": "^1.7", + "index-array-by": "1", + "kapsule": "^1.16", + "lodash-es": "4" + }, + "engines": { + "node": ">=12" + } + }, "node_modules/fsevents": { "version": "2.3.3", "resolved": "https://registry.npmmirror.com/fsevents/-/fsevents-2.3.3.tgz", @@ -3470,6 +3519,15 @@ "node": ">=8" } }, + "node_modules/index-array-by": { + "version": "1.4.2", + "resolved": "https://registry.npmmirror.com/index-array-by/-/index-array-by-1.4.2.tgz", + "integrity": "sha512-SP23P27OUKzXWEC/TOyWlwLviofQkCSCKONnc62eItjp69yCZZPqDQtr3Pw5gJDnPeUMqExmKydNZaJO0FU9pw==", + "license": "MIT", + "engines": { + "node": ">=12" + } + }, "node_modules/internmap": { "version": "2.0.3", "resolved": "https://registry.npmmirror.com/internmap/-/internmap-2.0.3.tgz", @@ -4205,6 +4263,23 @@ "react": "^19.2.4" } }, + "node_modules/react-force-graph-2d": { + "version": "1.29.1", + "resolved": "https://registry.npmmirror.com/react-force-graph-2d/-/react-force-graph-2d-1.29.1.tgz", + "integrity": "sha512-1Rl/1Z3xy2iTHKj6a0jRXGyiI86xUti81K+jBQZ+Oe46csaMikp47L5AjrzA9hY9fNGD63X8ffrqnvaORukCuQ==", + "license": "MIT", + "dependencies": { + "force-graph": "^1.51", + "prop-types": "15", + "react-kapsule": "^2.5" + }, + "engines": { + "node": ">=12" + }, + "peerDependencies": { + "react": "*" + } + }, "node_modules/react-force-graph-3d": { "version": "1.29.0", "resolved": "https://registry.npmmirror.com/react-force-graph-3d/-/react-force-graph-3d-1.29.0.tgz", diff --git a/frontend/package.json b/frontend/package.json index c9eee0d..4286310 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -16,6 +16,7 @@ "lucide-react": "^0.487.0", "react": "^19.1.0", "react-dom": "^19.1.0", + "react-force-graph-2d": "^1.29.1", "react-force-graph-3d": "^1.29.0", "react-router-dom": "^7.7.1", "sonner": "^2.0.3", diff --git a/frontend/public/benchmarks/blog-community-graph.json b/frontend/public/benchmarks/blog-community-graph.json new file mode 100644 index 0000000..fe0059d --- /dev/null +++ b/frontend/public/benchmarks/blog-community-graph.json @@ -0,0 +1,4529 @@ +{ + "nodes": [ + { + "id": 1, + "url": "https://benchmark.heyblog.local/indie-web-01/", + "domain": "indie-web-01.benchmark.heyblog.local", + "title": "Indie Web Notes 01", + "icon_url": null, + "incoming_count": 6, + "outgoing_count": 7, + "degree": 13, + "component_id": "indie-web", + "benchmark_community_label": "Indie Web", + "x": -485.248, + "y": -260.0, + "z": -27.653 + }, + { + "id": 2, + "url": "https://benchmark.heyblog.local/indie-web-02/", + "domain": "indie-web-02.benchmark.heyblog.local", + "title": "Indie Web Notes 02", + "icon_url": null, + "incoming_count": 7, + "outgoing_count": 5, + "degree": 12, + "component_id": "indie-web", + "benchmark_community_label": "Indie Web", + "x": -561.694, + "y": -221.805, + "z": 9.57 + }, + { + "id": 3, + "url": "https://benchmark.heyblog.local/indie-web-03/", + "domain": "indie-web-03.benchmark.heyblog.local", + "title": "Indie Web Notes 03", + "icon_url": null, + "incoming_count": 5, + "outgoing_count": 4, + "degree": 9, + "component_id": "indie-web", + "benchmark_community_label": "Indie Web", + "x": -514.041, + "y": -327.899, + "z": -4.206 + }, + { + "id": 4, + "url": "https://benchmark.heyblog.local/indie-web-04/", + "domain": "indie-web-04.benchmark.heyblog.local", + "title": "Indie Web Notes 04", + "icon_url": null, + "incoming_count": 9, + "outgoing_count": 5, + "degree": 14, + "component_id": "indie-web", + "benchmark_community_label": "Indie Web", + "x": -471.039, + "y": -196.138, + "z": 18.91 + }, + { + "id": 5, + "url": "https://benchmark.heyblog.local/indie-web-05/", + "domain": "indie-web-05.benchmark.heyblog.local", + "title": "Indie Web Notes 05", + "icon_url": null, + "incoming_count": 4, + "outgoing_count": 4, + "degree": 8, + "component_id": "indie-web", + "benchmark_community_label": "Indie Web", + "x": -626.876, + "y": -278.905, + "z": -4.065 + }, + { + "id": 6, + "url": "https://benchmark.heyblog.local/indie-web-06/", + "domain": "indie-web-06.benchmark.heyblog.local", + "title": "Indie Web Notes 06", + "icon_url": null, + "incoming_count": 6, + "outgoing_count": 6, + "degree": 12, + "component_id": "indie-web", + "benchmark_community_label": "Indie Web", + "x": -482.029, + "y": -284.154, + "z": -23.273 + }, + { + "id": 7, + "url": "https://benchmark.heyblog.local/indie-web-07/", + "domain": "indie-web-07.benchmark.heyblog.local", + "title": "Indie Web Notes 07", + "icon_url": null, + "incoming_count": 4, + "outgoing_count": 4, + "degree": 8, + "component_id": "indie-web", + "benchmark_community_label": "Indie Web", + "x": -534.637, + "y": -205.551, + "z": -7.773 + }, + { + "id": 8, + "url": "https://benchmark.heyblog.local/indie-web-08/", + "domain": "indie-web-08.benchmark.heyblog.local", + "title": "Indie Web Notes 08", + "icon_url": null, + "incoming_count": 8, + "outgoing_count": 4, + "degree": 12, + "component_id": "indie-web", + "benchmark_community_label": "Indie Web", + "x": -554.081, + "y": -325.621, + "z": -1.666 + }, + { + "id": 9, + "url": "https://benchmark.heyblog.local/indie-web-09/", + "domain": "indie-web-09.benchmark.heyblog.local", + "title": "Indie Web Notes 09", + "icon_url": null, + "incoming_count": 4, + "outgoing_count": 4, + "degree": 8, + "component_id": "indie-web", + "benchmark_community_label": "Indie Web", + "x": -432.629, + "y": -228.092, + "z": -5.715 + }, + { + "id": 10, + "url": "https://benchmark.heyblog.local/indie-web-10/", + "domain": "indie-web-10.benchmark.heyblog.local", + "title": "Indie Web Notes 10", + "icon_url": null, + "incoming_count": 3, + "outgoing_count": 7, + "degree": 10, + "component_id": "indie-web", + "benchmark_community_label": "Indie Web", + "x": -614.579, + "y": -220.959, + "z": -15.78 + }, + { + "id": 11, + "url": "https://benchmark.heyblog.local/indie-web-11/", + "domain": "indie-web-11.benchmark.heyblog.local", + "title": "Indie Web Notes 11", + "icon_url": null, + "incoming_count": 5, + "outgoing_count": 6, + "degree": 11, + "component_id": "indie-web", + "benchmark_community_label": "Indie Web", + "x": -504.745, + "y": -292.6, + "z": -23.746 + }, + { + "id": 12, + "url": "https://benchmark.heyblog.local/indie-web-12/", + "domain": "indie-web-12.benchmark.heyblog.local", + "title": "Indie Web Notes 12", + "icon_url": null, + "incoming_count": 3, + "outgoing_count": 4, + "degree": 7, + "component_id": "indie-web", + "benchmark_community_label": "Indie Web", + "x": -502.776, + "y": -205.087, + "z": -2.887 + }, + { + "id": 13, + "url": "https://benchmark.heyblog.local/indie-web-13/", + "domain": "indie-web-13.benchmark.heyblog.local", + "title": "Indie Web Notes 13", + "icon_url": null, + "incoming_count": 5, + "outgoing_count": 5, + "degree": 10, + "component_id": "indie-web", + "benchmark_community_label": "Indie Web", + "x": -585.047, + "y": -297.696, + "z": 35.821 + }, + { + "id": 14, + "url": "https://benchmark.heyblog.local/indie-web-14/", + "domain": "indie-web-14.benchmark.heyblog.local", + "title": "Indie Web Notes 14", + "icon_url": null, + "incoming_count": 4, + "outgoing_count": 5, + "degree": 9, + "component_id": "indie-web", + "benchmark_community_label": "Indie Web", + "x": -438.853, + "y": -277.84, + "z": -26.944 + }, + { + "id": 15, + "url": "https://benchmark.heyblog.local/indie-web-15/", + "domain": "indie-web-15.benchmark.heyblog.local", + "title": "Indie Web Notes 15", + "icon_url": null, + "incoming_count": 4, + "outgoing_count": 7, + "degree": 11, + "component_id": "indie-web", + "benchmark_community_label": "Indie Web", + "x": -578.153, + "y": -177.283, + "z": 1.251 + }, + { + "id": 16, + "url": "https://benchmark.heyblog.local/indie-web-16/", + "domain": "indie-web-16.benchmark.heyblog.local", + "title": "Indie Web Notes 16", + "icon_url": null, + "incoming_count": 1, + "outgoing_count": 5, + "degree": 6, + "component_id": "indie-web", + "benchmark_community_label": "Indie Web", + "x": -526.023, + "y": -306.476, + "z": 1.643 + }, + { + "id": 17, + "url": "https://benchmark.heyblog.local/indie-web-17/", + "domain": "indie-web-17.benchmark.heyblog.local", + "title": "Indie Web Notes 17", + "icon_url": null, + "incoming_count": 5, + "outgoing_count": 5, + "degree": 10, + "component_id": "indie-web", + "benchmark_community_label": "Indie Web", + "x": -470.825, + "y": -218.555, + "z": -34.816 + }, + { + "id": 18, + "url": "https://benchmark.heyblog.local/indie-web-18/", + "domain": "indie-web-18.benchmark.heyblog.local", + "title": "Indie Web Notes 18", + "icon_url": null, + "incoming_count": 6, + "outgoing_count": 5, + "degree": 11, + "component_id": "indie-web", + "benchmark_community_label": "Indie Web", + "x": -591.377, + "y": -257.048, + "z": 28.352 + }, + { + "id": 19, + "url": "https://benchmark.heyblog.local/indie-web-19/", + "domain": "indie-web-19.benchmark.heyblog.local", + "title": "Indie Web Notes 19", + "icon_url": null, + "incoming_count": 3, + "outgoing_count": 6, + "degree": 9, + "component_id": "indie-web", + "benchmark_community_label": "Indie Web", + "x": -463.993, + "y": -315.734, + "z": -7.653 + }, + { + "id": 20, + "url": "https://benchmark.heyblog.local/indie-web-20/", + "domain": "indie-web-20.benchmark.heyblog.local", + "title": "Indie Web Notes 20", + "icon_url": null, + "incoming_count": 7, + "outgoing_count": 2, + "degree": 9, + "component_id": "indie-web", + "benchmark_community_label": "Indie Web", + "x": -524.976, + "y": -152.383, + "z": -23.053 + }, + { + "id": 21, + "url": "https://benchmark.heyblog.local/indie-web-21/", + "domain": "indie-web-21.benchmark.heyblog.local", + "title": "Indie Web Notes 21", + "icon_url": null, + "incoming_count": 9, + "outgoing_count": 4, + "degree": 13, + "component_id": "indie-web", + "benchmark_community_label": "Indie Web", + "x": -543.562, + "y": -288.235, + "z": 3.581 + }, + { + "id": 22, + "url": "https://benchmark.heyblog.local/indie-web-22/", + "domain": "indie-web-22.benchmark.heyblog.local", + "title": "Indie Web Notes 22", + "icon_url": null, + "incoming_count": 3, + "outgoing_count": 8, + "degree": 11, + "component_id": "indie-web", + "benchmark_community_label": "Indie Web", + "x": -471.124, + "y": -253.424, + "z": 30.062 + }, + { + "id": 23, + "url": "https://benchmark.heyblog.local/indie-web-23/", + "domain": "indie-web-23.benchmark.heyblog.local", + "title": "Indie Web Notes 23", + "icon_url": null, + "incoming_count": 4, + "outgoing_count": 4, + "degree": 8, + "component_id": "indie-web", + "benchmark_community_label": "Indie Web", + "x": -579.691, + "y": -218.469, + "z": 20.868 + }, + { + "id": 24, + "url": "https://benchmark.heyblog.local/indie-web-24/", + "domain": "indie-web-24.benchmark.heyblog.local", + "title": "Indie Web Notes 24", + "icon_url": null, + "incoming_count": 6, + "outgoing_count": 5, + "degree": 11, + "component_id": "indie-web", + "benchmark_community_label": "Indie Web", + "x": -502.561, + "y": -337.519, + "z": -32.523 + }, + { + "id": 25, + "url": "https://benchmark.heyblog.local/engineering-01/", + "domain": "engineering-01.benchmark.heyblog.local", + "title": "Engineering Notes 01", + "icon_url": null, + "incoming_count": 2, + "outgoing_count": 7, + "degree": 9, + "component_id": "engineering", + "benchmark_community_label": "Engineering", + "x": 573.139, + "y": -167.265, + "z": -2.006 + }, + { + "id": 26, + "url": "https://benchmark.heyblog.local/engineering-02/", + "domain": "engineering-02.benchmark.heyblog.local", + "title": "Engineering Notes 02", + "icon_url": null, + "incoming_count": 4, + "outgoing_count": 1, + "degree": 5, + "component_id": "engineering", + "benchmark_community_label": "Engineering", + "x": 472.691, + "y": -275.093, + "z": 28.025 + }, + { + "id": 27, + "url": "https://benchmark.heyblog.local/engineering-03/", + "domain": "engineering-03.benchmark.heyblog.local", + "title": "Engineering Notes 03", + "icon_url": null, + "incoming_count": 8, + "outgoing_count": 4, + "degree": 12, + "component_id": "engineering", + "benchmark_community_label": "Engineering", + "x": 570.488, + "y": -283.327, + "z": 16.877 + }, + { + "id": 28, + "url": "https://benchmark.heyblog.local/engineering-04/", + "domain": "engineering-04.benchmark.heyblog.local", + "title": "Engineering Notes 04", + "icon_url": null, + "incoming_count": 5, + "outgoing_count": 3, + "degree": 8, + "component_id": "engineering", + "benchmark_community_label": "Engineering", + "x": 490.999, + "y": -190.703, + "z": 12.287 + }, + { + "id": 29, + "url": "https://benchmark.heyblog.local/engineering-05/", + "domain": "engineering-05.benchmark.heyblog.local", + "title": "Engineering Notes 05", + "icon_url": null, + "incoming_count": 4, + "outgoing_count": 6, + "degree": 10, + "component_id": "engineering", + "benchmark_community_label": "Engineering", + "x": 492.317, + "y": -336.967, + "z": 27.342 + }, + { + "id": 30, + "url": "https://benchmark.heyblog.local/engineering-06/", + "domain": "engineering-06.benchmark.heyblog.local", + "title": "Engineering Notes 06", + "icon_url": null, + "incoming_count": 3, + "outgoing_count": 4, + "degree": 7, + "component_id": "engineering", + "benchmark_community_label": "Engineering", + "x": 614.342, + "y": -210.416, + "z": -20.321 + }, + { + "id": 31, + "url": "https://benchmark.heyblog.local/engineering-07/", + "domain": "engineering-07.benchmark.heyblog.local", + "title": "Engineering Notes 07", + "icon_url": null, + "incoming_count": 6, + "outgoing_count": 3, + "degree": 9, + "component_id": "engineering", + "benchmark_community_label": "Engineering", + "x": 485.672, + "y": -250.951, + "z": 12.808 + }, + { + "id": 32, + "url": "https://benchmark.heyblog.local/engineering-08/", + "domain": "engineering-08.benchmark.heyblog.local", + "title": "Engineering Notes 08", + "icon_url": null, + "incoming_count": 4, + "outgoing_count": 6, + "degree": 10, + "component_id": "engineering", + "benchmark_community_label": "Engineering", + "x": 553.668, + "y": -312.362, + "z": -12.726 + }, + { + "id": 33, + "url": "https://benchmark.heyblog.local/engineering-09/", + "domain": "engineering-09.benchmark.heyblog.local", + "title": "Engineering Notes 09", + "icon_url": null, + "incoming_count": 4, + "outgoing_count": 4, + "degree": 8, + "component_id": "engineering", + "benchmark_community_label": "Engineering", + "x": 532.13, + "y": -189.422, + "z": 13.888 + }, + { + "id": 34, + "url": "https://benchmark.heyblog.local/engineering-10/", + "domain": "engineering-10.benchmark.heyblog.local", + "title": "Engineering Notes 10", + "icon_url": null, + "incoming_count": 8, + "outgoing_count": 3, + "degree": 11, + "component_id": "engineering", + "benchmark_community_label": "Engineering", + "x": 446.316, + "y": -317.065, + "z": -8.99 + }, + { + "id": 35, + "url": "https://benchmark.heyblog.local/engineering-11/", + "domain": "engineering-11.benchmark.heyblog.local", + "title": "Engineering Notes 11", + "icon_url": null, + "incoming_count": 6, + "outgoing_count": 3, + "degree": 9, + "component_id": "engineering", + "benchmark_community_label": "Engineering", + "x": 628.742, + "y": -269.009, + "z": 17.532 + }, + { + "id": 36, + "url": "https://benchmark.heyblog.local/engineering-12/", + "domain": "engineering-12.benchmark.heyblog.local", + "title": "Engineering Notes 12", + "icon_url": null, + "incoming_count": 8, + "outgoing_count": 8, + "degree": 16, + "component_id": "engineering", + "benchmark_community_label": "Engineering", + "x": 492.373, + "y": -230.136, + "z": 30.217 + }, + { + "id": 37, + "url": "https://benchmark.heyblog.local/engineering-13/", + "domain": "engineering-13.benchmark.heyblog.local", + "title": "Engineering Notes 13", + "icon_url": null, + "incoming_count": 4, + "outgoing_count": 4, + "degree": 8, + "component_id": "engineering", + "benchmark_community_label": "Engineering", + "x": 520.305, + "y": -322.624, + "z": -27.247 + }, + { + "id": 38, + "url": "https://benchmark.heyblog.local/engineering-14/", + "domain": "engineering-14.benchmark.heyblog.local", + "title": "Engineering Notes 14", + "icon_url": null, + "incoming_count": 1, + "outgoing_count": 4, + "degree": 5, + "component_id": "engineering", + "benchmark_community_label": "Engineering", + "x": 565.794, + "y": -209.519, + "z": -18.772 + }, + { + "id": 39, + "url": "https://benchmark.heyblog.local/engineering-15/", + "domain": "engineering-15.benchmark.heyblog.local", + "title": "Engineering Notes 15", + "icon_url": null, + "incoming_count": 5, + "outgoing_count": 7, + "degree": 12, + "component_id": "engineering", + "benchmark_community_label": "Engineering", + "x": 431.124, + "y": -268.237, + "z": 9.782 + }, + { + "id": 40, + "url": "https://benchmark.heyblog.local/engineering-16/", + "domain": "engineering-16.benchmark.heyblog.local", + "title": "Engineering Notes 16", + "icon_url": null, + "incoming_count": 3, + "outgoing_count": 6, + "degree": 9, + "component_id": "engineering", + "benchmark_community_label": "Engineering", + "x": 595.39, + "y": -317.218, + "z": 6.472 + }, + { + "id": 41, + "url": "https://benchmark.heyblog.local/engineering-17/", + "domain": "engineering-17.benchmark.heyblog.local", + "title": "Engineering Notes 17", + "icon_url": null, + "incoming_count": 4, + "outgoing_count": 4, + "degree": 8, + "component_id": "engineering", + "benchmark_community_label": "Engineering", + "x": 511.479, + "y": -213.161, + "z": -23.505 + }, + { + "id": 42, + "url": "https://benchmark.heyblog.local/engineering-18/", + "domain": "engineering-18.benchmark.heyblog.local", + "title": "Engineering Notes 18", + "icon_url": null, + "incoming_count": 2, + "outgoing_count": 5, + "degree": 7, + "component_id": "engineering", + "benchmark_community_label": "Engineering", + "x": 486.51, + "y": -313.218, + "z": 21.707 + }, + { + "id": 43, + "url": "https://benchmark.heyblog.local/engineering-19/", + "domain": "engineering-19.benchmark.heyblog.local", + "title": "Engineering Notes 19", + "icon_url": null, + "incoming_count": 5, + "outgoing_count": 6, + "degree": 11, + "component_id": "engineering", + "benchmark_community_label": "Engineering", + "x": 591.907, + "y": -240.293, + "z": 32.612 + }, + { + "id": 44, + "url": "https://benchmark.heyblog.local/engineering-20/", + "domain": "engineering-20.benchmark.heyblog.local", + "title": "Engineering Notes 20", + "icon_url": null, + "incoming_count": 8, + "outgoing_count": 7, + "degree": 15, + "component_id": "engineering", + "benchmark_community_label": "Engineering", + "x": 443.241, + "y": -220.609, + "z": -6.126 + }, + { + "id": 45, + "url": "https://benchmark.heyblog.local/engineering-21/", + "domain": "engineering-21.benchmark.heyblog.local", + "title": "Engineering Notes 21", + "icon_url": null, + "incoming_count": 4, + "outgoing_count": 7, + "degree": 11, + "component_id": "engineering", + "benchmark_community_label": "Engineering", + "x": 554.187, + "y": -352.213, + "z": -23.958 + }, + { + "id": 46, + "url": "https://benchmark.heyblog.local/engineering-22/", + "domain": "engineering-22.benchmark.heyblog.local", + "title": "Engineering Notes 22", + "icon_url": null, + "incoming_count": 5, + "outgoing_count": 4, + "degree": 9, + "component_id": "engineering", + "benchmark_community_label": "Engineering", + "x": 536.488, + "y": -219.498, + "z": -10.669 + }, + { + "id": 47, + "url": "https://benchmark.heyblog.local/design-01/", + "domain": "design-01.benchmark.heyblog.local", + "title": "Design Notes 01", + "icon_url": null, + "incoming_count": 3, + "outgoing_count": 2, + "degree": 5, + "component_id": "design", + "benchmark_community_label": "Design", + "x": -570.698, + "y": 275.973, + "z": -15.539 + }, + { + "id": 48, + "url": "https://benchmark.heyblog.local/design-02/", + "domain": "design-02.benchmark.heyblog.local", + "title": "Design Notes 02", + "icon_url": null, + "incoming_count": 4, + "outgoing_count": 2, + "degree": 6, + "component_id": "design", + "benchmark_community_label": "Design", + "x": -453.52, + "y": 279.503, + "z": -12.521 + }, + { + "id": 49, + "url": "https://benchmark.heyblog.local/design-03/", + "domain": "design-03.benchmark.heyblog.local", + "title": "Design Notes 03", + "icon_url": null, + "incoming_count": 4, + "outgoing_count": 4, + "degree": 8, + "component_id": "design", + "benchmark_community_label": "Design", + "x": -561.402, + "y": 370.645, + "z": -6.008 + }, + { + "id": 50, + "url": "https://benchmark.heyblog.local/design-04/", + "domain": "design-04.benchmark.heyblog.local", + "title": "Design Notes 04", + "icon_url": null, + "incoming_count": 2, + "outgoing_count": 3, + "degree": 5, + "component_id": "design", + "benchmark_community_label": "Design", + "x": -540.85, + "y": 202.907, + "z": 9.596 + }, + { + "id": 51, + "url": "https://benchmark.heyblog.local/design-05/", + "domain": "design-05.benchmark.heyblog.local", + "title": "Design Notes 05", + "icon_url": null, + "incoming_count": 2, + "outgoing_count": 3, + "degree": 5, + "component_id": "design", + "benchmark_community_label": "Design", + "x": -482.696, + "y": 326.499, + "z": 10.269 + }, + { + "id": 52, + "url": "https://benchmark.heyblog.local/design-06/", + "domain": "design-06.benchmark.heyblog.local", + "title": "Design Notes 06", + "icon_url": null, + "incoming_count": 2, + "outgoing_count": 5, + "degree": 7, + "component_id": "design", + "benchmark_community_label": "Design", + "x": -574.155, + "y": 306.749, + "z": -2.115 + }, + { + "id": 53, + "url": "https://benchmark.heyblog.local/design-07/", + "domain": "design-07.benchmark.heyblog.local", + "title": "Design Notes 07", + "icon_url": null, + "incoming_count": 4, + "outgoing_count": 3, + "degree": 7, + "component_id": "design", + "benchmark_community_label": "Design", + "x": -471.145, + "y": 242.603, + "z": -1.46 + }, + { + "id": 54, + "url": "https://benchmark.heyblog.local/design-08/", + "domain": "design-08.benchmark.heyblog.local", + "title": "Design Notes 08", + "icon_url": null, + "incoming_count": 2, + "outgoing_count": 4, + "degree": 6, + "component_id": "design", + "benchmark_community_label": "Design", + "x": -516.807, + "y": 387.548, + "z": 34.33 + }, + { + "id": 55, + "url": "https://benchmark.heyblog.local/design-09/", + "domain": "design-09.benchmark.heyblog.local", + "title": "Design Notes 09", + "icon_url": null, + "incoming_count": 3, + "outgoing_count": 2, + "degree": 5, + "component_id": "design", + "benchmark_community_label": "Design", + "x": -586.781, + "y": 232.235, + "z": -21.918 + }, + { + "id": 56, + "url": "https://benchmark.heyblog.local/design-10/", + "domain": "design-10.benchmark.heyblog.local", + "title": "Design Notes 10", + "icon_url": null, + "incoming_count": 1, + "outgoing_count": 3, + "degree": 4, + "component_id": "design", + "benchmark_community_label": "Design", + "x": -482.005, + "y": 301.943, + "z": 4.554 + }, + { + "id": 57, + "url": "https://benchmark.heyblog.local/design-11/", + "domain": "design-11.benchmark.heyblog.local", + "title": "Design Notes 11", + "icon_url": null, + "incoming_count": 4, + "outgoing_count": 4, + "degree": 8, + "component_id": "design", + "benchmark_community_label": "Design", + "x": -560.363, + "y": 333.35, + "z": 29.597 + }, + { + "id": 58, + "url": "https://benchmark.heyblog.local/design-12/", + "domain": "design-12.benchmark.heyblog.local", + "title": "Design Notes 12", + "icon_url": null, + "incoming_count": 4, + "outgoing_count": 2, + "degree": 6, + "component_id": "design", + "benchmark_community_label": "Design", + "x": -509.899, + "y": 227.603, + "z": -3.746 + }, + { + "id": 59, + "url": "https://benchmark.heyblog.local/design-13/", + "domain": "design-13.benchmark.heyblog.local", + "title": "Design Notes 13", + "icon_url": null, + "incoming_count": 7, + "outgoing_count": 4, + "degree": 11, + "component_id": "design", + "benchmark_community_label": "Design", + "x": -474.537, + "y": 366.026, + "z": -35.333 + }, + { + "id": 60, + "url": "https://benchmark.heyblog.local/design-14/", + "domain": "design-14.benchmark.heyblog.local", + "title": "Design Notes 14", + "icon_url": null, + "incoming_count": 2, + "outgoing_count": 7, + "degree": 9, + "component_id": "design", + "benchmark_community_label": "Design", + "x": -614.325, + "y": 278.296, + "z": 32.354 + }, + { + "id": 61, + "url": "https://benchmark.heyblog.local/design-15/", + "domain": "design-15.benchmark.heyblog.local", + "title": "Design Notes 15", + "icon_url": null, + "incoming_count": 4, + "outgoing_count": 4, + "degree": 8, + "component_id": "design", + "benchmark_community_label": "Design", + "x": -485.328, + "y": 280.356, + "z": 27.902 + }, + { + "id": 62, + "url": "https://benchmark.heyblog.local/design-16/", + "domain": "design-16.benchmark.heyblog.local", + "title": "Design Notes 16", + "icon_url": null, + "incoming_count": 6, + "outgoing_count": 3, + "degree": 9, + "component_id": "design", + "benchmark_community_label": "Design", + "x": -537.501, + "y": 353.946, + "z": 8.124 + }, + { + "id": 63, + "url": "https://benchmark.heyblog.local/design-17/", + "domain": "design-17.benchmark.heyblog.local", + "title": "Design Notes 17", + "icon_url": null, + "incoming_count": 6, + "outgoing_count": 3, + "degree": 9, + "component_id": "design", + "benchmark_community_label": "Design", + "x": -551.293, + "y": 231.393, + "z": 31.6 + }, + { + "id": 64, + "url": "https://benchmark.heyblog.local/design-18/", + "domain": "design-18.benchmark.heyblog.local", + "title": "Design Notes 18", + "icon_url": null, + "incoming_count": 2, + "outgoing_count": 4, + "degree": 6, + "component_id": "design", + "benchmark_community_label": "Design", + "x": -441.014, + "y": 333.51, + "z": 15.436 + }, + { + "id": 65, + "url": "https://benchmark.heyblog.local/data-ai-01/", + "domain": "data-ai-01.benchmark.heyblog.local", + "title": "Data & AI Notes 01", + "icon_url": null, + "incoming_count": 6, + "outgoing_count": 5, + "degree": 11, + "component_id": "data-ai", + "benchmark_community_label": "Data & AI", + "x": 422.243, + "y": 334.624, + "z": -11.591 + }, + { + "id": 66, + "url": "https://benchmark.heyblog.local/data-ai-02/", + "domain": "data-ai-02.benchmark.heyblog.local", + "title": "Data & AI Notes 02", + "icon_url": null, + "incoming_count": 5, + "outgoing_count": 5, + "degree": 10, + "component_id": "data-ai", + "benchmark_community_label": "Data & AI", + "x": 536.171, + "y": 269.594, + "z": 1.955 + }, + { + "id": 67, + "url": "https://benchmark.heyblog.local/data-ai-03/", + "domain": "data-ai-03.benchmark.heyblog.local", + "title": "Data & AI Notes 03", + "icon_url": null, + "incoming_count": 3, + "outgoing_count": 6, + "degree": 9, + "component_id": "data-ai", + "benchmark_community_label": "Data & AI", + "x": 533.193, + "y": 351.058, + "z": 9.399 + }, + { + "id": 68, + "url": "https://benchmark.heyblog.local/data-ai-04/", + "domain": "data-ai-04.benchmark.heyblog.local", + "title": "Data & AI Notes 04", + "icon_url": null, + "incoming_count": 4, + "outgoing_count": 6, + "degree": 10, + "component_id": "data-ai", + "benchmark_community_label": "Data & AI", + "x": 458.789, + "y": 260.219, + "z": -29.844 + }, + { + "id": 69, + "url": "https://benchmark.heyblog.local/data-ai-05/", + "domain": "data-ai-05.benchmark.heyblog.local", + "title": "Data & AI Notes 05", + "icon_url": null, + "incoming_count": 9, + "outgoing_count": 6, + "degree": 15, + "component_id": "data-ai", + "benchmark_community_label": "Data & AI", + "x": 611.447, + "y": 284.743, + "z": -20.898 + }, + { + "id": 70, + "url": "https://benchmark.heyblog.local/data-ai-06/", + "domain": "data-ai-06.benchmark.heyblog.local", + "title": "Data & AI Notes 06", + "icon_url": null, + "incoming_count": 4, + "outgoing_count": 7, + "degree": 11, + "component_id": "data-ai", + "benchmark_community_label": "Data & AI", + "x": 461.87, + "y": 374.307, + "z": -11.953 + }, + { + "id": 71, + "url": "https://benchmark.heyblog.local/data-ai-07/", + "domain": "data-ai-07.benchmark.heyblog.local", + "title": "Data & AI Notes 07", + "icon_url": null, + "incoming_count": 4, + "outgoing_count": 4, + "degree": 8, + "component_id": "data-ai", + "benchmark_community_label": "Data & AI", + "x": 516.969, + "y": 261.116, + "z": 6.55 + }, + { + "id": 72, + "url": "https://benchmark.heyblog.local/data-ai-08/", + "domain": "data-ai-08.benchmark.heyblog.local", + "title": "Data & AI Notes 08", + "icon_url": null, + "incoming_count": 3, + "outgoing_count": 4, + "degree": 7, + "component_id": "data-ai", + "benchmark_community_label": "Data & AI", + "x": 567.287, + "y": 344.175, + "z": -24.876 + }, + { + "id": 73, + "url": "https://benchmark.heyblog.local/data-ai-09/", + "domain": "data-ai-09.benchmark.heyblog.local", + "title": "Data & AI Notes 09", + "icon_url": null, + "incoming_count": 4, + "outgoing_count": 3, + "degree": 7, + "component_id": "data-ai", + "benchmark_community_label": "Data & AI", + "x": 451.144, + "y": 299.328, + "z": 33.496 + }, + { + "id": 74, + "url": "https://benchmark.heyblog.local/data-ai-10/", + "domain": "data-ai-10.benchmark.heyblog.local", + "title": "Data & AI Notes 10", + "icon_url": null, + "incoming_count": 5, + "outgoing_count": 5, + "degree": 10, + "component_id": "data-ai", + "benchmark_community_label": "Data & AI", + "x": 588.21, + "y": 238.727, + "z": -15.243 + }, + { + "id": 75, + "url": "https://benchmark.heyblog.local/data-ai-11/", + "domain": "data-ai-11.benchmark.heyblog.local", + "title": "Data & AI Notes 11", + "icon_url": null, + "incoming_count": 5, + "outgoing_count": 2, + "degree": 7, + "component_id": "data-ai", + "benchmark_community_label": "Data & AI", + "x": 510.386, + "y": 398.503, + "z": 17.685 + }, + { + "id": 76, + "url": "https://benchmark.heyblog.local/data-ai-12/", + "domain": "data-ai-12.benchmark.heyblog.local", + "title": "Data & AI Notes 12", + "icon_url": null, + "incoming_count": 4, + "outgoing_count": 3, + "degree": 7, + "component_id": "data-ai", + "benchmark_community_label": "Data & AI", + "x": 497.126, + "y": 269.555, + "z": -14.242 + }, + { + "id": 77, + "url": "https://benchmark.heyblog.local/data-ai-13/", + "domain": "data-ai-13.benchmark.heyblog.local", + "title": "Data & AI Notes 13", + "icon_url": null, + "incoming_count": 5, + "outgoing_count": 4, + "degree": 9, + "component_id": "data-ai", + "benchmark_community_label": "Data & AI", + "x": 572.02, + "y": 309.726, + "z": 23.079 + }, + { + "id": 78, + "url": "https://benchmark.heyblog.local/data-ai-14/", + "domain": "data-ai-14.benchmark.heyblog.local", + "title": "Data & AI Notes 14", + "icon_url": null, + "incoming_count": 4, + "outgoing_count": 5, + "degree": 9, + "component_id": "data-ai", + "benchmark_community_label": "Data & AI", + "x": 456.631, + "y": 339.447, + "z": -4.464 + }, + { + "id": 79, + "url": "https://benchmark.heyblog.local/data-ai-15/", + "domain": "data-ai-15.benchmark.heyblog.local", + "title": "Data & AI Notes 15", + "icon_url": null, + "incoming_count": 7, + "outgoing_count": 5, + "degree": 12, + "component_id": "data-ai", + "benchmark_community_label": "Data & AI", + "x": 541.957, + "y": 221.388, + "z": -13.295 + }, + { + "id": 80, + "url": "https://benchmark.heyblog.local/data-ai-16/", + "domain": "data-ai-16.benchmark.heyblog.local", + "title": "Data & AI Notes 16", + "icon_url": null, + "incoming_count": 3, + "outgoing_count": 4, + "degree": 7, + "component_id": "data-ai", + "benchmark_community_label": "Data & AI", + "x": 567.816, + "y": 394.305, + "z": -35.145 + }, + { + "id": 81, + "url": "https://benchmark.heyblog.local/data-ai-17/", + "domain": "data-ai-17.benchmark.heyblog.local", + "title": "Data & AI Notes 17", + "icon_url": null, + "incoming_count": 6, + "outgoing_count": 5, + "degree": 11, + "component_id": "data-ai", + "benchmark_community_label": "Data & AI", + "x": 479.297, + "y": 284.684, + "z": -0.06 + }, + { + "id": 82, + "url": "https://benchmark.heyblog.local/data-ai-18/", + "domain": "data-ai-18.benchmark.heyblog.local", + "title": "Data & AI Notes 18", + "icon_url": null, + "incoming_count": 3, + "outgoing_count": 3, + "degree": 6, + "component_id": "data-ai", + "benchmark_community_label": "Data & AI", + "x": 568.43, + "y": 280.559, + "z": 3.997 + }, + { + "id": 83, + "url": "https://benchmark.heyblog.local/data-ai-19/", + "domain": "data-ai-19.benchmark.heyblog.local", + "title": "Data & AI Notes 19", + "icon_url": null, + "incoming_count": 4, + "outgoing_count": 6, + "degree": 10, + "component_id": "data-ai", + "benchmark_community_label": "Data & AI", + "x": 487.855, + "y": 366.981, + "z": 33.056 + }, + { + "id": 84, + "url": "https://benchmark.heyblog.local/data-ai-20/", + "domain": "data-ai-20.benchmark.heyblog.local", + "title": "Data & AI Notes 20", + "icon_url": null, + "incoming_count": 6, + "outgoing_count": 3, + "degree": 9, + "component_id": "data-ai", + "benchmark_community_label": "Data & AI", + "x": 493.239, + "y": 211.673, + "z": 14.359 + }, + { + "id": 85, + "url": "https://benchmark.heyblog.local/culture-01/", + "domain": "culture-01.benchmark.heyblog.local", + "title": "Culture Notes 01", + "icon_url": null, + "incoming_count": 3, + "outgoing_count": 4, + "degree": 7, + "component_id": "culture", + "benchmark_community_label": "Culture", + "x": 89.757, + "y": 93.193, + "z": 513.975 + }, + { + "id": 86, + "url": "https://benchmark.heyblog.local/culture-02/", + "domain": "culture-02.benchmark.heyblog.local", + "title": "Culture Notes 02", + "icon_url": null, + "incoming_count": 3, + "outgoing_count": 2, + "degree": 5, + "component_id": "culture", + "benchmark_community_label": "Culture", + "x": -40.174, + "y": 48.422, + "z": 530.82 + }, + { + "id": 87, + "url": "https://benchmark.heyblog.local/culture-03/", + "domain": "culture-03.benchmark.heyblog.local", + "title": "Culture Notes 03", + "icon_url": null, + "incoming_count": 3, + "outgoing_count": 3, + "degree": 6, + "component_id": "culture", + "benchmark_community_label": "Culture", + "x": 31.055, + "y": -3.269, + "z": 521.511 + }, + { + "id": 88, + "url": "https://benchmark.heyblog.local/culture-04/", + "domain": "culture-04.benchmark.heyblog.local", + "title": "Culture Notes 04", + "icon_url": null, + "incoming_count": 2, + "outgoing_count": 1, + "degree": 3, + "component_id": "culture", + "benchmark_community_label": "Culture", + "x": 8.33, + "y": 109.602, + "z": 535.363 + }, + { + "id": 89, + "url": "https://benchmark.heyblog.local/culture-05/", + "domain": "culture-05.benchmark.heyblog.local", + "title": "Culture Notes 05", + "icon_url": null, + "incoming_count": 4, + "outgoing_count": 3, + "degree": 7, + "component_id": "culture", + "benchmark_community_label": "Culture", + "x": -64.94, + "y": -15.824, + "z": 551.158 + }, + { + "id": 90, + "url": "https://benchmark.heyblog.local/culture-06/", + "domain": "culture-06.benchmark.heyblog.local", + "title": "Culture Notes 06", + "icon_url": null, + "incoming_count": 3, + "outgoing_count": 2, + "degree": 5, + "component_id": "culture", + "benchmark_community_label": "Culture", + "x": 94.81, + "y": 37.006, + "z": 545.355 + }, + { + "id": 91, + "url": "https://benchmark.heyblog.local/culture-07/", + "domain": "culture-07.benchmark.heyblog.local", + "title": "Culture Notes 07", + "icon_url": null, + "incoming_count": 4, + "outgoing_count": 3, + "degree": 7, + "component_id": "culture", + "benchmark_community_label": "Culture", + "x": -26.072, + "y": 65.444, + "z": 494.282 + }, + { + "id": 92, + "url": "https://benchmark.heyblog.local/culture-08/", + "domain": "culture-08.benchmark.heyblog.local", + "title": "Culture Notes 08", + "icon_url": null, + "incoming_count": 3, + "outgoing_count": 4, + "degree": 7, + "component_id": "culture", + "benchmark_community_label": "Culture", + "x": 3.275, + "y": -18.463, + "z": 497.011 + }, + { + "id": 93, + "url": "https://benchmark.heyblog.local/culture-09/", + "domain": "culture-09.benchmark.heyblog.local", + "title": "Culture Notes 09", + "icon_url": null, + "incoming_count": 4, + "outgoing_count": 3, + "degree": 7, + "component_id": "culture", + "benchmark_community_label": "Culture", + "x": 42.82, + "y": 92.343, + "z": 505.261 + }, + { + "id": 94, + "url": "https://benchmark.heyblog.local/culture-10/", + "domain": "culture-10.benchmark.heyblog.local", + "title": "Culture Notes 10", + "icon_url": null, + "incoming_count": 3, + "outgoing_count": 3, + "degree": 6, + "component_id": "culture", + "benchmark_community_label": "Culture", + "x": -91.729, + "y": 26.746, + "z": 492.576 + }, + { + "id": 95, + "url": "https://benchmark.heyblog.local/culture-11/", + "domain": "culture-11.benchmark.heyblog.local", + "title": "Culture Notes 11", + "icon_url": null, + "incoming_count": 2, + "outgoing_count": 5, + "degree": 7, + "component_id": "culture", + "benchmark_community_label": "Culture", + "x": 80.365, + "y": -14.76, + "z": 554.686 + }, + { + "id": 96, + "url": "https://benchmark.heyblog.local/culture-12/", + "domain": "culture-12.benchmark.heyblog.local", + "title": "Culture Notes 12", + "icon_url": null, + "incoming_count": 3, + "outgoing_count": 5, + "degree": 8, + "component_id": "culture", + "benchmark_community_label": "Culture", + "x": -8.027, + "y": 74.124, + "z": 509.271 + }, + { + "id": 97, + "url": "https://benchmark.heyblog.local/culture-13/", + "domain": "culture-13.benchmark.heyblog.local", + "title": "Culture Notes 13", + "icon_url": null, + "incoming_count": 5, + "outgoing_count": 1, + "degree": 6, + "component_id": "culture", + "benchmark_community_label": "Culture", + "x": -25.844, + "y": -6.138, + "z": 511.06 + }, + { + "id": 98, + "url": "https://benchmark.heyblog.local/culture-14/", + "domain": "culture-14.benchmark.heyblog.local", + "title": "Culture Notes 14", + "icon_url": null, + "incoming_count": 3, + "outgoing_count": 2, + "degree": 5, + "component_id": "culture", + "benchmark_community_label": "Culture", + "x": 70.956, + "y": 63.402, + "z": 532.942 + }, + { + "id": 99, + "url": "https://benchmark.heyblog.local/culture-15/", + "domain": "culture-15.benchmark.heyblog.local", + "title": "Culture Notes 15", + "icon_url": null, + "incoming_count": 1, + "outgoing_count": 4, + "degree": 5, + "component_id": "culture", + "benchmark_community_label": "Culture", + "x": -82.37, + "y": 77.086, + "z": 554.04 + }, + { + "id": 100, + "url": "https://benchmark.heyblog.local/culture-16/", + "domain": "culture-16.benchmark.heyblog.local", + "title": "Culture Notes 16", + "icon_url": null, + "incoming_count": 2, + "outgoing_count": 3, + "degree": 5, + "component_id": "culture", + "benchmark_community_label": "Culture", + "x": 37.575, + "y": -47.38, + "z": 518.86 + } + ], + "edges": [ + { + "from_blog_id": 1, + "to_blog_id": 2, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-02/", + "id": "benchmark-edge-001" + }, + { + "from_blog_id": 1, + "to_blog_id": 3, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-03/", + "id": "benchmark-edge-002" + }, + { + "from_blog_id": 1, + "to_blog_id": 7, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-07/", + "id": "benchmark-edge-003" + }, + { + "from_blog_id": 1, + "to_blog_id": 8, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-08/", + "id": "benchmark-edge-004" + }, + { + "from_blog_id": 1, + "to_blog_id": 10, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-10/", + "id": "benchmark-edge-005" + }, + { + "from_blog_id": 1, + "to_blog_id": 19, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-19/", + "id": "benchmark-edge-006" + }, + { + "from_blog_id": 1, + "to_blog_id": 79, + "link_text": "community bridge", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-15/", + "id": "benchmark-edge-007" + }, + { + "from_blog_id": 2, + "to_blog_id": 3, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-03/", + "id": "benchmark-edge-008" + }, + { + "from_blog_id": 2, + "to_blog_id": 14, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-14/", + "id": "benchmark-edge-009" + }, + { + "from_blog_id": 2, + "to_blog_id": 18, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-18/", + "id": "benchmark-edge-010" + }, + { + "from_blog_id": 2, + "to_blog_id": 21, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-21/", + "id": "benchmark-edge-011" + }, + { + "from_blog_id": 2, + "to_blog_id": 24, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-24/", + "id": "benchmark-edge-012" + }, + { + "from_blog_id": 3, + "to_blog_id": 4, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-04/", + "id": "benchmark-edge-013" + }, + { + "from_blog_id": 3, + "to_blog_id": 5, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-05/", + "id": "benchmark-edge-014" + }, + { + "from_blog_id": 3, + "to_blog_id": 14, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-14/", + "id": "benchmark-edge-015" + }, + { + "from_blog_id": 3, + "to_blog_id": 17, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-17/", + "id": "benchmark-edge-016" + }, + { + "from_blog_id": 4, + "to_blog_id": 1, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-01/", + "id": "benchmark-edge-017" + }, + { + "from_blog_id": 4, + "to_blog_id": 5, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-05/", + "id": "benchmark-edge-018" + }, + { + "from_blog_id": 4, + "to_blog_id": 20, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-20/", + "id": "benchmark-edge-019" + }, + { + "from_blog_id": 4, + "to_blog_id": 21, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-21/", + "id": "benchmark-edge-020" + }, + { + "from_blog_id": 4, + "to_blog_id": 69, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-05/", + "id": "benchmark-edge-021" + }, + { + "from_blog_id": 5, + "to_blog_id": 2, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-02/", + "id": "benchmark-edge-022" + }, + { + "from_blog_id": 5, + "to_blog_id": 6, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-06/", + "id": "benchmark-edge-023" + }, + { + "from_blog_id": 5, + "to_blog_id": 8, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-08/", + "id": "benchmark-edge-024" + }, + { + "from_blog_id": 5, + "to_blog_id": 11, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-11/", + "id": "benchmark-edge-025" + }, + { + "from_blog_id": 6, + "to_blog_id": 2, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-02/", + "id": "benchmark-edge-026" + }, + { + "from_blog_id": 6, + "to_blog_id": 5, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-05/", + "id": "benchmark-edge-027" + }, + { + "from_blog_id": 6, + "to_blog_id": 7, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-07/", + "id": "benchmark-edge-028" + }, + { + "from_blog_id": 6, + "to_blog_id": 8, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-08/", + "id": "benchmark-edge-029" + }, + { + "from_blog_id": 6, + "to_blog_id": 9, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-09/", + "id": "benchmark-edge-030" + }, + { + "from_blog_id": 6, + "to_blog_id": 18, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-18/", + "id": "benchmark-edge-031" + }, + { + "from_blog_id": 7, + "to_blog_id": 8, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-08/", + "id": "benchmark-edge-032" + }, + { + "from_blog_id": 7, + "to_blog_id": 12, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-12/", + "id": "benchmark-edge-033" + }, + { + "from_blog_id": 7, + "to_blog_id": 13, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-13/", + "id": "benchmark-edge-034" + }, + { + "from_blog_id": 7, + "to_blog_id": 14, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-14/", + "id": "benchmark-edge-035" + }, + { + "from_blog_id": 8, + "to_blog_id": 2, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-02/", + "id": "benchmark-edge-036" + }, + { + "from_blog_id": 8, + "to_blog_id": 9, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-09/", + "id": "benchmark-edge-037" + }, + { + "from_blog_id": 8, + "to_blog_id": 10, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-10/", + "id": "benchmark-edge-038" + }, + { + "from_blog_id": 8, + "to_blog_id": 18, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-18/", + "id": "benchmark-edge-039" + }, + { + "from_blog_id": 9, + "to_blog_id": 4, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-04/", + "id": "benchmark-edge-040" + }, + { + "from_blog_id": 9, + "to_blog_id": 5, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-05/", + "id": "benchmark-edge-041" + }, + { + "from_blog_id": 9, + "to_blog_id": 10, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-10/", + "id": "benchmark-edge-042" + }, + { + "from_blog_id": 9, + "to_blog_id": 22, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-22/", + "id": "benchmark-edge-043" + }, + { + "from_blog_id": 10, + "to_blog_id": 2, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-02/", + "id": "benchmark-edge-044" + }, + { + "from_blog_id": 10, + "to_blog_id": 3, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-03/", + "id": "benchmark-edge-045" + }, + { + "from_blog_id": 10, + "to_blog_id": 4, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-04/", + "id": "benchmark-edge-046" + }, + { + "from_blog_id": 10, + "to_blog_id": 11, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-11/", + "id": "benchmark-edge-047" + }, + { + "from_blog_id": 10, + "to_blog_id": 18, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-18/", + "id": "benchmark-edge-048" + }, + { + "from_blog_id": 10, + "to_blog_id": 21, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-21/", + "id": "benchmark-edge-049" + }, + { + "from_blog_id": 10, + "to_blog_id": 23, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-23/", + "id": "benchmark-edge-050" + }, + { + "from_blog_id": 11, + "to_blog_id": 12, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-12/", + "id": "benchmark-edge-051" + }, + { + "from_blog_id": 11, + "to_blog_id": 15, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-15/", + "id": "benchmark-edge-052" + }, + { + "from_blog_id": 11, + "to_blog_id": 18, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-18/", + "id": "benchmark-edge-053" + }, + { + "from_blog_id": 11, + "to_blog_id": 20, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-20/", + "id": "benchmark-edge-054" + }, + { + "from_blog_id": 11, + "to_blog_id": 23, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-23/", + "id": "benchmark-edge-055" + }, + { + "from_blog_id": 11, + "to_blog_id": 24, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-24/", + "id": "benchmark-edge-056" + }, + { + "from_blog_id": 12, + "to_blog_id": 6, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-06/", + "id": "benchmark-edge-057" + }, + { + "from_blog_id": 12, + "to_blog_id": 9, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-09/", + "id": "benchmark-edge-058" + }, + { + "from_blog_id": 12, + "to_blog_id": 13, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-13/", + "id": "benchmark-edge-059" + }, + { + "from_blog_id": 12, + "to_blog_id": 17, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-17/", + "id": "benchmark-edge-060" + }, + { + "from_blog_id": 13, + "to_blog_id": 1, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-01/", + "id": "benchmark-edge-061" + }, + { + "from_blog_id": 13, + "to_blog_id": 3, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-03/", + "id": "benchmark-edge-062" + }, + { + "from_blog_id": 13, + "to_blog_id": 14, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-14/", + "id": "benchmark-edge-063" + }, + { + "from_blog_id": 13, + "to_blog_id": 19, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-19/", + "id": "benchmark-edge-064" + }, + { + "from_blog_id": 13, + "to_blog_id": 20, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-20/", + "id": "benchmark-edge-065" + }, + { + "from_blog_id": 14, + "to_blog_id": 4, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-04/", + "id": "benchmark-edge-066" + }, + { + "from_blog_id": 14, + "to_blog_id": 6, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-06/", + "id": "benchmark-edge-067" + }, + { + "from_blog_id": 14, + "to_blog_id": 11, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-11/", + "id": "benchmark-edge-068" + }, + { + "from_blog_id": 14, + "to_blog_id": 15, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-15/", + "id": "benchmark-edge-069" + }, + { + "from_blog_id": 14, + "to_blog_id": 21, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-21/", + "id": "benchmark-edge-070" + }, + { + "from_blog_id": 15, + "to_blog_id": 1, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-01/", + "id": "benchmark-edge-071" + }, + { + "from_blog_id": 15, + "to_blog_id": 2, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-02/", + "id": "benchmark-edge-072" + }, + { + "from_blog_id": 15, + "to_blog_id": 4, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-04/", + "id": "benchmark-edge-073" + }, + { + "from_blog_id": 15, + "to_blog_id": 8, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-08/", + "id": "benchmark-edge-074" + }, + { + "from_blog_id": 15, + "to_blog_id": 13, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-13/", + "id": "benchmark-edge-075" + }, + { + "from_blog_id": 15, + "to_blog_id": 16, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-16/", + "id": "benchmark-edge-076" + }, + { + "from_blog_id": 15, + "to_blog_id": 21, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-21/", + "id": "benchmark-edge-077" + }, + { + "from_blog_id": 16, + "to_blog_id": 6, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-06/", + "id": "benchmark-edge-078" + }, + { + "from_blog_id": 16, + "to_blog_id": 7, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-07/", + "id": "benchmark-edge-079" + }, + { + "from_blog_id": 16, + "to_blog_id": 12, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-12/", + "id": "benchmark-edge-080" + }, + { + "from_blog_id": 16, + "to_blog_id": 13, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-13/", + "id": "benchmark-edge-081" + }, + { + "from_blog_id": 16, + "to_blog_id": 17, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-17/", + "id": "benchmark-edge-082" + }, + { + "from_blog_id": 17, + "to_blog_id": 4, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-04/", + "id": "benchmark-edge-083" + }, + { + "from_blog_id": 17, + "to_blog_id": 11, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-11/", + "id": "benchmark-edge-084" + }, + { + "from_blog_id": 17, + "to_blog_id": 15, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-15/", + "id": "benchmark-edge-085" + }, + { + "from_blog_id": 17, + "to_blog_id": 18, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-18/", + "id": "benchmark-edge-086" + }, + { + "from_blog_id": 17, + "to_blog_id": 20, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-20/", + "id": "benchmark-edge-087" + }, + { + "from_blog_id": 18, + "to_blog_id": 1, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-01/", + "id": "benchmark-edge-088" + }, + { + "from_blog_id": 18, + "to_blog_id": 19, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-19/", + "id": "benchmark-edge-089" + }, + { + "from_blog_id": 18, + "to_blog_id": 21, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-21/", + "id": "benchmark-edge-090" + }, + { + "from_blog_id": 18, + "to_blog_id": 22, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-22/", + "id": "benchmark-edge-091" + }, + { + "from_blog_id": 18, + "to_blog_id": 23, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-23/", + "id": "benchmark-edge-092" + }, + { + "from_blog_id": 19, + "to_blog_id": 3, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-03/", + "id": "benchmark-edge-093" + }, + { + "from_blog_id": 19, + "to_blog_id": 4, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-04/", + "id": "benchmark-edge-094" + }, + { + "from_blog_id": 19, + "to_blog_id": 6, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-06/", + "id": "benchmark-edge-095" + }, + { + "from_blog_id": 19, + "to_blog_id": 7, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-07/", + "id": "benchmark-edge-096" + }, + { + "from_blog_id": 19, + "to_blog_id": 8, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-08/", + "id": "benchmark-edge-097" + }, + { + "from_blog_id": 19, + "to_blog_id": 20, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-20/", + "id": "benchmark-edge-098" + }, + { + "from_blog_id": 20, + "to_blog_id": 1, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-01/", + "id": "benchmark-edge-099" + }, + { + "from_blog_id": 20, + "to_blog_id": 21, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-21/", + "id": "benchmark-edge-100" + }, + { + "from_blog_id": 21, + "to_blog_id": 8, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-08/", + "id": "benchmark-edge-101" + }, + { + "from_blog_id": 21, + "to_blog_id": 17, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-17/", + "id": "benchmark-edge-102" + }, + { + "from_blog_id": 21, + "to_blog_id": 22, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-22/", + "id": "benchmark-edge-103" + }, + { + "from_blog_id": 21, + "to_blog_id": 24, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-24/", + "id": "benchmark-edge-104" + }, + { + "from_blog_id": 22, + "to_blog_id": 2, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-02/", + "id": "benchmark-edge-105" + }, + { + "from_blog_id": 22, + "to_blog_id": 8, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-08/", + "id": "benchmark-edge-106" + }, + { + "from_blog_id": 22, + "to_blog_id": 11, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-11/", + "id": "benchmark-edge-107" + }, + { + "from_blog_id": 22, + "to_blog_id": 13, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-13/", + "id": "benchmark-edge-108" + }, + { + "from_blog_id": 22, + "to_blog_id": 17, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-17/", + "id": "benchmark-edge-109" + }, + { + "from_blog_id": 22, + "to_blog_id": 21, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-21/", + "id": "benchmark-edge-110" + }, + { + "from_blog_id": 22, + "to_blog_id": 23, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-23/", + "id": "benchmark-edge-111" + }, + { + "from_blog_id": 22, + "to_blog_id": 24, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-24/", + "id": "benchmark-edge-112" + }, + { + "from_blog_id": 23, + "to_blog_id": 6, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-06/", + "id": "benchmark-edge-113" + }, + { + "from_blog_id": 23, + "to_blog_id": 20, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-20/", + "id": "benchmark-edge-114" + }, + { + "from_blog_id": 23, + "to_blog_id": 21, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-21/", + "id": "benchmark-edge-115" + }, + { + "from_blog_id": 23, + "to_blog_id": 24, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-24/", + "id": "benchmark-edge-116" + }, + { + "from_blog_id": 24, + "to_blog_id": 1, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-01/", + "id": "benchmark-edge-117" + }, + { + "from_blog_id": 24, + "to_blog_id": 4, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-04/", + "id": "benchmark-edge-118" + }, + { + "from_blog_id": 24, + "to_blog_id": 9, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-09/", + "id": "benchmark-edge-119" + }, + { + "from_blog_id": 24, + "to_blog_id": 15, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-15/", + "id": "benchmark-edge-120" + }, + { + "from_blog_id": 24, + "to_blog_id": 20, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-20/", + "id": "benchmark-edge-121" + }, + { + "from_blog_id": 25, + "to_blog_id": 24, + "link_text": "community bridge", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-24/", + "id": "benchmark-edge-122" + }, + { + "from_blog_id": 25, + "to_blog_id": 26, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/engineering-02/", + "id": "benchmark-edge-123" + }, + { + "from_blog_id": 25, + "to_blog_id": 28, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-04/", + "id": "benchmark-edge-124" + }, + { + "from_blog_id": 25, + "to_blog_id": 39, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-15/", + "id": "benchmark-edge-125" + }, + { + "from_blog_id": 25, + "to_blog_id": 43, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-19/", + "id": "benchmark-edge-126" + }, + { + "from_blog_id": 25, + "to_blog_id": 44, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-20/", + "id": "benchmark-edge-127" + }, + { + "from_blog_id": 25, + "to_blog_id": 46, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-22/", + "id": "benchmark-edge-128" + }, + { + "from_blog_id": 26, + "to_blog_id": 27, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/engineering-03/", + "id": "benchmark-edge-129" + }, + { + "from_blog_id": 27, + "to_blog_id": 28, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/engineering-04/", + "id": "benchmark-edge-130" + }, + { + "from_blog_id": 27, + "to_blog_id": 33, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-09/", + "id": "benchmark-edge-131" + }, + { + "from_blog_id": 27, + "to_blog_id": 36, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-12/", + "id": "benchmark-edge-132" + }, + { + "from_blog_id": 27, + "to_blog_id": 45, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-21/", + "id": "benchmark-edge-133" + }, + { + "from_blog_id": 28, + "to_blog_id": 29, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/engineering-05/", + "id": "benchmark-edge-134" + }, + { + "from_blog_id": 28, + "to_blog_id": 30, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-06/", + "id": "benchmark-edge-135" + }, + { + "from_blog_id": 28, + "to_blog_id": 34, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-10/", + "id": "benchmark-edge-136" + }, + { + "from_blog_id": 29, + "to_blog_id": 27, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-03/", + "id": "benchmark-edge-137" + }, + { + "from_blog_id": 29, + "to_blog_id": 30, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/engineering-06/", + "id": "benchmark-edge-138" + }, + { + "from_blog_id": 29, + "to_blog_id": 36, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-12/", + "id": "benchmark-edge-139" + }, + { + "from_blog_id": 29, + "to_blog_id": 37, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-13/", + "id": "benchmark-edge-140" + }, + { + "from_blog_id": 29, + "to_blog_id": 44, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-20/", + "id": "benchmark-edge-141" + }, + { + "from_blog_id": 29, + "to_blog_id": 69, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-05/", + "id": "benchmark-edge-142" + }, + { + "from_blog_id": 30, + "to_blog_id": 26, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-02/", + "id": "benchmark-edge-143" + }, + { + "from_blog_id": 30, + "to_blog_id": 31, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/engineering-07/", + "id": "benchmark-edge-144" + }, + { + "from_blog_id": 30, + "to_blog_id": 36, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-12/", + "id": "benchmark-edge-145" + }, + { + "from_blog_id": 30, + "to_blog_id": 37, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-13/", + "id": "benchmark-edge-146" + }, + { + "from_blog_id": 31, + "to_blog_id": 32, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/engineering-08/", + "id": "benchmark-edge-147" + }, + { + "from_blog_id": 31, + "to_blog_id": 40, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-16/", + "id": "benchmark-edge-148" + }, + { + "from_blog_id": 31, + "to_blog_id": 45, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-21/", + "id": "benchmark-edge-149" + }, + { + "from_blog_id": 32, + "to_blog_id": 27, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-03/", + "id": "benchmark-edge-150" + }, + { + "from_blog_id": 32, + "to_blog_id": 31, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-07/", + "id": "benchmark-edge-151" + }, + { + "from_blog_id": 32, + "to_blog_id": 33, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/engineering-09/", + "id": "benchmark-edge-152" + }, + { + "from_blog_id": 32, + "to_blog_id": 34, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-10/", + "id": "benchmark-edge-153" + }, + { + "from_blog_id": 32, + "to_blog_id": 36, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-12/", + "id": "benchmark-edge-154" + }, + { + "from_blog_id": 32, + "to_blog_id": 44, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-20/", + "id": "benchmark-edge-155" + }, + { + "from_blog_id": 33, + "to_blog_id": 34, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/engineering-10/", + "id": "benchmark-edge-156" + }, + { + "from_blog_id": 33, + "to_blog_id": 35, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-11/", + "id": "benchmark-edge-157" + }, + { + "from_blog_id": 33, + "to_blog_id": 37, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-13/", + "id": "benchmark-edge-158" + }, + { + "from_blog_id": 33, + "to_blog_id": 46, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-22/", + "id": "benchmark-edge-159" + }, + { + "from_blog_id": 34, + "to_blog_id": 27, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-03/", + "id": "benchmark-edge-160" + }, + { + "from_blog_id": 34, + "to_blog_id": 31, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-07/", + "id": "benchmark-edge-161" + }, + { + "from_blog_id": 34, + "to_blog_id": 35, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/engineering-11/", + "id": "benchmark-edge-162" + }, + { + "from_blog_id": 35, + "to_blog_id": 29, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-05/", + "id": "benchmark-edge-163" + }, + { + "from_blog_id": 35, + "to_blog_id": 36, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/engineering-12/", + "id": "benchmark-edge-164" + }, + { + "from_blog_id": 35, + "to_blog_id": 59, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/design-13/", + "id": "benchmark-edge-165" + }, + { + "from_blog_id": 36, + "to_blog_id": 25, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-01/", + "id": "benchmark-edge-166" + }, + { + "from_blog_id": 36, + "to_blog_id": 31, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-07/", + "id": "benchmark-edge-167" + }, + { + "from_blog_id": 36, + "to_blog_id": 34, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-10/", + "id": "benchmark-edge-168" + }, + { + "from_blog_id": 36, + "to_blog_id": 37, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/engineering-13/", + "id": "benchmark-edge-169" + }, + { + "from_blog_id": 36, + "to_blog_id": 39, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-15/", + "id": "benchmark-edge-170" + }, + { + "from_blog_id": 36, + "to_blog_id": 43, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-19/", + "id": "benchmark-edge-171" + }, + { + "from_blog_id": 36, + "to_blog_id": 44, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-20/", + "id": "benchmark-edge-172" + }, + { + "from_blog_id": 36, + "to_blog_id": 46, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-22/", + "id": "benchmark-edge-173" + }, + { + "from_blog_id": 37, + "to_blog_id": 27, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-03/", + "id": "benchmark-edge-174" + }, + { + "from_blog_id": 37, + "to_blog_id": 36, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-12/", + "id": "benchmark-edge-175" + }, + { + "from_blog_id": 37, + "to_blog_id": 38, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/engineering-14/", + "id": "benchmark-edge-176" + }, + { + "from_blog_id": 37, + "to_blog_id": 40, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-16/", + "id": "benchmark-edge-177" + }, + { + "from_blog_id": 38, + "to_blog_id": 34, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-10/", + "id": "benchmark-edge-178" + }, + { + "from_blog_id": 38, + "to_blog_id": 39, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/engineering-15/", + "id": "benchmark-edge-179" + }, + { + "from_blog_id": 38, + "to_blog_id": 41, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-17/", + "id": "benchmark-edge-180" + }, + { + "from_blog_id": 38, + "to_blog_id": 44, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-20/", + "id": "benchmark-edge-181" + }, + { + "from_blog_id": 39, + "to_blog_id": 26, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-02/", + "id": "benchmark-edge-182" + }, + { + "from_blog_id": 39, + "to_blog_id": 27, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-03/", + "id": "benchmark-edge-183" + }, + { + "from_blog_id": 39, + "to_blog_id": 28, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-04/", + "id": "benchmark-edge-184" + }, + { + "from_blog_id": 39, + "to_blog_id": 29, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-05/", + "id": "benchmark-edge-185" + }, + { + "from_blog_id": 39, + "to_blog_id": 34, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-10/", + "id": "benchmark-edge-186" + }, + { + "from_blog_id": 39, + "to_blog_id": 40, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/engineering-16/", + "id": "benchmark-edge-187" + }, + { + "from_blog_id": 39, + "to_blog_id": 46, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-22/", + "id": "benchmark-edge-188" + }, + { + "from_blog_id": 40, + "to_blog_id": 27, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-03/", + "id": "benchmark-edge-189" + }, + { + "from_blog_id": 40, + "to_blog_id": 32, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-08/", + "id": "benchmark-edge-190" + }, + { + "from_blog_id": 40, + "to_blog_id": 33, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-09/", + "id": "benchmark-edge-191" + }, + { + "from_blog_id": 40, + "to_blog_id": 35, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-11/", + "id": "benchmark-edge-192" + }, + { + "from_blog_id": 40, + "to_blog_id": 41, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/engineering-17/", + "id": "benchmark-edge-193" + }, + { + "from_blog_id": 40, + "to_blog_id": 43, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-19/", + "id": "benchmark-edge-194" + }, + { + "from_blog_id": 41, + "to_blog_id": 36, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-12/", + "id": "benchmark-edge-195" + }, + { + "from_blog_id": 41, + "to_blog_id": 39, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-15/", + "id": "benchmark-edge-196" + }, + { + "from_blog_id": 41, + "to_blog_id": 42, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/engineering-18/", + "id": "benchmark-edge-197" + }, + { + "from_blog_id": 41, + "to_blog_id": 44, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-20/", + "id": "benchmark-edge-198" + }, + { + "from_blog_id": 42, + "to_blog_id": 28, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-04/", + "id": "benchmark-edge-199" + }, + { + "from_blog_id": 42, + "to_blog_id": 31, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-07/", + "id": "benchmark-edge-200" + }, + { + "from_blog_id": 42, + "to_blog_id": 41, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-17/", + "id": "benchmark-edge-201" + }, + { + "from_blog_id": 42, + "to_blog_id": 43, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/engineering-19/", + "id": "benchmark-edge-202" + }, + { + "from_blog_id": 42, + "to_blog_id": 44, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-20/", + "id": "benchmark-edge-203" + }, + { + "from_blog_id": 43, + "to_blog_id": 27, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-03/", + "id": "benchmark-edge-204" + }, + { + "from_blog_id": 43, + "to_blog_id": 32, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-08/", + "id": "benchmark-edge-205" + }, + { + "from_blog_id": 43, + "to_blog_id": 35, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-11/", + "id": "benchmark-edge-206" + }, + { + "from_blog_id": 43, + "to_blog_id": 44, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/engineering-20/", + "id": "benchmark-edge-207" + }, + { + "from_blog_id": 43, + "to_blog_id": 45, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-21/", + "id": "benchmark-edge-208" + }, + { + "from_blog_id": 43, + "to_blog_id": 77, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-13/", + "id": "benchmark-edge-209" + }, + { + "from_blog_id": 44, + "to_blog_id": 4, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/indie-web-04/", + "id": "benchmark-edge-210" + }, + { + "from_blog_id": 44, + "to_blog_id": 28, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-04/", + "id": "benchmark-edge-211" + }, + { + "from_blog_id": 44, + "to_blog_id": 31, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-07/", + "id": "benchmark-edge-212" + }, + { + "from_blog_id": 44, + "to_blog_id": 34, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-10/", + "id": "benchmark-edge-213" + }, + { + "from_blog_id": 44, + "to_blog_id": 39, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-15/", + "id": "benchmark-edge-214" + }, + { + "from_blog_id": 44, + "to_blog_id": 43, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-19/", + "id": "benchmark-edge-215" + }, + { + "from_blog_id": 44, + "to_blog_id": 45, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/engineering-21/", + "id": "benchmark-edge-216" + }, + { + "from_blog_id": 45, + "to_blog_id": 26, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-02/", + "id": "benchmark-edge-217" + }, + { + "from_blog_id": 45, + "to_blog_id": 30, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-06/", + "id": "benchmark-edge-218" + }, + { + "from_blog_id": 45, + "to_blog_id": 32, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-08/", + "id": "benchmark-edge-219" + }, + { + "from_blog_id": 45, + "to_blog_id": 33, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-09/", + "id": "benchmark-edge-220" + }, + { + "from_blog_id": 45, + "to_blog_id": 35, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-11/", + "id": "benchmark-edge-221" + }, + { + "from_blog_id": 45, + "to_blog_id": 36, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-12/", + "id": "benchmark-edge-222" + }, + { + "from_blog_id": 45, + "to_blog_id": 46, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/engineering-22/", + "id": "benchmark-edge-223" + }, + { + "from_blog_id": 46, + "to_blog_id": 25, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/engineering-01/", + "id": "benchmark-edge-224" + }, + { + "from_blog_id": 46, + "to_blog_id": 35, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-11/", + "id": "benchmark-edge-225" + }, + { + "from_blog_id": 46, + "to_blog_id": 41, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-17/", + "id": "benchmark-edge-226" + }, + { + "from_blog_id": 46, + "to_blog_id": 42, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-18/", + "id": "benchmark-edge-227" + }, + { + "from_blog_id": 47, + "to_blog_id": 48, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/design-02/", + "id": "benchmark-edge-228" + }, + { + "from_blog_id": 47, + "to_blog_id": 77, + "link_text": "community bridge", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-13/", + "id": "benchmark-edge-229" + }, + { + "from_blog_id": 48, + "to_blog_id": 49, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/design-03/", + "id": "benchmark-edge-230" + }, + { + "from_blog_id": 48, + "to_blog_id": 61, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/design-15/", + "id": "benchmark-edge-231" + }, + { + "from_blog_id": 49, + "to_blog_id": 50, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/design-04/", + "id": "benchmark-edge-232" + }, + { + "from_blog_id": 49, + "to_blog_id": 55, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/design-09/", + "id": "benchmark-edge-233" + }, + { + "from_blog_id": 49, + "to_blog_id": 58, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/design-12/", + "id": "benchmark-edge-234" + }, + { + "from_blog_id": 49, + "to_blog_id": 62, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/design-16/", + "id": "benchmark-edge-235" + }, + { + "from_blog_id": 50, + "to_blog_id": 29, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-05/", + "id": "benchmark-edge-236" + }, + { + "from_blog_id": 50, + "to_blog_id": 51, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/design-05/", + "id": "benchmark-edge-237" + }, + { + "from_blog_id": 50, + "to_blog_id": 57, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/design-11/", + "id": "benchmark-edge-238" + }, + { + "from_blog_id": 51, + "to_blog_id": 52, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/design-06/", + "id": "benchmark-edge-239" + }, + { + "from_blog_id": 51, + "to_blog_id": 60, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/design-14/", + "id": "benchmark-edge-240" + }, + { + "from_blog_id": 51, + "to_blog_id": 62, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/design-16/", + "id": "benchmark-edge-241" + }, + { + "from_blog_id": 52, + "to_blog_id": 48, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/design-02/", + "id": "benchmark-edge-242" + }, + { + "from_blog_id": 52, + "to_blog_id": 53, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/design-07/", + "id": "benchmark-edge-243" + }, + { + "from_blog_id": 52, + "to_blog_id": 58, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/design-12/", + "id": "benchmark-edge-244" + }, + { + "from_blog_id": 52, + "to_blog_id": 59, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/design-13/", + "id": "benchmark-edge-245" + }, + { + "from_blog_id": 52, + "to_blog_id": 63, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/design-17/", + "id": "benchmark-edge-246" + }, + { + "from_blog_id": 53, + "to_blog_id": 47, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/design-01/", + "id": "benchmark-edge-247" + }, + { + "from_blog_id": 53, + "to_blog_id": 54, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/design-08/", + "id": "benchmark-edge-248" + }, + { + "from_blog_id": 53, + "to_blog_id": 57, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/design-11/", + "id": "benchmark-edge-249" + }, + { + "from_blog_id": 54, + "to_blog_id": 52, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/design-06/", + "id": "benchmark-edge-250" + }, + { + "from_blog_id": 54, + "to_blog_id": 55, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/design-09/", + "id": "benchmark-edge-251" + }, + { + "from_blog_id": 54, + "to_blog_id": 63, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/design-17/", + "id": "benchmark-edge-252" + }, + { + "from_blog_id": 54, + "to_blog_id": 70, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-06/", + "id": "benchmark-edge-253" + }, + { + "from_blog_id": 55, + "to_blog_id": 50, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/design-04/", + "id": "benchmark-edge-254" + }, + { + "from_blog_id": 55, + "to_blog_id": 56, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/design-10/", + "id": "benchmark-edge-255" + }, + { + "from_blog_id": 56, + "to_blog_id": 49, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/design-03/", + "id": "benchmark-edge-256" + }, + { + "from_blog_id": 56, + "to_blog_id": 57, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/design-11/", + "id": "benchmark-edge-257" + }, + { + "from_blog_id": 56, + "to_blog_id": 62, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/design-16/", + "id": "benchmark-edge-258" + }, + { + "from_blog_id": 57, + "to_blog_id": 58, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/design-12/", + "id": "benchmark-edge-259" + }, + { + "from_blog_id": 57, + "to_blog_id": 59, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/design-13/", + "id": "benchmark-edge-260" + }, + { + "from_blog_id": 57, + "to_blog_id": 61, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/design-15/", + "id": "benchmark-edge-261" + }, + { + "from_blog_id": 57, + "to_blog_id": 62, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/design-16/", + "id": "benchmark-edge-262" + }, + { + "from_blog_id": 58, + "to_blog_id": 53, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/design-07/", + "id": "benchmark-edge-263" + }, + { + "from_blog_id": 58, + "to_blog_id": 59, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/design-13/", + "id": "benchmark-edge-264" + }, + { + "from_blog_id": 59, + "to_blog_id": 48, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/design-02/", + "id": "benchmark-edge-265" + }, + { + "from_blog_id": 59, + "to_blog_id": 53, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/design-07/", + "id": "benchmark-edge-266" + }, + { + "from_blog_id": 59, + "to_blog_id": 60, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/design-14/", + "id": "benchmark-edge-267" + }, + { + "from_blog_id": 59, + "to_blog_id": 63, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/design-17/", + "id": "benchmark-edge-268" + }, + { + "from_blog_id": 60, + "to_blog_id": 47, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/design-01/", + "id": "benchmark-edge-269" + }, + { + "from_blog_id": 60, + "to_blog_id": 48, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/design-02/", + "id": "benchmark-edge-270" + }, + { + "from_blog_id": 60, + "to_blog_id": 49, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/design-03/", + "id": "benchmark-edge-271" + }, + { + "from_blog_id": 60, + "to_blog_id": 57, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/design-11/", + "id": "benchmark-edge-272" + }, + { + "from_blog_id": 60, + "to_blog_id": 61, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/design-15/", + "id": "benchmark-edge-273" + }, + { + "from_blog_id": 60, + "to_blog_id": 62, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/design-16/", + "id": "benchmark-edge-274" + }, + { + "from_blog_id": 60, + "to_blog_id": 63, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/design-17/", + "id": "benchmark-edge-275" + }, + { + "from_blog_id": 61, + "to_blog_id": 51, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/design-05/", + "id": "benchmark-edge-276" + }, + { + "from_blog_id": 61, + "to_blog_id": 54, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/design-08/", + "id": "benchmark-edge-277" + }, + { + "from_blog_id": 61, + "to_blog_id": 59, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/design-13/", + "id": "benchmark-edge-278" + }, + { + "from_blog_id": 61, + "to_blog_id": 62, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/design-16/", + "id": "benchmark-edge-279" + }, + { + "from_blog_id": 62, + "to_blog_id": 53, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/design-07/", + "id": "benchmark-edge-280" + }, + { + "from_blog_id": 62, + "to_blog_id": 63, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/design-17/", + "id": "benchmark-edge-281" + }, + { + "from_blog_id": 62, + "to_blog_id": 64, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/design-18/", + "id": "benchmark-edge-282" + }, + { + "from_blog_id": 63, + "to_blog_id": 34, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/engineering-10/", + "id": "benchmark-edge-283" + }, + { + "from_blog_id": 63, + "to_blog_id": 61, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/design-15/", + "id": "benchmark-edge-284" + }, + { + "from_blog_id": 63, + "to_blog_id": 64, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/design-18/", + "id": "benchmark-edge-285" + }, + { + "from_blog_id": 64, + "to_blog_id": 47, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/design-01/", + "id": "benchmark-edge-286" + }, + { + "from_blog_id": 64, + "to_blog_id": 49, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/design-03/", + "id": "benchmark-edge-287" + }, + { + "from_blog_id": 64, + "to_blog_id": 58, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/design-12/", + "id": "benchmark-edge-288" + }, + { + "from_blog_id": 64, + "to_blog_id": 59, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/design-13/", + "id": "benchmark-edge-289" + }, + { + "from_blog_id": 65, + "to_blog_id": 66, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-02/", + "id": "benchmark-edge-290" + }, + { + "from_blog_id": 65, + "to_blog_id": 75, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-11/", + "id": "benchmark-edge-291" + }, + { + "from_blog_id": 65, + "to_blog_id": 77, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-13/", + "id": "benchmark-edge-292" + }, + { + "from_blog_id": 65, + "to_blog_id": 82, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-18/", + "id": "benchmark-edge-293" + }, + { + "from_blog_id": 65, + "to_blog_id": 86, + "link_text": "community bridge", + "link_url_raw": "https://benchmark.heyblog.local/culture-02/", + "id": "benchmark-edge-294" + }, + { + "from_blog_id": 66, + "to_blog_id": 65, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-01/", + "id": "benchmark-edge-295" + }, + { + "from_blog_id": 66, + "to_blog_id": 67, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-03/", + "id": "benchmark-edge-296" + }, + { + "from_blog_id": 66, + "to_blog_id": 74, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-10/", + "id": "benchmark-edge-297" + }, + { + "from_blog_id": 66, + "to_blog_id": 76, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-12/", + "id": "benchmark-edge-298" + }, + { + "from_blog_id": 66, + "to_blog_id": 79, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-15/", + "id": "benchmark-edge-299" + }, + { + "from_blog_id": 67, + "to_blog_id": 65, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-01/", + "id": "benchmark-edge-300" + }, + { + "from_blog_id": 67, + "to_blog_id": 68, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-04/", + "id": "benchmark-edge-301" + }, + { + "from_blog_id": 67, + "to_blog_id": 72, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-08/", + "id": "benchmark-edge-302" + }, + { + "from_blog_id": 67, + "to_blog_id": 75, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-11/", + "id": "benchmark-edge-303" + }, + { + "from_blog_id": 67, + "to_blog_id": 81, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-17/", + "id": "benchmark-edge-304" + }, + { + "from_blog_id": 67, + "to_blog_id": 83, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-19/", + "id": "benchmark-edge-305" + }, + { + "from_blog_id": 68, + "to_blog_id": 65, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-01/", + "id": "benchmark-edge-306" + }, + { + "from_blog_id": 68, + "to_blog_id": 69, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-05/", + "id": "benchmark-edge-307" + }, + { + "from_blog_id": 68, + "to_blog_id": 74, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-10/", + "id": "benchmark-edge-308" + }, + { + "from_blog_id": 68, + "to_blog_id": 79, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-15/", + "id": "benchmark-edge-309" + }, + { + "from_blog_id": 68, + "to_blog_id": 81, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-17/", + "id": "benchmark-edge-310" + }, + { + "from_blog_id": 68, + "to_blog_id": 84, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-20/", + "id": "benchmark-edge-311" + }, + { + "from_blog_id": 69, + "to_blog_id": 66, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-02/", + "id": "benchmark-edge-312" + }, + { + "from_blog_id": 69, + "to_blog_id": 68, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-04/", + "id": "benchmark-edge-313" + }, + { + "from_blog_id": 69, + "to_blog_id": 70, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-06/", + "id": "benchmark-edge-314" + }, + { + "from_blog_id": 69, + "to_blog_id": 73, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-09/", + "id": "benchmark-edge-315" + }, + { + "from_blog_id": 69, + "to_blog_id": 77, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-13/", + "id": "benchmark-edge-316" + }, + { + "from_blog_id": 69, + "to_blog_id": 83, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-19/", + "id": "benchmark-edge-317" + }, + { + "from_blog_id": 70, + "to_blog_id": 65, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-01/", + "id": "benchmark-edge-318" + }, + { + "from_blog_id": 70, + "to_blog_id": 67, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-03/", + "id": "benchmark-edge-319" + }, + { + "from_blog_id": 70, + "to_blog_id": 68, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-04/", + "id": "benchmark-edge-320" + }, + { + "from_blog_id": 70, + "to_blog_id": 69, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-05/", + "id": "benchmark-edge-321" + }, + { + "from_blog_id": 70, + "to_blog_id": 71, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-07/", + "id": "benchmark-edge-322" + }, + { + "from_blog_id": 70, + "to_blog_id": 80, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-16/", + "id": "benchmark-edge-323" + }, + { + "from_blog_id": 70, + "to_blog_id": 83, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-19/", + "id": "benchmark-edge-324" + }, + { + "from_blog_id": 71, + "to_blog_id": 66, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-02/", + "id": "benchmark-edge-325" + }, + { + "from_blog_id": 71, + "to_blog_id": 69, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-05/", + "id": "benchmark-edge-326" + }, + { + "from_blog_id": 71, + "to_blog_id": 72, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-08/", + "id": "benchmark-edge-327" + }, + { + "from_blog_id": 71, + "to_blog_id": 79, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-15/", + "id": "benchmark-edge-328" + }, + { + "from_blog_id": 72, + "to_blog_id": 70, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-06/", + "id": "benchmark-edge-329" + }, + { + "from_blog_id": 72, + "to_blog_id": 73, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-09/", + "id": "benchmark-edge-330" + }, + { + "from_blog_id": 72, + "to_blog_id": 82, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-18/", + "id": "benchmark-edge-331" + }, + { + "from_blog_id": 72, + "to_blog_id": 84, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-20/", + "id": "benchmark-edge-332" + }, + { + "from_blog_id": 73, + "to_blog_id": 68, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-04/", + "id": "benchmark-edge-333" + }, + { + "from_blog_id": 73, + "to_blog_id": 71, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-07/", + "id": "benchmark-edge-334" + }, + { + "from_blog_id": 73, + "to_blog_id": 74, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-10/", + "id": "benchmark-edge-335" + }, + { + "from_blog_id": 74, + "to_blog_id": 67, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-03/", + "id": "benchmark-edge-336" + }, + { + "from_blog_id": 74, + "to_blog_id": 69, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-05/", + "id": "benchmark-edge-337" + }, + { + "from_blog_id": 74, + "to_blog_id": 73, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-09/", + "id": "benchmark-edge-338" + }, + { + "from_blog_id": 74, + "to_blog_id": 75, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-11/", + "id": "benchmark-edge-339" + }, + { + "from_blog_id": 74, + "to_blog_id": 81, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-17/", + "id": "benchmark-edge-340" + }, + { + "from_blog_id": 75, + "to_blog_id": 71, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-07/", + "id": "benchmark-edge-341" + }, + { + "from_blog_id": 75, + "to_blog_id": 76, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-12/", + "id": "benchmark-edge-342" + }, + { + "from_blog_id": 76, + "to_blog_id": 69, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-05/", + "id": "benchmark-edge-343" + }, + { + "from_blog_id": 76, + "to_blog_id": 77, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-13/", + "id": "benchmark-edge-344" + }, + { + "from_blog_id": 76, + "to_blog_id": 84, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-20/", + "id": "benchmark-edge-345" + }, + { + "from_blog_id": 77, + "to_blog_id": 71, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-07/", + "id": "benchmark-edge-346" + }, + { + "from_blog_id": 77, + "to_blog_id": 78, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-14/", + "id": "benchmark-edge-347" + }, + { + "from_blog_id": 77, + "to_blog_id": 81, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-17/", + "id": "benchmark-edge-348" + }, + { + "from_blog_id": 77, + "to_blog_id": 84, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-20/", + "id": "benchmark-edge-349" + }, + { + "from_blog_id": 78, + "to_blog_id": 66, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-02/", + "id": "benchmark-edge-350" + }, + { + "from_blog_id": 78, + "to_blog_id": 69, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-05/", + "id": "benchmark-edge-351" + }, + { + "from_blog_id": 78, + "to_blog_id": 74, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-10/", + "id": "benchmark-edge-352" + }, + { + "from_blog_id": 78, + "to_blog_id": 79, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-15/", + "id": "benchmark-edge-353" + }, + { + "from_blog_id": 78, + "to_blog_id": 92, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/culture-08/", + "id": "benchmark-edge-354" + }, + { + "from_blog_id": 79, + "to_blog_id": 70, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-06/", + "id": "benchmark-edge-355" + }, + { + "from_blog_id": 79, + "to_blog_id": 76, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-12/", + "id": "benchmark-edge-356" + }, + { + "from_blog_id": 79, + "to_blog_id": 80, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-16/", + "id": "benchmark-edge-357" + }, + { + "from_blog_id": 79, + "to_blog_id": 81, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-17/", + "id": "benchmark-edge-358" + }, + { + "from_blog_id": 79, + "to_blog_id": 84, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-20/", + "id": "benchmark-edge-359" + }, + { + "from_blog_id": 80, + "to_blog_id": 63, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/design-17/", + "id": "benchmark-edge-360" + }, + { + "from_blog_id": 80, + "to_blog_id": 76, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-12/", + "id": "benchmark-edge-361" + }, + { + "from_blog_id": 80, + "to_blog_id": 78, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-14/", + "id": "benchmark-edge-362" + }, + { + "from_blog_id": 80, + "to_blog_id": 81, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-17/", + "id": "benchmark-edge-363" + }, + { + "from_blog_id": 81, + "to_blog_id": 65, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-01/", + "id": "benchmark-edge-364" + }, + { + "from_blog_id": 81, + "to_blog_id": 69, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-05/", + "id": "benchmark-edge-365" + }, + { + "from_blog_id": 81, + "to_blog_id": 78, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-14/", + "id": "benchmark-edge-366" + }, + { + "from_blog_id": 81, + "to_blog_id": 80, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-16/", + "id": "benchmark-edge-367" + }, + { + "from_blog_id": 81, + "to_blog_id": 82, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-18/", + "id": "benchmark-edge-368" + }, + { + "from_blog_id": 82, + "to_blog_id": 75, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-11/", + "id": "benchmark-edge-369" + }, + { + "from_blog_id": 82, + "to_blog_id": 79, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-15/", + "id": "benchmark-edge-370" + }, + { + "from_blog_id": 82, + "to_blog_id": 83, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-19/", + "id": "benchmark-edge-371" + }, + { + "from_blog_id": 83, + "to_blog_id": 72, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-08/", + "id": "benchmark-edge-372" + }, + { + "from_blog_id": 83, + "to_blog_id": 73, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-09/", + "id": "benchmark-edge-373" + }, + { + "from_blog_id": 83, + "to_blog_id": 74, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-10/", + "id": "benchmark-edge-374" + }, + { + "from_blog_id": 83, + "to_blog_id": 75, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-11/", + "id": "benchmark-edge-375" + }, + { + "from_blog_id": 83, + "to_blog_id": 79, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-15/", + "id": "benchmark-edge-376" + }, + { + "from_blog_id": 83, + "to_blog_id": 84, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-20/", + "id": "benchmark-edge-377" + }, + { + "from_blog_id": 84, + "to_blog_id": 65, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-01/", + "id": "benchmark-edge-378" + }, + { + "from_blog_id": 84, + "to_blog_id": 66, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-02/", + "id": "benchmark-edge-379" + }, + { + "from_blog_id": 84, + "to_blog_id": 78, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/data-ai-14/", + "id": "benchmark-edge-380" + }, + { + "from_blog_id": 85, + "to_blog_id": 55, + "link_text": "community bridge", + "link_url_raw": "https://benchmark.heyblog.local/design-09/", + "id": "benchmark-edge-381" + }, + { + "from_blog_id": 85, + "to_blog_id": 86, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/culture-02/", + "id": "benchmark-edge-382" + }, + { + "from_blog_id": 85, + "to_blog_id": 87, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/culture-03/", + "id": "benchmark-edge-383" + }, + { + "from_blog_id": 85, + "to_blog_id": 89, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/culture-05/", + "id": "benchmark-edge-384" + }, + { + "from_blog_id": 86, + "to_blog_id": 87, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/culture-03/", + "id": "benchmark-edge-385" + }, + { + "from_blog_id": 86, + "to_blog_id": 96, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/culture-12/", + "id": "benchmark-edge-386" + }, + { + "from_blog_id": 87, + "to_blog_id": 88, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/culture-04/", + "id": "benchmark-edge-387" + }, + { + "from_blog_id": 87, + "to_blog_id": 93, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/culture-09/", + "id": "benchmark-edge-388" + }, + { + "from_blog_id": 87, + "to_blog_id": 98, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/culture-14/", + "id": "benchmark-edge-389" + }, + { + "from_blog_id": 88, + "to_blog_id": 89, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/culture-05/", + "id": "benchmark-edge-390" + }, + { + "from_blog_id": 89, + "to_blog_id": 86, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/culture-02/", + "id": "benchmark-edge-391" + }, + { + "from_blog_id": 89, + "to_blog_id": 88, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/culture-04/", + "id": "benchmark-edge-392" + }, + { + "from_blog_id": 89, + "to_blog_id": 90, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/culture-06/", + "id": "benchmark-edge-393" + }, + { + "from_blog_id": 90, + "to_blog_id": 91, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/culture-07/", + "id": "benchmark-edge-394" + }, + { + "from_blog_id": 90, + "to_blog_id": 93, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/culture-09/", + "id": "benchmark-edge-395" + }, + { + "from_blog_id": 91, + "to_blog_id": 85, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/culture-01/", + "id": "benchmark-edge-396" + }, + { + "from_blog_id": 91, + "to_blog_id": 92, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/culture-08/", + "id": "benchmark-edge-397" + }, + { + "from_blog_id": 91, + "to_blog_id": 94, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/culture-10/", + "id": "benchmark-edge-398" + }, + { + "from_blog_id": 92, + "to_blog_id": 90, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/culture-06/", + "id": "benchmark-edge-399" + }, + { + "from_blog_id": 92, + "to_blog_id": 93, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/culture-09/", + "id": "benchmark-edge-400" + }, + { + "from_blog_id": 92, + "to_blog_id": 97, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/culture-13/", + "id": "benchmark-edge-401" + }, + { + "from_blog_id": 92, + "to_blog_id": 100, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/culture-16/", + "id": "benchmark-edge-402" + }, + { + "from_blog_id": 93, + "to_blog_id": 59, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/design-13/", + "id": "benchmark-edge-403" + }, + { + "from_blog_id": 93, + "to_blog_id": 92, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/culture-08/", + "id": "benchmark-edge-404" + }, + { + "from_blog_id": 93, + "to_blog_id": 94, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/culture-10/", + "id": "benchmark-edge-405" + }, + { + "from_blog_id": 94, + "to_blog_id": 89, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/culture-05/", + "id": "benchmark-edge-406" + }, + { + "from_blog_id": 94, + "to_blog_id": 95, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/culture-11/", + "id": "benchmark-edge-407" + }, + { + "from_blog_id": 94, + "to_blog_id": 97, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/culture-13/", + "id": "benchmark-edge-408" + }, + { + "from_blog_id": 95, + "to_blog_id": 89, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/culture-05/", + "id": "benchmark-edge-409" + }, + { + "from_blog_id": 95, + "to_blog_id": 91, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/culture-07/", + "id": "benchmark-edge-410" + }, + { + "from_blog_id": 95, + "to_blog_id": 94, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/culture-10/", + "id": "benchmark-edge-411" + }, + { + "from_blog_id": 95, + "to_blog_id": 96, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/culture-12/", + "id": "benchmark-edge-412" + }, + { + "from_blog_id": 95, + "to_blog_id": 97, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/culture-13/", + "id": "benchmark-edge-413" + }, + { + "from_blog_id": 96, + "to_blog_id": 85, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/culture-01/", + "id": "benchmark-edge-414" + }, + { + "from_blog_id": 96, + "to_blog_id": 87, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/culture-03/", + "id": "benchmark-edge-415" + }, + { + "from_blog_id": 96, + "to_blog_id": 93, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/culture-09/", + "id": "benchmark-edge-416" + }, + { + "from_blog_id": 96, + "to_blog_id": 97, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/culture-13/", + "id": "benchmark-edge-417" + }, + { + "from_blog_id": 96, + "to_blog_id": 98, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/culture-14/", + "id": "benchmark-edge-418" + }, + { + "from_blog_id": 97, + "to_blog_id": 98, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/culture-14/", + "id": "benchmark-edge-419" + }, + { + "from_blog_id": 98, + "to_blog_id": 91, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/culture-07/", + "id": "benchmark-edge-420" + }, + { + "from_blog_id": 98, + "to_blog_id": 99, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/culture-15/", + "id": "benchmark-edge-421" + }, + { + "from_blog_id": 99, + "to_blog_id": 90, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/culture-06/", + "id": "benchmark-edge-422" + }, + { + "from_blog_id": 99, + "to_blog_id": 91, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/culture-07/", + "id": "benchmark-edge-423" + }, + { + "from_blog_id": 99, + "to_blog_id": 96, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/culture-12/", + "id": "benchmark-edge-424" + }, + { + "from_blog_id": 99, + "to_blog_id": 100, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/culture-16/", + "id": "benchmark-edge-425" + }, + { + "from_blog_id": 100, + "to_blog_id": 85, + "link_text": "blogroll", + "link_url_raw": "https://benchmark.heyblog.local/culture-01/", + "id": "benchmark-edge-426" + }, + { + "from_blog_id": 100, + "to_blog_id": 95, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/culture-11/", + "id": "benchmark-edge-427" + }, + { + "from_blog_id": 100, + "to_blog_id": 97, + "link_text": "friend link", + "link_url_raw": "https://benchmark.heyblog.local/culture-13/", + "id": "benchmark-edge-428" + } + ], + "meta": { + "strategy": "synthetic-community-benchmark", + "limit": 100, + "source": "scripts/generate_visualization_benchmark.py", + "generated_at": "2026-06-05T18:43:51.851750+00:00", + "total_nodes": 100, + "total_edges": 428, + "selected_nodes": 100, + "selected_edges": 428, + "available_nodes": 100, + "available_edges": 428, + "benchmark": { + "seed": 42, + "model": "seeded stochastic block model inspired by LFR mixing-parameter benchmarks", + "community_sizes": { + "indie-web": 24, + "engineering": 22, + "design": 18, + "data-ai": 20, + "culture": 16 + }, + "intra_probability": 0.34, + "inter_probability": 0.002, + "estimated_mixing_rate": 0.006, + "layout": "fixed separated community centers with deterministic jitter" + } + } +} diff --git a/frontend/server.py b/frontend/server.py index add1438..75b0004 100644 --- a/frontend/server.py +++ b/frontend/server.py @@ -126,10 +126,15 @@ async def proxy_api(path: str, request: Request) -> Response: content=await request.body(), headers=headers, ) + response_headers = {} + cache_control = forwarded.headers.get("cache-control") + if cache_control: + response_headers["cache-control"] = cache_control return Response( content=forwarded.content, status_code=forwarded.status_code, media_type=forwarded.headers.get("content-type"), + headers=response_headers, ) @app.get("/{path:path}", include_in_schema=False) diff --git a/frontend/src/App.test.tsx b/frontend/src/App.test.tsx index a4c49d4..e77d10e 100644 --- a/frontend/src/App.test.tsx +++ b/frontend/src/App.test.tsx @@ -1,10 +1,52 @@ import { afterEach, beforeEach, expect, test, vi } from "vitest"; -import { act, cleanup, fireEvent, render, screen, waitFor } from "@testing-library/react"; +import { act, cleanup, fireEvent, render, screen, waitFor, within } from "@testing-library/react"; + +const { forceGraphProps } = vi.hoisted(() => ({ + forceGraphProps: [] as Record<string, any>[], +})); + +const { forceGraph2DProps } = vi.hoisted(() => ({ + forceGraph2DProps: [] as Record<string, any>[], +})); vi.mock("react-force-graph-3d", () => ({ - default: () => <div data-testid="force-graph-3d" />, + default: (props: Record<string, any>) => { + forceGraphProps.push(props); + return <div data-testid="force-graph-3d" />; + }, })); +vi.mock("react-force-graph-2d", async () => { + const React = await vi.importActual<typeof import("react")>("react"); + return { + default: React.forwardRef((props: Record<string, any>, ref) => { + React.useImperativeHandle(ref, () => ({ + d3Force: () => ({ + distance: vi.fn(), + strength: vi.fn(), + }), + d3ReheatSimulation: vi.fn(), + zoomToFit: vi.fn(), + })); + forceGraph2DProps.push(props); + return ( + <div data-testid="force-graph-2d"> + {props.graphData.nodes.map((node: Record<string, any>) => ( + <button + key={node.id} + type="button" + aria-label={`${node.label} ${node.url}`} + onMouseEnter={() => props.onNodeHover?.(node, null)} + onMouseLeave={() => props.onNodeHover?.(null, node)} + onClick={() => props.onNodeClick?.(node, new MouseEvent("click"))} + /> + ))} + </div> + ); + }), + }; +}); + import App from "./App"; function makeCatalogItem(id: number, crawlStatus: string, title: string) { @@ -33,6 +75,123 @@ function makeCatalogItem(id: number, crawlStatus: string, title: string) { }; } +function makeDetailPayload(item: Record<string, unknown>) { + const relatedBlog = makeCatalogItem(88, "FINISHED", "Related Blog"); + const downstreamBlog = makeCatalogItem(87, "FINISHED", "Downstream Blog"); + const recommendedBlog = makeCatalogItem(89, "FINISHED", "Recommended Blog"); + const viaBlog = makeCatalogItem(90, "FINISHED", "Mutual Blog"); + return { + ...item, + icon_url: `https://${String(item.domain)}/favicon.ico`, + incoming_edges: [ + { + id: "incoming-1", + from_blog_id: relatedBlog.id, + to_blog_id: item.id, + link_text: "friend", + link_url_raw: item.url, + neighbor_blog: relatedBlog, + }, + ], + outgoing_edges: [ + { + id: "outgoing-1", + from_blog_id: item.id, + to_blog_id: relatedBlog.id, + link_text: "blogroll", + link_url_raw: relatedBlog.url, + neighbor_blog: relatedBlog, + }, + { + id: "outgoing-2", + from_blog_id: item.id, + to_blog_id: downstreamBlog.id, + link_text: "next", + link_url_raw: downstreamBlog.url, + neighbor_blog: downstreamBlog, + }, + ], + recommended_blogs: [ + { + ...recommendedBlog, + via_blogs: [viaBlog], + }, + ], + discovery_path: { + mode: "crawled", + origin_source: "seed", + origin_label: "种子导入", + target_source: "rss", + truncated: false, + steps: [ + { + blog: relatedBlog, + blog_id: relatedBlog.id, + url: relatedBlog.url, + domain: relatedBlog.domain, + accepted_by: "seed", + accepted_label: "种子导入", + raw_id: null, + raw_source_blog_id: null, + raw_accepted_by: null, + discovered_at: null, + }, + { + blog: item, + blog_id: item.id, + url: String(item.url), + domain: String(item.domain), + accepted_by: null, + accepted_label: "RSS 判定", + raw_id: 22, + raw_source_blog_id: relatedBlog.id, + raw_accepted_by: "rss", + discovered_at: "2026-04-20T10:00:00Z", + }, + ], + }, + relation_graphs: { + incoming: { + direction: "incoming", + focus_blog_id: item.id, + depth: 2, + nodes: [item, relatedBlog], + edges: [ + { + id: "incoming-1", + from_blog_id: relatedBlog.id, + to_blog_id: item.id, + link_text: "friend", + link_url_raw: item.url, + }, + ], + }, + outgoing: { + direction: "outgoing", + focus_blog_id: item.id, + depth: 2, + nodes: [item, relatedBlog, downstreamBlog], + edges: [ + { + id: "outgoing-1", + from_blog_id: item.id, + to_blog_id: relatedBlog.id, + link_text: "blogroll", + link_url_raw: relatedBlog.url, + }, + { + id: "outgoing-2", + from_blog_id: item.id, + to_blog_id: downstreamBlog.id, + link_text: "next", + link_url_raw: downstreamBlog.url, + }, + ], + }, + }, + }; +} + function sortCatalogItems(items: Array<Record<string, unknown>>, sort: string) { const copied = [...items]; if (sort === "id_desc") { @@ -66,10 +225,36 @@ let statusPayload = { total_edges: 10, }; +class TestResizeObserver { + callback: ResizeObserverCallback; + + constructor(callback: ResizeObserverCallback) { + this.callback = callback; + } + + observe() { + this.callback( + [ + { + contentRect: { width: 960, height: 720 }, + } as ResizeObserverEntry, + ], + this, + ); + } + + unobserve() {} + + disconnect() {} +} + beforeEach(() => { cleanup(); vi.restoreAllMocks(); vi.useFakeTimers({ shouldAdvanceTime: true }); + vi.stubGlobal("ResizeObserver", TestResizeObserver); + forceGraphProps.length = 0; + forceGraph2DProps.length = 0; window.history.replaceState({}, "", "/"); catalogItems = [...baseCatalogItems, makeCatalogItem(33, "PROCESSING", "Newest Processing Blog")]; window.localStorage.clear(); @@ -83,22 +268,27 @@ beforeEach(() => { total_edges: 10, }; - const fetchMock = vi.fn(async (input: RequestInfo | URL) => { + const fetchMock = vi.fn(async (input: RequestInfo | URL, init?: RequestInit) => { const url = new URL(String(input), "http://localhost"); if (url.pathname === "/api/blogs/catalog") { const page = Number(url.searchParams.get("page") || "1"); const pageSize = Number(url.searchParams.get("page_size") || "30"); const status = url.searchParams.get("status"); const query = (url.searchParams.get("q") || "").trim().toLowerCase(); + const urlQuery = (url.searchParams.get("url") || "").trim().toLowerCase(); const sort = url.searchParams.get("sort") || "id_asc"; const filteredItems = sortCatalogItems( (status ? catalogItems.filter((item) => item.crawl_status === status) : catalogItems).filter((item) => { - if (!query) { + if (!query && !urlQuery) { return true; } const title = String(item.title ?? "").toLowerCase(); const blogUrl = String(item.url ?? "").toLowerCase(); - return title.includes(query) || blogUrl.includes(query); + const normalizedUrl = String(item.normalized_url ?? "").toLowerCase(); + return ( + (!query || title.includes(query) || blogUrl.includes(query)) && + (!urlQuery || blogUrl.includes(urlQuery) || normalizedUrl.includes(urlQuery)) + ); }), sort, ); @@ -117,11 +307,204 @@ beforeEach(() => { }), ); } + if (url.pathname === "/api/recommendations/random-blog-batches") { + const body = JSON.parse(String(init?.body ?? "{}")); + const count = Number(body.count || 9); + const filteredItems = sortCatalogItems( + catalogItems.filter((item) => item.crawl_status === "FINISHED"), + "random", + ).slice(0, count); + return new Response( + JSON.stringify({ + request_uuid: "request-random-1", + surface: "random_blog_page", + strategy: "weighted_random", + strategy_version: "v1", + visitor_id: body.visitor_id, + session_id: body.session_id, + requested_count: count, + served_count: filteredItems.length, + created_at: "2026-06-07T13:30:00Z", + items: filteredItems.map((item, index) => ({ + ...item, + request_uuid: "request-random-1", + impression_id: index + 101, + position: index + 1, + })), + }), + ); + } + if (url.pathname === "/api/recommendation-events") { + const body = JSON.parse(String(init?.body ?? "{}")); + return new Response( + JSON.stringify({ + id: 1, + ...body, + duplicate: false, + }), + ); + } + if (url.pathname === "/api/blogs/user-seeds") { + const body = JSON.parse(String(init?.body ?? "{}")); + const submittedUrl = String(body.homepage_url || "https://missing-blog.example.com/"); + if (submittedUrl === "https://blog.sayori.org/") { + return new Response(JSON.stringify({ detail: "rule:blocked_tld" }), { status: 422 }); + } + const item = { + ...makeCatalogItem(444, "WAITING", "Missing Blog"), + url: submittedUrl, + normalized_url: submittedUrl, + domain: submittedUrl.replace(/^https?:\/\//, "").replace(/\/$/, ""), + }; + return new Response( + JSON.stringify({ + status: "QUEUED", + blog_id: item.id, + inserted: true, + blog: item, + }), + ); + } + const blogDetailMatch = url.pathname.match(/^\/api\/blogs\/(\d+)$/); + if (blogDetailMatch) { + const detailItem = catalogItems.find((item) => Number(item.id) === Number(blogDetailMatch[1])); + if (!detailItem) { + return new Response(JSON.stringify({ detail: "not_found" }), { status: 404 }); + } + return new Response(JSON.stringify(makeDetailPayload(detailItem))); + } if (url.pathname === "/api/status") { return new Response(JSON.stringify(statusPayload)); } + if (url.pathname === "/api/auth/register") { + return new Response( + JSON.stringify({ + sent: true, + verification_token: "mail-token", + expires_at: "2026-06-11T00:00:00Z", + }), + ); + } + if (url.pathname === "/api/auth/email/verify/confirm") { + return new Response( + JSON.stringify({ + id: 7, + email: "new@example.com", + display_name: "new", + role: "user", + is_active: true, + email_verified: true, + email_verified_at: "2026-06-10T00:10:00Z", + created_at: "2026-06-10T00:00:00Z", + updated_at: "2026-06-10T00:10:00Z", + }), + ); + } + if (url.pathname === "/api/auth/me") { + return new Response( + JSON.stringify({ + id: 7, + email: "new@example.com", + display_name: "new", + role: "user", + is_active: true, + email_verified: false, + email_verified_at: null, + created_at: "2026-06-10T00:00:00Z", + updated_at: "2026-06-10T00:00:00Z", + }), + ); + } + if (url.pathname === "/api/me/label-stats") { + return new Response(JSON.stringify({ label_count: 3 })); + } if (url.pathname === "/api/stats") { - return new Response(JSON.stringify({ total_blogs: 34, total_edges: 10 })); + return new Response(JSON.stringify({ total_blogs: statusPayload.total_blogs, total_edges: statusPayload.total_edges })); + } + if (url.pathname === "/api/admin/runtime/status") { + return new Response( + JSON.stringify({ + runner_status: "idle", + active_run_id: null, + worker_count: 0, + active_workers: 0, + current_blog_id: null, + current_url: null, + current_stage: null, + elapsed_seconds: null, + maintenance_in_progress: false, + }), + ); + } + if (url.pathname === "/api/admin/runtime/current") { + return new Response( + JSON.stringify({ + runner_status: "idle", + active_run_id: null, + worker_count: 0, + active_workers: 0, + current_blog_id: null, + current_url: null, + current_stage: null, + elapsed_seconds: null, + }), + ); + } + if (url.pathname === "/api/admin/hourly-stats") { + const row = { + id: 1, + hour_start: "2026-06-11T10:00:00Z", + user_count: 12, + random_request_count: 3, + random_impression_count: 27, + detail_open_count: 4, + external_open_count: 5, + detail_ctr: 4 / 27, + external_ctr: 5 / 27, + total_click_ctr: 9 / 27, + refreshed_at: "2026-06-11T10:05:00Z", + created_at: "2026-06-11T10:05:00Z", + }; + return new Response(JSON.stringify({ current_hour: row, latest: row, items: [row] })); + } + if (url.pathname === "/api/admin/blog-labeling/counts") { + return new Response(JSON.stringify({ total_labeled: 0, by_label: {} })); + } + if (url.pathname === "/api/admin/blog-labeling/parquet-status") { + return new Response( + JSON.stringify({ + path: "/tmp/blog-label-training.parquet", + filename: "blog-label-training.parquet", + exists: false, + saved_count: 0, + total_labeled: 0, + missing_count: 0, + batch_size: 100, + rewritten: false, + message: "not ready", + updated_at: null, + }), + ); + } + if (url.pathname === "/api/admin/blog-labeling/candidates") { + return new Response( + JSON.stringify({ + items: [], + available_tags: [ + { id: 1, name: "blog", slug: "blog", created_at: "2026-06-11T00:00:00Z", updated_at: "2026-06-11T00:00:00Z" }, + { id: 2, name: "company", slug: "company", created_at: "2026-06-11T00:00:00Z", updated_at: "2026-06-11T00:00:00Z" }, + { id: 3, name: "other", slug: "other", created_at: "2026-06-11T00:00:00Z", updated_at: "2026-06-11T00:00:00Z" }, + { id: 4, name: "unknown", slug: "unknown", created_at: "2026-06-11T00:00:00Z", updated_at: "2026-06-11T00:00:00Z" }, + ], + page: 1, + page_size: 9, + total_items: 0, + total_pages: 1, + has_next: false, + has_prev: false, + sort: "id_desc", + }), + ); } if (url.pathname === "/api/filter-stats") { return new Response( @@ -162,11 +545,67 @@ beforeEach(() => { domain: "graph.example.com", title: "Graph Example", icon_url: null, + incoming_count: 2, + outgoing_count: 1, + }, + { + id: 2, + url: "https://two.example.com/", + domain: "two.example.com", + title: "Two Example", + icon_url: null, + incoming_count: 1, + outgoing_count: 1, + }, + { + id: 3, + url: "https://three.example.com/", + domain: "three.example.com", + title: "Three Example", + icon_url: null, + incoming_count: 1, + outgoing_count: 1, + }, + { + id: 4, + url: "https://leaf.example.com/", + domain: "leaf.example.com", + title: "Leaf Example", + icon_url: null, incoming_count: 0, - outgoing_count: 0, + outgoing_count: 1, + }, + ], + edges: [ + { + id: "edge-1-2", + from_blog_id: 1, + to_blog_id: 2, + link_text: null, + link_url_raw: "https://two.example.com/", + }, + { + id: "edge-2-3", + from_blog_id: 2, + to_blog_id: 3, + link_text: null, + link_url_raw: "https://three.example.com/", + }, + { + id: "edge-3-1", + from_blog_id: 3, + to_blog_id: 1, + link_text: null, + link_url_raw: "https://graph.example.com/", + }, + { + id: "edge-1-4", + from_blog_id: 1, + to_blog_id: 4, + link_text: null, + link_url_raw: "https://leaf.example.com/", }, ], - edges: [], meta: { strategy: "degree", limit: 200, @@ -174,6 +613,51 @@ beforeEach(() => { }), ); } + if (url.pathname === "/benchmarks/blog-community-graph.json") { + return new Response( + JSON.stringify({ + nodes: [ + { + id: 1, + url: "https://benchmark.heyblog.local/indie-web-01/", + domain: "indie-web-01.benchmark.heyblog.local", + title: "Indie Web Notes 01", + icon_url: null, + incoming_count: 1, + outgoing_count: 1, + degree: 2, + component_id: "indie-web", + }, + { + id: 2, + url: "https://benchmark.heyblog.local/indie-web-02/", + domain: "indie-web-02.benchmark.heyblog.local", + title: "Indie Web Notes 02", + icon_url: null, + incoming_count: 1, + outgoing_count: 1, + degree: 2, + component_id: "indie-web", + }, + ], + edges: [ + { + id: "benchmark-edge-001", + from_blog_id: 1, + to_blog_id: 2, + link_text: "blogroll", + link_url_raw: "https://benchmark.heyblog.local/indie-web-02/", + }, + ], + meta: { + strategy: "synthetic-community-benchmark", + limit: 2, + total_nodes: 2, + total_edges: 1, + }, + }), + ); + } throw new Error(`Unhandled fetch: ${url.toString()}`); }); @@ -184,144 +668,352 @@ afterEach(() => { vi.useRealTimers(); }); -test("renders paginated home cards, reloads from server for filters, and refreshes statuses by polling", async () => { +test("renders the home summary with URL search while keeping queue metrics and catalog cards hidden", async () => { render(<App />); await waitFor(() => { expect(screen.getByRole("heading", { name: "HeyBlog!" })).toBeInTheDocument(); }); expect(screen.getByText("基于友链爬取所有博客!")).toBeInTheDocument(); + expect(screen.getByText("总节点数")).toBeInTheDocument(); + expect(screen.getByText("总连接数")).toBeInTheDocument(); + expect(screen.getByText("34")).toBeInTheDocument(); + expect(screen.getByText("10")).toBeInTheDocument(); + expect(screen.queryByText("待处理队列")).not.toBeInTheDocument(); + expect(screen.queryByText("处理中 / 失败")).not.toBeInTheDocument(); + expect(fetch).not.toHaveBeenCalledWith(expect.stringContaining("/api/status"), expect.anything()); - expect(fetch).toHaveBeenCalledWith( - expect.stringContaining("/api/blogs/catalog?page=1&page_size=30&sort=id_asc&status=PROCESSING"), - expect.anything(), + expect(fetch).not.toHaveBeenCalledWith(expect.stringContaining("/api/blogs/catalog"), expect.anything()); + expect(screen.queryByRole("button", { name: "ALL" })).not.toBeInTheDocument(); + expect(screen.queryByRole("button", { name: "PROCESSING" })).not.toBeInTheDocument(); + expect(screen.queryByRole("button", { name: "WAITING" })).not.toBeInTheDocument(); + expect(screen.queryByRole("button", { name: "FINISHED" })).not.toBeInTheDocument(); + expect(screen.queryByRole("button", { name: "FAILED" })).not.toBeInTheDocument(); + expect(screen.queryByText("Processing Blog")).not.toBeInTheDocument(); + expect(screen.queryByText("Waiting Blog")).not.toBeInTheDocument(); + expect(screen.queryByText("Finished Blog")).not.toBeInTheDocument(); + expect(screen.queryByText("Failed Blog")).not.toBeInTheDocument(); + expect(screen.getByPlaceholderText("输入你的博客链接,看看你的博客有没有被找到吧!")).toBeInTheDocument(); + + await act(async () => { + await vi.advanceTimersByTimeAsync(5000); + }); + + await waitFor(() => { + expect(fetch).toHaveBeenCalledWith(expect.stringContaining("/api/stats"), expect.anything()); + }); + expect(fetch).not.toHaveBeenCalledWith(expect.stringContaining("/api/blogs/catalog"), expect.anything()); + expect(fetch).not.toHaveBeenCalledWith(expect.stringContaining("/api/status"), expect.anything()); +}); + +test("shows the admin navigation item only for active verified admin sessions", async () => { + window.localStorage.setItem( + "heyblog_user_session", + JSON.stringify({ + token: "admin-session-token", + expiresAt: "2026-07-10T00:00:00Z", + user: { + id: 70, + email: "admin@magic-knowledge.top", + displayName: "admin", + role: "admin", + isActive: true, + emailVerified: true, + emailVerifiedAt: "2026-06-11T00:00:00Z", + createdAt: "2026-06-11T00:00:00Z", + updatedAt: "2026-06-11T00:00:00Z", + }, + }), ); - expect(fetch).toHaveBeenCalledWith( - expect.stringContaining("/api/blogs/catalog?page=1&page_size=30&sort=id_asc&status=WAITING"), - expect.anything(), + + render(<App />); + + expect(await screen.findByRole("link", { name: "管理" })).toHaveAttribute("href", "/admin"); +}); + +test("renders hourly admin statistics for verified admin sessions", async () => { + window.history.replaceState({}, "", "/admin"); + window.localStorage.setItem( + "heyblog_user_session", + JSON.stringify({ + token: "admin-session-token", + expiresAt: "2026-07-10T00:00:00Z", + user: { + id: 70, + email: "admin@magic-knowledge.top", + displayName: "admin", + role: "admin", + isActive: true, + emailVerified: true, + emailVerifiedAt: "2026-06-11T00:00:00Z", + createdAt: "2026-06-11T00:00:00Z", + updatedAt: "2026-06-11T00:00:00Z", + }, + }), ); - expect(fetch).toHaveBeenCalledWith( - expect.stringContaining("/api/blogs/catalog?page=1&page_size=30&sort=id_asc&status=FINISHED"), - expect.anything(), + + render(<App />); + + expect(await screen.findByRole("heading", { name: "后台统计" })).toBeInTheDocument(); + expect(screen.getByText("当前用户数")).toBeInTheDocument(); + expect(screen.getAllByText("12").length).toBeGreaterThan(0); + expect(screen.getByText("随机请求 / 曝光")).toBeInTheDocument(); + expect(screen.getByText("3 / 27")).toBeInTheDocument(); + expect(screen.getByText("详情点击率")).toBeInTheDocument(); + expect(screen.getByText("外链点击率")).toBeInTheDocument(); + expect(screen.getAllByText("14.81%").length).toBeGreaterThan(0); + expect(screen.getAllByText("18.52%").length).toBeGreaterThan(0); +}); + +test("hides admin navigation and renders 404 for non-admin direct admin URLs", async () => { + window.localStorage.setItem( + "heyblog_user_session", + JSON.stringify({ + token: "user-session-token", + expiresAt: "2026-07-10T00:00:00Z", + user: { + id: 71, + email: "1304412077@qq.com", + displayName: "user", + role: "user", + isActive: true, + emailVerified: true, + emailVerifiedAt: "2026-06-11T00:00:00Z", + createdAt: "2026-06-11T00:00:00Z", + updatedAt: "2026-06-11T00:00:00Z", + }, + }), ); - expect(fetch).toHaveBeenCalledWith( - expect.stringContaining("/api/blogs/catalog?page=1&page_size=30&sort=id_asc&status=FAILED"), - expect.anything(), + window.history.replaceState({}, "", "/admin"); + + render(<App />); + + expect(await screen.findByText("404")).toBeInTheDocument(); + expect(screen.getByRole("heading", { name: "页面不存在" })).toBeInTheDocument(); + expect(screen.queryByRole("link", { name: "管理" })).not.toBeInTheDocument(); + expect(screen.queryByText("管理控制台")).not.toBeInTheDocument(); +}); + +test("lets home users search normalized URLs and open the blog detail route", async () => { + catalogItems = catalogItems.map((item) => + Number(item.id) === 3 ? { ...item, icon_url: "https://finished-blog.example.com/favicon.ico" } : item, ); - expect(screen.getByText("Processing Blog")).toBeInTheDocument(); - expect(screen.getByText("Newest Processing Blog")).toBeInTheDocument(); - expect(screen.getByText("Waiting Blog")).toBeInTheDocument(); - expect(screen.getByText("当前显示第 1 / 2 页,本页 30 个,共 34 个博客")).toBeInTheDocument(); - expect(screen.getByRole("button", { name: "PROCESSING" })).toBeInTheDocument(); - expect(screen.getByRole("button", { name: "FAILED" })).toBeInTheDocument(); - const titles = screen.getAllByRole("heading", { level: 3 }).map((node) => node.textContent); - expect(titles.slice(0, 4)).toEqual(["Processing Blog", "Newest Processing Blog", "Waiting Blog", "Newest Waiting Blog"]); + render(<App />); - fireEvent.click(screen.getByRole("button", { name: "FAILED" })); + const input = await screen.findByPlaceholderText("输入你的博客链接,看看你的博客有没有被找到吧!"); + fireEvent.change(input, { target: { value: "finished-blog.example.com" } }); + fireEvent.click(screen.getByRole("button", { name: "搜索博客" })); await waitFor(() => { - expect(fetch).toHaveBeenCalledWith( - expect.stringContaining("/api/blogs/catalog?page=1&page_size=30&sort=id_desc&status=FAILED"), - expect.anything(), - ); + const searchCall = vi + .mocked(fetch) + .mock.calls.find(([input]) => String(input).includes("/api/blogs/catalog?")); + expect(searchCall).toBeDefined(); + const requestUrl = new URL(String(searchCall![0]), "http://localhost"); + expect(requestUrl.searchParams.get("page")).toBe("1"); + expect(requestUrl.searchParams.get("page_size")).toBe("30"); + expect(requestUrl.searchParams.get("url")).toBe("finished-blog.example.com"); + expect(requestUrl.searchParams.get("sort")).toBe("id_desc"); }); - expect(screen.getByText("Failed Blog")).toBeInTheDocument(); - expect(screen.queryByText("Processing Blog")).not.toBeInTheDocument(); - expect(screen.getByText("当前显示第 1 / 1 页,本页 1 个,共 1 个博客")).toBeInTheDocument(); + expect(screen.getByText("1 个匹配")).toBeInTheDocument(); + expect(screen.getByText("Finished Blog")).toBeInTheDocument(); + expect(screen.getByText("https://finished-blog.example.com/")).toBeInTheDocument(); + expect(screen.getAllByAltText("finished-blog.example.com icon")).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + src: "https://finished-blog.example.com/favicon.ico", + }), + ]), + ); - fireEvent.click(screen.getByRole("button", { name: "WAITING" })); + fireEvent.click(screen.getByRole("button", { name: /Finished Blog/i })); await waitFor(() => { - expect(fetch).toHaveBeenCalledWith( - expect.stringContaining("/api/blogs/catalog?page=1&page_size=30&sort=id_asc&status=WAITING"), - expect.anything(), - ); + expect(window.location.pathname).toBe("/blogs/3"); }); - const waitingTitles = screen.getAllByRole("heading", { level: 3 }).map((node) => node.textContent); - expect(waitingTitles.slice(0, 2)).toEqual(["Waiting Blog", "Newest Waiting Blog"]); + expect( + vi + .mocked(fetch) + .mock.calls.some( + ([input, init]) => + String(input).includes("/api/recommendation-events") && + String(init?.body).includes('"event_type":"detail_open"') && + String(init?.body).includes('"entrance_kind":"home_search_result"') && + String(init?.body).includes('"entrance_url"'), + ), + ).toBe(true); + expect(screen.queryByRole("heading", { name: "HeyBlog!" })).not.toBeInTheDocument(); + await waitFor(() => { + expect(screen.getByRole("heading", { name: "Finished Blog" })).toBeInTheDocument(); + }); + expect(screen.getAllByAltText("finished-blog.example.com icon")).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + src: "https://finished-blog.example.com/favicon.ico", + }), + ]), + ); + expect(screen.getByRole("heading", { name: "博客关联" })).toBeInTheDocument(); + expect(screen.getByRole("heading", { name: "发现路径" })).toBeInTheDocument(); + expect(screen.getByText("https://related-blog.example.com/")).toBeInTheDocument(); + expect(screen.getByRole("button", { name: "入链关系" })).toBeInTheDocument(); + expect(screen.getByRole("button", { name: "出链关系" })).toBeInTheDocument(); + const relatedNode = screen.getByLabelText("Related Blog https://related-blog.example.com/"); + expect(relatedNode).toBeInTheDocument(); + fireEvent.mouseEnter(relatedNode); + const tooltip = screen.getByRole("tooltip"); + expect(within(tooltip).getByText("Related Blog")).toBeInTheDocument(); + expect(within(tooltip).getByText("https://related-blog.example.com/")).toBeInTheDocument(); + fireEvent.click(screen.getByRole("button", { name: "出链关系" })); + expect(screen.getByLabelText("Downstream Blog https://downstream-blog.example.com/")).toBeInTheDocument(); + expect(screen.queryByText("种子导入")).not.toBeInTheDocument(); + expect(screen.queryByText("RSS 判定")).not.toBeInTheDocument(); + expect(screen.queryByText(/源头/)).not.toBeInTheDocument(); + expect(screen.queryByRole("heading", { name: "直接相关博客" })).not.toBeInTheDocument(); + expect(screen.queryByRole("heading", { name: "推荐博客" })).not.toBeInTheDocument(); + expect(screen.queryByRole("heading", { name: "基础信息" })).not.toBeInTheDocument(); + expect(screen.queryByText("通过 Mutual Blog 关联")).not.toBeInTheDocument(); +}); - fireEvent.click(screen.getByRole("button", { name: "ALL" })); +test("submits a user seed when home URL search has no matches", async () => { + render(<App />); + + const input = await screen.findByPlaceholderText("输入你的博客链接,看看你的博客有没有被找到吧!"); + fireEvent.change(input, { target: { value: "missing-blog.example.com" } }); + fireEvent.click(screen.getByRole("button", { name: "搜索博客" })); await waitFor(() => { - expect(screen.getByText("Processing Blog")).toBeInTheDocument(); + expect(screen.getByRole("dialog", { name: "当前未找到该博客,是否将该博客加入博客网络?" })).toBeInTheDocument(); }); + expect(screen.getByText("missing-blog.example.com")).toBeInTheDocument(); - fireEvent.click(screen.getByRole("button", { name: "下一页" })); + fireEvent.click(screen.getByRole("button", { name: "不是" })); await waitFor(() => { - expect(fetch).toHaveBeenCalledWith( - expect.stringContaining("/api/blogs/catalog?page=1&page_size=60&sort=id_asc&status=PROCESSING"), - expect.anything(), - ); + expect( + screen.queryByRole("dialog", { name: "当前未找到该博客,是否将该博客加入博客网络?" }), + ).not.toBeInTheDocument(); }); - expect(screen.getByText("Failed Blog")).toBeInTheDocument(); - expect(screen.getByText("Extra Blog 32")).toBeInTheDocument(); - fireEvent.click(screen.getByRole("button", { name: "PROCESSING" })); + fireEvent.click(screen.getByRole("button", { name: "搜索博客" })); await waitFor(() => { - expect(fetch).toHaveBeenCalledWith( - expect.stringContaining("/api/blogs/catalog?page=1&page_size=30&sort=id_desc&status=PROCESSING"), - expect.anything(), - ); + expect(screen.getByRole("dialog", { name: "当前未找到该博客,是否将该博客加入博客网络?" })).toBeInTheDocument(); }); - expect(screen.getByText("Newest Processing Blog")).toBeInTheDocument(); - expect(screen.getByText("Processing Blog")).toBeInTheDocument(); - - fireEvent.click(screen.getByRole("button", { name: "ALL" })); + fireEvent.click(screen.getByRole("button", { name: "是" })); + const seedInput = screen.getByLabelText("请输入完整博客链接"); + expect(seedInput).toHaveAttribute("placeholder", "https://blog.example.com"); + fireEvent.change(seedInput, { target: { value: "https://missing-blog.example.com/" } }); + fireEvent.click(screen.getByRole("button", { name: "是" })); await waitFor(() => { - expect(screen.getByText("Waiting Blog")).toBeInTheDocument(); + const submitCall = vi.mocked(fetch).mock.calls.find(([input]) => String(input).includes("/api/blogs/user-seeds")); + expect(submitCall).toBeDefined(); + expect(JSON.parse(String(submitCall![1]?.body))).toEqual({ + homepage_url: "https://missing-blog.example.com/", + }); + expect( + screen.queryByRole("dialog", { name: "当前未找到该博客,是否将该博客加入博客网络?" }), + ).not.toBeInTheDocument(); }); +}); - catalogItems = catalogItems.map((item) => - item.id === 1 ? { ...item, crawl_status: "FINISHED", status_code: 200, last_crawled_at: "2026-04-17T10:00:00Z" } : item, - ); - statusPayload = { - ...statusPayload, - pending_tasks: 2, - processing_tasks: 1, - finished_tasks: 31, - }; +test("shows the exact rule-filter reason when user seed submission fails", async () => { + render(<App />); - await act(async () => { - await vi.advanceTimersByTimeAsync(5000); - }); + const input = await screen.findByPlaceholderText("输入你的博客链接,看看你的博客有没有被找到吧!"); + fireEvent.change(input, { target: { value: "blog.sayori.org" } }); + fireEvent.click(screen.getByRole("button", { name: "搜索博客" })); await waitFor(() => { - expect(fetch).toHaveBeenCalledWith( - expect.stringContaining("/api/blogs/catalog?page=1&page_size=30&sort=id_asc&status=PROCESSING"), - expect.anything(), - ); + expect(screen.getByRole("dialog", { name: "当前未找到该博客,是否将该博客加入博客网络?" })).toBeInTheDocument(); }); + fireEvent.click(screen.getByRole("button", { name: "是" })); + fireEvent.change(screen.getByLabelText("请输入完整博客链接"), { + target: { value: "https://blog.sayori.org/" }, + }); + fireEvent.click(screen.getByRole("button", { name: "是" })); + + expect(await screen.findByText("规则过滤未通过:域名后缀被屏蔽(rule:blocked_tld)")).toBeInTheDocument(); + expect(screen.getByRole("dialog", { name: "当前未找到该博客,是否将该博客加入博客网络?" })).toBeInTheDocument(); +}); + +test("keeps new registrations signed out until email verification", async () => { + window.history.replaceState({}, "", "/profile"); - fireEvent.click(screen.getByRole("button", { name: "PROCESSING" })); + render(<App />); + + fireEvent.click(await screen.findByRole("button", { name: "没有账号,注册一个" })); + fireEvent.change(screen.getByLabelText("邮箱"), { target: { value: "New@Example.com" } }); + fireEvent.change(screen.getByLabelText("密码"), { target: { value: "correct horse" } }); + fireEvent.click(screen.getByRole("button", { name: "注册并发送验证邮件" })); await waitFor(() => { - expect(fetch).toHaveBeenCalledWith( - expect.stringContaining("/api/blogs/catalog?page=1&page_size=30&sort=id_desc&status=PROCESSING"), - expect.anything(), - ); + expect(screen.getByRole("heading", { name: "登录账号" })).toBeInTheDocument(); }); - expect(screen.getByText("Newest Processing Blog")).toBeInTheDocument(); - expect(screen.queryByText("Processing Blog")).not.toBeInTheDocument(); + expect(screen.getByText("验证邮件已发送,请验证邮箱后登录。")).toBeInTheDocument(); + expect(window.localStorage.getItem("heyblog_user_session")).toBeNull(); + expect(screen.queryByText("当前账号")).not.toBeInTheDocument(); + expect(screen.queryByRole("heading", { name: "数据标注" })).not.toBeInTheDocument(); +}); + +test("confirms email automatically when opened from a verification email link", async () => { + window.localStorage.setItem( + "heyblog_user_session", + JSON.stringify({ + token: "new-user-token", + expiresAt: "2026-07-10T00:00:00Z", + user: { + id: 7, + email: "new@example.com", + displayName: "new", + role: "user", + isActive: true, + emailVerified: false, + emailVerifiedAt: null, + createdAt: "2026-06-10T00:00:00Z", + updatedAt: "2026-06-10T00:00:00Z", + }, + }), + ); + window.history.replaceState({}, "", "/profile?verify_token=mail-token"); + + render(<App />); - fireEvent.change(screen.getByPlaceholderText(/输入 URL 或标题进行搜索/i), { - target: { value: "Newest" }, + await waitFor(() => { + expect(screen.getByText(/已验证/)).toBeInTheDocument(); }); - fireEvent.click(screen.getByRole("button", { name: /搜索博客/i })); + expect( + vi + .mocked(fetch) + .mock.calls.some( + ([input, init]) => + String(input).includes("/api/auth/email/verify/confirm") && + String(init?.body).includes('"token":"mail-token"'), + ), + ).toBe(true); + expect(screen.getByRole("heading", { name: "数据标注" })).toBeInTheDocument(); + expect(screen.getByText(/当前总共标注了/)).toBeInTheDocument(); +}); + +test("confirms email links without requiring a local session", async () => { + window.history.replaceState({}, "", "/profile?verify_token=mail-token"); + + render(<App />); await waitFor(() => { - expect(fetch).toHaveBeenCalledWith( - expect.stringContaining("/api/blogs/catalog?page=1&page_size=30&q=Newest&sort=id_desc&status=PROCESSING"), - expect.anything(), - ); + expect(screen.getByRole("heading", { name: "登录账号" })).toBeInTheDocument(); }); - expect(screen.getByText("Newest Processing Blog")).toBeInTheDocument(); - expect(screen.queryByText("Processing Blog")).not.toBeInTheDocument(); - expect(screen.queryByText("Newest Waiting Blog")).not.toBeInTheDocument(); - expect(screen.queryByText("Waiting Blog")).not.toBeInTheDocument(); - expect(screen.getByText("搜索词: Newest")).toBeInTheDocument(); + expect( + vi + .mocked(fetch) + .mock.calls.some( + ([input, init]) => + String(input).includes("/api/auth/email/verify/confirm") && + String(init?.body).includes('"token":"mail-token"'), + ), + ).toBe(true); + expect(screen.queryByText(/Token/)).not.toBeInTheDocument(); }); test("adds a random blog route that loads nine finished cards and refreshes them on demand", async () => { @@ -334,14 +1026,17 @@ test("adds a random blog route that loads nine finished cards and refreshes them }); expect(fetch).toHaveBeenCalledWith( - expect.stringContaining("/api/blogs/catalog?page=1&page_size=9&sort=random&status=FINISHED"), - expect.anything(), + expect.stringContaining("/api/recommendations/random-blog-batches"), + expect.objectContaining({ + method: "POST", + body: expect.stringContaining('"count":9'), + }), ); expect(screen.getByText("当前展示 9 个随机博客卡片")).toBeInTheDocument(); expect(screen.getByText("Extra Blog 32")).toBeInTheDocument(); expect(screen.getByAltText("extra-blog-32.example.com icon")).toHaveAttribute( "src", - "https://icons.duckduckgo.com/ip3/extra-blog-32.example.com.ico", + "https://t2.gstatic.com/faviconV2?client=SOCIAL&type=FAVICON&fallback_opts=TYPE,SIZE,URL&url=https://extra-blog-32.example.com&size=64", ); fireEvent.click(screen.getByRole("button", { name: /刷新随机博客/i })); @@ -350,13 +1045,95 @@ test("adds a random blog route that loads nine finished cards and refreshes them const randomCalls = vi .mocked(fetch) .mock.calls.filter(([input]) => - String(input).includes("/api/blogs/catalog?page=1&page_size=9&sort=random&status=FINISHED"), + String(input).includes("/api/recommendations/random-blog-batches"), ); expect(randomCalls).toHaveLength(2); }); }); -test("lets visualization users choose a deterministic sampled graph size", async () => { +test("lets random blog users open one blog detail route in a new tab", async () => { + window.history.replaceState({}, "", "/random"); + const openMock = vi.fn(); + vi.stubGlobal("open", openMock); + + render(<App />); + + await waitFor(() => { + expect(screen.getByText("当前展示 9 个随机博客卡片")).toBeInTheDocument(); + }); + + fireEvent.click(screen.getAllByRole("button", { name: "查看详情" })[0]); + + await waitFor(() => { + expect(fetch).toHaveBeenCalledWith( + expect.stringContaining("/api/recommendation-events"), + expect.objectContaining({ + method: "POST", + body: expect.stringContaining('"event_type":"detail_open"'), + }), + ); + }); + expect( + vi + .mocked(fetch) + .mock.calls.some( + ([input, init]) => + String(input).includes("/api/recommendation-events") && + String(init?.body).includes('"entrance_kind":"random_blog_page"') && + String(init?.body).includes('"entrance_url"'), + ), + ).toBe(true); + expect(openMock).toHaveBeenCalledWith("/blogs/32", "_blank", "noopener,noreferrer"); + expect(window.location.pathname).toBe("/random"); + expect(fetch).not.toHaveBeenCalledWith(expect.stringContaining("/api/blogs/32"), expect.anything()); +}); + +test("records random blog external URL opens as recommendation interactions", async () => { + window.history.replaceState({}, "", "/random"); + + render(<App />); + + await waitFor(() => { + expect(screen.getByText("当前展示 9 个随机博客卡片")).toBeInTheDocument(); + }); + + fireEvent.click(screen.getAllByRole("link", { name: /打开 Extra Blog 32/i })[0]); + + await waitFor(() => { + expect(fetch).toHaveBeenCalledWith( + expect.stringContaining("/api/recommendation-events"), + expect.objectContaining({ + method: "POST", + body: expect.stringContaining('"event_type":"external_open"'), + }), + ); + }); + expect( + vi + .mocked(fetch) + .mock.calls.some( + ([input, init]) => + String(input).includes("/api/recommendation-events") && + String(init?.body).includes('"entrance_kind":"random_blog_page"') && + String(init?.body).includes('"entrance_url"'), + ), + ).toBe(true); +}); + +test("renders one external URL text per random blog card", async () => { + window.history.replaceState({}, "", "/random"); + + render(<App />); + + await waitFor(() => { + expect(screen.getByText("当前展示 9 个随机博客卡片")).toBeInTheDocument(); + }); + + const firstRandomBlog = catalogItems[31]; + expect(screen.getAllByText(String(firstRandomBlog.url))).toHaveLength(1); +}); + +test("lets visualization users choose a graph size with a blog-count slider", async () => { window.history.replaceState({}, "", "/visualization"); render(<App />); @@ -366,31 +1143,71 @@ test("lets visualization users choose a deterministic sampled graph size", async }); expect(screen.getByRole("dialog", { name: "选择图谱规模" })).toBeInTheDocument(); - expect(screen.getByRole("button", { name: "10000" })).toBeInTheDocument(); + const slider = await screen.findByRole("slider", { name: "节点数量" }); + expect(slider).toHaveAttribute("min", "0"); + expect(slider).toHaveAttribute("max", "34"); + expect(slider).toHaveValue("34"); expect(screen.queryByText(/使用固定随机种子 42 选择起点/)).not.toBeInTheDocument(); expect(screen.queryByText(/显示实际下载大小/)).not.toBeInTheDocument(); expect(screen.queryByText("该功能仍不成熟!")).not.toBeInTheDocument(); expect(screen.queryByText("数据统计")).not.toBeInTheDocument(); + expect(screen.getByRole("button", { name: "精简" })).toHaveAttribute("aria-pressed", "true"); + expect(screen.getByRole("button", { name: "全" })).toHaveAttribute("aria-pressed", "false"); - fireEvent.click(screen.getByRole("button", { name: "500" })); + fireEvent.change(slider, { target: { value: "20" } }); + expect(slider).toHaveValue("20"); + fireEvent.click(screen.getByRole("button", { name: "确认" })); await waitFor(() => { - expect(screen.queryByRole("dialog", { name: "选择图谱规模" })).not.toBeInTheDocument(); + expect(screen.getByRole("dialog", { name: "正在渲染图谱" })).toBeInTheDocument(); }); expect(fetch).toHaveBeenCalledWith( - expect.stringContaining( - "/api/graph/views/core?strategy=degree&limit=500&sample_mode=count&sample_value=500&sample_seed=42", - ), + expect.stringContaining("/api/graph/views/core?strategy=seed&limit=20"), expect.anything(), ); - expect(screen.queryByText(/当前使用固定随机种子 42 展示 500 个节点/)).not.toBeInTheDocument(); + expect(forceGraphProps.at(-1)!.graphData.nodes.map((node: { id: string }) => node.id)).toEqual(["1", "2", "3"]); + expect(forceGraphProps.at(-1)!.graphData.links).toHaveLength(3); + expect(screen.getByRole("progressbar")).toHaveAttribute("aria-valuenow", "12"); + await waitFor(() => { + expect(screen.getByText("预计需要 126 ticks")).toBeInTheDocument(); + }); + expect(screen.getByText("预估所需渲染时间:约 3 秒")).toBeInTheDocument(); + act(() => { + forceGraphProps.at(-1)!.onEngineTick(); + forceGraphProps.at(-1)!.onEngineTick(); + }); + expect(screen.getByRole("progressbar")).toHaveAttribute("aria-valuenow", "12"); + act(() => { + forceGraphProps.at(-1)!.onEngineStop(); + }); + await waitFor(() => { + expect(screen.queryByRole("dialog", { name: "正在渲染图谱" })).not.toBeInTheDocument(); + }); + expect(screen.queryByText(/当前使用固定随机种子 42 展示 20 个节点/)).not.toBeInTheDocument(); expect(screen.queryByText("全图最大节点数")).not.toBeInTheDocument(); expect(screen.queryByRole("button", { name: /刷新全图|返回全图/ })).not.toBeInTheDocument(); expect(screen.queryByRole("button", { name: /搜索博客/i })).not.toBeInTheDocument(); }); -test("uses cached visualization graph data for repeated sampled sizes", async () => { +test("lets visualization users load the full graph without compact filtering", async () => { + window.history.replaceState({}, "", "/visualization"); + + render(<App />); + + const fullButton = await screen.findByRole("button", { name: "全" }); + fireEvent.click(fullButton); + expect(fullButton).toHaveAttribute("aria-pressed", "true"); + + fireEvent.click(screen.getByRole("button", { name: "确认" })); + + await waitFor(() => { + expect(forceGraphProps.at(-1)!.graphData.nodes).toHaveLength(4); + }); + expect(forceGraphProps.at(-1)!.graphData.links).toHaveLength(4); +}); + +test("ignores stale cached visualization graph data and reloads sampled sizes online", async () => { window.history.replaceState({}, "", "/visualization"); window.localStorage.setItem( "heyblog:visualization:3d-v1:seed-42:limit-200", @@ -414,10 +1231,46 @@ test("uses cached visualization graph data for repeated sampled sizes", async () expect(screen.getByRole("dialog", { name: "选择图谱规模" })).toBeInTheDocument(); }); - fireEvent.click(screen.getByRole("button", { name: "200" })); + fireEvent.change(screen.getByRole("slider", { name: "节点数量" }), { target: { value: "20" } }); + fireEvent.click(screen.getByRole("button", { name: "确认" })); + + await waitFor(() => { + expect(fetch).toHaveBeenCalledWith( + expect.stringContaining("/api/graph/views/core?strategy=seed&limit=20"), + expect.anything(), + ); + }); +}); + +test("defaults visualization slider to two hundred when the blog count is larger", async () => { + statusPayload = { + ...statusPayload, + total_blogs: 500, + }; + window.history.replaceState({}, "", "/visualization"); + + render(<App />); + + const slider = await screen.findByRole("slider", { name: "节点数量" }); - expect(screen.queryByText(/当前使用固定随机种子 42 展示 200 个节点/)).not.toBeInTheDocument(); - expect(fetch).not.toHaveBeenCalledWith(expect.stringContaining("/api/graph/views/core"), expect.anything()); + expect(slider).toHaveAttribute("max", "500"); + expect(slider).toHaveValue("200"); +}); + +test("loads the static clustered benchmark graph through the visualization route", async () => { + window.history.replaceState({}, "", "/visualization/benchmark"); + + render(<App />); + + await waitFor(() => { + expect(fetch).toHaveBeenCalledWith( + expect.stringContaining("/benchmarks/blog-community-graph.json"), + expect.anything(), + ); + }); + expect(screen.queryByRole("dialog", { name: "选择图谱规模" })).not.toBeInTheDocument(); + expect(forceGraphProps.at(-1)!.graphData.nodes).toHaveLength(2); + expect(forceGraphProps.at(-1)!.graphData.links).toHaveLength(1); }); test("adds a public filter stats route that renders success-source split", async () => { diff --git a/frontend/src/App.tsx b/frontend/src/App.tsx index 23a2772..a754c07 100644 --- a/frontend/src/App.tsx +++ b/frontend/src/App.tsx @@ -1,13 +1,32 @@ import { BrowserRouter, Navigate, Route, Routes } from "react-router-dom"; import { Toaster } from "sonner"; +import { Navigation } from "./components/Navigation"; +import { hasStoredAdminSession } from "./lib/auth"; import { AboutPage } from "./pages/AboutPage"; import { AdminPage } from "./pages/AdminPage"; +import { BlogDetailPage } from "./pages/BlogDetailPage"; import { FilterStatsPage } from "./pages/FilterStatsPage"; import { HomePage } from "./pages/HomePage"; import { ProfilePage } from "./pages/ProfilePage"; import { RandomBlogPage } from "./pages/RandomBlogPage"; import { VisualizationPage } from "./pages/VisualizationPage"; +function NotFoundPage() { + return ( + <div className="min-h-screen bg-slate-50"> + <Navigation /> + <main className="mx-auto max-w-3xl px-6 pt-32 text-slate-950"> + <p className="text-sm font-medium text-slate-500">404</p> + <h1 className="mt-3 text-3xl font-semibold">页面不存在</h1> + </main> + </div> + ); +} + +function AdminRoute() { + return hasStoredAdminSession() ? <AdminPage /> : <NotFoundPage />; +} + /** * Mount the routed frontend shell. * @@ -19,12 +38,14 @@ export default function App() { <Toaster position="top-right" richColors /> <Routes> <Route path="/" element={<HomePage />} /> + <Route path="/blogs/:blogId" element={<BlogDetailPage />} /> <Route path="/random" element={<RandomBlogPage />} /> <Route path="/visualization" element={<VisualizationPage />} /> + <Route path="/visualization/benchmark" element={<VisualizationPage />} /> <Route path="/filter-stats" element={<FilterStatsPage />} /> <Route path="/about" element={<AboutPage />} /> <Route path="/profile" element={<ProfilePage />} /> - <Route path="/admin" element={<AdminPage />} /> + <Route path="/admin" element={<AdminRoute />} /> <Route path="*" element={<Navigate to="/" replace />} /> </Routes> </BrowserRouter> diff --git a/frontend/src/components/BlogCard.tsx b/frontend/src/components/BlogCard.tsx index 011ac92..d0bf450 100644 --- a/frontend/src/components/BlogCard.tsx +++ b/frontend/src/components/BlogCard.tsx @@ -1,11 +1,14 @@ -import { ArrowUpRight, CheckCircle2, Clock3, XCircle } from "lucide-react"; +import { CheckCircle2, Clock3, XCircle } from "lucide-react"; import { useState, type ReactNode } from "react"; +import { BlogExternalLink } from "./BlogExternalLink"; import { resolveBlogIconUrl } from "../lib/icon"; import type { BlogCatalogItem } from "../types/graph"; interface BlogCardProps { blog: BlogCatalogItem; children?: ReactNode; + externalEntranceKind: string; + externalEntranceUrl: string; } function statusTone(crawlStatus: string) { @@ -41,9 +44,11 @@ function statusTone(crawlStatus: string) { * Render one catalog blog card in the example-inspired home layout. * * @param blog Catalog row returned by `/api/blogs/catalog`. + * @param externalEntranceKind Stable entry-point category for external-open tracking. + * @param externalEntranceUrl Raw entry-point URL for external-open tracking. * @returns Blog summary card. */ -export function BlogCard({ blog, children }: BlogCardProps) { +export function BlogCard({ blog, children, externalEntranceKind, externalEntranceUrl }: BlogCardProps) { const tone = statusTone(blog.crawlStatus); const ToneIcon = tone.icon; const iconUrl = resolveBlogIconUrl(blog); @@ -78,17 +83,17 @@ export function BlogCard({ blog, children }: BlogCardProps) { </span> </div> - <div className="flex items-center justify-between gap-3 rounded-2xl bg-slate-50 px-4 py-3 text-sm text-slate-600"> - <div className="min-w-0 truncate">{blog.url}</div> - <a - href={blog.url} - target="_blank" - rel="noreferrer" + <div className="rounded-2xl bg-slate-50 px-4 py-3 text-sm text-slate-600"> + <BlogExternalLink + blog={blog} + entranceKind={externalEntranceKind} + entranceUrl={externalEntranceUrl} aria-label={`打开 ${blog.title || blog.domain}`} - className="inline-flex h-8 w-8 flex-shrink-0 items-center justify-center rounded-full border border-slate-200 bg-white text-slate-500 transition-colors duration-200 hover:border-sky-300 hover:text-sky-600" + showIcon + className="inline-flex w-full min-w-0 items-center justify-between gap-3 transition-colors duration-200 hover:text-sky-600" > - <ArrowUpRight className="h-4 w-4" /> - </a> + <span className="min-w-0 truncate">{blog.url}</span> + </BlogExternalLink> </div> {children ? <div className="mt-4">{children}</div> : null} diff --git a/frontend/src/components/BlogDetailLink.tsx b/frontend/src/components/BlogDetailLink.tsx new file mode 100644 index 0000000..75a3f0e --- /dev/null +++ b/frontend/src/components/BlogDetailLink.tsx @@ -0,0 +1,48 @@ +import type { ButtonHTMLAttributes, ReactNode } from "react"; +import { useNavigate } from "react-router-dom"; +import { openTrackedBlogDetail, type BlogInteractionEntrance } from "../lib/blogInteractions"; +import type { BlogCatalogItem, GraphNode } from "../types/graph"; + +interface BlogDetailLinkProps extends Omit<ButtonHTMLAttributes<HTMLButtonElement>, "onClick"> { + blog: BlogCatalogItem | GraphNode; + entranceKind: string; + entranceUrl: string; + children: ReactNode; + eventAttributes?: Record<string, unknown>; + openInNewTab?: boolean; +} + +/** + * Render the canonical tracked navigation control for blog detail routes. + * + * @param blog Blog target whose detail route should open. + * @param entranceKind Stable entry-point category for analytics. + * @param entranceUrl Raw entry-point URL for analytics. + * @param children Visible button content. + * @param eventAttributes Optional event metadata sent with the interaction. + * @param openInNewTab Whether to open the detail route in a new browser tab. + * @returns Button that records `detail_open` and navigates to `/blogs/:id`. + */ +export function BlogDetailLink({ + blog, + entranceKind, + entranceUrl, + children, + eventAttributes, + openInNewTab, + ...buttonProps +}: BlogDetailLinkProps) { + const navigate = useNavigate(); + const entrance: BlogInteractionEntrance = { entranceKind, entranceUrl }; + return ( + <button + {...buttonProps} + type={buttonProps.type ?? "button"} + onClick={() => { + openTrackedBlogDetail(navigate, blog, entrance, eventAttributes, { newTab: openInNewTab }); + }} + > + {children} + </button> + ); +} diff --git a/frontend/src/components/BlogDetailPanel.tsx b/frontend/src/components/BlogDetailPanel.tsx index af20660..9fade9a 100644 --- a/frontend/src/components/BlogDetailPanel.tsx +++ b/frontend/src/components/BlogDetailPanel.tsx @@ -1,9 +1,12 @@ -import { ArrowLeft, ArrowRight, ExternalLink, Sparkles, X } from "lucide-react"; +import { ArrowLeft, ArrowRight, Sparkles, X } from "lucide-react"; +import { BlogExternalLink } from "./BlogExternalLink"; import type { BlogDetail } from "../types/graph"; interface BlogDetailPanelProps { detail: BlogDetail; onClose: () => void; + entranceKind?: string; + entranceUrl?: string; } /** @@ -11,9 +14,16 @@ interface BlogDetailPanelProps { * * @param detail Selected blog detail payload. * @param onClose Callback used to dismiss the panel. + * @param entranceKind Optional panel entry-point category for external-open tracking. + * @param entranceUrl Optional panel entry-point URL for external-open tracking. * @returns Floating detail panel. */ -export function BlogDetailPanel({ detail, onClose }: BlogDetailPanelProps) { +export function BlogDetailPanel({ + detail, + onClose, + entranceKind = "blog_detail_panel_external", + entranceUrl = window.location.href, +}: BlogDetailPanelProps) { return ( <div className="absolute right-8 top-24 z-10 max-h-[70vh] w-96 overflow-y-auto rounded-lg border-2 border-gray-200 bg-white p-6 shadow-2xl"> <div className="mb-4 flex items-start justify-between"> @@ -29,15 +39,14 @@ export function BlogDetailPanel({ detail, onClose }: BlogDetailPanelProps) { <div className="space-y-4"> <div> <div className="mb-1 text-sm text-gray-600">URL</div> - <a - href={detail.url} - target="_blank" - rel="noopener noreferrer" + <BlogExternalLink + blog={detail} + entranceKind={entranceKind} + entranceUrl={entranceUrl} className="flex items-center gap-1 break-all text-blue-600 hover:underline" > {detail.url} - <ExternalLink className="h-4 w-4 flex-shrink-0" /> - </a> + </BlogExternalLink> </div> <div className="border-t border-gray-200 pt-4"> diff --git a/frontend/src/components/BlogExternalLink.tsx b/frontend/src/components/BlogExternalLink.tsx new file mode 100644 index 0000000..37e3809 --- /dev/null +++ b/frontend/src/components/BlogExternalLink.tsx @@ -0,0 +1,48 @@ +import { ArrowUpRight } from "lucide-react"; +import type { AnchorHTMLAttributes, ReactNode } from "react"; +import { blogInteractionTarget, recordBlogInteraction, type BlogInteractionEntrance } from "../lib/blogInteractions"; +import type { BlogCatalogItem, GraphNode } from "../types/graph"; + +interface BlogExternalLinkProps extends Omit<AnchorHTMLAttributes<HTMLAnchorElement>, "href" | "target" | "rel" | "onClick"> { + blog: BlogCatalogItem | GraphNode; + entranceKind: string; + entranceUrl: string; + children?: ReactNode; + showIcon?: boolean; + eventAttributes?: Record<string, unknown>; +} + +/** + * Render the canonical tracked external link for a blog URL. + * + * @param blog Blog target whose external URL should open. + * @param entranceKind Stable entry-point category for analytics. + * @param entranceUrl Raw entry-point URL for analytics. + * @param children Optional visible link content. + * @param showIcon Whether to append the external-link icon. + * @param eventAttributes Optional event metadata sent with the interaction. + * @returns External anchor that records `external_open` before opening. + */ +export function BlogExternalLink({ + blog, + entranceKind, + entranceUrl, + children, + showIcon = true, + eventAttributes, + ...anchorProps +}: BlogExternalLinkProps) { + const entrance: BlogInteractionEntrance = { entranceKind, entranceUrl }; + return ( + <a + {...anchorProps} + href={blog.url} + target="_blank" + rel="noreferrer" + onClick={() => recordBlogInteraction(blogInteractionTarget(blog), "external_open", entrance, eventAttributes)} + > + {children ?? blog.url} + {showIcon ? <ArrowUpRight className="h-4 w-4 flex-shrink-0" /> : null} + </a> + ); +} diff --git a/frontend/src/components/GraphVisualization.test.tsx b/frontend/src/components/GraphVisualization.test.tsx index 62d5d1d..aa544c9 100644 --- a/frontend/src/components/GraphVisualization.test.tsx +++ b/frontend/src/components/GraphVisualization.test.tsx @@ -1,56 +1,86 @@ -import { cleanup, fireEvent, render, screen } from "@testing-library/react"; +import { act, cleanup, fireEvent, render, screen, waitFor } from "@testing-library/react"; import { afterEach, beforeEach, describe, expect, test, vi } from "vitest"; -import { GraphVisualization } from "./GraphVisualization"; +import { estimateGraphRenderCooldownTicks, GraphVisualization } from "./GraphVisualization"; +import { tuneNaturalClusterForces } from "./GraphVisualization"; +import type { ForwardedRef } from "react"; import type { GraphData } from "../types/graph"; -const { forceGraphRenders, ForceGraph3DMock } = vi.hoisted(() => { - const forceGraphRenders: Record<string, any>[] = []; - - function ForceGraph3DMock(props: Record<string, any>) { - forceGraphRenders.push(props); - const { ref, onNodeClick, graphData } = props; - if (ref) { - ref.current = { - d3Force: vi.fn(() => ({ - strength: vi.fn(), - distance: vi.fn(), - })), - d3ReheatSimulation: vi.fn(), - zoomToFit: vi.fn(), - camera: vi.fn(() => ({ - position: { - clone: () => ({ - normalize: () => ({ - multiplyScalar: () => ({ x: 0, y: 0, z: 360 }), +const { chargeForce, d3ReheatSimulation, forceCalls, forceGraphRenders, ForceGraph3DMock, linkForce } = vi.hoisted( + () => { + const forceGraphRenders: Record<string, any>[] = []; + const forceCalls: Array<[string, unknown?]> = []; + const chargeForce = { + strength: vi.fn(), + distanceMax: vi.fn(), + }; + const linkForce = { + strength: vi.fn(), + distance: vi.fn(), + }; + const d3ReheatSimulation = vi.fn(); + + function ForceGraph3DMock(props: Record<string, any>, ref: ForwardedRef<Record<string, unknown>>) { + forceGraphRenders.push(props); + const { onNodeClick, graphData } = props; + const resolvedRef = ref ?? props.ref; + if (resolvedRef) { + const graphInstance = { + d3Force: vi.fn((name: string, force?: unknown) => { + forceCalls.push([name, force]); + if (name === "charge") { + return chargeForce; + } + if (name === "link") { + return linkForce; + } + return undefined; + }), + d3ReheatSimulation, + zoomToFit: vi.fn(), + camera: vi.fn(() => ({ + position: { + clone: () => ({ + normalize: () => ({ + multiplyScalar: () => ({ x: 0, y: 0, z: 360 }), + }), }), - }), - length: () => 360, - copy: vi.fn(), - }, - })), - controls: vi.fn(() => ({ update: vi.fn() })), - cameraPosition: vi.fn(), - }; + length: () => 360, + copy: vi.fn(), + }, + })), + controls: vi.fn(() => ({ update: vi.fn() })), + cameraPosition: vi.fn(), + refresh: vi.fn(), + }; + if (typeof resolvedRef === "function") { + resolvedRef(graphInstance); + } else { + resolvedRef.current = graphInstance; + } + } + + return ( + <button + type="button" + data-testid="force-graph-3d" + onClick={() => onNodeClick?.(graphData.nodes[1], new MouseEvent("click"))} + > + 3D graph + </button> + ); } - return ( - <button - type="button" - data-testid="force-graph-3d" - onClick={() => onNodeClick?.(graphData.nodes[1], new MouseEvent("click"))} - > - 3D graph - </button> - ); - } + return { chargeForce, d3ReheatSimulation, forceCalls, forceGraphRenders, ForceGraph3DMock, linkForce }; + }, +); - return { forceGraphRenders, ForceGraph3DMock }; +vi.mock("react-force-graph-3d", async () => { + const React = await vi.importActual<typeof import("react")>("react"); + return { + default: React.forwardRef(ForceGraph3DMock), + }; }); -vi.mock("react-force-graph-3d", () => ({ - default: ForceGraph3DMock, -})); - vi.mock("three", async () => { const actual = await vi.importActual<typeof import("three")>("three"); return { @@ -58,10 +88,10 @@ vi.mock("three", async () => { TextureLoader: class { setCrossOrigin = vi.fn(); - load = vi.fn((url: string, onLoad?: () => void) => { + load = vi.fn((url: string, onLoad?: (texture: any) => void) => { const texture = new actual.Texture(); texture.userData = { url }; - onLoad?.(); + onLoad?.(texture); return texture; }); }, @@ -137,6 +167,12 @@ class TestResizeObserver { beforeEach(() => { forceGraphRenders.length = 0; + forceCalls.length = 0; + chargeForce.strength.mockClear(); + chargeForce.distanceMax.mockClear(); + linkForce.strength.mockClear(); + linkForce.distance.mockClear(); + d3ReheatSimulation.mockClear(); vi.stubGlobal("ResizeObserver", TestResizeObserver); }); @@ -146,6 +182,12 @@ afterEach(() => { }); describe("GraphVisualization", () => { + test("estimates larger render cooldowns for bigger or denser graphs", () => { + expect(estimateGraphRenderCooldownTicks(2, 1)).toBe(120); + expect(estimateGraphRenderCooldownTicks(100, 500)).toBeGreaterThan(120); + expect(estimateGraphRenderCooldownTicks(10000, 100000)).toBeGreaterThan(720); + }); + test("passes cleaned node-link data into the 3D force graph", () => { render(<GraphVisualization data={forceGraphData} />); @@ -162,15 +204,21 @@ describe("GraphVisualization", () => { id: "1", blogId: 1, label: "Alpha Blog", - iconUrl: "https://icons.duckduckgo.com/ip3/alpha.example.com.ico", + iconUrl: "/api/icons/proxy?url=https%3A%2F%2Falpha.example.com%2Ffavicon.ico", val: 1, + x: expect.any(Number), + y: expect.any(Number), + z: expect.any(Number), }), expect.objectContaining({ id: "2", blogId: 2, label: "Beta Blog", - iconUrl: "https://icons.duckduckgo.com/ip3/beta.example.com.ico", + iconUrl: undefined, val: 1, + x: expect.any(Number), + y: expect.any(Number), + z: expect.any(Number), }), ]), links: [ @@ -185,6 +233,66 @@ describe("GraphVisualization", () => { ); }); + test("seeds disconnected graph regions into separated initial positions", () => { + const twoRegionGraph: GraphData = { + nodes: [ + { + id: 1, + url: "https://alpha.example.com/", + domain: "alpha.example.com", + title: "Alpha Blog", + iconUrl: null, + }, + { + id: 2, + url: "https://beta.example.com/", + domain: "beta.example.com", + title: "Beta Blog", + iconUrl: null, + }, + { + id: 3, + url: "https://gamma.example.com/", + domain: "gamma.example.com", + title: "Gamma Blog", + iconUrl: null, + }, + { + id: 4, + url: "https://delta.example.com/", + domain: "delta.example.com", + title: "Delta Blog", + iconUrl: null, + }, + ], + edges: [ + { + id: "1-2", + source: 1, + target: 2, + linkText: null, + linkUrlRaw: "https://alpha.example.com/link", + }, + { + id: "3-4", + source: 3, + target: 4, + linkText: null, + linkUrlRaw: "https://gamma.example.com/link", + }, + ], + }; + + render(<GraphVisualization data={twoRegionGraph} />); + + const graphProps = forceGraphRenders.at(-1)!; + const [firstRegionNode] = graphProps.graphData.nodes; + const thirdNode = graphProps.graphData.nodes[2]; + const distance = Math.hypot(firstRegionNode.x - thirdNode.x, firstRegionNode.y - thirdNode.y, firstRegionNode.z - thirdNode.z); + + expect(distance).toBeGreaterThan(500); + }); + test("uses the original graph node for click callbacks", () => { const handleNodeClick = vi.fn(); render(<GraphVisualization data={forceGraphData} onNodeClick={handleNodeClick} />); @@ -230,10 +338,54 @@ describe("GraphVisualization", () => { const graphProps = forceGraphRenders.at(-1); const [selectedLink, unrelatedLink] = graphProps!.graphData.links; - expect(graphProps!.linkWidth(selectedLink)).toBe(2); - expect(graphProps!.linkColor(selectedLink)).toBe("rgba(125, 211, 252, 0.78)"); - expect(graphProps!.linkWidth(unrelatedLink)).toBe(0.35); - expect(graphProps!.linkColor(unrelatedLink)).toBe("rgba(71, 85, 105, 0.16)"); + expect(graphProps!.linkWidth(selectedLink)).toBe(3.2); + expect(graphProps!.linkColor(selectedLink)).toBe("rgba(240, 249, 255, 1)"); + expect(graphProps!.linkWidth(unrelatedLink)).toBe(0.9); + expect(graphProps!.linkColor(unrelatedLink)).toBe("rgba(186, 230, 253, 0.55)"); + }); + + test("uses brighter default link color on the dark graph background", () => { + render(<GraphVisualization data={forceGraphData} />); + + const graphProps = forceGraphRenders.at(-1); + const [defaultLink] = graphProps!.graphData.links; + + expect(graphProps!.linkWidth(defaultLink)).toBe(1.6); + expect(graphProps!.linkColor(defaultLink)).toBe("rgba(224, 242, 254, 0.78)"); + }); + + test("uses dynamic cooldown ticks and completes early after stable movement", () => { + const handleProgress = vi.fn(); + const handleComplete = vi.fn(); + const graphWithPositions: GraphData = { + nodes: forceGraphData.nodes.map((node, index) => ({ + ...node, + x: index * 10, + y: 0, + z: 0, + })), + edges: forceGraphData.edges, + }; + + render( + <GraphVisualization data={graphWithPositions} onRenderProgress={handleProgress} onRenderComplete={handleComplete} />, + ); + + const initialProps = forceGraphRenders.at(-1)!; + expect(initialProps.cooldownTicks).toBe(estimateGraphRenderCooldownTicks(2, 1)); + + act(() => { + for (let index = 0; index < 100; index += 1) { + initialProps.onEngineTick(); + } + }); + + const stableProps = forceGraphRenders.at(-1)!; + expect(stableProps.cooldownTicks).toBe(100); + stableProps.onEngineStop(); + + expect(handleProgress).toHaveBeenLastCalledWith(1); + expect(handleComplete).toHaveBeenCalled(); }); test("exposes icon-only zoom and reset controls", () => { @@ -252,6 +404,45 @@ describe("GraphVisualization", () => { const nodeObject = graphProps!.nodeThreeObject(iconNode); expect(nodeObject.children).toHaveLength(3); - expect(nodeObject.userData.iconUrl).toBe("https://icons.duckduckgo.com/ip3/alpha.example.com.ico"); + expect(nodeObject.userData.iconUrl).toBe("/api/icons/proxy?url=https%3A%2F%2Falpha.example.com%2Ffavicon.ico"); + }); + + test("renders iconless nodes as neutral gray spheres", () => { + render(<GraphVisualization data={forceGraphData} />); + + const graphProps = forceGraphRenders.at(-1); + const iconlessNode = graphProps!.graphData.nodes[1]; + const nodeObject = graphProps!.nodeThreeObject(iconlessNode); + const core = nodeObject.children[1] as any; + + expect(iconlessNode.iconUrl).toBeUndefined(); + expect(nodeObject.children).toHaveLength(2); + expect(nodeObject.userData.iconUrl).toBeUndefined(); + expect(core.material.color.getHexString()).toBe("94a3b8"); + }); + + test("tunes forces for natural clusters instead of a centered sphere", () => { + const graph = { + d3Force: vi.fn((name: string, force?: unknown) => { + forceCalls.push([name, force]); + if (name === "charge") { + return chargeForce; + } + if (name === "link") { + return linkForce; + } + return undefined; + }), + d3ReheatSimulation, + }; + + tuneNaturalClusterForces(graph as never); + + expect(forceCalls).toContainEqual(["center", null]); + expect(chargeForce.strength).toHaveBeenCalledWith(-280); + expect(chargeForce.distanceMax).toHaveBeenCalledWith(1400); + expect(linkForce.distance).toHaveBeenCalledWith(96); + expect(linkForce.strength).toHaveBeenCalledWith(0.24); + expect(d3ReheatSimulation).toHaveBeenCalled(); }); }); diff --git a/frontend/src/components/GraphVisualization.tsx b/frontend/src/components/GraphVisualization.tsx index c1b04c7..bfcc4c3 100644 --- a/frontend/src/components/GraphVisualization.tsx +++ b/frontend/src/components/GraphVisualization.tsx @@ -2,13 +2,29 @@ import { RotateCcw, ZoomIn, ZoomOut } from "lucide-react"; import { useCallback, useEffect, useMemo, useRef, useState } from "react"; import ForceGraph3D, { type ForceGraphMethods } from "react-force-graph-3d"; import * as THREE from "three"; -import { resolveBlogIconUrl } from "../lib/icon"; +import { resolveIconProxyUrl } from "../lib/icon"; import type { GraphData, GraphEdge, GraphNode } from "../types/graph"; +export const GRAPH_RENDER_COOLDOWN_TICKS = 120; +const GRAPH_RENDER_MIN_STABILITY_TICKS = 80; +const GRAPH_RENDER_STABLE_SAMPLE_TICKS = 20; +const GRAPH_RENDER_AVERAGE_MOVEMENT_THRESHOLD = 0.15; +const GRAPH_RENDER_MAX_MOVEMENT_THRESHOLD = 1; +const GRAPH_LINK_DISTANCE = 96; +const GRAPH_LINK_STRENGTH = 0.24; +const GRAPH_CHARGE_STRENGTH = -280; +const GRAPH_CHARGE_DISTANCE_MAX = 1400; +const GRAPH_SEEDED_GROUP_SIZE = 18; +const GRAPH_SEEDED_LAYOUT_SPACING = 360; + interface GraphVisualizationProps { data: GraphData; onNodeClick?: (node: GraphNode) => void; highlightNodeId?: number; + onRenderProgress?: (progress: number) => void; + onRenderComplete?: () => void; + onRenderTickEstimate?: (ticks: number) => void; + useNodeIcons?: boolean; } interface RenderNode extends Omit<GraphNode, "id" | "iconUrl"> { @@ -18,6 +34,7 @@ interface RenderNode extends Omit<GraphNode, "id" | "iconUrl"> { label: string; val: number; iconUrl?: string; + iconUrls: string[]; } interface RenderLink extends Omit<GraphEdge, "source" | "target"> { @@ -30,10 +47,103 @@ interface RenderGraphData { links: RenderLink[]; } +interface NodePosition { + x: number; + y: number; + z: number; +} + +interface MovementSample { + averageMovement: number; + maxMovement: number; + measuredNodes: number; +} + function nodeTitle(node: GraphNode): string { return node.title?.trim() || node.domain || node.url || `Blog ${node.id}`; } +/** + * Keep one numeric value above an inclusive minimum. + * + * @param value Candidate value. + * @param min Inclusive minimum. + * @returns Value constrained to at least min. + */ +function clampMin(value: number, min: number): number { + return Math.max(min, value); +} + +/** + * Estimate the maximum force-layout duration from graph size. + * + * @param nodeCount Number of renderable graph nodes. + * @param edgeCount Number of renderable graph links. + * @returns Cooldown tick upper bound used by the force graph engine. + */ +export function estimateGraphRenderCooldownTicks(nodeCount: number, edgeCount: number): number { + const safeNodeCount = Math.max(0, nodeCount); + const safeEdgeCount = Math.max(0, edgeCount); + const edgeDensity = safeEdgeCount / Math.max(1, safeNodeCount); + const estimatedTicks = Math.round( + 80 + 12 * Math.sqrt(safeNodeCount) + 4 * Math.sqrt(safeEdgeCount) + Math.min(180, edgeDensity * 18), + ); + + return clampMin(estimatedTicks, GRAPH_RENDER_COOLDOWN_TICKS); +} + +/** + * Capture the current 3D positions for nodes that have been placed by d3. + * + * @param nodes Render nodes from the active graph payload. + * @returns Map keyed by render node id with current coordinates. + */ +function snapshotNodePositions(nodes: RenderNode[]): Map<string, NodePosition> { + const positions = new Map<string, NodePosition>(); + for (const node of nodes) { + if (node.x === undefined || node.y === undefined || node.z === undefined) { + continue; + } + positions.set(node.id, { x: node.x, y: node.y, z: node.z }); + } + return positions; +} + +/** + * Measure node displacement since the previous force tick. + * + * @param nodes Render nodes from the active graph payload. + * @param previousPositions Position snapshot from the previous tick. + * @returns Average and maximum displacement, or undefined when no positions are available. + */ +function measureNodeMovement(nodes: RenderNode[], previousPositions: Map<string, NodePosition>): MovementSample | undefined { + let totalMovement = 0; + let maxMovement = 0; + let measuredNodes = 0; + + for (const node of nodes) { + const previous = previousPositions.get(node.id); + if (!previous || node.x === undefined || node.y === undefined || node.z === undefined) { + continue; + } + + const movement = Math.hypot(node.x - previous.x, node.y - previous.y, node.z - previous.z); + totalMovement += movement; + maxMovement = Math.max(maxMovement, movement); + measuredNodes += 1; + } + + if (measuredNodes === 0) { + return undefined; + } + + return { + averageMovement: totalMovement / measuredNodes, + maxMovement, + measuredNodes, + }; +} + function sourceIdOf(link: RenderLink): string { return typeof link.source === "object" ? link.source.id : String(link.source); } @@ -42,7 +152,178 @@ function targetIdOf(link: RenderLink): string { return typeof link.target === "object" ? link.target.id : String(link.target); } -function buildGraphData(data: GraphData): RenderGraphData { +/** + * Build an undirected adjacency map from renderable links. + * + * @param nodes Nodes that can be displayed in the graph. + * @param links Links whose endpoints both exist in the graph. + * @returns Map keyed by node id with neighboring node ids. + */ +function buildAdjacency(nodes: RenderNode[], links: RenderLink[]): Map<string, Set<string>> { + const adjacency = new Map<string, Set<string>>(); + for (const node of nodes) { + adjacency.set(node.id, new Set()); + } + + for (const link of links) { + const source = sourceIdOf(link); + const target = targetIdOf(link); + if (source === target || !adjacency.has(source) || !adjacency.has(target)) { + continue; + } + adjacency.get(source)?.add(target); + adjacency.get(target)?.add(source); + } + + return adjacency; +} + +/** + * Find deterministic weakly connected components for initial graph placement. + * + * @param nodes Nodes that can be displayed in the graph. + * @param adjacency Undirected adjacency map. + * @returns Components sorted by size and id for stable layout. + */ +function findConnectedComponents(nodes: RenderNode[], adjacency: Map<string, Set<string>>): string[][] { + const visited = new Set<string>(); + const nodeIds = nodes.map((node) => node.id).sort((left, right) => Number(left) - Number(right)); + const components: string[][] = []; + + for (const nodeId of nodeIds) { + if (visited.has(nodeId)) { + continue; + } + + const component: string[] = []; + const queue = [nodeId]; + visited.add(nodeId); + + for (let index = 0; index < queue.length; index += 1) { + const current = queue[index]; + component.push(current); + const neighbors = Array.from(adjacency.get(current) ?? []).sort((left, right) => Number(left) - Number(right)); + for (const neighbor of neighbors) { + if (visited.has(neighbor)) { + continue; + } + visited.add(neighbor); + queue.push(neighbor); + } + } + + components.push(component); + } + + return components.sort((left, right) => right.length - left.length || Number(left[0]) - Number(right[0])); +} + +/** + * Split a large connected component into deterministic layout groups. + * + * @param component Node ids in one connected component. + * @param adjacency Undirected adjacency map. + * @returns Layout groups used only for initial spatial seeding. + */ +function splitComponentIntoLayoutGroups(component: string[], adjacency: Map<string, Set<string>>): string[][] { + if (component.length <= GRAPH_SEEDED_GROUP_SIZE) { + return [component]; + } + + const seedCount = Math.max(2, Math.ceil(component.length / GRAPH_SEEDED_GROUP_SIZE)); + const componentIds = new Set(component); + const seeds = component + .slice() + .sort((left, right) => { + const degreeDelta = (adjacency.get(right)?.size ?? 0) - (adjacency.get(left)?.size ?? 0); + return degreeDelta || Number(left) - Number(right); + }) + .slice(0, seedCount); + + const groupByNodeId = new Map<string, number>(); + const queues = seeds.map((seed, index) => { + groupByNodeId.set(seed, index); + return [seed]; + }); + + for (let queueIndex = 0; queues.some((queue) => queue.length > 0); queueIndex = (queueIndex + 1) % queues.length) { + const current = queues[queueIndex].shift(); + if (!current) { + continue; + } + + const neighbors = Array.from(adjacency.get(current) ?? []).sort((left, right) => Number(left) - Number(right)); + for (const neighbor of neighbors) { + if (!componentIds.has(neighbor) || groupByNodeId.has(neighbor)) { + continue; + } + groupByNodeId.set(neighbor, queueIndex); + queues[queueIndex].push(neighbor); + } + } + + const groups = seeds.map((): string[] => []); + for (const nodeId of component) { + const groupIndex = groupByNodeId.get(nodeId) ?? 0; + groups[groupIndex].push(nodeId); + } + + return groups.filter((group) => group.length > 0); +} + +/** + * Seed deterministic 3D positions so force layout starts from separated regions. + * + * @param nodes Nodes to position. + * @param links Links used to infer components and layout groups. + * @returns Nodes with initial x/y/z coordinates. + */ +export function seedGraphInitialPositions(nodes: RenderNode[], links: RenderLink[]): RenderNode[] { + const adjacency = buildAdjacency(nodes, links); + const layoutGroups = findConnectedComponents(nodes, adjacency).flatMap((component) => + splitComponentIntoLayoutGroups(component, adjacency), + ); + const groupIndexByNodeId = new Map<string, number>(); + for (const [groupIndex, group] of layoutGroups.entries()) { + for (const nodeId of group) { + groupIndexByNodeId.set(nodeId, groupIndex); + } + } + + const nodeIndexInGroup = new Map<string, number>(); + for (const group of layoutGroups) { + const sortedGroup = group.slice().sort((left, right) => Number(left) - Number(right)); + sortedGroup.forEach((nodeId, index) => nodeIndexInGroup.set(nodeId, index)); + } + + const groupCount = Math.max(1, layoutGroups.length); + return nodes.map((node) => { + const groupIndex = groupIndexByNodeId.get(node.id) ?? 0; + const indexInGroup = nodeIndexInGroup.get(node.id) ?? 0; + const groupSize = Math.max(1, layoutGroups[groupIndex]?.length ?? 1); + const groupAngle = (Math.PI * 2 * groupIndex) / groupCount; + const groupRing = GRAPH_SEEDED_LAYOUT_SPACING * (1 + Math.floor(groupIndex / Math.max(1, Math.ceil(Math.sqrt(groupCount))))); + const localAngle = (Math.PI * 2 * indexInGroup) / groupSize; + const localRadius = 34 + 8 * Math.sqrt(groupSize) + 5 * (indexInGroup % 5); + + return { + ...node, + x: Math.cos(groupAngle) * groupRing + Math.cos(localAngle) * localRadius, + y: Math.sin(groupAngle) * groupRing + Math.sin(localAngle) * localRadius, + z: ((indexInGroup % 7) - 3) * 24 + (groupIndex % 3) * 60, + }; + }); +} + +function buildExplicitIconUrls(node: GraphNode, useNodeIcons: boolean): string[] { + const iconUrl = node.iconUrl?.trim(); + if (!useNodeIcons || !iconUrl) { + return []; + } + return [resolveIconProxyUrl(iconUrl)]; +} + +function buildGraphData(data: GraphData, useNodeIcons: boolean): RenderGraphData { const nodesById = new Map<string, RenderNode>(); for (const node of data.nodes) { @@ -50,6 +331,7 @@ function buildGraphData(data: GraphData): RenderGraphData { if (!id) { continue; } + const iconUrls = buildExplicitIconUrls(node, useNodeIcons); nodesById.set(id, { ...node, id, @@ -57,7 +339,8 @@ function buildGraphData(data: GraphData): RenderGraphData { original: node, label: nodeTitle(node), val: 1, - iconUrl: resolveBlogIconUrl(node), + iconUrls, + iconUrl: iconUrls[0], }); } @@ -85,7 +368,7 @@ function buildGraphData(data: GraphData): RenderGraphData { val: Math.max(1, degreeById.get(node.id) ?? node.degree ?? 1), })); - return { nodes, links }; + return { nodes: seedGraphInitialPositions(nodes, links), links }; } function buildNeighborIds(graphData: RenderGraphData, highlightNodeId?: number): Set<string> { @@ -120,12 +403,6 @@ function colorForNode(node: RenderNode, highlightNodeId?: number, neighborIds?: if (highlightNodeId !== undefined) { return "#334155"; } - if ((node.incomingCount ?? 0) > (node.outgoingCount ?? 0)) { - return "#fbbf24"; - } - if ((node.outgoingCount ?? 0) > 0) { - return "#818cf8"; - } return "#94a3b8"; } @@ -157,20 +434,13 @@ function createNodeObject(node: RenderNode, color: string, size: number): THREE. group.add(glow); group.add(core); + group.userData = { blogId: node.blogId, iconUrl: node.iconUrl }; - if (node.iconUrl) { + const iconUrls = node.iconUrls.length > 0 ? node.iconUrls : node.iconUrl ? [node.iconUrl] : []; + if (iconUrls.length > 0) { const loader = new THREE.TextureLoader(); loader.setCrossOrigin("anonymous"); - const texture = loader.load( - node.iconUrl, - () => { - core.visible = false; - }, - undefined, - () => { - core.visible = true; - }, - ); + const texture = new THREE.Texture(); texture.colorSpace = THREE.SRGBColorSpace; const icon = new THREE.Sprite( new THREE.SpriteMaterial({ @@ -182,12 +452,62 @@ function createNodeObject(node: RenderNode, color: string, size: number): THREE. icon.scale.set(size * 2.1, size * 2.1, 1); icon.position.set(0, 0, size * 0.08); group.add(icon); + + const loadIcon = (index: number) => { + const candidate = iconUrls[index]; + if (!candidate) { + core.visible = true; + icon.visible = false; + return; + } + loader.load( + candidate, + (loadedTexture) => { + loadedTexture.colorSpace = THREE.SRGBColorSpace; + icon.material.map = loadedTexture; + icon.material.needsUpdate = true; + core.visible = false; + icon.visible = true; + group.userData.iconUrl = candidate; + }, + undefined, + () => loadIcon(index + 1), + ); + }; + loadIcon(0); } - group.userData = { blogId: node.blogId, iconUrl: node.iconUrl }; return group; } +/** + * Tune the d3 force engine so related blogs cluster without collapsing into a global sphere. + * + * @param graph Force graph instance exposed by react-force-graph-3d. + */ +export function tuneNaturalClusterForces(graph: ForceGraphMethods<RenderNode, RenderLink>): void { + graph.d3Force("center", null); + + const chargeForce = graph.d3Force("charge") as + | { + strength?: (value: number) => unknown; + distanceMax?: (value: number) => unknown; + } + | undefined; + chargeForce?.strength?.(GRAPH_CHARGE_STRENGTH); + chargeForce?.distanceMax?.(GRAPH_CHARGE_DISTANCE_MAX); + + const linkForce = graph.d3Force("link") as + | { + distance?: (value: number) => unknown; + strength?: (value: number) => unknown; + } + | undefined; + linkForce?.distance?.(GRAPH_LINK_DISTANCE); + linkForce?.strength?.(GRAPH_LINK_STRENGTH); + graph.d3ReheatSimulation(); +} + /** * Render an interactive 3D force graph for blog relationship exploration. * @@ -196,12 +516,29 @@ function createNodeObject(node: RenderNode, color: string, size: number): THREE. * @param highlightNodeId Selected node id to emphasize. * @returns Graph container with 3D canvas and controls. */ -export function GraphVisualization({ data, onNodeClick, highlightNodeId }: GraphVisualizationProps) { +export function GraphVisualization({ + data, + onNodeClick, + highlightNodeId, + onRenderProgress, + onRenderComplete, + onRenderTickEstimate, + useNodeIcons = true, +}: GraphVisualizationProps) { const graphRef = useRef<ForceGraphMethods<RenderNode, RenderLink> | undefined>(undefined); const containerRef = useRef<HTMLDivElement | null>(null); + const renderTickRef = useRef(0); + const stableTickRef = useRef(0); + const earlyStopRequestedRef = useRef(false); + const previousPositionsRef = useRef<Map<string, NodePosition>>(new Map()); const [size, setSize] = useState({ width: 960, height: 720 }); const [isMeasured, setIsMeasured] = useState(false); - const graphData = useMemo(() => buildGraphData(data), [data]); + const graphData = useMemo(() => buildGraphData(data, useNodeIcons), [data, useNodeIcons]); + const estimatedCooldownTicks = useMemo( + () => estimateGraphRenderCooldownTicks(graphData.nodes.length, graphData.links.length), + [graphData.links.length, graphData.nodes.length], + ); + const [cooldownTicks, setCooldownTicks] = useState(estimatedCooldownTicks); const neighborIds = useMemo(() => buildNeighborIds(graphData, highlightNodeId), [graphData, highlightNodeId]); const selectedGraphId = highlightNodeId === undefined ? undefined : String(highlightNodeId); @@ -221,6 +558,23 @@ export function GraphVisualization({ data, onNodeClick, highlightNodeId }: Graph return () => observer.disconnect(); }, []); + useEffect(() => { + renderTickRef.current = 0; + stableTickRef.current = 0; + earlyStopRequestedRef.current = false; + previousPositionsRef.current = snapshotNodePositions(graphData.nodes); + setCooldownTicks(estimatedCooldownTicks); + onRenderTickEstimate?.(estimatedCooldownTicks); + }, [estimatedCooldownTicks, graphData, onRenderTickEstimate]); + + useEffect(() => { + const graph = graphRef.current; + if (!graph || graphData.nodes.length === 0) { + return; + } + tuneNaturalClusterForces(graph); + }, [graphData]); + useEffect(() => { const graph = graphRef.current; if (!graph || !selectedGraphId) { @@ -259,6 +613,35 @@ export function GraphVisualization({ data, onNodeClick, highlightNodeId }: Graph graphRef.current?.zoomToFit(650, 80); }, []); + const handleEngineTick = useCallback(() => { + renderTickRef.current += 1; + const movement = measureNodeMovement(graphData.nodes, previousPositionsRef.current); + previousPositionsRef.current = snapshotNodePositions(graphData.nodes); + + if ( + movement && + renderTickRef.current >= GRAPH_RENDER_MIN_STABILITY_TICKS && + movement.averageMovement < GRAPH_RENDER_AVERAGE_MOVEMENT_THRESHOLD && + movement.maxMovement < GRAPH_RENDER_MAX_MOVEMENT_THRESHOLD + ) { + stableTickRef.current += 1; + } else { + stableTickRef.current = 0; + } + + if (!earlyStopRequestedRef.current && stableTickRef.current >= GRAPH_RENDER_STABLE_SAMPLE_TICKS) { + earlyStopRequestedRef.current = true; + setCooldownTicks((current) => Math.min(current, renderTickRef.current)); + } + + onRenderProgress?.(Math.min(renderTickRef.current / cooldownTicks, 0.98)); + }, [cooldownTicks, graphData.nodes, onRenderProgress]); + + const handleEngineStop = useCallback(() => { + onRenderProgress?.(1); + onRenderComplete?.(); + }, [onRenderComplete, onRenderProgress]); + return ( <div ref={containerRef} className="relative h-full w-full overflow-hidden bg-slate-950"> <div className="absolute inset-x-0 top-0 z-10 h-24 bg-gradient-to-b from-slate-950 via-slate-950/70 to-transparent" /> @@ -281,17 +664,17 @@ export function GraphVisualization({ data, onNodeClick, highlightNodeId }: Graph linkTarget="target" linkColor={(link: RenderLink) => { if (!selectedGraphId) { - return "rgba(148, 163, 184, 0.28)"; + return "rgba(224, 242, 254, 0.78)"; } return sourceIdOf(link) === selectedGraphId || targetIdOf(link) === selectedGraphId - ? "rgba(125, 211, 252, 0.78)" - : "rgba(71, 85, 105, 0.16)"; + ? "rgba(240, 249, 255, 1)" + : "rgba(186, 230, 253, 0.55)"; }} linkWidth={(link: RenderLink) => { if (!selectedGraphId) { - return 0.8; + return 1.6; } - return sourceIdOf(link) === selectedGraphId || targetIdOf(link) === selectedGraphId ? 2 : 0.35; + return sourceIdOf(link) === selectedGraphId || targetIdOf(link) === selectedGraphId ? 3.2 : 0.9; }} linkDirectionalArrowLength={3.5} linkDirectionalArrowRelPos={1} @@ -305,9 +688,11 @@ export function GraphVisualization({ data, onNodeClick, highlightNodeId }: Graph node.fy = node.y; node.fz = node.z; }} - d3VelocityDecay={0.38} + d3VelocityDecay={0.44} d3AlphaDecay={0.025} - cooldownTicks={120} + cooldownTicks={cooldownTicks} + onEngineTick={handleEngineTick} + onEngineStop={handleEngineStop} controlType="orbit" /> ) : null} diff --git a/frontend/src/components/MissingBlogConfirmDialog.tsx b/frontend/src/components/MissingBlogConfirmDialog.tsx new file mode 100644 index 0000000..c7c4950 --- /dev/null +++ b/frontend/src/components/MissingBlogConfirmDialog.tsx @@ -0,0 +1,111 @@ +import { Loader2 } from "lucide-react"; +import { useState } from "react"; + +interface MissingBlogConfirmDialogProps { + url: string; + onCancel: () => void; + onSubmit: (url: string) => Promise<void>; +} + +/** + * Render a confirmation dialog when a searched blog URL is not recorded. + * + * @param url Searched blog URL that was not found. + * @param onCancel Callback for dismissing the dialog without action. + * @param onSubmit Callback used to submit the confirmed complete blog URL. + * @returns Modal confirmation UI. + */ +export function MissingBlogConfirmDialog({ url, onCancel, onSubmit }: MissingBlogConfirmDialogProps) { + const [isConfirming, setIsConfirming] = useState(false); + const [seedUrl, setSeedUrl] = useState(url); + const [isSubmitting, setIsSubmitting] = useState(false); + + /** + * Submit the user-provided complete URL to the seed ingestion flow. + * + * @param event Form submit event. + */ + async function handleSubmit(event: React.FormEvent<HTMLFormElement>) { + event.preventDefault(); + if (!seedUrl.trim()) { + return; + } + setIsSubmitting(true); + try { + await onSubmit(seedUrl); + } finally { + setIsSubmitting(false); + } + } + + return ( + <div className="fixed inset-0 z-50 flex items-center justify-center bg-slate-950/45 p-4"> + <div + role="dialog" + aria-modal="true" + aria-labelledby="missing-blog-confirm-title" + className="w-full max-w-md rounded-lg bg-white p-6 shadow-2xl" + > + <h2 id="missing-blog-confirm-title" className="text-xl text-slate-950"> + 当前未找到该博客,是否将该博客加入博客网络? + </h2> + {isConfirming ? ( + <form onSubmit={handleSubmit} className="mt-5 space-y-4"> + <div> + <label htmlFor="missing-blog-seed-url" className="mb-2 block text-sm text-slate-700"> + 请输入完整博客链接 + </label> + <input + id="missing-blog-seed-url" + type="url" + value={seedUrl} + onChange={(event) => setSeedUrl(event.target.value)} + placeholder="https://blog.example.com" + disabled={isSubmitting} + className="w-full rounded-md border border-slate-300 px-3 py-2 text-sm text-slate-950 outline-none transition-colors placeholder:text-slate-400 focus:border-sky-500 focus:ring-2 focus:ring-sky-100 disabled:cursor-not-allowed disabled:bg-slate-50" + /> + </div> + <div className="flex justify-end gap-3"> + <button + type="button" + onClick={onCancel} + disabled={isSubmitting} + className="rounded-md border border-slate-300 px-4 py-2 text-sm text-slate-700 transition-colors hover:bg-slate-50 disabled:cursor-not-allowed disabled:opacity-60" + > + 不是 + </button> + <button + type="submit" + disabled={isSubmitting || !seedUrl.trim()} + className="inline-flex items-center gap-2 rounded-md bg-sky-500 px-4 py-2 text-sm text-white transition-colors hover:bg-sky-600 disabled:cursor-not-allowed disabled:bg-slate-300" + > + {isSubmitting ? <Loader2 className="h-4 w-4 animate-spin" /> : null} + 是 + </button> + </div> + </form> + ) : ( + <> + <div className="mt-3 break-all text-sm text-slate-500">{url}</div> + <div className="mt-6 flex justify-end gap-3"> + <button + type="button" + onClick={onCancel} + className="rounded-md border border-slate-300 px-4 py-2 text-sm text-slate-700 transition-colors hover:bg-slate-50" + > + 不是 + </button> + <button + type="button" + onClick={() => setIsConfirming(true)} + className="rounded-md bg-sky-500 px-4 py-2 text-sm text-white transition-colors hover:bg-sky-600" + > + 是 + </button> + </div> + </> + )} + </div> + </div> + ); +} diff --git a/frontend/src/components/Navigation.tsx b/frontend/src/components/Navigation.tsx index 390b60b..7a159fd 100644 --- a/frontend/src/components/Navigation.tsx +++ b/frontend/src/components/Navigation.tsx @@ -1,5 +1,6 @@ -import { Dices, Filter, Home, Info, Network, UserCircle } from "lucide-react"; +import { Dices, Filter, Home, Info, Network, Shield, UserCircle } from "lucide-react"; import { NavLink } from "react-router-dom"; +import { hasStoredAdminSession } from "../lib/auth"; const navigationItems = [ { to: "/", label: "首页", icon: Home }, @@ -16,10 +17,13 @@ const navigationItems = [ * @returns Floating route navigation bar. */ export function Navigation() { + const visibleItems = hasStoredAdminSession() + ? [...navigationItems, { to: "/admin", label: "管理", icon: Shield }] + : navigationItems; return ( <nav className="fixed right-6 top-6 z-40"> <div className="flex items-center gap-2 rounded-2xl border border-white/70 bg-white/92 p-1.5 shadow-[0_20px_60px_rgba(15,23,42,0.14)] backdrop-blur-md"> - {navigationItems.map((item) => { + {visibleItems.map((item) => { const Icon = item.icon; return ( <NavLink diff --git a/frontend/src/components/SubmitBlogDialog.tsx b/frontend/src/components/SubmitBlogDialog.tsx index 64e31e0..de07025 100644 --- a/frontend/src/components/SubmitBlogDialog.tsx +++ b/frontend/src/components/SubmitBlogDialog.tsx @@ -1,7 +1,7 @@ import { useState } from "react"; import { AlertCircle, X } from "lucide-react"; import { toast } from "sonner"; -import { submitBlogInfo } from "../lib/api"; +import { submitUserSeed } from "../lib/api"; interface SubmitBlogDialogProps { url: string; @@ -18,7 +18,6 @@ interface SubmitBlogDialogProps { * @returns Modal dialog UI. */ export function SubmitBlogDialog({ url, onClose, onSuccess }: SubmitBlogDialogProps) { - const [email, setEmail] = useState(""); const [isSubmitting, setIsSubmitting] = useState(false); /** @@ -31,9 +30,9 @@ export function SubmitBlogDialog({ url, onClose, onSuccess }: SubmitBlogDialogPr try { setIsSubmitting(true); - await submitBlogInfo({ url, email }); + await submitUserSeed({ url }); - toast.success("提交成功,系统已记录该博客请求。"); + toast.success("提交成功,系统已将该博客加入抓取队列。"); onSuccess?.(); onClose(); } catch { @@ -51,7 +50,7 @@ export function SubmitBlogDialog({ url, onClose, onSuccess }: SubmitBlogDialogPr <h2 className="mb-2 text-2xl text-gray-900">博客未找到</h2> <div className="flex items-start gap-2 rounded-md bg-amber-50 p-3 text-amber-700"> <AlertCircle className="mt-0.5 h-5 w-5 flex-shrink-0" /> - <div className="text-sm">该 URL 当前未收录。你可以留下邮箱,系统会创建抓取请求。</div> + <div className="text-sm">该 URL 当前未收录。你可以将它加入抓取队列。</div> </div> </div> <button onClick={onClose} className="flex-shrink-0 rounded-md p-1 transition-colors hover:bg-gray-100"> @@ -72,19 +71,6 @@ export function SubmitBlogDialog({ url, onClose, onSuccess }: SubmitBlogDialogPr /> </div> - <div> - <label className="mb-2 block text-sm text-gray-700"> - 联系邮箱 <span className="text-red-500">*</span> - </label> - <input - type="email" - value={email} - onChange={(event) => setEmail(event.target.value)} - placeholder="you@example.com" - className="w-full rounded-md border border-gray-300 px-4 py-2 focus:border-blue-500 focus:outline-none" - /> - </div> - <div className="flex gap-3 pt-4"> <button type="button" @@ -95,10 +81,10 @@ export function SubmitBlogDialog({ url, onClose, onSuccess }: SubmitBlogDialogPr </button> <button type="submit" - disabled={isSubmitting || !email.trim()} + disabled={isSubmitting} className="flex-1 rounded-md bg-blue-500 px-6 py-3 text-white transition-colors hover:bg-blue-600 disabled:cursor-not-allowed disabled:bg-gray-300" > - {isSubmitting ? "提交中..." : "创建抓取请求"} + {isSubmitting ? "提交中..." : "加入抓取队列"} </button> </div> </form> diff --git a/frontend/src/lib/api.ts b/frontend/src/lib/api.ts index 830a535..842b7e1 100644 --- a/frontend/src/lib/api.ts +++ b/frontend/src/lib/api.ts @@ -1,11 +1,12 @@ import type { - AdminDedupSummary, AdminBlogLabelCounts, AdminBlogLabelingCandidate, AdminBlogLabelingPage, AdminBlogLabelParquetStatus, AdminRequeueFailedBlogsResult, AdminBlogLabelTag, + AdminHourlyStats, + AdminHourlyStatsRow, AdminRuntimeCurrent, AdminRuntimeStatus, BlogCatalogItem, @@ -16,7 +17,10 @@ import type { GraphEdge, GraphMeta, GraphNode, + AuthLifecycleToken, LookupResult, + RandomRecommendationBatch, + RecommendationEventInput, RecommendedBlog, StatsData, StatusData, @@ -25,6 +29,24 @@ import type { UserProfile, } from "../types/graph"; +export class ApiError extends Error { + status: number; + detail: unknown; + + /** + * Capture one failed API response with the backend detail payload intact. + * + * @param status HTTP response status. + * @param detail Backend error detail payload, when available. + */ + constructor(status: number, detail: unknown) { + super(typeof detail === "string" && detail ? detail : `api_error_${status}`); + this.name = "ApiError"; + this.status = status; + this.detail = detail; + } +} + interface BackendGraphNode { id: number; blog_id?: number; @@ -39,6 +61,7 @@ interface BackendGraphNode { icon_url: string | null; status_code?: number | null; crawl_status?: string; + crawl_error_kind?: string | null; friend_links_count?: number; last_crawled_at?: string | null; created_at?: string; @@ -48,6 +71,9 @@ interface BackendGraphNode { outgoing_count?: number; activity_at?: string | null; identity_complete?: boolean; + request_uuid?: string; + impression_id?: number; + position?: number; x?: number; y?: number; degree?: number; @@ -114,10 +140,45 @@ interface BackendRecommendedBlog extends BackendGraphNode { via_blogs?: BackendNeighborSummary[]; } +interface BackendBlogDiscoveryStep { + blog: BackendNeighborSummary | null; + blog_id: number; + url: string; + domain: string; + accepted_by: string | null; + accepted_label: string | null; + raw_id: number | null; + raw_source_blog_id: number | null; + raw_accepted_by: string | null; + discovered_at: string | null; +} + +interface BackendBlogDiscoveryPath { + mode: "manual" | "crawled"; + origin_source: string | null; + origin_label: string; + target_source: string | null; + truncated: boolean; + steps: BackendBlogDiscoveryStep[]; +} + +interface BackendBlogRelationGraph { + direction: "incoming" | "outgoing"; + focus_blog_id: number; + depth: number; + nodes: BackendGraphNode[]; + edges: BackendGraphEdge[]; +} + interface BackendBlogDetail extends BackendGraphNode { incoming_edges: BackendBlogRelation[]; outgoing_edges: BackendBlogRelation[]; recommended_blogs: BackendRecommendedBlog[]; + discovery_path?: BackendBlogDiscoveryPath | null; + relation_graphs?: { + incoming: BackendBlogRelationGraph; + outgoing: BackendBlogRelationGraph; + }; } interface BackendStatsPayload { @@ -159,16 +220,27 @@ interface BackendCatalogPayload { sort: string; } -interface CreateIngestionRequestPayload { - request_id: number; - request_token: string; - status: string; +interface BackendRandomRecommendationBatchPayload { + request_uuid: string; + surface: string; + strategy: string; + strategy_version: string; + visitor_id: string; + session_id: string; + requested_count: number; + served_count: number; + created_at: string | null; + items: BackendGraphNode[]; } interface BackendUserProfile { id: number; email: string; display_name: string; + role: "admin" | "user"; + is_active: boolean; + email_verified: boolean; + email_verified_at: string | null; created_at: string | null; updated_at: string | null; } @@ -177,6 +249,17 @@ interface BackendAuthSession { token: string; expires_at: string | null; user: BackendUserProfile; + email_verification?: BackendAuthLifecycleToken; +} + +interface BackendAuthLifecycleToken { + sent: boolean; + verification_token?: string; + verification_url?: string; + reset_token?: string; + reset_url?: string; + expires_at?: string | null; + already_verified?: boolean; } interface BackendUserLabelSelection { @@ -206,15 +289,25 @@ interface BackendRuntimePayload { maintenance_in_progress?: boolean; } -interface BackendDedupSummary { +interface BackendAdminHourlyStatsRow { id: number; - status: string; - total_count: number; - scanned_count: number; - removed_count: number; - kept_count: number; - created_at: string; - updated_at: string; + hour_start: string | null; + user_count: number; + random_request_count: number; + random_impression_count: number; + detail_open_count: number; + external_open_count: number; + detail_ctr: number; + external_ctr: number; + total_click_ctr: number; + refreshed_at: string | null; + created_at: string | null; +} + +interface BackendAdminHourlyStats { + current_hour: BackendAdminHourlyStatsRow; + latest: BackendAdminHourlyStatsRow; + items: BackendAdminHourlyStatsRow[]; } interface BackendBlogLabelTag { @@ -327,6 +420,9 @@ function toBlogCatalogItem(node: BackendGraphNode): BlogCatalogItem { return { ...toGraphNode(node), normalizedUrl: node.normalized_url ?? node.url, + requestUuid: node.request_uuid, + impressionId: node.impression_id, + position: node.position, identityKey: node.identity_key ?? "", identityReasonCodes: node.identity_reason_codes ?? [], identityRulesetVersion: node.identity_ruleset_version ?? "", @@ -345,6 +441,86 @@ function toBlogCatalogItem(node: BackendGraphNode): BlogCatalogItem { }; } +/** + * Convert one backend recommendation batch into the frontend random-page model. + * + * @param payload Backend recommendation batch payload. + * @returns Normalized batch with catalog items. + */ +function toRandomRecommendationBatch(payload: BackendRandomRecommendationBatchPayload): RandomRecommendationBatch { + return { + requestUuid: payload.request_uuid, + surface: payload.surface, + strategy: payload.strategy, + strategyVersion: payload.strategy_version, + visitorId: payload.visitor_id, + sessionId: payload.session_id, + requestedCount: payload.requested_count, + servedCount: payload.served_count, + createdAt: payload.created_at, + items: payload.items.map(toBlogCatalogItem), + }; +} + +/** + * Convert one backend hourly admin stats row into frontend camelCase shape. + * + * @param row Raw backend admin stats row. + * @returns Normalized admin stats row. + */ +function toAdminHourlyStatsRow(row: BackendAdminHourlyStatsRow): AdminHourlyStatsRow { + return { + id: row.id, + hourStart: row.hour_start, + userCount: row.user_count, + randomRequestCount: row.random_request_count, + randomImpressionCount: row.random_impression_count, + detailOpenCount: row.detail_open_count, + externalOpenCount: row.external_open_count, + detailCtr: row.detail_ctr, + externalCtr: row.external_ctr, + totalClickCtr: row.total_click_ctr, + refreshedAt: row.refreshed_at, + createdAt: row.created_at, + }; +} + +/** + * Convert backend admin hourly stats payload into frontend shape. + * + * @param payload Raw backend admin stats payload. + * @returns Normalized hourly stats collection. + */ +function toAdminHourlyStats(payload: BackendAdminHourlyStats): AdminHourlyStats { + return { + currentHour: toAdminHourlyStatsRow(payload.current_hour), + latest: toAdminHourlyStatsRow(payload.latest), + items: payload.items.map(toAdminHourlyStatsRow), + }; +} + +/** + * Convert one backend relation graph into frontend graph coordinates. + * + * @param graph Backend directional relation graph. + * @returns Normalized relation graph. + */ +function toBlogRelationGraph(graph: BackendBlogRelationGraph) { + return { + direction: graph.direction, + focusBlogId: graph.focus_blog_id, + depth: graph.depth, + nodes: graph.nodes.map(toGraphNode), + edges: graph.edges.map((edge) => ({ + id: String(edge.id ?? `${edge.from_blog_id}-${edge.to_blog_id}`), + source: edge.from_blog_id, + target: edge.to_blog_id, + linkText: edge.link_text, + linkUrlRaw: edge.link_url_raw, + })), + }; +} + /** * Convert one backend blog label tag into the frontend admin tag shape. * @@ -499,7 +675,14 @@ async function apiJson<T>(path: string, init?: RequestInit): Promise<T> { }, }); if (!response.ok) { - throw new Error(`api_error_${response.status}`); + let detail: unknown = null; + try { + const payload = await response.json(); + detail = (payload as { detail?: unknown }).detail ?? payload; + } catch { + detail = await response.text().catch(() => null); + } + throw new ApiError(response.status, detail); } return (await response.json()) as T; } @@ -527,16 +710,36 @@ function toUserProfile(user: BackendUserProfile): UserProfile { id: user.id, email: user.email, displayName: user.display_name, + role: user.role, + isActive: user.is_active, + emailVerified: user.email_verified, + emailVerifiedAt: user.email_verified_at, createdAt: user.created_at, updatedAt: user.updated_at, }; } +function toAuthLifecycleToken(payload: BackendAuthLifecycleToken | undefined): AuthLifecycleToken | undefined { + if (!payload) { + return undefined; + } + return { + sent: payload.sent, + verificationToken: payload.verification_token, + verificationUrl: payload.verification_url, + resetToken: payload.reset_token, + resetUrl: payload.reset_url, + expiresAt: payload.expires_at, + alreadyVerified: payload.already_verified, + }; +} + function toAuthSession(session: BackendAuthSession): AuthSession { return { token: session.token, expiresAt: session.expires_at, user: toUserProfile(session.user), + emailVerification: toAuthLifecycleToken(session.email_verification), }; } @@ -544,22 +747,13 @@ function toAuthSession(session: BackendAuthSession): AuthSession { * Fetch the default core graph view. * * @param limit Maximum node count requested for the core graph. - * @param options Optional deterministic sampling settings for graph selection. * @returns Normalized graph data. */ -export async function fetchGraphData( - limit = 200, - options: { sampleMode?: "off" | "count" | "percent"; sampleSeed?: number } = {}, -): Promise<GraphData> { +export async function fetchGraphData(limit = 200): Promise<GraphData> { const params = new URLSearchParams({ - strategy: "degree", + strategy: "seed", limit: String(limit), }); - if (options.sampleMode && options.sampleMode !== "off") { - params.set("sample_mode", options.sampleMode); - params.set("sample_value", String(limit)); - params.set("sample_seed", String(options.sampleSeed ?? 42)); - } const payload = await apiJson<BackendGraphPayload>(`/api/graph/views/core?${params.toString()}`); return toGraphData(payload); } @@ -614,9 +808,16 @@ export async function fetchBlogDetail(blogId: number): Promise<BlogDetail> { .filter((neighbor): neighbor is BackendNeighborSummary => neighbor !== null) .map(toGraphNode); const outgoingNeighbors = payload.outgoing_edges - .map((edge) => edge.neighbor_blog) - .filter((neighbor): neighbor is BackendNeighborSummary => neighbor !== null) - .map(toGraphNode); + .map((edge) => { + if (!edge.neighbor_blog) { + return null; + } + return { + ...toGraphNode(edge.neighbor_blog), + url: edge.link_url_raw, + }; + }) + .filter((neighbor): neighbor is GraphNode => neighbor !== null); const relatedNodesById = new Map<number, GraphNode>(); [...incomingNeighbors, ...outgoingNeighbors].forEach((node) => { relatedNodesById.set(node.id, node); @@ -625,12 +826,77 @@ export async function fetchBlogDetail(blogId: number): Promise<BlogDetail> { ...toGraphNode(blog), viaBlogs: (blog.via_blogs ?? []).map(toGraphNode), })); + const discoveryPath = payload.discovery_path + ? { + mode: payload.discovery_path.mode, + originSource: payload.discovery_path.origin_source, + originLabel: payload.discovery_path.origin_label, + targetSource: payload.discovery_path.target_source, + truncated: payload.discovery_path.truncated, + steps: payload.discovery_path.steps.map((step) => ({ + blog: step.blog + ? { + id: step.blog.blog_id ?? step.blog.id, + domain: step.blog.domain, + title: step.blog.title, + iconUrl: step.blog.icon_url, + } + : null, + blogId: step.blog_id, + url: step.url, + domain: step.domain, + acceptedBy: step.accepted_by, + acceptedLabel: step.accepted_label, + rawId: step.raw_id, + rawSourceBlogId: step.raw_source_blog_id, + rawAcceptedBy: step.raw_accepted_by, + discoveredAt: step.discovered_at, + })), + } + : null; return { ...toGraphNode(payload), + crawlStatus: payload.crawl_status ?? "WAITING", + crawlErrorKind: payload.crawl_error_kind ?? null, incomingLinks: payload.incoming_edges.length, outgoingLinks: payload.outgoing_edges.length, relatedNodes: Array.from(relatedNodesById.values()), + outgoingNodes: outgoingNeighbors, recommendedBlogs, + discoveryPath, + relationGraphs: payload.relation_graphs + ? { + incoming: toBlogRelationGraph(payload.relation_graphs.incoming), + outgoing: toBlogRelationGraph(payload.relation_graphs.outgoing), + } + : { + incoming: { + direction: "incoming", + focusBlogId: payload.blog_id ?? payload.id, + depth: 2, + nodes: [toGraphNode(payload), ...incomingNeighbors], + edges: payload.incoming_edges.map((edge) => ({ + id: String(edge.id), + source: edge.from_blog_id, + target: edge.to_blog_id, + linkText: edge.link_text, + linkUrlRaw: edge.link_url_raw, + })), + }, + outgoing: { + direction: "outgoing", + focusBlogId: payload.blog_id ?? payload.id, + depth: 2, + nodes: [toGraphNode(payload), ...outgoingNeighbors], + edges: payload.outgoing_edges.map((edge) => ({ + id: String(edge.id), + source: edge.from_blog_id, + target: edge.to_blog_id, + linkText: edge.link_text, + linkUrlRaw: edge.link_url_raw, + })), + }, + }, }; } @@ -763,39 +1029,136 @@ export async function fetchBlogsCatalog(query: BlogCatalogQuery = {}): Promise<B } /** - * Submit one ingestion request when a searched blog is missing. + * Fetch and persist one random-blog recommendation batch. * - * @param data User-provided URL and email pair. - * @returns Created ingestion request summary. + * @param input Random batch request metadata. + * @returns Persisted batch with ordered impression attribution. */ -export async function submitBlogInfo(data: { - url: string; - email: string; -}): Promise<CreateIngestionRequestPayload> { - if (!data.url.trim()) { - throw new Error("url_required"); - } - if (!data.email.trim()) { - throw new Error("email_required"); - } - return apiJson<CreateIngestionRequestPayload>("/api/ingestion-requests", { +export async function fetchRandomBlogBatch(input: { + count: number; + visitorId: string; + sessionId: string; + source?: string; + pageUrl?: string; + context?: Record<string, unknown>; + token?: string | null; +}): Promise<RandomRecommendationBatch> { + const payload = await apiJson<BackendRandomRecommendationBatchPayload>("/api/recommendations/random-blog-batches", { method: "POST", + headers: input.token ? authHeaders(input.token) : undefined, body: JSON.stringify({ - homepage_url: data.url.trim(), - email: data.email.trim(), + count: input.count, + visitor_id: input.visitorId, + session_id: input.sessionId, + source: input.source, + page_url: input.pageUrl, + context: input.context, }), }); + return toRandomRecommendationBatch(payload); } -export async function registerUser(data: { email: string; password: string }): Promise<AuthSession> { - const payload = await apiJson<BackendAuthSession>("/api/auth/register", { +/** + * Record one recommendation event without changing page state. + * + * @param input Event attribution and metadata. + * @param token Optional auth token used for registered-user attribution. + * @returns Promise resolved after the backend accepts the event. + */ +export async function postRecommendationEvent( + input: RecommendationEventInput, + token?: string | null, +): Promise<void> { + await apiJson("/api/recommendation-events", { + method: "POST", + headers: token ? authHeaders(token) : undefined, + body: JSON.stringify({ + event_uuid: input.eventUuid, + event_type: input.eventType, + blog_id: input.blogId, + visitor_id: input.visitorId, + session_id: input.sessionId, + entrance_kind: input.entranceKind, + entrance_url: input.entranceUrl, + request_uuid: input.requestUuid, + impression_id: input.impressionId, + position: input.position, + interaction_order: input.interactionOrder, + client_event_at: input.clientEventAt, + attributes: input.attributes, + }), + }); +} + +/** + * Submit one user seed URL so it can be accepted and queued for crawling. + * + * @param data User-provided complete blog URL. + * @returns Accepted blog seed summary. + */ +export async function submitUserSeed(data: { url: string }): Promise<{ status: string; blogId: number }> { + if (!data.url.trim()) { + throw new Error("url_required"); + } + let payload: { status: string; blog_id: number }; + try { + payload = await apiJson<{ status: string; blog_id: number }>("/api/blogs/user-seeds", { + method: "POST", + body: JSON.stringify({ + homepage_url: data.url.trim(), + }), + }); + } catch (error) { + throw new Error(describeUserSeedError(error)); + } + return { + status: payload.status, + blogId: payload.blog_id, + }; +} + +function describeUserSeedError(error: unknown): string { + if (!(error instanceof ApiError)) { + return error instanceof Error ? error.message : "提交失败:未知错误"; + } + const detail = typeof error.detail === "string" ? error.detail : ""; + const ruleReason = USER_SEED_RULE_REASON_MESSAGES[detail]; + if (ruleReason) { + return `规则过滤未通过:${ruleReason}(${detail})`; + } + if (detail === "Unsupported homepage URL") { + return "URL 无法识别:请输入完整的 http 或 https 博客首页链接。"; + } + if (detail) { + return `提交失败:${detail}`; + } + return `提交失败:接口返回 ${error.status}`; +} + +const USER_SEED_RULE_REASON_MESSAGES: Record<string, string> = { + "rule:duplicate_url": "该 URL 已经存在于发现记录中", + "rule:non_http_scheme": "链接不是 http 或 https 协议", + "rule:same_domain": "链接与来源域名相同", + "rule:exact_url_blocked": "链接命中精确 URL 黑名单", + "rule:prefix_blocked": "链接命中 URL 前缀黑名单", + "rule:platform_blocked": "域名属于已屏蔽的平台站点", + "rule:domain_blocked": "域名命中自定义屏蔽列表", + "rule:blocked_tld": "域名后缀被屏蔽", + "rule:non_root_path": "链接不是博客首页根路径", + "rule:non_root_location": "链接包含查询参数或锚点", + "rule:asset_suffix": "链接指向静态资源文件", + "rule:blocked_path": "链接路径属于登录、搜索、RSS、管理页等非博客首页", +}; + +export async function registerUser(data: { email: string; password: string }): Promise<AuthLifecycleToken> { + const payload = await apiJson<BackendAuthLifecycleToken>("/api/auth/register", { method: "POST", body: JSON.stringify({ email: data.email.trim(), password: data.password, }), }); - return toAuthSession(payload); + return toAuthLifecycleToken(payload) ?? { sent: true }; } export async function loginUser(data: { email: string; password: string }): Promise<AuthSession> { @@ -823,6 +1186,38 @@ export async function logoutUser(token: string): Promise<void> { }); } +export async function requestEmailVerification(email: string): Promise<AuthLifecycleToken> { + const payload = await apiJson<BackendAuthLifecycleToken>("/api/auth/email/verify/request", { + method: "POST", + body: JSON.stringify({ email: email.trim() }), + }); + return toAuthLifecycleToken(payload) ?? { sent: true }; +} + +export async function confirmEmailVerification(token: string): Promise<UserProfile> { + const payload = await apiJson<BackendUserProfile>("/api/auth/email/verify/confirm", { + method: "POST", + body: JSON.stringify({ token: token.trim() }), + }); + return toUserProfile(payload); +} + +export async function requestPasswordReset(email: string): Promise<AuthLifecycleToken> { + const payload = await apiJson<BackendAuthLifecycleToken>("/api/auth/password/forgot", { + method: "POST", + body: JSON.stringify({ email: email.trim() }), + }); + return toAuthLifecycleToken(payload) ?? { sent: true }; +} + +export async function resetPassword(data: { token: string; password: string }): Promise<UserProfile> { + const payload = await apiJson<BackendUserProfile>("/api/auth/password/reset", { + method: "POST", + body: JSON.stringify({ token: data.token.trim(), password: data.password }), + }); + return toUserProfile(payload); +} + export async function fetchMyLabelSelections(token: string, limit = 50): Promise<UserLabelSelection[]> { const params = new URLSearchParams({ limit: String(limit) }); const payload = await apiJson<BackendUserLabelSelection[]>(`/api/me/label-selections?${params.toString()}`, { @@ -884,29 +1279,18 @@ export async function fetchAdminRuntimeCurrent(adminToken: string): Promise<Admi } /** - * Fetch the latest dedup scan summary when available. + * Fetch protected hourly admin dashboard statistics. * * @param adminToken Bearer token used for the protected endpoint. - * @returns Normalized dedup summary or null when no run exists. + * @param limit Maximum number of hourly rows to return. + * @returns Normalized hourly statistics payload. */ -export async function fetchAdminDedupLatest(adminToken: string): Promise<AdminDedupSummary | null> { - try { - const payload = await apiJson<BackendDedupSummary>("/api/admin/blog-dedup-scans/latest", { - headers: adminHeaders(adminToken), - }); - return { - id: payload.id, - status: payload.status, - totalCount: payload.total_count, - scannedCount: payload.scanned_count, - removedCount: payload.removed_count, - keptCount: payload.kept_count, - createdAt: payload.created_at, - updatedAt: payload.updated_at, - }; - } catch { - return null; - } +export async function fetchAdminHourlyStats(adminToken: string, limit = 24): Promise<AdminHourlyStats> { + const params = new URLSearchParams({ limit: String(limit) }); + const payload = await apiJson<BackendAdminHourlyStats>(`/api/admin/hourly-stats?${params.toString()}`, { + headers: adminHeaders(adminToken), + }); + return toAdminHourlyStats(payload); } /** diff --git a/frontend/src/lib/auth.ts b/frontend/src/lib/auth.ts index 921cd50..33688de 100644 --- a/frontend/src/lib/auth.ts +++ b/frontend/src/lib/auth.ts @@ -26,6 +26,22 @@ export function readStoredAuthSession(): AuthSession | null { } } +/** + * Return whether the browser-local user session belongs to an active, + * email-verified admin user. + * + * @returns True when the stored session can be used for admin navigation. + */ +export function hasStoredAdminSession(): boolean { + const session = readStoredAuthSession(); + return Boolean( + session?.token && + session.user?.role === "admin" && + session.user.isActive && + session.user.emailVerified, + ); +} + /** * Persist the current user session in localStorage. * diff --git a/frontend/src/lib/benchmarkGraph.ts b/frontend/src/lib/benchmarkGraph.ts new file mode 100644 index 0000000..75c9daf --- /dev/null +++ b/frontend/src/lib/benchmarkGraph.ts @@ -0,0 +1,128 @@ +import type { GraphData, GraphEdge, GraphMeta, GraphNode } from "../types/graph"; + +interface BenchmarkGraphNodePayload { + id: number; + url: string; + domain: string; + title: string | null; + icon_url: string | null; + incoming_count?: number; + outgoing_count?: number; + degree?: number; + component_id?: string; + x?: number; + y?: number; + z?: number; +} + +interface BenchmarkGraphEdgePayload { + id?: number | string; + from_blog_id: number; + to_blog_id: number; + link_text: string | null; + link_url_raw: string; +} + +interface BenchmarkGraphPayload { + nodes: BenchmarkGraphNodePayload[]; + edges: BenchmarkGraphEdgePayload[]; + meta?: { + strategy: string; + limit: number; + generated_at?: string; + source?: string; + total_nodes?: number; + total_edges?: number; + available_nodes?: number; + available_edges?: number; + selected_nodes?: number; + selected_edges?: number; + }; +} + +/** + * Convert a static benchmark node payload to the frontend graph node model. + * + * @param node Raw benchmark node using backend field names. + * @returns Normalized graph node. + */ +function toBenchmarkNode(node: BenchmarkGraphNodePayload): GraphNode { + return { + id: Number(node.id), + url: node.url, + domain: node.domain, + title: node.title, + iconUrl: node.icon_url, + incomingCount: node.incoming_count, + outgoingCount: node.outgoing_count, + degree: node.degree, + componentId: node.component_id, + x: node.x, + y: node.y, + z: node.z, + }; +} + +/** + * Convert a static benchmark edge payload to the frontend graph edge model. + * + * @param edge Raw benchmark edge using backend field names. + * @param index Fallback edge index. + * @returns Normalized graph edge. + */ +function toBenchmarkEdge(edge: BenchmarkGraphEdgePayload, index: number): GraphEdge { + return { + id: edge.id ? String(edge.id) : `benchmark-edge-${index}`, + source: Number(edge.from_blog_id), + target: Number(edge.to_blog_id), + linkText: edge.link_text, + linkUrlRaw: edge.link_url_raw, + }; +} + +/** + * Convert static benchmark metadata to the frontend graph meta model. + * + * @param meta Raw benchmark metadata. + * @returns Normalized graph metadata. + */ +function toBenchmarkMeta(meta: BenchmarkGraphPayload["meta"]): GraphMeta | undefined { + if (!meta) { + return undefined; + } + return { + strategy: meta.strategy, + limit: meta.limit, + generatedAt: meta.generated_at, + source: meta.source, + totalNodes: meta.total_nodes, + totalEdges: meta.total_edges, + availableNodes: meta.available_nodes, + availableEdges: meta.available_edges, + selectedNodes: meta.selected_nodes, + selectedEdges: meta.selected_edges, + }; +} + +/** + * Fetch the static visualization benchmark graph. + * + * @returns Normalized graph data for the shared 3D visualization component. + */ +export async function fetchBenchmarkGraphData(): Promise<GraphData> { + const response = await fetch("/benchmarks/blog-community-graph.json", { + headers: { + accept: "application/json", + }, + }); + if (!response.ok) { + throw new Error(`benchmark_graph_error_${response.status}`); + } + + const payload = (await response.json()) as BenchmarkGraphPayload; + return { + nodes: payload.nodes.map(toBenchmarkNode), + edges: payload.edges.map(toBenchmarkEdge), + meta: toBenchmarkMeta(payload.meta), + }; +} diff --git a/frontend/src/lib/blogInteractions.ts b/frontend/src/lib/blogInteractions.ts new file mode 100644 index 0000000..4a07bf6 --- /dev/null +++ b/frontend/src/lib/blogInteractions.ts @@ -0,0 +1,140 @@ +import { readStoredAuthSession } from "./auth"; +import { postRecommendationEvent } from "./api"; +import type { NavigateFunction } from "react-router-dom"; +import type { BlogCatalogItem, GraphNode } from "../types/graph"; + +const BLOG_VISITOR_STORAGE_KEY = "heyblog.blog_interactions.visitor_id"; +const BLOG_SESSION_STORAGE_KEY = "heyblog.blog_interactions.session_id"; + +let interactionOrder = 0; + +export interface BlogInteractionEntrance { + entranceKind: string; + entranceUrl: string; +} + +export interface BlogInteractionTarget { + id: number; + requestUuid?: string; + impressionId?: number; + position?: number; +} + +/** + * Convert a graph node into the minimal interaction target shape. + * + * @param blog Blog-like frontend model. + * @returns Target fields required by the interaction event API. + */ +export function blogInteractionTarget(blog: BlogCatalogItem | GraphNode): BlogInteractionTarget { + return { + id: blog.id, + requestUuid: "requestUuid" in blog ? blog.requestUuid : undefined, + impressionId: "impressionId" in blog ? blog.impressionId : undefined, + position: "position" in blog ? blog.position : undefined, + }; +} + +/** + * Create one browser-local random identifier without requiring crypto support. + * + * @param prefix Stable prefix that identifies the ID family. + * @returns URL-safe identifier string. + */ +export function createBlogInteractionId(prefix: string) { + const random = Math.random().toString(36).slice(2); + return `${prefix}_${Date.now().toString(36)}_${random}`; +} + +/** + * Read or create the browser-stable visitor ID used for blog interactions. + * + * @returns Stable local visitor ID. + */ +export function getBlogInteractionVisitorId() { + const existing = localStorage.getItem(BLOG_VISITOR_STORAGE_KEY); + if (existing) { + return existing; + } + const created = createBlogInteractionId("visitor"); + localStorage.setItem(BLOG_VISITOR_STORAGE_KEY, created); + return created; +} + +/** + * Read or create the tab-session ID used for blog interactions. + * + * @returns Stable session ID for the current tab session. + */ +export function getBlogInteractionSessionId() { + const existing = sessionStorage.getItem(BLOG_SESSION_STORAGE_KEY); + if (existing) { + return existing; + } + const created = createBlogInteractionId("session"); + sessionStorage.setItem(BLOG_SESSION_STORAGE_KEY, created); + return created; +} + +/** + * Record one non-blocking blog interaction with required entrance metadata. + * + * @param target Blog interaction target. + * @param eventType Event type recognized by the backend. + * @param entrance Required entry-point metadata for later aggregation. + * @param attributes Optional event metadata. + */ +export function recordBlogInteraction( + target: BlogInteractionTarget, + eventType: string, + entrance: BlogInteractionEntrance, + attributes?: Record<string, unknown>, +) { + interactionOrder += 1; + const session = readStoredAuthSession(); + void postRecommendationEvent( + { + eventUuid: createBlogInteractionId("event"), + eventType, + blogId: target.id, + visitorId: getBlogInteractionVisitorId(), + sessionId: getBlogInteractionSessionId(), + entranceKind: entrance.entranceKind, + entranceUrl: entrance.entranceUrl, + requestUuid: target.requestUuid, + impressionId: target.impressionId, + position: target.position, + interactionOrder, + clientEventAt: new Date().toISOString(), + attributes, + }, + session?.token, + ).catch((error: unknown) => { + console.warn("Failed to record blog interaction", error); + }); +} + +/** + * Record and open the canonical blog detail route. + * + * @param navigate React Router navigation function. + * @param blog Blog target whose detail route should open. + * @param entrance Required entry-point metadata for later aggregation. + * @param attributes Optional event metadata. + * @param options Optional browser navigation behavior. + */ +export function openTrackedBlogDetail( + navigate: NavigateFunction, + blog: BlogCatalogItem | GraphNode, + entrance: BlogInteractionEntrance, + attributes?: Record<string, unknown>, + options?: { newTab?: boolean }, +) { + recordBlogInteraction(blogInteractionTarget(blog), "detail_open", entrance, attributes); + const detailPath = `/blogs/${blog.id}`; + if (options?.newTab) { + window.open(detailPath, "_blank", "noopener,noreferrer"); + return; + } + navigate(detailPath); +} diff --git a/frontend/src/lib/icon.test.ts b/frontend/src/lib/icon.test.ts new file mode 100644 index 0000000..9754a2e --- /dev/null +++ b/frontend/src/lib/icon.test.ts @@ -0,0 +1,25 @@ +import { describe, expect, test } from "vitest"; +import { resolveBlogIconUrls, resolveProxiedBlogIconUrls } from "./icon"; + +describe("icon helpers", () => { + test("keeps direct display candidates unchanged", () => { + const urls = resolveBlogIconUrls({ + url: "https://blog.example.com/posts/1", + domain: "blog.example.com", + iconUrl: "https://cdn.example.com/icon.png", + }); + + expect(urls[0]).toBe("https://cdn.example.com/icon.png"); + }); + + test("wraps graph texture candidates with the same-origin icon proxy", () => { + const urls = resolveProxiedBlogIconUrls({ + url: "https://blog.example.com/posts/1", + domain: "blog.example.com", + iconUrl: "https://cdn.example.com/icon.png", + }); + + expect(urls[0]).toBe("/api/icons/proxy?url=https%3A%2F%2Fcdn.example.com%2Ficon.png"); + expect(urls.every((url) => url.startsWith("/api/icons/proxy?url="))).toBe(true); + }); +}); diff --git a/frontend/src/lib/icon.ts b/frontend/src/lib/icon.ts index 99628c5..b1ef3ac 100644 --- a/frontend/src/lib/icon.ts +++ b/frontend/src/lib/icon.ts @@ -32,20 +32,69 @@ export function resolveDuckDuckGoIconUrl(node: Pick<GraphNode, "domain">): strin } /** - * Resolve the best displayable icon URL for a blog node. + * Build Google's public favicon service URL for one blog domain. + * + * @param node Blog-like frontend node that may include a domain. + * @returns Google favicon URL, or undefined when the domain is missing. + */ +export function resolveGoogleIconUrl(node: Pick<GraphNode, "domain">): string | undefined { + const hostname = node.domain?.trim(); + if (!hostname) { + return undefined; + } + return `https://t2.gstatic.com/faviconV2?client=SOCIAL&type=FAVICON&fallback_opts=TYPE,SIZE,URL&url=https://${encodeURIComponent(hostname)}&size=64`; +} + +/** + * Resolve displayable icon candidates for a blog node. * * @param node Blog-like frontend node with optional crawled icon metadata. - * @returns Preferred icon URL with deterministic fallbacks, or undefined. + * @returns Ordered icon candidates for UI display fallback. */ -export function resolveBlogIconUrl( +export function resolveBlogIconUrls( node: Pick<GraphNode, "domain" | "iconUrl" | "url">, -): string | undefined { +): string[] { const originFaviconUrl = resolveOriginFaviconUrl(node); const normalizedIconUrl = node.iconUrl?.trim() || undefined; + const candidates = [ + normalizedIconUrl, + resolveGoogleIconUrl(node), + resolveDuckDuckGoIconUrl(node), + originFaviconUrl, + ]; + return Array.from(new Set(candidates.filter((candidate): candidate is string => Boolean(candidate)))); +} - if (normalizedIconUrl && normalizedIconUrl !== originFaviconUrl) { - return normalizedIconUrl; - } +/** + * Wrap one remote icon URL with the same-origin backend icon proxy. + * + * @param iconUrl Absolute remote icon URL. + * @returns Same-origin proxy URL suitable for CORS-sensitive WebGL textures. + */ +export function resolveIconProxyUrl(iconUrl: string): string { + return `/api/icons/proxy?url=${encodeURIComponent(iconUrl)}`; +} + +/** + * Resolve proxied icon candidates for WebGL texture loading. + * + * @param node Blog-like frontend node with optional crawled icon metadata. + * @returns Ordered same-origin icon proxy URLs. + */ +export function resolveProxiedBlogIconUrls( + node: Pick<GraphNode, "domain" | "iconUrl" | "url">, +): string[] { + return resolveBlogIconUrls(node).map(resolveIconProxyUrl); +} - return resolveDuckDuckGoIconUrl(node) ?? normalizedIconUrl ?? originFaviconUrl ?? undefined; +/** + * Resolve the best displayable icon URL for a blog node. + * + * @param node Blog-like frontend node with optional crawled icon metadata. + * @returns Preferred icon URL with deterministic fallbacks, or undefined. + */ +export function resolveBlogIconUrl( + node: Pick<GraphNode, "domain" | "iconUrl" | "url">, +): string | undefined { + return resolveBlogIconUrls(node)[0]; } diff --git a/frontend/src/pages/AboutPage.tsx b/frontend/src/pages/AboutPage.tsx index 4c0b7d2..eccfd7d 100644 --- a/frontend/src/pages/AboutPage.tsx +++ b/frontend/src/pages/AboutPage.tsx @@ -8,7 +8,7 @@ import avatarImage from "../assets/images/avatar.png"; */ export function AboutPage() { return ( - <div className="h-screen overflow-hidden bg-[radial-gradient(circle_at_top_left,_rgba(125,211,252,0.22),_transparent_28%),radial-gradient(circle_at_bottom_right,_rgba(244,114,182,0.14),_transparent_30%),linear-gradient(180deg,_#f8fbff_0%,_#ffffff_52%,_#f7fbff_100%)]"> + <div className="h-screen overflow-hidden bg-slate-50"> <Navigation /> <main className="mx-auto flex h-full max-w-6xl items-center px-5 pb-5 pt-20 sm:px-8 sm:pt-24"> @@ -57,8 +57,8 @@ export function AboutPage() { </div> <div className="relative mx-auto flex min-h-0 w-full max-w-[280px] items-stretch sm:max-w-md lg:max-w-none"> - <div className="absolute inset-x-10 bottom-8 h-24 rounded-full bg-sky-200/30 blur-3xl" /> - <div className="relative flex w-full items-end justify-center overflow-hidden rounded-[32px] border border-slate-200 bg-gradient-to-b from-white/95 via-sky-50/75 to-rose-50/70 shadow-[0_24px_60px_rgba(15,23,42,0.12)]"> + <div className="absolute inset-x-10 bottom-8 h-24 rounded-full bg-slate-200/45 blur-3xl" /> + <div className="relative flex w-full items-end justify-center overflow-hidden rounded-[32px] border border-slate-200 bg-white shadow-[0_24px_60px_rgba(15,23,42,0.12)]"> <img src={avatarImage} alt="HeyBlog avatar" diff --git a/frontend/src/pages/AdminPage.tsx b/frontend/src/pages/AdminPage.tsx index 416d32d..2d137a8 100644 --- a/frontend/src/pages/AdminPage.tsx +++ b/frontend/src/pages/AdminPage.tsx @@ -18,13 +18,14 @@ import { import { type FormEvent, useEffect, useRef, useState } from "react"; import { toast } from "sonner"; import { Navigation } from "../components/Navigation"; +import { readStoredAuthSession } from "../lib/auth"; import { downloadAdminBlogLabelParquet, fetchAdminBlogLabelCounts, fetchAdminBlogLabelingCandidates, fetchAdminBlogLabelParquetStatus, fetchAdminBlogLabelTitlePreview, - fetchAdminDedupLatest, + fetchAdminHourlyStats, fetchAdminRuntimeCurrent, fetchAdminRuntimeStatus, fetchStats, @@ -42,12 +43,12 @@ import { import type { AdminBlogLabelingCandidate, AdminBlogLabelTag, - AdminDedupSummary, AdminRuntimeCurrent, AdminRuntimeStatus, StatsData, AdminBlogLabelCounts, AdminBlogLabelParquetStatus, + AdminHourlyStats, } from "../types/graph"; const ADMIN_TOKEN_STORAGE_KEY = "heyblog_admin_token"; @@ -67,6 +68,19 @@ function readStoredAdminToken(): string { return window.localStorage.getItem(ADMIN_TOKEN_STORAGE_KEY) ?? ""; } +function readDefaultAdminToken(): string { + const session = readStoredAuthSession(); + if ( + session?.token && + session.user.role === "admin" && + session.user.isActive && + session.user.emailVerified + ) { + return session.token; + } + return readStoredAdminToken(); +} + /** * Persist the admin token for future visits. * @@ -99,6 +113,34 @@ function clearStoredAdminToken() { window.localStorage.removeItem(ADMIN_TOKEN_STORAGE_KEY); } +/** + * Format one ratio as a percentage for compact admin stats. + * + * @param value Ratio where `1` means 100%. + * @returns Human-readable percentage string. + */ +function formatPercent(value: number | null | undefined): string { + return `${((value ?? 0) * 100).toFixed(2)}%`; +} + +/** + * Format an ISO timestamp for the current operator locale. + * + * @param value ISO timestamp or null. + * @returns Short local date-time string. + */ +function formatAdminTime(value: string | null | undefined): string { + if (!value) { + return "--"; + } + return new Intl.DateTimeFormat(undefined, { + month: "2-digit", + day: "2-digit", + hour: "2-digit", + minute: "2-digit", + }).format(new Date(value)); +} + /** * Resolve an icon URL for a labeling card. * @@ -118,16 +160,16 @@ function resolveLabelingIconUrl(candidate: AdminBlogLabelingCandidate): string { * @returns Admin page UI. */ export function AdminPage() { - const [adminTokenInput, setAdminTokenInput] = useState(readStoredAdminToken()); - const [activeAdminToken, setActiveAdminToken] = useState(readStoredAdminToken()); + const [adminTokenInput, setAdminTokenInput] = useState(readDefaultAdminToken()); + const [activeAdminToken, setActiveAdminToken] = useState(readDefaultAdminToken()); const [stats, setStats] = useState<StatsData>({ totalNodes: 0, totalEdges: 0 }); const [runtimeStatus, setRuntimeStatus] = useState<AdminRuntimeStatus | null>(null); const [runtimeCurrent, setRuntimeCurrent] = useState<AdminRuntimeCurrent | null>(null); - const [latestDedup, setLatestDedup] = useState<AdminDedupSummary | null>(null); const [labelingCandidates, setLabelingCandidates] = useState<AdminBlogLabelingCandidate[]>([]); const [labelTags, setLabelTags] = useState<AdminBlogLabelTag[]>([]); const [labelCounts, setLabelCounts] = useState<AdminBlogLabelCounts>({ totalLabeled: 0, byLabel: {} }); const [labelParquetStatus, setLabelParquetStatus] = useState<AdminBlogLabelParquetStatus | null>(null); + const [adminHourlyStats, setAdminHourlyStats] = useState<AdminHourlyStats | null>(null); const [labelingTotalItems, setLabelingTotalItems] = useState(0); const [labelingTotalPages, setLabelingTotalPages] = useState(1); const [labelingPage, setLabelingPage] = useState(1); @@ -180,36 +222,36 @@ export function AdminPage() { if (!adminToken.trim()) { setRuntimeStatus(null); setRuntimeCurrent(null); - setLatestDedup(null); setLabelingCandidates([]); setLabelTags([]); setLabelCounts({ totalLabeled: 0, byLabel: {} }); setLabelParquetStatus(null); + setAdminHourlyStats(null); setLabelingTotalItems(0); setLabelingTotalPages(1); - setAdminError("请输入管理员 Token 以加载受保护接口。"); + setAdminError("请输入管理员 Token 或 Admin 账号登录 Token 以加载受保护接口。"); return; } const [ runtimeStatusResponse, runtimeCurrentResponse, - latestDedupResponse, labelCountResponse, labelParquetResponse, + hourlyStatsResponse, ] = await Promise.all([ fetchAdminRuntimeStatus(adminToken), fetchAdminRuntimeCurrent(adminToken), - fetchAdminDedupLatest(adminToken), fetchAdminBlogLabelCounts(adminToken), fetchAdminBlogLabelParquetStatus(adminToken), + fetchAdminHourlyStats(adminToken), ]); setRuntimeStatus(runtimeStatusResponse); setRuntimeCurrent(runtimeCurrentResponse); - setLatestDedup(latestDedupResponse); setLabelCounts(labelCountResponse); setLabelParquetStatus(labelParquetResponse); + setAdminHourlyStats(hourlyStatsResponse); setAdminError(null); try { @@ -233,14 +275,14 @@ export function AdminPage() { console.error(error); setRuntimeStatus(null); setRuntimeCurrent(null); - setLatestDedup(null); setLabelingCandidates([]); setLabelTags([]); setLabelCounts({ totalLabeled: 0, byLabel: {} }); setLabelParquetStatus(null); + setAdminHourlyStats(null); setLabelingTotalItems(0); setLabelingTotalPages(1); - setAdminError("管理员接口加载失败,请确认 Token 是否正确。"); + setAdminError("管理员接口加载失败,请确认 Token 是否正确且账号具备 Admin 身份。"); } finally { if (!options?.silent) { setIsLoading(false); @@ -318,7 +360,7 @@ export function AdminPage() { */ async function refreshLabelingWorkbench(options: { page?: number; query?: string } = {}) { if (!activeAdminToken.trim()) { - toast.error("请先输入管理员 Token。"); + toast.error("请先输入管理员 Token 或 Admin 账号登录 Token。"); return; } try { @@ -337,7 +379,7 @@ export function AdminPage() { setLabelParquetStatus(parquetStatus); } catch (error) { console.error(error); - toast.error("标注台加载失败,请检查 token 或服务状态。"); + toast.error("标注台加载失败,请检查 Token、Admin 身份或服务状态。"); } finally { setIsLabelingLoading(false); } @@ -352,7 +394,7 @@ export function AdminPage() { */ async function handleApplyCandidateLabel(candidate: AdminBlogLabelingCandidate, tag: AdminBlogLabelTag) { if (!activeAdminToken.trim()) { - toast.error("请先输入管理员 Token。"); + toast.error("请先输入管理员 Token 或 Admin 账号登录 Token。"); return; } try { @@ -476,6 +518,8 @@ export function AdminPage() { } const avgConnections = stats.totalNodes > 0 ? (stats.totalEdges / stats.totalNodes).toFixed(2) : "0.00"; + const currentAdminStats = adminHourlyStats?.currentHour ?? null; + const recentHourlyRows = adminHourlyStats?.items.slice(0, 6) ?? []; const visibleLabelCounts = labelTags.map((tag) => ({ ...tag, count: labelCounts.byLabel[tag.slug] ?? 0, @@ -558,6 +602,80 @@ export function AdminPage() { </div> </section> + <section className="mb-8 rounded-[32px] border border-slate-200 bg-white/95 p-6 shadow-[0_18px_40px_rgba(15,23,42,0.08)]"> + <div className="mb-5 flex flex-col gap-3 lg:flex-row lg:items-end lg:justify-between"> + <div> + <h2 className="text-2xl text-slate-950">后台统计</h2> + <p className="mt-2 text-sm text-slate-500"> + 当前自然小时:{formatAdminTime(currentAdminStats?.hourStart)},刷新于{" "} + {formatAdminTime(currentAdminStats?.refreshedAt)}。 + </p> + </div> + <div className="text-sm text-slate-500">统计按推荐曝光计算平均点击率。</div> + </div> + + <div className="grid grid-cols-1 gap-4 md:grid-cols-2 xl:grid-cols-5"> + <div className="rounded-2xl border border-slate-200 bg-slate-50 p-4"> + <div className="text-sm text-slate-500">当前用户数</div> + <div className="mt-2 text-3xl text-slate-950">{currentAdminStats?.userCount ?? 0}</div> + </div> + <div className="rounded-2xl border border-slate-200 bg-slate-50 p-4"> + <div className="text-sm text-slate-500">随机请求 / 曝光</div> + <div className="mt-2 text-3xl text-slate-950"> + {currentAdminStats?.randomRequestCount ?? 0} / {currentAdminStats?.randomImpressionCount ?? 0} + </div> + </div> + <div className="rounded-2xl border border-slate-200 bg-slate-50 p-4"> + <div className="text-sm text-slate-500">详情点击率</div> + <div className="mt-2 text-3xl text-slate-950">{formatPercent(currentAdminStats?.detailCtr)}</div> + <div className="mt-1 text-xs text-slate-500">{currentAdminStats?.detailOpenCount ?? 0} 次详情打开</div> + </div> + <div className="rounded-2xl border border-slate-200 bg-slate-50 p-4"> + <div className="text-sm text-slate-500">外链点击率</div> + <div className="mt-2 text-3xl text-slate-950">{formatPercent(currentAdminStats?.externalCtr)}</div> + <div className="mt-1 text-xs text-slate-500">{currentAdminStats?.externalOpenCount ?? 0} 次外链打开</div> + </div> + <div className="rounded-2xl border border-slate-200 bg-slate-50 p-4"> + <div className="text-sm text-slate-500">总点击率</div> + <div className="mt-2 text-3xl text-slate-950">{formatPercent(currentAdminStats?.totalClickCtr)}</div> + </div> + </div> + + <div className="mt-5 overflow-x-auto"> + <table className="min-w-full text-left text-sm"> + <thead className="text-xs uppercase text-slate-500"> + <tr> + <th className="px-3 py-2 font-medium">小时</th> + <th className="px-3 py-2 font-medium">用户</th> + <th className="px-3 py-2 font-medium">请求</th> + <th className="px-3 py-2 font-medium">曝光</th> + <th className="px-3 py-2 font-medium">详情 CTR</th> + <th className="px-3 py-2 font-medium">外链 CTR</th> + </tr> + </thead> + <tbody className="divide-y divide-slate-100 text-slate-700"> + {recentHourlyRows.map((row) => ( + <tr key={row.id}> + <td className="px-3 py-2">{formatAdminTime(row.hourStart)}</td> + <td className="px-3 py-2">{row.userCount}</td> + <td className="px-3 py-2">{row.randomRequestCount}</td> + <td className="px-3 py-2">{row.randomImpressionCount}</td> + <td className="px-3 py-2">{formatPercent(row.detailCtr)}</td> + <td className="px-3 py-2">{formatPercent(row.externalCtr)}</td> + </tr> + ))} + {recentHourlyRows.length === 0 ? ( + <tr> + <td className="px-3 py-5 text-slate-500" colSpan={6}> + 输入管理员 Token 后加载小时统计。 + </td> + </tr> + ) : null} + </tbody> + </table> + </div> + </section> + <section className="mb-8 rounded-[32px] border border-slate-200 bg-white/95 p-6 shadow-[0_18px_40px_rgba(15,23,42,0.08)]"> <div className="mb-6 flex flex-col gap-4 lg:flex-row lg:items-end lg:justify-between"> <div> @@ -915,17 +1033,6 @@ export function AdminPage() { active run: {runtimeCurrent?.activeRunId ?? "-"} </div> </div> - <div className="rounded-3xl bg-slate-50 p-4"> - <div className="text-sm text-slate-500">latest dedup scan</div> - <div className="mt-1 text-xl text-slate-950">{latestDedup?.status ?? "暂无记录"}</div> - <div className="mt-3 text-sm leading-7 text-slate-600"> - run id: {latestDedup?.id ?? "-"} - <br /> - scanned / total: {latestDedup ? `${latestDedup.scannedCount} / ${latestDedup.totalCount}` : "-"} - <br /> - removed: {latestDedup?.removedCount ?? "-"} - </div> - </div> </div> )} </div> diff --git a/frontend/src/pages/BlogDetailPage.tsx b/frontend/src/pages/BlogDetailPage.tsx new file mode 100644 index 0000000..9e0ba71 --- /dev/null +++ b/frontend/src/pages/BlogDetailPage.tsx @@ -0,0 +1,703 @@ +import { + ArrowLeft, + ArrowRight, + AlertTriangle, + CheckCircle2, + Clock3, + Loader2, + Network, + Route, + RotateCcw, +} from "lucide-react"; +import { useCallback, useEffect, useMemo, useRef, useState } from "react"; +import ForceGraph2D, { type ForceGraphMethods } from "react-force-graph-2d"; +import { useNavigate, useParams } from "react-router-dom"; +import { toast } from "sonner"; +import { BlogDetailLink } from "../components/BlogDetailLink"; +import { BlogExternalLink } from "../components/BlogExternalLink"; +import { Navigation } from "../components/Navigation"; +import { fetchBlogDetail } from "../lib/api"; +import { openTrackedBlogDetail } from "../lib/blogInteractions"; +import { resolveBlogIconUrls, resolveIconProxyUrl } from "../lib/icon"; +import type { BlogDetail, BlogDiscoveryPath, BlogDiscoveryStep, BlogRelationGraph, GraphNode } from "../types/graph"; + +const RELATION_GRAPH_LINK_DISTANCE = 78; +const RELATION_GRAPH_CHARGE_STRENGTH = -260; +const DETAIL_PAGE_EXTERNAL_ENTRANCE_KIND = "blog_detail_hero_external"; +const DETAIL_DISCOVERY_PATH_ENTRANCE_KIND = "blog_detail_discovery_path"; +const DETAIL_RELATION_GRAPH_ENTRANCE_KIND = "blog_detail_relation_graph"; + +/** + * Format a numeric count for compact detail cards. + * + * @param value Count value to display. + * @returns Localized count string. + */ +function formatCount(value: number) { + return new Intl.NumberFormat("zh-CN").format(value); +} + +/** + * Return a compact Chinese label for a crawl status value. + * + * @param crawlStatus Raw crawl status returned by the backend. + * @returns User-facing crawl status label. + */ +function formatCrawlStatus(crawlStatus: string) { + const labels: Record<string, string> = { + WAITING: "等待抓取", + PROCESSING: "正在抓取", + FINISHED: "抓取完成", + FAILED: "抓取失败", + }; + return labels[crawlStatus] ?? crawlStatus; +} + +/** + * Return a readable failure reason for a crawl error kind. + * + * @param crawlErrorKind Stable backend failure category. + * @returns User-facing failure reason. + */ +function formatCrawlErrorKind(crawlErrorKind: string | null) { + if (!crawlErrorKind) { + return "未记录具体失败原因"; + } + const labels: Record<string, string> = { + timeout: "请求超时", + http_status: "目标站点返回异常 HTTP 状态", + invalid_url: "URL 无效", + page_too_large: "页面体积超过抓取限制", + request_error: "网络请求失败", + worker_error: "抓取任务执行异常", + }; + return labels[crawlErrorKind] ?? crawlErrorKind.replaceAll("_", " "); +} + +/** + * Render the crawl execution status for the current detail blog. + * + * @param props Blog detail payload with crawl status fields. + * @returns Compact status block, including failure reason when failed. + */ +function BlogCrawlStatus({ detail }: { detail: BlogDetail }) { + const isFailed = detail.crawlStatus === "FAILED"; + const statusMeta = (() => { + switch (detail.crawlStatus) { + case "FINISHED": + return { + Icon: CheckCircle2, + className: "border-emerald-200 bg-emerald-50 text-emerald-700", + }; + case "PROCESSING": + return { + Icon: RotateCcw, + className: "border-sky-200 bg-sky-50 text-sky-700", + }; + case "FAILED": + return { + Icon: AlertTriangle, + className: "border-rose-200 bg-rose-50 text-rose-700", + }; + default: + return { + Icon: Clock3, + className: "border-slate-200 bg-slate-50 text-slate-600", + }; + } + })(); + const { Icon } = statusMeta; + + return ( + <div className={`mt-5 inline-flex max-w-full flex-col gap-1 rounded-lg border px-3 py-2 text-sm ${statusMeta.className}`}> + <div className="flex items-center gap-2 font-medium"> + <Icon className="h-4 w-4 flex-shrink-0" /> + <span>抓取状态:{formatCrawlStatus(detail.crawlStatus)}</span> + </div> + {isFailed ? ( + <div className="break-words pl-6 text-xs opacity-90"> + 失败原因:{formatCrawlErrorKind(detail.crawlErrorKind)} + </div> + ) : null} + </div> + ); +} + +/** + * Render a detail page hero icon with favicon fallbacks. + * + * @param props Blog detail node used to resolve icon candidates. + * @returns Blog icon image or a text fallback. + */ +function BlogHeroIcon({ detail }: { detail: BlogDetail }) { + const iconUrls = resolveBlogIconUrls(detail); + const [iconIndex, setIconIndex] = useState(0); + const iconUrl = iconUrls[iconIndex]; + + useEffect(() => { + setIconIndex(0); + }, [detail.id, detail.iconUrl, detail.url, detail.domain]); + + return ( + <div className="mb-4 flex h-16 w-16 items-center justify-center overflow-hidden rounded-lg bg-sky-100 text-2xl font-semibold text-sky-700 ring-1 ring-sky-200"> + {iconUrl ? ( + <img + src={iconUrl} + alt={`${detail.domain} icon`} + className="h-full w-full object-cover" + loading="lazy" + referrerPolicy="no-referrer" + onError={() => setIconIndex((currentIndex) => currentIndex + 1)} + /> + ) : ( + <span>{(detail.domain || "?").slice(0, 1).toUpperCase()}</span> + )} + </div> + ); +} + +/** + * Render one compact card for a historical discovery path step. + * + * @param props Discovery step returned by the blog detail API. + * @returns Clickable blog card with title, icon, and URL. + */ +function DiscoveryPathCard({ step, entranceUrl }: { step: BlogDiscoveryStep; entranceUrl: string }) { + const blog = { + id: step.blogId, + url: step.url, + domain: step.domain, + title: step.blog?.title ?? null, + iconUrl: step.blog?.iconUrl ?? null, + }; + const iconUrls = resolveBlogIconUrls(blog); + const [iconIndex, setIconIndex] = useState(0); + const iconUrl = iconUrls[iconIndex]; + + useEffect(() => { + setIconIndex(0); + }, [step.blogId, step.url, step.domain, step.blog?.iconUrl]); + + return ( + <BlogDetailLink + blog={blog} + entranceKind={DETAIL_DISCOVERY_PATH_ENTRANCE_KIND} + entranceUrl={entranceUrl} + className="flex w-56 items-center gap-3 rounded-lg border border-slate-200 bg-slate-50 px-3 py-3 transition-colors hover:border-sky-300 hover:bg-sky-50" + > + <div className="flex h-10 w-10 flex-shrink-0 items-center justify-center overflow-hidden rounded-lg bg-white text-sm font-semibold text-slate-500 ring-1 ring-slate-200"> + {iconUrl ? ( + <img + src={iconUrl} + alt={`${step.domain} icon`} + className="h-full w-full object-cover" + loading="lazy" + referrerPolicy="no-referrer" + onError={() => setIconIndex((currentIndex) => currentIndex + 1)} + /> + ) : ( + <span>{(step.domain || "?").slice(0, 1).toUpperCase()}</span> + )} + </div> + <div className="min-w-0"> + <div className="truncate text-sm font-medium text-slate-950">{step.blog?.title || step.domain}</div> + <div className="mt-1 truncate text-xs text-slate-500">{step.url}</div> + </div> + </BlogDetailLink> + ); +} + +/** + * Render only the historical discovery path, without outgoing branches. + * + * @param props Discovery path payload. + * @returns Historical discovery path section or null when unavailable. + */ +function DiscoveryPathSection({ path, entranceUrl }: { path: BlogDiscoveryPath | null; entranceUrl: string }) { + if (!path || path.steps.length === 0) { + return null; + } + + return ( + <section className="rounded-lg border border-slate-200 bg-white p-6 shadow-sm"> + <div className="mb-4 flex items-center gap-2"> + <Route className="h-5 w-5 text-sky-600" /> + <h2 className="text-xl text-slate-950">发现路径</h2> + </div> + <div className="overflow-x-auto"> + <div className="flex min-w-max items-center gap-3"> + {path.steps.map((step, index) => ( + <div key={`${step.blogId}-${index}`} className="flex items-center gap-3"> + <DiscoveryPathCard step={step} entranceUrl={entranceUrl} /> + {index < path.steps.length - 1 ? ( + <div className="flex items-center gap-2 text-slate-300"> + <div className="h-px w-6 bg-slate-300" /> + <ArrowRight className="h-4 w-4" /> + </div> + ) : null} + </div> + ))} + </div> + </div> + </section> + ); +} + +interface RelationRenderNode extends Omit<GraphNode, "id" | "iconUrl"> { + id: string; + blogId: number; + original: GraphNode; + label: string; + iconUrls: string[]; + radius: number; +} + +interface RelationRenderLink { + id: string; + source: string | RelationRenderNode; + target: string | RelationRenderNode; +} + +interface RelationRenderGraph { + nodes: RelationRenderNode[]; + links: RelationRenderLink[]; +} + +/** + * Build force-graph render data from the blog relation API payload. + * + * @param graph Directional relation graph payload. + * @returns Render nodes and links for react-force-graph-2d. + */ +function buildRelationRenderGraph(graph: BlogRelationGraph): RelationRenderGraph { + const nodes = graph.nodes.map((node) => { + const iconUrls = resolveBlogIconUrls(node).map(resolveIconProxyUrl); + return { + ...node, + id: String(node.id), + blogId: node.id, + original: node, + label: node.title?.trim() || node.domain || node.url, + iconUrls, + radius: node.id === graph.focusBlogId ? 18 : 13, + }; + }); + const nodeIds = new Set(nodes.map((node) => node.id)); + return { + nodes, + links: graph.edges + .map((edge) => ({ + id: edge.id, + source: String(edge.source), + target: String(edge.target), + })) + .filter((edge) => nodeIds.has(edge.source) && nodeIds.has(edge.target)), + }; +} + +/** + * Resolve a force-graph link endpoint id after d3 mutates links. + * + * @param endpoint Link source or target value. + * @returns Stable render node id. + */ +function relationEndpointId(endpoint: string | RelationRenderNode): string { + return typeof endpoint === "object" ? endpoint.id : String(endpoint); +} + +/** + * Draw a relation graph node on a 2D force-graph canvas. + * + * @param node Render node to draw. + * @param context Canvas context from react-force-graph-2d. + * @param imageCache Loaded icon cache keyed by proxied icon URL. + * @param focusBlogId Current detail blog id. + * @param hoveredBlogId Hovered blog id, if any. + */ +function paintRelationNode( + node: RelationRenderNode, + context: CanvasRenderingContext2D, + imageCache: Map<string, HTMLImageElement>, + focusBlogId: number, + hoveredBlogId: number | null, +) { + const x = node.x ?? 0; + const y = node.y ?? 0; + const isFocus = node.blogId === focusBlogId; + const isHovered = node.blogId === hoveredBlogId; + const radius = node.radius + (isHovered ? 3 : 0); + const icon = node.iconUrls.map((url) => imageCache.get(url)).find((image) => image?.complete && image.naturalWidth > 0); + + context.save(); + context.beginPath(); + context.arc(x, y, radius + (isFocus ? 5 : 3), 0, Math.PI * 2); + context.fillStyle = isFocus ? "rgba(14, 165, 233, 0.2)" : "rgba(148, 163, 184, 0.18)"; + context.fill(); + + context.beginPath(); + context.arc(x, y, radius, 0, Math.PI * 2); + context.fillStyle = icon ? "#ffffff" : isFocus ? "#bae6fd" : "#cbd5e1"; + context.fill(); + context.lineWidth = isFocus ? 3 : 1.5; + context.strokeStyle = isFocus ? "#0284c7" : "#ffffff"; + context.stroke(); + + if (icon) { + context.save(); + context.beginPath(); + context.arc(x, y, radius - 1, 0, Math.PI * 2); + context.clip(); + context.drawImage(icon, x - radius, y - radius, radius * 2, radius * 2); + context.restore(); + } + context.restore(); +} + +/** + * Paint the clickable pointer area for one relation graph node. + * + * @param node Render node to cover. + * @param paintColor Hidden pointer-picking color supplied by force graph. + * @param context Canvas context from react-force-graph-2d. + */ +function paintRelationPointerArea(node: RelationRenderNode, paintColor: string, context: CanvasRenderingContext2D) { + const radius = node.radius + 5; + context.fillStyle = paintColor; + context.beginPath(); + context.arc(node.x ?? 0, node.y ?? 0, radius, 0, Math.PI * 2); + context.fill(); +} + +/** + * Render one paged blog relation graph as an interactive 2D force graph. + * + * @param props Directional relation graph payload. + * @returns 2D force-graph relation view. + */ +function RelationGraphView({ graph, entranceUrl }: { graph: BlogRelationGraph; entranceUrl: string }) { + const navigate = useNavigate(); + const graphRef = useRef<ForceGraphMethods<RelationRenderNode, RelationRenderLink> | undefined>(undefined); + const containerRef = useRef<HTMLDivElement | null>(null); + const imageCacheRef = useRef(new Map<string, HTMLImageElement>()); + const [size, setSize] = useState({ width: 960, height: 360 }); + const [isMeasured, setIsMeasured] = useState(false); + const [hoveredBlog, setHoveredBlog] = useState<GraphNode | null>(null); + const [iconPaintVersion, setIconPaintVersion] = useState(0); + const renderGraph = useMemo(() => buildRelationRenderGraph(graph), [graph]); + const hoveredBlogId = hoveredBlog?.id ?? null; + const fitGraphToView = useCallback((durationMs = 500) => { + graphRef.current?.zoomToFit(durationMs, 44); + }, []); + + useEffect(() => { + if (!containerRef.current) { + return undefined; + } + const observer = new ResizeObserver(([entry]) => { + setSize({ + width: Math.max(320, Math.floor(entry.contentRect.width)), + height: Math.max(320, Math.floor(entry.contentRect.height)), + }); + setIsMeasured(true); + }); + observer.observe(containerRef.current); + return () => observer.disconnect(); + }, []); + + useEffect(() => { + const graphInstance = graphRef.current; + if (!isMeasured || !graphInstance) { + return undefined; + } + graphInstance.d3Force("center", null); + const chargeForce = graphInstance.d3Force("charge") as { strength?: (value: number) => unknown } | undefined; + chargeForce?.strength?.(RELATION_GRAPH_CHARGE_STRENGTH); + const linkForce = graphInstance.d3Force("link") as { distance?: (value: number) => unknown } | undefined; + linkForce?.distance?.(RELATION_GRAPH_LINK_DISTANCE); + graphInstance.d3ReheatSimulation(); + const firstFitTimer = window.setTimeout(() => fitGraphToView(450), 120); + const settledFitTimer = window.setTimeout(() => fitGraphToView(450), 620); + return () => { + window.clearTimeout(firstFitTimer); + window.clearTimeout(settledFitTimer); + }; + }, [fitGraphToView, isMeasured, renderGraph, size.height, size.width]); + + useEffect(() => { + let isDisposed = false; + const urls = Array.from(new Set(renderGraph.nodes.flatMap((node) => node.iconUrls))); + urls.forEach((url) => { + if (imageCacheRef.current.has(url)) { + return; + } + const image = new Image(); + image.crossOrigin = "anonymous"; + image.onload = () => { + if (!isDisposed) { + imageCacheRef.current.set(url, image); + setIconPaintVersion((version) => version + 1); + } + }; + image.onerror = () => { + imageCacheRef.current.delete(url); + }; + image.src = url; + imageCacheRef.current.set(url, image); + }); + return () => { + isDisposed = true; + }; + }, [renderGraph.nodes]); + + const nodeCanvasObject = useCallback( + (node: RelationRenderNode, context: CanvasRenderingContext2D) => { + paintRelationNode(node, context, imageCacheRef.current, graph.focusBlogId, hoveredBlogId); + }, + [graph.focusBlogId, hoveredBlogId, iconPaintVersion], + ); + + return ( + <div ref={containerRef} className="relative h-[380px] overflow-hidden rounded-lg bg-slate-50"> + {isMeasured ? ( + <ForceGraph2D<RelationRenderNode, RelationRenderLink> + ref={graphRef} + graphData={renderGraph} + nodeId="id" + width={size.width} + height={size.height} + backgroundColor="#f8fafc" + nodeLabel={(node) => `${node.label}\n${node.url}`} + nodeVal={(node) => node.radius} + nodeCanvasObjectMode={() => "replace"} + nodeCanvasObject={nodeCanvasObject} + nodePointerAreaPaint={paintRelationPointerArea} + linkSource="source" + linkTarget="target" + linkColor={() => (graph.direction === "incoming" ? "rgba(2, 132, 199, 0.58)" : "rgba(5, 150, 105, 0.58)")} + linkWidth={() => 1.7} + linkDirectionalArrowLength={5} + linkDirectionalArrowRelPos={1} + linkDirectionalArrowColor={() => (graph.direction === "incoming" ? "#0284c7" : "#059669")} + enableNodeDrag={false} + enablePointerInteraction + cooldownTicks={90} + d3VelocityDecay={0.34} + d3AlphaDecay={0.04} + onNodeHover={(node) => setHoveredBlog(node?.original ?? null)} + onNodeClick={(node) => { + openTrackedBlogDetail( + navigate, + node.original, + { + entranceKind: DETAIL_RELATION_GRAPH_ENTRANCE_KIND, + entranceUrl, + }, + { relation_direction: graph.direction, focus_blog_id: graph.focusBlogId }, + ); + }} + showPointerCursor={(item) => Boolean(item && "blogId" in item)} + /> + ) : null} + <div className="sr-only" aria-live="polite"> + {renderGraph.nodes.map((node) => ( + <span key={node.id}>{`${node.label} ${node.url}`}</span> + ))} + </div> + {hoveredBlog ? ( + <div + role="tooltip" + className="pointer-events-none absolute left-4 top-4 z-30 max-w-[min(360px,calc(100%-2rem))] rounded-lg border border-slate-200 bg-white px-3 py-2 text-sm shadow-lg" + > + <div className="truncate font-medium text-slate-950">{hoveredBlog.title || hoveredBlog.domain}</div> + <div className="mt-1 break-all text-xs text-slate-500">{hoveredBlog.url || hoveredBlog.domain}</div> + </div> + ) : null} + </div> + ); +} + +/** + * Render the paged blog association module. + * + * @param props Incoming and outgoing relation graphs. + * @returns Blog association section with two graph pages. + */ +function BlogAssociationSection({ detail, entranceUrl }: { detail: BlogDetail; entranceUrl: string }) { + const [activeGraph, setActiveGraph] = useState<"incoming" | "outgoing">("incoming"); + const graph = detail.relationGraphs[activeGraph]; + + return ( + <section className="rounded-lg border border-slate-200 bg-white p-6 shadow-sm"> + <div className="mb-4 flex items-center gap-2"> + <Network className="h-5 w-5 text-sky-600" /> + <h2 className="text-xl text-slate-950">博客关联</h2> + </div> + <div className="mb-4 inline-flex rounded-lg border border-slate-200 bg-slate-50 p-1"> + <button + type="button" + onClick={() => setActiveGraph("incoming")} + className={[ + "rounded-md px-4 py-2 text-sm transition-colors", + activeGraph === "incoming" ? "bg-white text-slate-950 shadow-sm" : "text-slate-500 hover:text-slate-900", + ].join(" ")} + > + 入链关系 + </button> + <button + type="button" + onClick={() => setActiveGraph("outgoing")} + className={[ + "rounded-md px-4 py-2 text-sm transition-colors", + activeGraph === "outgoing" ? "bg-white text-slate-950 shadow-sm" : "text-slate-500 hover:text-slate-900", + ].join(" ")} + > + 出链关系 + </button> + </div> + {graph.nodes.length > 1 ? ( + <RelationGraphView graph={graph} entranceUrl={entranceUrl} /> + ) : ( + <div className="flex h-[260px] items-center justify-center rounded-lg bg-slate-50 text-sm text-slate-500"> + 暂无{activeGraph === "incoming" ? "入链" : "出链"}关联。 + </div> + )} + </section> + ); +} + +/** + * Render the public blog detail page. + * + * @returns Blog detail route UI. + */ +export function BlogDetailPage() { + const { blogId } = useParams(); + const navigate = useNavigate(); + const [detail, setDetail] = useState<BlogDetail | null>(null); + const [isLoading, setIsLoading] = useState(true); + const [errorMessage, setErrorMessage] = useState<string | null>(null); + const numericBlogId = Number(blogId); + + useEffect(() => { + let isDisposed = false; + + /** + * Load the route blog detail payload. + * + * @returns Promise resolved when detail state settles. + */ + async function loadDetail() { + if (!Number.isInteger(numericBlogId) || numericBlogId <= 0) { + setErrorMessage("博客 ID 无效。"); + setIsLoading(false); + return; + } + + setIsLoading(true); + setErrorMessage(null); + try { + const payload = await fetchBlogDetail(numericBlogId); + if (!isDisposed) { + setDetail(payload); + } + } catch { + if (!isDisposed) { + setDetail(null); + setErrorMessage("博客详情加载失败。"); + toast.error("博客详情加载失败,请稍后重试。"); + } + } finally { + if (!isDisposed) { + setIsLoading(false); + } + } + } + + void loadDetail(); + return () => { + isDisposed = true; + }; + }, [numericBlogId]); + + return ( + <div className="min-h-screen overflow-x-hidden bg-white"> + <Navigation /> + <main className="mx-auto max-w-6xl px-6 pb-16 pt-24 sm:px-8"> + <button + type="button" + onClick={() => navigate(-1)} + className="mb-8 inline-flex items-center gap-2 rounded-lg border border-slate-200 bg-white px-4 py-2 text-sm text-slate-600 transition-colors hover:border-sky-300 hover:text-sky-700" + > + <ArrowLeft className="h-4 w-4" /> + 返回 + </button> + + {isLoading ? ( + <section className="flex min-h-[360px] items-center justify-center"> + <div className="flex flex-col items-center gap-4 text-slate-500"> + <Loader2 className="h-10 w-10 animate-spin text-sky-500" /> + <div>正在加载博客详情...</div> + </div> + </section> + ) : null} + + {!isLoading && errorMessage ? ( + <section className="rounded-lg border border-rose-200 bg-rose-50 px-6 py-8 text-rose-700"> + {errorMessage} + </section> + ) : null} + + {!isLoading && detail ? ( + <div className="space-y-8"> + <section className="rounded-lg border border-slate-200 bg-white p-6 shadow-sm"> + <div className="flex flex-col gap-5 md:flex-row md:items-start md:justify-between"> + <div className="min-w-0"> + <BlogHeroIcon detail={detail} /> + <h1 className="break-words text-4xl leading-tight text-slate-950">{detail.title || detail.domain}</h1> + <div className="mt-2 text-base text-slate-500">{detail.domain}</div> + <BlogExternalLink + blog={detail} + entranceKind={DETAIL_PAGE_EXTERNAL_ENTRANCE_KIND} + entranceUrl={window.location.href} + className="mt-4 inline-flex max-w-full items-center gap-2 break-all text-sm text-sky-700 hover:underline" + > + {detail.url} + </BlogExternalLink> + <BlogCrawlStatus detail={detail} /> + </div> + </div> + </section> + + <section className="grid grid-cols-1 gap-4 md:grid-cols-3"> + <div className="rounded-lg border border-slate-200 bg-white p-5 shadow-sm"> + <div className="mb-3 flex h-10 w-10 items-center justify-center rounded-lg bg-sky-100 text-sky-700"> + <ArrowLeft className="h-5 w-5" /> + </div> + <div className="text-sm text-slate-500">入链</div> + <div className="mt-2 text-3xl text-slate-950">{formatCount(detail.incomingLinks)}</div> + </div> + <div className="rounded-lg border border-slate-200 bg-white p-5 shadow-sm"> + <div className="mb-3 flex h-10 w-10 items-center justify-center rounded-lg bg-emerald-100 text-emerald-700"> + <ArrowRight className="h-5 w-5" /> + </div> + <div className="text-sm text-slate-500">出链</div> + <div className="mt-2 text-3xl text-slate-950">{formatCount(detail.outgoingLinks)}</div> + </div> + <div className="rounded-lg border border-slate-200 bg-white p-5 shadow-sm"> + <div className="mb-3 flex h-10 w-10 items-center justify-center rounded-lg bg-violet-100 text-violet-700"> + <Network className="h-5 w-5" /> + </div> + <div className="text-sm text-slate-500">直接相关博客</div> + <div className="mt-2 text-3xl text-slate-950">{formatCount(detail.relatedNodes.length)}</div> + </div> + </section> + + <DiscoveryPathSection path={detail.discoveryPath} entranceUrl={window.location.href} /> + + <BlogAssociationSection detail={detail} entranceUrl={window.location.href} /> + </div> + ) : null} + </main> + </div> + ); +} diff --git a/frontend/src/pages/HomePage.tsx b/frontend/src/pages/HomePage.tsx index c38fcc7..51bfd46 100644 --- a/frontend/src/pages/HomePage.tsx +++ b/frontend/src/pages/HomePage.tsx @@ -1,86 +1,66 @@ -import { Loader2, Network, GitBranch, Radar, TimerReset } from "lucide-react"; +import { GitBranch, Loader2, Network, Search } from "lucide-react"; import { useEffect, useRef, useState } from "react"; import { toast } from "sonner"; -import { BlogCard } from "../components/BlogCard"; +import { BlogDetailLink } from "../components/BlogDetailLink"; +import { MissingBlogConfirmDialog } from "../components/MissingBlogConfirmDialog"; import { Navigation } from "../components/Navigation"; -import { SearchBar } from "../components/SearchBar"; -import { fetchBlogsCatalog, fetchStats, fetchStatus } from "../lib/api"; -import type { BlogCatalogPage, StatsData, StatusData } from "../types/graph"; +import { fetchBlogsCatalog, fetchStats, submitUserSeed } from "../lib/api"; +import { resolveBlogIconUrls } from "../lib/icon"; +import type { BlogCatalogItem, StatsData } from "../types/graph"; -const DEFAULT_PAGE_SIZE = 30; const HOME_REFRESH_INTERVAL_MS = 5000; -const HOME_STATUS_ORDER = ["PROCESSING", "WAITING", "FINISHED", "FAILED"] as const; -const HOME_STATUS_FILTERS = ["ALL", ...HOME_STATUS_ORDER] as const; - -type HomeStatusFilter = (typeof HOME_STATUS_FILTERS)[number]; +const HOME_SEARCH_PAGE_SIZE = 30; +const HOME_SEARCH_ENTRANCE_KIND = "home_search_result"; /** - * Load one synthetic "ALL" page by concatenating status buckets in priority order. - * - * Each bucket is read directly from the catalog API and keeps ascending blog-id - * ordering inside the bucket. + * Render the icon used in one homepage search result row. * - * @param page Current homepage page number. - * @param pageSize Maximum number of cards per page. - * @param searchQuery Optional fuzzy-search keyword applied to the catalog query. - * @returns One combined catalog page. + * @param props Blog catalog item used for icon resolution. + * @returns Blog icon image or text fallback. */ -async function fetchAllStatusCatalogPage( - page: number, - pageSize: number, - searchQuery: string, -): Promise<BlogCatalogPage> { - const takeCount = page * pageSize; - const responses = await Promise.allSettled( - HOME_STATUS_ORDER.map((status) => - fetchBlogsCatalog({ - page: 1, - pageSize: takeCount, - q: searchQuery || undefined, - sort: "id_asc", - status, - }), - ), - ); - const fulfilledResponses = responses - .filter((response): response is PromiseFulfilledResult<BlogCatalogPage> => response.status === "fulfilled") - .map((response) => response.value); - if (fulfilledResponses.length === 0) { - throw new Error("all_catalog_buckets_failed"); - } +function SearchResultIcon({ blog }: { blog: BlogCatalogItem }) { + const iconUrls = resolveBlogIconUrls(blog); + const [iconIndex, setIconIndex] = useState(0); + const iconUrl = iconUrls[iconIndex]; - const mergedItems = fulfilledResponses.flatMap((response) => response.items); - const offset = (page - 1) * pageSize; - const totalItems = fulfilledResponses.reduce((sum, response) => sum + response.totalItems, 0); - const totalPages = totalItems > 0 ? Math.ceil(totalItems / pageSize) : 0; + useEffect(() => { + setIconIndex(0); + }, [blog.id, blog.iconUrl, blog.url, blog.domain]); - return { - items: mergedItems.slice(offset, offset + pageSize), - page, - pageSize, - totalItems, - totalPages, - hasNext: page < totalPages, - hasPrev: page > 1, - sort: "home_status_priority_asc", - }; + return ( + <div className="flex h-11 w-11 flex-shrink-0 items-center justify-center overflow-hidden rounded-lg bg-slate-100 text-base font-semibold text-slate-500 ring-1 ring-slate-200"> + {iconUrl ? ( + <img + src={iconUrl} + alt={`${blog.domain} icon`} + className="h-full w-full object-cover" + loading="lazy" + referrerPolicy="no-referrer" + onError={() => setIconIndex((currentIndex) => currentIndex + 1)} + /> + ) : ( + <span>{(blog.domain || "?").slice(0, 1).toUpperCase()}</span> + )} + </div> + ); } /** - * Render the public home page with stats, search, and card-based blog discovery. + * Render the public home page summary without the status-filtered blog catalog. * * @returns Home route UI. */ export function HomePage() { - const [catalog, setCatalog] = useState<BlogCatalogPage | null>(null); const [stats, setStats] = useState<StatsData>({ totalNodes: 0, totalEdges: 0 }); - const [status, setStatus] = useState<StatusData | null>(null); const [isInitialLoading, setIsInitialLoading] = useState(true); const [isRefreshing, setIsRefreshing] = useState(false); + const [searchInput, setSearchInput] = useState(""); + const [searchResults, setSearchResults] = useState<BlogCatalogItem[]>([]); + const [searchTotalItems, setSearchTotalItems] = useState(0); + const [lastSearchQuery, setLastSearchQuery] = useState(""); + const [hasSearched, setHasSearched] = useState(false); const [isSearching, setIsSearching] = useState(false); - const [statusFilter, setStatusFilter] = useState<HomeStatusFilter>("ALL"); - const [currentPage, setCurrentPage] = useState(1); - const [searchQuery, setSearchQuery] = useState(""); + const [missingBlogUrl, setMissingBlogUrl] = useState<string | null>(null); const refreshInFlightRef = useRef(false); const hasLoadedOnceRef = useRef(false); @@ -90,13 +70,10 @@ export function HomePage() { hasLoadedOnceRef.current = true; } void loadHomePage({ - page: currentPage, - searchQuery, - statusFilter, showInitialLoading: isFirstLoad, showRefreshState: !isFirstLoad, }); - }, [currentPage, searchQuery, statusFilter]); + }, []); useEffect(() => { let isDisposed = false; @@ -106,9 +83,6 @@ export function HomePage() { return; } await loadHomePage({ - page: currentPage, - searchQuery, - statusFilter, showInitialLoading: false, showRefreshState: true, showErrorToast: false, @@ -122,9 +96,6 @@ export function HomePage() { function handleVisibilityChange() { if (document.visibilityState === "visible") { void loadHomePage({ - page: currentPage, - searchQuery, - statusFilter, showInitialLoading: false, showRefreshState: true, showErrorToast: false, @@ -138,18 +109,15 @@ export function HomePage() { window.clearInterval(intervalId); document.removeEventListener("visibilitychange", handleVisibilityChange); }; - }, [currentPage, searchQuery, statusFilter]); + }, []); /** - * Load the home page summary and one catalog page. + * Load the home page summary metrics. * * @param options Loading behavior flags. * @returns Promise resolved when the homepage state finishes updating. */ async function loadHomePage(options?: { - page?: number; - searchQuery?: string; - statusFilter?: HomeStatusFilter; showInitialLoading?: boolean; showRefreshState?: boolean; showErrorToast?: boolean; @@ -161,9 +129,6 @@ export function HomePage() { const showInitialLoading = options?.showInitialLoading ?? false; const showRefreshState = options?.showRefreshState ?? false; const showErrorToast = options?.showErrorToast ?? true; - const page = options?.page ?? currentPage; - const currentSearchQuery = options?.searchQuery ?? searchQuery; - const selectedStatusFilter = options?.statusFilter ?? statusFilter; refreshInFlightRef.current = true; try { @@ -173,22 +138,8 @@ export function HomePage() { if (showRefreshState) { setIsRefreshing(true); } - const [catalogResponse, statsResponse, statusResponse] = await Promise.all([ - selectedStatusFilter === "ALL" - ? fetchAllStatusCatalogPage(page, DEFAULT_PAGE_SIZE, currentSearchQuery) - : fetchBlogsCatalog({ - page, - pageSize: DEFAULT_PAGE_SIZE, - q: currentSearchQuery || undefined, - sort: selectedStatusFilter === "WAITING" ? "id_asc" : "id_desc", - status: selectedStatusFilter, - }), - fetchStats(), - fetchStatus(), - ]); - setCatalog(catalogResponse); + const statsResponse = await fetchStats(); setStats(statsResponse); - setStatus(statusResponse); } catch { if (showErrorToast) { toast.error("首页数据加载失败,请刷新页面重试。"); @@ -197,32 +148,65 @@ export function HomePage() { refreshInFlightRef.current = false; setIsInitialLoading(false); setIsRefreshing(false); - setIsSearching(false); } } /** - * Update the selected homepage status filter and reset pagination to the oldest page. + * Search accepted blogs by URL using the server-side normalized URL fuzzy filter. * - * @param filter Next status filter selected by the user. + * @param event Search form submit event. + * @returns Promise resolved after results are rendered. */ - function handleStatusFilterChange(filter: HomeStatusFilter) { - setStatusFilter(filter); - setCurrentPage(1); + async function handleSearchSubmit(event: React.FormEvent<HTMLFormElement>) { + event.preventDefault(); + const query = searchInput.trim(); + if (!query) { + setHasSearched(false); + setLastSearchQuery(""); + setSearchResults([]); + setSearchTotalItems(0); + setMissingBlogUrl(null); + return; + } + + setIsSearching(true); + try { + const page = await fetchBlogsCatalog({ + page: 1, + pageSize: HOME_SEARCH_PAGE_SIZE, + url: query, + sort: "id_desc", + }); + setSearchResults(page.items); + setSearchTotalItems(page.totalItems); + setLastSearchQuery(query); + setHasSearched(true); + setMissingBlogUrl(page.items.length === 0 ? query : null); + } catch { + toast.error("博客搜索失败,请稍后重试。"); + } finally { + setIsSearching(false); + } } /** - * Apply one fuzzy-search keyword to the homepage catalog. + * Submit a user-confirmed missing blog URL as an accepted crawler seed. * - * @param query Search keyword entered by the user. + * @param url Complete blog URL typed by the user. + * @returns Promise resolved after the submission is persisted. */ - function handleSearch(query: string) { - setIsSearching(true); - setSearchQuery(query); - setCurrentPage(1); + async function handleSubmitMissingBlog(url: string) { + try { + await submitUserSeed({ url }); + toast.success("已加入博客网络,等待爬虫抓取友链。"); + setMissingBlogUrl(null); + } catch (error) { + const message = error instanceof Error ? error.message : "提交失败:未知错误"; + toast.error(message); + } } - if (isInitialLoading || !catalog) { + if (isInitialLoading) { return ( <div className="flex min-h-screen items-center justify-center bg-white"> <div className="flex flex-col items-center gap-4"> @@ -236,6 +220,13 @@ export function HomePage() { return ( <div className="min-h-screen overflow-x-hidden bg-white"> <Navigation /> + {missingBlogUrl ? ( + <MissingBlogConfirmDialog + url={missingBlogUrl} + onCancel={() => setMissingBlogUrl(null)} + onSubmit={handleSubmitMissingBlog} + /> + ) : null} <main className="mx-auto max-w-7xl px-6 pb-16 pt-24 sm:px-8"> <section className="mb-14"> @@ -245,12 +236,71 @@ export function HomePage() { <p className="mt-5 max-w-3xl text-lg leading-8 text-slate-600"> 基于友链爬取所有博客! </p> - <div className="mt-8"> - <SearchBar onSearch={handleSearch} isLoading={isSearching} /> - </div> </section> - <section className="mb-14 grid grid-cols-1 gap-5 md:grid-cols-2 xl:grid-cols-4"> + <section className="mx-auto mb-14 w-full max-w-4xl"> + <form onSubmit={handleSearchSubmit} className="relative"> + <label htmlFor="home-blog-url-search" className="sr-only"> + 搜索博客链接 + </label> + <input + id="home-blog-url-search" + type="text" + value={searchInput} + onChange={(event) => setSearchInput(event.target.value)} + placeholder="输入你的博客链接,看看你的博客有没有被找到吧!" + disabled={isSearching} + className="w-full rounded-lg border border-slate-300 bg-white px-5 py-4 pr-14 text-base text-slate-950 shadow-sm outline-none transition-colors placeholder:text-slate-400 focus:border-sky-500 focus:ring-2 focus:ring-sky-100 disabled:cursor-not-allowed disabled:bg-slate-50" + /> + <button + type="submit" + aria-label="搜索博客" + disabled={isSearching || !searchInput.trim()} + className="absolute right-2 top-1/2 inline-flex h-10 w-10 -translate-y-1/2 items-center justify-center rounded-md bg-sky-500 text-white transition-colors hover:bg-sky-600 disabled:cursor-not-allowed disabled:bg-slate-300" + > + {isSearching ? <Loader2 className="h-5 w-5 animate-spin" /> : <Search className="h-5 w-5" />} + </button> + </form> + + {hasSearched ? ( + <div className="mt-4 rounded-lg border border-slate-200 bg-white shadow-sm"> + <div className="flex items-center justify-between border-b border-slate-100 px-4 py-3 text-sm text-slate-500"> + <span>搜索结果</span> + <span>{searchTotalItems} 个匹配</span> + </div> + {searchResults.length > 0 ? ( + <div className="max-h-80 overflow-y-auto"> + {searchResults.map((blog) => ( + <BlogDetailLink + key={blog.id} + blog={blog} + entranceKind={HOME_SEARCH_ENTRANCE_KIND} + entranceUrl={window.location.href} + className="block w-full border-b border-slate-100 px-4 py-4 text-left transition-colors last:border-b-0 hover:bg-sky-50 focus:bg-sky-50 focus:outline-none" + > + <div className="flex items-start justify-between gap-4"> + <div className="flex min-w-0 items-center gap-3"> + <SearchResultIcon blog={blog} /> + <div className="min-w-0"> + <div className="truncate text-base text-slate-950">{blog.title || blog.domain}</div> + <div className="mt-1 truncate text-sm text-slate-500">{blog.normalizedUrl}</div> + </div> + </div> + <span className="flex-shrink-0 rounded-md bg-slate-100 px-2 py-1 text-xs text-slate-500"> + {blog.crawlStatus} + </span> + </div> + </BlogDetailLink> + ))} + </div> + ) : ( + <div className="px-4 py-6 text-sm text-slate-500">未找到与 {lastSearchQuery} 匹配的博客。</div> + )} + </div> + ) : null} + </section> + + <section className="mb-14 grid grid-cols-1 gap-5 md:grid-cols-2"> <div className="rounded-[28px] border border-sky-200 bg-white/95 p-6 shadow-[0_18px_40px_rgba(15,23,42,0.08)]"> <div className="mb-4 flex h-12 w-12 items-center justify-center rounded-2xl bg-sky-500 text-white"> <Network className="h-6 w-6" /> @@ -265,84 +315,15 @@ export function HomePage() { <div className="text-sm text-slate-500">总连接数</div> <div className="mt-2 text-4xl text-slate-950">{stats.totalEdges}</div> </div> - <div className="rounded-[28px] border border-violet-200 bg-white/95 p-6 shadow-[0_18px_40px_rgba(15,23,42,0.08)]"> - <div className="mb-4 flex h-12 w-12 items-center justify-center rounded-2xl bg-violet-500 text-white"> - <Radar className="h-6 w-6" /> - </div> - <div className="text-sm text-slate-500">待处理队列</div> - <div className="mt-2 text-4xl text-slate-950">{status?.pendingTasks ?? 0}</div> - </div> - <div className="rounded-[28px] border border-amber-200 bg-white/95 p-6 shadow-[0_18px_40px_rgba(15,23,42,0.08)]"> - <div className="mb-4 flex h-12 w-12 items-center justify-center rounded-2xl bg-amber-500 text-white"> - <TimerReset className="h-6 w-6" /> - </div> - <div className="text-sm text-slate-500">处理中 / 失败</div> - <div className="mt-2 text-4xl text-slate-950"> - {(status?.processingTasks ?? 0) + (status?.failedTasks ?? 0)} - </div> - </div> </section> - <section className="mb-8 flex flex-col gap-4 md:flex-row md:items-center md:justify-between"> - <div className="flex flex-wrap gap-3"> - {HOME_STATUS_FILTERS.map((filter) => { - const isActive = statusFilter === filter; - return ( - <button - key={filter} - type="button" - onClick={() => handleStatusFilterChange(filter)} - className={`rounded-full border px-4 py-2 text-sm transition-colors ${ - isActive - ? "border-sky-500 bg-sky-500 text-white" - : "border-slate-200 bg-white text-slate-600 hover:border-sky-300 hover:text-sky-600" - }`} - > - {filter} - </button> - ); - })} - </div> - <div className="flex items-center gap-3 text-sm text-slate-500"> - <span> - 当前显示第 {catalog.page} / {Math.max(catalog.totalPages, 1)} 页,本页 {catalog.items.length} 个,共 {catalog.totalItems} 个博客 + <section className="flex items-center justify-end text-sm text-slate-500"> + {isRefreshing ? ( + <span className="inline-flex items-center gap-2 text-sky-600"> + <Loader2 className="h-4 w-4 animate-spin" /> + 正在刷新 </span> - {searchQuery ? <span>搜索词: {searchQuery}</span> : null} - {isRefreshing ? ( - <span className="inline-flex items-center gap-2 text-sky-600"> - <Loader2 className="h-4 w-4 animate-spin" /> - 正在刷新 - </span> - ) : null} - </div> - </section> - - <section className="grid grid-cols-1 gap-6 md:grid-cols-2 xl:grid-cols-3"> - {catalog.items.map((blog) => ( - <BlogCard key={blog.id} blog={blog} /> - ))} - </section> - - <section className="mt-10 flex items-center justify-between gap-4"> - <button - type="button" - onClick={() => setCurrentPage((page) => Math.max(1, page - 1))} - disabled={!catalog.hasPrev} - className="rounded-full border border-slate-200 px-5 py-2 text-sm text-slate-600 transition-colors hover:border-sky-300 hover:text-sky-600 disabled:cursor-not-allowed disabled:border-slate-100 disabled:text-slate-300" - > - 上一页 - </button> - <div className="text-sm text-slate-500"> - 每页最多 {DEFAULT_PAGE_SIZE} 个 - </div> - <button - type="button" - onClick={() => setCurrentPage((page) => page + 1)} - disabled={!catalog.hasNext} - className="rounded-full border border-slate-200 px-5 py-2 text-sm text-slate-600 transition-colors hover:border-sky-300 hover:text-sky-600 disabled:cursor-not-allowed disabled:border-slate-100 disabled:text-slate-300" - > - 下一页 - </button> + ) : null} </section> </main> </div> diff --git a/frontend/src/pages/ProfilePage.tsx b/frontend/src/pages/ProfilePage.tsx index 7fc6adc..d5468cd 100644 --- a/frontend/src/pages/ProfilePage.tsx +++ b/frontend/src/pages/ProfilePage.tsx @@ -1,13 +1,18 @@ import { Loader2, LogOut, UserCircle } from "lucide-react"; import { FormEvent, useEffect, useState } from "react"; +import { useSearchParams } from "react-router-dom"; import { toast } from "sonner"; import { Navigation } from "../components/Navigation"; import { + confirmEmailVerification, fetchCurrentUser, fetchMyLabelStats, loginUser, logoutUser, registerUser, + requestEmailVerification, + requestPasswordReset, + resetPassword, } from "../lib/api"; import { clearStoredAuthSession, @@ -17,7 +22,7 @@ import { } from "../lib/auth"; import type { AuthSession, UserProfile } from "../types/graph"; -type AuthMode = "login" | "register"; +type AuthMode = "login" | "register" | "forgot" | "reset"; /** * Render the user auth and profile page. @@ -26,22 +31,42 @@ type AuthMode = "login" | "register"; * user profile with a concise random-blog label total. */ export function ProfilePage() { + const [searchParams] = useSearchParams(); const [authMode, setAuthMode] = useState<AuthMode>("login"); const [email, setEmail] = useState(""); const [password, setPassword] = useState(""); + const [tokenInput, setTokenInput] = useState(""); + const [lastLifecycleToken, setLastLifecycleToken] = useState<string | null>(null); const [session, setSession] = useState<AuthSession | null>(() => readStoredAuthSession()); const [user, setUser] = useState<UserProfile | null>(() => session?.user ?? null); const [labelCount, setLabelCount] = useState(0); const [isSubmitting, setIsSubmitting] = useState(false); const [isLoadingProfile, setIsLoadingProfile] = useState(Boolean(session)); + useEffect(() => { + const verifyToken = searchParams.get("verify_token")?.trim(); + const resetToken = searchParams.get("reset_token")?.trim(); + if (verifyToken) { + void handleVerifyEmail(verifyToken); + return; + } + if (resetToken) { + setTokenInput(resetToken); + setAuthMode("reset"); + } + }, [searchParams]); + useEffect(() => { if (!session?.token) { setIsLoadingProfile(false); return; } + if (searchParams.get("verify_token")?.trim()) { + setIsLoadingProfile(false); + return; + } void loadProfile(session.token); - }, [session?.token]); + }, [searchParams, session?.token]); async function loadProfile(token: string) { try { @@ -65,23 +90,123 @@ export function ProfilePage() { async function handleSubmit(event: FormEvent<HTMLFormElement>) { event.preventDefault(); + if (authMode === "forgot") { + await handleForgotPassword(); + return; + } + if (authMode === "reset") { + await handleResetPassword(); + return; + } if (!email.trim() || !password) { toast.error("请输入邮箱和密码。"); return; } try { setIsSubmitting(true); - const nextSession = - authMode === "register" - ? await registerUser({ email, password }) - : await loginUser({ email, password }); + if (authMode === "register") { + const payload = await registerUser({ email, password }); + setLastLifecycleToken(payload.verificationToken ?? null); + setPassword(""); + setAuthMode("login"); + toast.success("验证邮件已发送,请验证邮箱后登录。"); + return; + } + const nextSession = await loginUser({ email, password }); storeAuthSession(nextSession); setSession(nextSession); setUser(nextSession.user); setPassword(""); - toast.success(authMode === "register" ? "注册成功,已登录。" : "登录成功。"); + toast.success("登录成功。"); + } catch { + toast.error(authMode === "register" ? "注册失败,请检查邮箱、密码或待验证状态。" : "登录失败,请检查账号密码。"); + } finally { + setIsSubmitting(false); + } + } + + async function handleVerifyEmail(tokenOverride?: string) { + const token = tokenOverride?.trim(); + if (!token) { + toast.error("验证链接缺少 Token。"); + return; + } + try { + setIsSubmitting(true); + const profile = await confirmEmailVerification(token); + if (session?.token) { + setUser(profile); + updateStoredUser(profile); + } else { + clearStoredAuthSession(); + setSession(null); + setUser(null); + } + setAuthMode("login"); + setTokenInput(""); + setLastLifecycleToken(null); + toast.success(session?.token ? "邮箱验证成功。" : "邮箱验证成功,请登录。"); + } catch { + toast.error("邮箱验证失败,请重新发送验证邮件。"); + } finally { + setIsSubmitting(false); + } + } + + async function handleForgotPassword() { + if (!email.trim()) { + toast.error("请输入邮箱。"); + return; + } + try { + setIsSubmitting(true); + const payload = await requestPasswordReset(email); + setLastLifecycleToken(payload.resetToken ?? null); + setAuthMode("reset"); + toast.success("重置请求已提交。"); + } catch { + toast.error("密码重置请求失败。"); + } finally { + setIsSubmitting(false); + } + } + + async function handleResetPassword() { + const token = tokenInput.trim() || lastLifecycleToken?.trim(); + if (!token || !password) { + toast.error("请输入重置 Token 和新密码。"); + return; + } + try { + setIsSubmitting(true); + await resetPassword({ token, password }); + clearStoredAuthSession(); + setSession(null); + setUser(null); + setPassword(""); + setTokenInput(""); + setLastLifecycleToken(null); + setAuthMode("login"); + toast.success("密码已重置,请重新登录。"); } catch { - toast.error(authMode === "register" ? "注册失败,请检查邮箱或密码。" : "登录失败,请检查账号密码。"); + toast.error("密码重置失败,请检查 Token 或密码长度。"); + } finally { + setIsSubmitting(false); + } + } + + async function handleResendVerification() { + const targetEmail = user?.email ?? email; + if (!targetEmail.trim()) { + toast.error("请输入邮箱。"); + return; + } + try { + setIsSubmitting(true); + const payload = await requestEmailVerification(targetEmail); + toast.success(payload.alreadyVerified ? "邮箱已经验证。" : "验证邮件已发送。"); + } catch { + toast.error("验证请求失败。"); } finally { setIsSubmitting(false); } @@ -112,40 +237,68 @@ export function ProfilePage() { <UserCircle className="h-8 w-8 text-sky-600" /> <div> <h1 className="text-2xl font-semibold text-slate-950"> - {authMode === "register" ? "注册账号" : "登录账号"} + {authMode === "register" + ? "注册账号" + : authMode === "forgot" + ? "找回密码" + : authMode === "reset" + ? "重置密码" + : "登录账号"} </h1> - <p className="mt-1 text-sm text-slate-500">邮箱和密码会用于保存你的博客标注记录。</p> + <p className="mt-1 text-sm text-slate-500">邮箱账号会用于保存你的博客标注记录。</p> </div> </div> <form className="space-y-4" onSubmit={(event) => void handleSubmit(event)}> - <label className="block text-sm font-medium text-slate-700"> - 邮箱 - <input - type="email" - value={email} - onChange={(event) => setEmail(event.target.value)} - className="mt-2 w-full rounded-md border border-slate-300 px-3 py-2 text-slate-900 outline-none transition focus:border-sky-500 focus:ring-2 focus:ring-sky-100" - autoComplete="email" - /> - </label> - <label className="block text-sm font-medium text-slate-700"> - 密码 - <input - type="password" - value={password} - onChange={(event) => setPassword(event.target.value)} - className="mt-2 w-full rounded-md border border-slate-300 px-3 py-2 text-slate-900 outline-none transition focus:border-sky-500 focus:ring-2 focus:ring-sky-100" - autoComplete={authMode === "register" ? "new-password" : "current-password"} - /> - </label> + {authMode !== "reset" ? ( + <label className="block text-sm font-medium text-slate-700"> + 邮箱 + <input + type="email" + value={email} + onChange={(event) => setEmail(event.target.value)} + className="mt-2 w-full rounded-md border border-slate-300 px-3 py-2 text-slate-900 outline-none transition focus:border-sky-500 focus:ring-2 focus:ring-sky-100" + autoComplete="email" + /> + </label> + ) : null} + {authMode === "reset" ? ( + <label className="block text-sm font-medium text-slate-700"> + Token + <input + type="text" + value={tokenInput || lastLifecycleToken || ""} + onChange={(event) => setTokenInput(event.target.value)} + className="mt-2 w-full rounded-md border border-slate-300 px-3 py-2 text-slate-900 outline-none transition focus:border-sky-500 focus:ring-2 focus:ring-sky-100" + autoComplete="one-time-code" + /> + </label> + ) : null} + {authMode !== "forgot" ? ( + <label className="block text-sm font-medium text-slate-700"> + 密码 + <input + type="password" + value={password} + onChange={(event) => setPassword(event.target.value)} + className="mt-2 w-full rounded-md border border-slate-300 px-3 py-2 text-slate-900 outline-none transition focus:border-sky-500 focus:ring-2 focus:ring-sky-100" + autoComplete={authMode === "register" || authMode === "reset" ? "new-password" : "current-password"} + /> + </label> + ) : null} <button type="submit" disabled={isSubmitting} className="inline-flex w-full items-center justify-center gap-2 rounded-md bg-slate-950 px-4 py-2.5 text-sm font-medium text-white transition hover:bg-slate-800 disabled:cursor-not-allowed disabled:bg-slate-400" > {isSubmitting ? <Loader2 className="h-4 w-4 animate-spin" /> : null} - {authMode === "register" ? "注册并登录" : "登录"} + {authMode === "register" + ? "注册并发送验证邮件" + : authMode === "forgot" + ? "发送重置 Token" + : authMode === "reset" + ? "重置密码" + : "登录"} </button> </form> @@ -156,6 +309,11 @@ export function ProfilePage() { > {authMode === "register" ? "已有账号,去登录" : "没有账号,注册一个"} </button> + <div className="mt-3 flex justify-center gap-4 text-sm"> + <button type="button" onClick={() => setAuthMode("forgot")} className="text-slate-600 hover:text-slate-950"> + 忘记密码 + </button> + </div> </section> ) : ( <section className="space-y-6"> @@ -165,31 +323,55 @@ export function ProfilePage() { <p className="text-sm text-slate-500">当前账号</p> <h1 className="mt-1 text-2xl font-semibold text-slate-950">{user.email}</h1> <p className="mt-2 text-sm text-slate-500">显示名:{user.displayName || user.email}</p> + <p className="mt-1 text-sm text-slate-500"> + 身份:{user.role === "admin" ? "Admin" : "普通用户"} · 邮箱: + {user.emailVerified ? "已验证" : "未验证"} + </p> + </div> + <div className="flex flex-wrap gap-2"> + {!user.emailVerified ? ( + <button + type="button" + onClick={() => void handleResendVerification()} + className="inline-flex items-center justify-center rounded-md border border-slate-300 bg-white px-4 py-2 text-sm text-slate-700 transition hover:border-sky-300 hover:text-sky-700" + > + 重新发送验证邮件 + </button> + ) : null} + <button + type="button" + onClick={() => void handleLogout()} + className="inline-flex items-center justify-center gap-2 rounded-md border border-slate-300 bg-white px-4 py-2 text-sm text-slate-700 transition hover:border-rose-300 hover:text-rose-700" + > + <LogOut className="h-4 w-4" /> + 退出登录 + </button> </div> - <button - type="button" - onClick={() => void handleLogout()} - className="inline-flex items-center justify-center gap-2 rounded-md border border-slate-300 bg-white px-4 py-2 text-sm text-slate-700 transition hover:border-rose-300 hover:text-rose-700" - > - <LogOut className="h-4 w-4" /> - 退出登录 - </button> </div> - </div> - - <div className="rounded-lg border border-slate-200 bg-white p-6 shadow-sm"> - <h2 className="text-lg font-semibold text-slate-950">数据标注</h2> - {isLoadingProfile ? ( - <div className="mt-6 flex items-center gap-2 text-sm text-slate-500"> - <Loader2 className="h-4 w-4 animate-spin" /> - 正在加载个人数据... + {!user.emailVerified ? ( + <div className="mt-6 border-t border-slate-100 pt-6"> + <p className="text-sm text-slate-600"> + 验证邮件已发送,请打开邮箱并点击邮件中的验证链接。 + </p> </div> - ) : ( - <p className="mt-4 text-sm text-slate-500"> - 当前总共标注了 <span className="font-semibold text-slate-950">{labelCount}</span> 次。 - </p> - )} + ) : null} </div> + + {user.emailVerified ? ( + <div className="rounded-lg border border-slate-200 bg-white p-6 shadow-sm"> + <h2 className="text-lg font-semibold text-slate-950">数据标注</h2> + {isLoadingProfile ? ( + <div className="mt-6 flex items-center gap-2 text-sm text-slate-500"> + <Loader2 className="h-4 w-4 animate-spin" /> + 正在加载个人数据... + </div> + ) : ( + <p className="mt-4 text-sm text-slate-500"> + 当前总共标注了 <span className="font-semibold text-slate-950">{labelCount}</span> 次。 + </p> + )} + </div> + ) : null} </section> )} </main> diff --git a/frontend/src/pages/RandomBlogPage.tsx b/frontend/src/pages/RandomBlogPage.tsx index 0dd7a20..40a0567 100644 --- a/frontend/src/pages/RandomBlogPage.tsx +++ b/frontend/src/pages/RandomBlogPage.tsx @@ -1,13 +1,21 @@ -import { Loader2, RefreshCw } from "lucide-react"; +import { Eye, Loader2, RefreshCw } from "lucide-react"; import { useEffect, useState } from "react"; import { toast } from "sonner"; import { BlogCard } from "../components/BlogCard"; +import { BlogDetailLink } from "../components/BlogDetailLink"; import { Navigation } from "../components/Navigation"; import { readStoredAuthSession } from "../lib/auth"; -import { fetchBlogsCatalog, postBlogUserLabel } from "../lib/api"; +import { fetchRandomBlogBatch, postBlogUserLabel } from "../lib/api"; +import { + blogInteractionTarget, + getBlogInteractionSessionId, + getBlogInteractionVisitorId, + recordBlogInteraction, +} from "../lib/blogInteractions"; import type { BlogCatalogItem } from "../types/graph"; const RANDOM_BLOG_COUNT = 9; +const RANDOM_PAGE_ENTRANCE_KIND = "random_blog_page"; const RANDOM_LABELS = [ { slug: "blog", label: "博客" }, { slug: "company", label: "公司" }, @@ -50,11 +58,15 @@ export function RandomBlogPage() { } else { setIsRefreshing(true); } - const response = await fetchBlogsCatalog({ - page: 1, - pageSize: RANDOM_BLOG_COUNT, - status: "FINISHED", - sort: "random", + const session = readStoredAuthSession(); + const response = await fetchRandomBlogBatch({ + count: RANDOM_BLOG_COUNT, + visitorId: getBlogInteractionVisitorId(), + sessionId: getBlogInteractionSessionId(), + source: "random_page", + pageUrl: window.location.href, + context: { refresh_kind: showInitialLoading ? "initial" : "manual" }, + token: session?.token, }); setBlogs(response.items); } catch { @@ -88,6 +100,15 @@ export function RandomBlogPage() { ...current, [blog.normalizedUrl]: label, })); + recordBlogInteraction( + blogInteractionTarget(blog), + "label_select", + { + entranceKind: RANDOM_PAGE_ENTRANCE_KIND, + entranceUrl: window.location.href, + }, + { label, previous_label: selectedLabel ?? null }, + ); toast.success("已记录,谢谢标注。"); } catch { toast.error("标注保存失败,请稍后再试。"); @@ -135,7 +156,22 @@ export function RandomBlogPage() { <section className="mx-auto grid max-w-6xl grid-cols-1 gap-6 md:grid-cols-2 xl:grid-cols-3"> {blogs.map((blog) => ( - <BlogCard key={blog.id} blog={blog}> + <BlogCard + key={blog.id} + blog={blog} + externalEntranceKind={RANDOM_PAGE_ENTRANCE_KIND} + externalEntranceUrl={window.location.href} + > + <BlogDetailLink + blog={blog} + entranceKind={RANDOM_PAGE_ENTRANCE_KIND} + entranceUrl={window.location.href} + openInNewTab + className="mb-3 inline-flex h-10 w-full items-center justify-center gap-2 rounded-md border border-slate-200 bg-slate-950 px-3 text-sm text-white transition-colors hover:bg-slate-800" + > + <Eye className="h-4 w-4" /> + 查看详情 + </BlogDetailLink> <div className="grid grid-cols-4 gap-2"> {RANDOM_LABELS.map((label) => { const isSaving = savingLabelKey === `${blog.id}:${label.slug}`; diff --git a/frontend/src/pages/VisualizationPage.tsx b/frontend/src/pages/VisualizationPage.tsx index 167c0b4..c3b66d6 100644 --- a/frontend/src/pages/VisualizationPage.tsx +++ b/frontend/src/pages/VisualizationPage.tsx @@ -1,50 +1,60 @@ import { Loader2 } from "lucide-react"; -import { useEffect, useState } from "react"; -import { useSearchParams } from "react-router-dom"; +import { useEffect, useMemo, useState } from "react"; +import { useLocation, useSearchParams } from "react-router-dom"; import { toast } from "sonner"; import { BlogDetailPanel } from "../components/BlogDetailPanel"; import { GraphVisualization } from "../components/GraphVisualization"; import { Navigation } from "../components/Navigation"; -import { fetchBlogDetail, fetchGraphData, fetchSubgraph } from "../lib/api"; +import { fetchBenchmarkGraphData } from "../lib/benchmarkGraph"; +import { fetchBlogDetail, fetchGraphData, fetchStats, fetchSubgraph } from "../lib/api"; import type { BlogDetail, GraphData, GraphNode } from "../types/graph"; -const GRAPH_LIMIT_OPTIONS = [200, 500, 1000, 10000] as const; -const GRAPH_SAMPLE_SEED = 42; -const GRAPH_CACHE_VERSION = "3d-v1"; +const DEFAULT_GRAPH_LIMIT = 200; +const ESTIMATED_RENDER_TICKS_PER_SECOND = 60; +type GraphDisplayMode = "compact" | "full"; -type GraphLimit = (typeof GRAPH_LIMIT_OPTIONS)[number]; - -function graphCacheKey(limit: GraphLimit): string { - return `heyblog:visualization:${GRAPH_CACHE_VERSION}:seed-${GRAPH_SAMPLE_SEED}:limit-${limit}`; +/** + * Format a force-layout tick estimate as an approximate render duration. + * + * @param ticks Estimated force-layout tick count. + * @returns Human-readable duration label. + */ +function formatEstimatedRenderTime(ticks: number): string { + const seconds = Math.max(1, Math.ceil(ticks / ESTIMATED_RENDER_TICKS_PER_SECOND)); + return `约 ${seconds} 秒`; } -function readCachedGraph(limit: GraphLimit): GraphData | null { - try { - const raw = window.localStorage.getItem(graphCacheKey(limit)); - if (!raw) { - return null; - } - const parsed = JSON.parse(raw) as GraphData; - if (Array.isArray(parsed.nodes) && Array.isArray(parsed.edges)) { - return parsed; +/** + * Keep only graph nodes connected to at least two distinct other nodes. + * + * @param graph Raw graph returned by the backend. + * @returns Compact graph with filtered nodes and only edges between kept nodes. + */ +export function compactGraphData(graph: GraphData): GraphData { + const neighborIdsByNodeId = new Map<number, Set<number>>(); + for (const node of graph.nodes) { + neighborIdsByNodeId.set(node.id, new Set()); + } + + for (const edge of graph.edges) { + if (!neighborIdsByNodeId.has(edge.source) || !neighborIdsByNodeId.has(edge.target) || edge.source === edge.target) { + continue; } - } catch { - window.localStorage.removeItem(graphCacheKey(limit)); + neighborIdsByNodeId.get(edge.source)?.add(edge.target); + neighborIdsByNodeId.get(edge.target)?.add(edge.source); } - return null; -} -function graphPayloadSizeMb(data: GraphData): string { - const bytes = new TextEncoder().encode(JSON.stringify(data)).length; - return (bytes / (1024 * 1024)).toFixed(2); -} + const keptNodeIds = new Set( + Array.from(neighborIdsByNodeId.entries()) + .filter(([, neighborIds]) => neighborIds.size >= 2) + .map(([nodeId]) => nodeId), + ); -function writeCachedGraph(limit: GraphLimit, data: GraphData): void { - try { - window.localStorage.setItem(graphCacheKey(limit), JSON.stringify(data)); - } catch { - // Browsers can reject large localStorage writes; graph rendering should still continue. - } + return { + ...graph, + nodes: graph.nodes.filter((node) => keptNodeIds.has(node.id)), + edges: graph.edges.filter((edge) => keptNodeIds.has(edge.source) && keptNodeIds.has(edge.target)), + }; } /** @@ -53,14 +63,42 @@ function writeCachedGraph(limit: GraphLimit, data: GraphData): void { * @returns Visualization page UI. */ export function VisualizationPage() { + const location = useLocation(); const [searchParams] = useSearchParams(); + const isBenchmarkMode = location.pathname.endsWith("/benchmark") || searchParams.get("benchmark") === "community"; const [graphData, setGraphData] = useState<GraphData>({ nodes: [], edges: [] }); const [blogDetail, setBlogDetail] = useState<BlogDetail | null>(null); const [isLoading, setIsLoading] = useState(false); - const [selectedLimit, setSelectedLimit] = useState<GraphLimit | null>(null); - const [graphSizeMb, setGraphSizeMb] = useState<string | null>(null); - const [usedCachedGraph, setUsedCachedGraph] = useState(false); + const [isStatsLoading, setIsStatsLoading] = useState(true); + const [isRendering, setIsRendering] = useState(false); + const [renderProgress, setRenderProgress] = useState(0); + const [estimatedRenderTicks, setEstimatedRenderTicks] = useState<number | null>(null); + const [maxGraphLimit, setMaxGraphLimit] = useState(0); + const [pendingLimit, setPendingLimit] = useState(DEFAULT_GRAPH_LIMIT); + const [selectedLimit, setSelectedLimit] = useState<number | null>(null); + const [graphDisplayMode, setGraphDisplayMode] = useState<GraphDisplayMode>("compact"); const [highlightNodeId, setHighlightNodeId] = useState<number | undefined>(); + const visibleGraphData = useMemo( + () => (graphDisplayMode === "compact" ? compactGraphData(graphData) : graphData), + [graphData, graphDisplayMode], + ); + const shouldShowProgressOverlay = isLoading || isRendering; + const progressPercent = useMemo(() => { + const loadingFloor = isLoading ? 0.08 : 0; + return Math.round(Math.max(loadingFloor, renderProgress) * 100); + }, [isLoading, renderProgress]); + const estimatedRenderTime = useMemo( + () => (estimatedRenderTicks ? formatEstimatedRenderTime(estimatedRenderTicks) : null), + [estimatedRenderTicks], + ); + + useEffect(() => { + if (isBenchmarkMode) { + void loadBenchmarkGraph(); + return; + } + void loadGraphLimitBounds(); + }, [isBenchmarkMode]); useEffect(() => { const highlight = searchParams.get("highlight"); @@ -74,34 +112,86 @@ export function VisualizationPage() { void openBlog(blogId, { loadNeighborhood: true }); }, [searchParams]); + /** + * Load the current graph-size slider range from public stats. + * + * @returns Promise resolved after slider bounds update. + */ + async function loadGraphLimitBounds() { + try { + setIsStatsLoading(true); + const stats = await fetchStats(); + const totalBlogs = Math.max(0, stats.totalNodes); + setMaxGraphLimit(totalBlogs); + setPendingLimit(Math.min(DEFAULT_GRAPH_LIMIT, totalBlogs)); + } catch { + toast.error("图谱规模加载失败,请刷新页面重试。"); + setMaxGraphLimit(DEFAULT_GRAPH_LIMIT); + setPendingLimit(DEFAULT_GRAPH_LIMIT); + } finally { + setIsStatsLoading(false); + } + } + + /** + * Load the deterministic clustered graph benchmark from static frontend assets. + * + * @returns Promise resolved after benchmark graph state updates. + */ + async function loadBenchmarkGraph() { + setGraphDisplayMode("full"); + setSelectedLimit(100); + setPendingLimit(100); + setMaxGraphLimit(100); + setBlogDetail(null); + setHighlightNodeId(undefined); + setIsRendering(false); + setRenderProgress(0); + setEstimatedRenderTicks(null); + + try { + setIsStatsLoading(false); + setIsLoading(true); + const benchmarkGraph = await fetchBenchmarkGraphData(); + setRenderProgress(0.12); + setIsRendering(true); + setGraphData(benchmarkGraph); + } catch { + setSelectedLimit(null); + setIsRendering(false); + setRenderProgress(0); + setEstimatedRenderTicks(null); + toast.error("Benchmark 图谱加载失败,请先运行生成脚本。"); + } finally { + setIsLoading(false); + } + } + /** * Load the selected graph size using deterministic backend sampling. * * @param limit Requested node count. * @returns Promise resolved after graph state updates. */ - async function loadFullGraph(limit: GraphLimit) { + async function loadFullGraph(limit: number) { setSelectedLimit(limit); setBlogDetail(null); setHighlightNodeId(undefined); - - const cachedGraph = readCachedGraph(limit); - if (cachedGraph) { - setGraphData(cachedGraph); - setGraphSizeMb(graphPayloadSizeMb(cachedGraph)); - setUsedCachedGraph(true); - return; - } + setIsRendering(false); + setRenderProgress(0); + setEstimatedRenderTicks(null); try { - setUsedCachedGraph(false); setIsLoading(true); - const graphResponse = await fetchGraphData(limit, { sampleMode: "count", sampleSeed: GRAPH_SAMPLE_SEED }); + const graphResponse = await fetchGraphData(limit); + setRenderProgress(0.12); + setIsRendering(true); setGraphData(graphResponse); - setGraphSizeMb(graphPayloadSizeMb(graphResponse)); - writeCachedGraph(limit, graphResponse); } catch { setSelectedLimit(null); + setIsRendering(false); + setRenderProgress(0); + setEstimatedRenderTicks(null); toast.error("图谱加载失败,请刷新页面重试。"); } finally { setIsLoading(false); @@ -116,6 +206,42 @@ export function VisualizationPage() { * @returns Promise resolved after all requested data is loaded. */ async function openBlog(blogId: number, options: { loadNeighborhood: boolean }) { + if (isBenchmarkMode) { + const node = visibleGraphData.nodes.find((item) => item.id === blogId); + if (!node) { + return; + } + setBlogDetail({ + ...node, + crawlStatus: "FINISHED", + crawlErrorKind: null, + incomingLinks: node.incomingCount ?? 0, + outgoingLinks: node.outgoingCount ?? 0, + relatedNodes: [], + outgoingNodes: [], + recommendedBlogs: [], + discoveryPath: null, + relationGraphs: { + incoming: { + direction: "incoming", + focusBlogId: node.id, + depth: 2, + nodes: [node], + edges: [], + }, + outgoing: { + direction: "outgoing", + focusBlogId: node.id, + depth: 2, + nodes: [node], + edges: [], + }, + }, + }); + setHighlightNodeId(blogId); + return; + } + try { const detail = await fetchBlogDetail(blogId); setBlogDetail(detail); @@ -156,11 +282,22 @@ export function VisualizationPage() { </div> <div className="relative min-h-0 flex-1"> - <GraphVisualization data={graphData} onNodeClick={handleNodeClick} highlightNodeId={highlightNodeId} /> + <GraphVisualization + data={visibleGraphData} + onNodeClick={handleNodeClick} + highlightNodeId={highlightNodeId} + useNodeIcons={!isBenchmarkMode} + onRenderProgress={(progress) => setRenderProgress((current) => Math.max(current, progress))} + onRenderTickEstimate={setEstimatedRenderTicks} + onRenderComplete={() => { + setRenderProgress(1); + setIsRendering(false); + }} + /> {blogDetail ? <BlogDetailPanel detail={blogDetail} onClose={handleCloseDetail} /> : null} </div> - {!selectedLimit || isLoading ? ( + {!selectedLimit || shouldShowProgressOverlay ? ( <div className="fixed inset-0 z-50 flex items-center justify-center bg-slate-950/70 px-4 backdrop-blur-sm"> <div role="dialog" @@ -168,28 +305,99 @@ export function VisualizationPage() { aria-labelledby="visualization-limit-title" className="w-full max-w-md rounded-2xl bg-white p-6 shadow-[0_24px_80px_rgba(15,23,42,0.36)]" > - <h2 id="visualization-limit-title" className="text-2xl font-semibold tracking-normal text-slate-950"> - 选择图谱规模 - </h2> - <div className="mt-6 grid grid-cols-2 gap-3 sm:grid-cols-4"> - {GRAPH_LIMIT_OPTIONS.map((limit) => ( - <button - key={limit} - type="button" - onClick={() => void loadFullGraph(limit)} - disabled={isLoading} - className="rounded-xl border border-slate-200 px-4 py-3 text-sm font-medium text-slate-900 transition-colors hover:border-sky-300 hover:bg-sky-50 disabled:cursor-not-allowed disabled:opacity-60" + {!selectedLimit ? ( + <> + <h2 id="visualization-limit-title" className="text-2xl font-semibold tracking-normal text-slate-950"> + 选择图谱规模 + </h2> + {isStatsLoading ? ( + <div className="mt-5 flex items-center gap-3 text-sm text-slate-600"> + <Loader2 className="h-4 w-4 animate-spin text-sky-500" /> + 正在读取博客数量... + </div> + ) : ( + <div className="mt-6"> + <div className="flex items-end justify-between gap-4"> + <div className="text-sm text-slate-500">节点数量</div> + <div className="text-3xl font-semibold tabular-nums text-slate-950">{pendingLimit}</div> + </div> + <div className="mt-5 grid grid-cols-2 overflow-hidden rounded-lg border border-slate-200 bg-slate-50 p-1"> + <button + type="button" + onClick={() => setGraphDisplayMode("compact")} + className={`rounded-md px-3 py-2 text-sm font-medium transition-colors ${ + graphDisplayMode === "compact" ? "bg-slate-950 text-white shadow-sm" : "text-slate-600 hover:bg-white" + }`} + aria-pressed={graphDisplayMode === "compact"} + > + 精简 + </button> + <button + type="button" + onClick={() => setGraphDisplayMode("full")} + className={`rounded-md px-3 py-2 text-sm font-medium transition-colors ${ + graphDisplayMode === "full" ? "bg-slate-950 text-white shadow-sm" : "text-slate-600 hover:bg-white" + }`} + aria-pressed={graphDisplayMode === "full"} + > + 全 + </button> + </div> + <input + type="range" + min={0} + max={maxGraphLimit} + step={1} + value={pendingLimit} + onChange={(event) => setPendingLimit(Number(event.currentTarget.value))} + className="mt-5 w-full accent-sky-500" + aria-label="节点数量" + /> + <div className="mt-2 flex items-center justify-between text-sm tabular-nums text-slate-500"> + <span>0</span> + <span>{maxGraphLimit}</span> + </div> + <button + type="button" + onClick={() => void loadFullGraph(pendingLimit)} + disabled={isLoading || isStatsLoading} + className="mt-6 w-full rounded-xl bg-slate-950 px-4 py-3 text-sm font-medium text-white transition-colors hover:bg-sky-600 disabled:cursor-not-allowed disabled:opacity-60" + > + 确认 + </button> + </div> + )} + </> + ) : ( + <div> + <h2 id="visualization-limit-title" className="text-2xl font-semibold tracking-normal text-slate-950"> + 正在渲染图谱 + </h2> + <div className="mt-5 flex items-center gap-3 text-sm text-slate-600"> + <Loader2 className="h-4 w-4 animate-spin text-sky-500" /> + {isLoading ? "正在加载图谱数据..." : "正在计算 3D 力导布局..."} + </div> + {!isLoading && estimatedRenderTicks ? ( + <div className="mt-3 space-y-1 text-sm tabular-nums text-slate-500"> + <div>预计需要 {estimatedRenderTicks} ticks</div> + {estimatedRenderTime ? <div>预估所需渲染时间:{estimatedRenderTime}</div> : null} + </div> + ) : null} + <div + className="mt-5 h-2 overflow-hidden rounded-full bg-slate-100" + role="progressbar" + aria-valuenow={progressPercent} + aria-valuemin={0} + aria-valuemax={100} > - {limit} - </button> - ))} - </div> - {isLoading ? ( - <div className="mt-5 flex items-center gap-3 text-sm text-slate-600"> - <Loader2 className="h-4 w-4 animate-spin text-sky-500" /> - 正在加载图谱数据... + <div + className="h-full rounded-full bg-sky-500 transition-all duration-150 ease-out" + style={{ width: `${progressPercent}%` }} + /> + </div> + <div className="mt-2 text-right text-sm tabular-nums text-slate-500">{progressPercent}%</div> </div> - ) : null} + )} </div> </div> ) : null} diff --git a/frontend/src/types/graph.ts b/frontend/src/types/graph.ts index 2ce2046..8a04cac 100644 --- a/frontend/src/types/graph.ts +++ b/frontend/src/types/graph.ts @@ -10,6 +10,7 @@ export interface GraphNode { description?: string | null; x?: number; y?: number; + z?: number; degree?: number; incomingCount?: number; outgoingCount?: number; @@ -61,11 +62,49 @@ export interface RecommendedBlog extends GraphNode { viaBlogs: GraphNode[]; } +export interface BlogDiscoveryStep { + blog: Pick<GraphNode, "id" | "domain" | "title" | "iconUrl"> | null; + blogId: number; + url: string; + domain: string; + acceptedBy: string | null; + acceptedLabel: string | null; + rawId: number | null; + rawSourceBlogId: number | null; + rawAcceptedBy: string | null; + discoveredAt: string | null; +} + +export interface BlogDiscoveryPath { + mode: "manual" | "crawled"; + originSource: string | null; + originLabel: string; + targetSource: string | null; + truncated: boolean; + steps: BlogDiscoveryStep[]; +} + +export interface BlogRelationGraph { + direction: "incoming" | "outgoing"; + focusBlogId: number; + depth: number; + nodes: GraphNode[]; + edges: GraphEdge[]; +} + export interface BlogDetail extends GraphNode { + crawlStatus: string; + crawlErrorKind: string | null; incomingLinks: number; outgoingLinks: number; relatedNodes: GraphNode[]; + outgoingNodes: GraphNode[]; recommendedBlogs: RecommendedBlog[]; + discoveryPath: BlogDiscoveryPath | null; + relationGraphs: { + incoming: BlogRelationGraph; + outgoing: BlogRelationGraph; + }; } export interface StatsData { @@ -85,6 +124,9 @@ export interface StatusData { export interface BlogCatalogItem extends GraphNode { normalizedUrl: string; + requestUuid?: string; + impressionId?: number; + position?: number; identityKey: string; identityReasonCodes: string[]; identityRulesetVersion: string; @@ -113,10 +155,43 @@ export interface BlogCatalogPage { sort: string; } +export interface RandomRecommendationBatch { + requestUuid: string; + surface: string; + strategy: string; + strategyVersion: string; + visitorId: string; + sessionId: string; + requestedCount: number; + servedCount: number; + createdAt: string | null; + items: BlogCatalogItem[]; +} + +export interface RecommendationEventInput { + eventUuid: string; + eventType: string; + blogId: number; + visitorId: string; + sessionId: string; + entranceKind: string; + entranceUrl: string; + requestUuid?: string; + impressionId?: number; + position?: number; + interactionOrder?: number; + clientEventAt?: string; + attributes?: Record<string, unknown>; +} + export interface UserProfile { id: number; email: string; displayName: string; + role: "admin" | "user"; + isActive: boolean; + emailVerified: boolean; + emailVerifiedAt: string | null; createdAt: string | null; updatedAt: string | null; } @@ -125,6 +200,17 @@ export interface AuthSession { token: string; expiresAt: string | null; user: UserProfile; + emailVerification?: AuthLifecycleToken; +} + +export interface AuthLifecycleToken { + sent: boolean; + verificationToken?: string; + verificationUrl?: string; + resetToken?: string; + resetUrl?: string; + expiresAt?: string | null; + alreadyVerified?: boolean; } export interface UserLabelSelection { @@ -161,19 +247,29 @@ export interface AdminRuntimeCurrent { elapsedSeconds: number | null; } -export interface AdminRequeueFailedBlogsResult { - requeued: number; +export interface AdminHourlyStatsRow { + id: number; + hourStart: string | null; + userCount: number; + randomRequestCount: number; + randomImpressionCount: number; + detailOpenCount: number; + externalOpenCount: number; + detailCtr: number; + externalCtr: number; + totalClickCtr: number; + refreshedAt: string | null; + createdAt: string | null; } -export interface AdminDedupSummary { - id: number; - status: string; - totalCount: number; - scannedCount: number; - removedCount: number; - keptCount: number; - createdAt: string; - updatedAt: string; +export interface AdminHourlyStats { + currentHour: AdminHourlyStatsRow; + latest: AdminHourlyStatsRow; + items: AdminHourlyStatsRow[]; +} + +export interface AdminRequeueFailedBlogsResult { + requeued: number; } export interface AdminBlogLabelTag { diff --git a/memory/MEMORY.md b/memory/MEMORY.md deleted file mode 100644 index 81f0e75..0000000 --- a/memory/MEMORY.md +++ /dev/null @@ -1,3 +0,0 @@ -# Memory Index - -- [Filter chain two-phase architecture](filter-chain-two-phase-architecture.md) — rule AND-gate + success OR-group (RSS then model); URL-refilter feature deleted diff --git a/memory/filter-chain-two-phase-architecture.md b/memory/filter-chain-two-phase-architecture.md deleted file mode 100644 index 715f2fa..0000000 --- a/memory/filter-chain-two-phase-architecture.md +++ /dev/null @@ -1,20 +0,0 @@ ---- -name: filter-chain-two-phase-architecture -description: How the crawler URL filter chain works after the 2026-05-30 RSS refactor (rule AND-gate + success OR-group) -metadata: - type: project ---- - -As of 2026-05-30 the crawler URL decision chain ([crawler/crawling/decisions/chain.py](crawler/crawling/decisions/chain.py)) is two-phase, not the old pure-AND chain: - -- **Phase 1 — `rule` filters (AND-gate):** every deterministic hard rule must accept; first rejection wins and returns verbatim. `decider_role == "rule"`. -- **Phase 2 — `success` deciders (ordered OR-group):** run after rules. First decider that *confirms* (returns `FilterDecision.confirmed=True`) keeps the candidate immediately, carrying any `feed_url`. A decider that *abstains* (accepts without confirming) defers to the next. If none confirm but at least one rejected, the last rejection wins. `decider_role == "success"`. - -Success deciders in default order: `rss_discovery` then `model_consensus`. - -- **RSS layer** ([crawler/crawling/decisions/rss.py](crawler/crawling/decisions/rss.py)): fetches the candidate homepage, parses `<link rel=alternate>` feed links, probes common feed paths, validates with `feedparser`. Confirms + records the feed URL. **Needs a live fetcher** threaded via `UrlCandidateContext.fetcher` (+ `fetch_deadline`). Offline callers (dedup scan, funnel stats) pass no fetcher, so RSS abstains and stays network-free. Flag: `HEYBLOG_RSS_DISCOVERY_ENABLED` (default on). -- RSS-absence is an **abstain**, never a rejection — many blogs lack feeds, so they fall through to model consensus. With RSS off, behavior is identical to the legacy chain. - -`feed_url` is persisted on `blogs.feed_url` via `upsert_blog(..., feed_url=...)` (only set on insert or when existing feed is empty; never overwritten with null). - -**The offline URL-refilter feature was deleted entirely** in the same change (per user request): all `url_refilter` repository methods/models/endpoints/HTTP-client methods/frontend UI, plus `_backup_sqlite_database`, `_handle_refilter_*`, `_filter_chain_version`. The blog **dedup scan** is a separate feature and was kept — it still uses `decision_chain.decide()`, `_decision_scan_settings`, `_decision_scan_ruleset_version`, `_delete_blog_graph`. Migration `20260530_02` drops the refilter tables; `20260530_01` adds `feed_url`. Related: [[heyblog-service-boundaries]]. diff --git a/persistence_api/email_delivery.py b/persistence_api/email_delivery.py new file mode 100644 index 0000000..d77d2f7 --- /dev/null +++ b/persistence_api/email_delivery.py @@ -0,0 +1,237 @@ +"""Email delivery adapters for user lifecycle messages.""" + +from __future__ import annotations + +from dataclasses import dataclass +from email.message import EmailMessage +import smtplib +import ssl +from typing import Protocol + +from shared.config import Settings + + +class EmailDeliveryError(Exception): + """Raised when a configured email provider cannot deliver a message. + + Args: + message: Stable error code suitable for API translation. + """ + + +class EmailDelivery(Protocol): + """Interface for sending user lifecycle email messages. + + Implementations send already-generated verification and reset URLs. They + must not persist raw lifecycle tokens or expose provider credentials. + """ + + def send_verification_email(self, *, to_email: str, verification_url: str) -> None: + """Send a verification-link email. + + Args: + to_email: Recipient account email address. + verification_url: Public one-time verification URL. + + Returns: + None after the message has been accepted by the provider. + """ + + def send_password_reset_email(self, *, to_email: str, reset_url: str) -> None: + """Send a password-reset-link email. + + Args: + to_email: Recipient account email address. + reset_url: Public one-time password reset URL. + + Returns: + None after the message has been accepted by the provider. + """ + + +@dataclass(slots=True) +class NoopEmailDelivery: + """Email adapter that intentionally performs no provider call. + + This keeps local development and tests independent from networked SMTP + credentials while still exercising token generation flows. + """ + + def send_verification_email(self, *, to_email: str, verification_url: str) -> None: + """Ignore one verification message. + + Args: + to_email: Recipient account email address. + verification_url: Public one-time verification URL. + + Returns: + None. + """ + + del to_email, verification_url + + def send_password_reset_email(self, *, to_email: str, reset_url: str) -> None: + """Ignore one password reset message. + + Args: + to_email: Recipient account email address. + reset_url: Public one-time password reset URL. + + Returns: + None. + """ + + del to_email, reset_url + + +@dataclass(slots=True) +class SmtpEmailDelivery: + """SMTP-backed email adapter for verification and reset messages. + + Args: + host: SMTP server hostname. + port: SMTP server port. + from_email: Sender address used in lifecycle emails. + username: Optional SMTP username. + password: Optional SMTP password. + use_tls: Whether to upgrade the connection with STARTTLS. + use_ssl: Whether to connect with implicit SMTP-over-SSL. + timeout_seconds: Network timeout for SMTP operations. + """ + + host: str + port: int + from_email: str + username: str | None = None + password: str | None = None + use_tls: bool = True + use_ssl: bool = False + timeout_seconds: float = 10.0 + + def send_verification_email(self, *, to_email: str, verification_url: str) -> None: + """Send a verification-link email. + + Args: + to_email: Recipient account email address. + verification_url: Public one-time verification URL. + + Returns: + None after the SMTP server accepts the message. + """ + + self._send( + to_email=to_email, + subject="Verify your HeyBlog email", + text_body=( + "Verify your HeyBlog email address by opening this link:\n\n" + f"{verification_url}\n\n" + "If you did not request this, you can ignore this email." + ), + ) + + def send_password_reset_email(self, *, to_email: str, reset_url: str) -> None: + """Send a password-reset-link email. + + Args: + to_email: Recipient account email address. + reset_url: Public one-time password reset URL. + + Returns: + None after the SMTP server accepts the message. + """ + + self._send( + to_email=to_email, + subject="Reset your HeyBlog password", + text_body=( + "Reset your HeyBlog password by opening this link:\n\n" + f"{reset_url}\n\n" + "If you did not request this, you can ignore this email." + ), + ) + + def _send(self, *, to_email: str, subject: str, text_body: str) -> None: + """Build and send one plain-text email over SMTP. + + Args: + to_email: Recipient email address. + subject: Message subject line. + text_body: Plain-text message body. + + Returns: + None after the provider accepts the message. + + Raises: + EmailDeliveryError: Raised when the SMTP call fails or is + misconfigured. + """ + + if not self.host or not self.from_email: + raise EmailDeliveryError("email_delivery_not_configured") + + message = EmailMessage() + message["From"] = self.from_email + message["To"] = to_email + message["Subject"] = subject + message.set_content(text_body) + + try: + if self.use_ssl: + context = ssl.create_default_context() + with smtplib.SMTP_SSL(self.host, self.port, timeout=self.timeout_seconds, context=context) as smtp: + self._authenticate_if_configured(smtp) + smtp.send_message(message) + return + + with smtplib.SMTP(self.host, self.port, timeout=self.timeout_seconds) as smtp: + if self.use_tls: + context = ssl.create_default_context() + smtp.starttls(context=context) + self._authenticate_if_configured(smtp) + smtp.send_message(message) + except (OSError, smtplib.SMTPException) as exc: + raise EmailDeliveryError("email_delivery_failed") from exc + + def _authenticate_if_configured(self, smtp: smtplib.SMTP) -> None: + """Authenticate with SMTP when username and password are configured. + + Args: + smtp: Open SMTP connection. + + Returns: + None. + """ + + if self.username and self.password: + smtp.login(self.username, self.password) + + +def build_email_delivery(settings: Settings) -> EmailDelivery: + """Create the configured email delivery adapter. + + Args: + settings: Runtime settings loaded from environment variables. + + Returns: + SMTP adapter when `HEYBLOG_EMAIL_PROVIDER=smtp`; otherwise a no-op + adapter for development and tests. + + Raises: + ValueError: Raised when an unsupported email provider is configured. + """ + + provider = settings.email_provider.strip().lower() + if provider in {"", "disabled", "noop"}: + return NoopEmailDelivery() + if provider == "smtp": + return SmtpEmailDelivery( + host=settings.smtp_host, + port=settings.smtp_port, + from_email=settings.email_from, + username=settings.smtp_username, + password=settings.smtp_password, + use_tls=settings.smtp_use_tls, + use_ssl=settings.smtp_use_ssl, + timeout_seconds=settings.smtp_timeout_seconds, + ) + raise ValueError("unsupported_email_provider") diff --git a/persistence_api/graph_projection.py b/persistence_api/graph_projection.py index 74d521f..6caba12 100644 --- a/persistence_api/graph_projection.py +++ b/persistence_api/graph_projection.py @@ -139,12 +139,13 @@ def _available_graph( blogs: list[dict[str, Any]], edges: list[dict[str, Any]], ) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: - nodes = [dict(blog) for blog in blogs if str(blog.get("crawl_status")) == "FINISHED"] - finished_ids = {int(node["id"]) for node in nodes} + """Return graph nodes and edges whose endpoints still exist in blogs.""" + nodes = [dict(blog) for blog in blogs] + node_ids = {int(node["id"]) for node in nodes} filtered_edges = [ dict(edge) for edge in edges - if int(edge["from_blog_id"]) in finished_ids and int(edge["to_blog_id"]) in finished_ids + if int(edge["from_blog_id"]) in node_ids and int(edge["to_blog_id"]) in node_ids ] return nodes, filtered_edges @@ -418,7 +419,7 @@ def build_core_graph_view( """Return the default structured subgraph view.""" nodes = snapshot["nodes"] edges = snapshot["edges"] - limit = _clamp_int(limit, 24, MAX_CORE_LIMIT) + limit = _clamp_int(limit, 0, MAX_CORE_LIMIT) sampled_ids = _sample_node_ids( nodes, edges, @@ -443,7 +444,16 @@ def build_core_graph_view( adjacency, _, _ = _build_adjacency(filtered_nodes, edges) ordered_nodes = _sorted_nodes(filtered_nodes) if strategy == "seed": - seed_nodes = sorted(filtered_nodes, key=lambda node: int(node["id"]))[: min(len(filtered_nodes), 18)] + selected_ids = {int(node["id"]) for node in sorted(filtered_nodes, key=lambda node: int(node["id"]))[:limit]} + return _build_view_payload( + snapshot, + selected_ids, + strategy=strategy, + limit=limit, + sample_mode=sample_mode, + sample_value=sample_value, + sample_seed=sample_seed, + ) else: strategy = "degree" seed_nodes = ordered_nodes[: min(len(ordered_nodes), max(12, limit // 4))] diff --git a/persistence_api/main.py b/persistence_api/main.py index 84d5f5e..1f6c11b 100644 --- a/persistence_api/main.py +++ b/persistence_api/main.py @@ -12,6 +12,8 @@ from fastapi.responses import Response from pydantic import BaseModel +from persistence_api.email_delivery import EmailDeliveryError +from persistence_api.email_delivery import build_email_delivery from persistence_api.repository import BLOG_CATALOG_DEFAULT_PAGE_SIZE from persistence_api.repository import BLOG_LABELING_DEFAULT_PAGE_SIZE from persistence_api.age_graph import AgeGraphManager @@ -49,11 +51,13 @@ class UpsertBlogRequest(BaseModel): domain: str email: str | None = None feed_url: str | None = None + accepted_by: str | None = None + seed_source_path: str | None = None + seed_source_row: int | None = None -class CreateIngestionRequest(BaseModel): +class CreateUserSeedRequest(BaseModel): homepage_url: str - email: str class UserAuthRequest(BaseModel): @@ -61,6 +65,23 @@ class UserAuthRequest(BaseModel): password: str +class EmailRequest(BaseModel): + email: str + + +class TokenRequest(BaseModel): + token: str + + +class PasswordResetRequest(BaseModel): + token: str + password: str + + +class UpdateUserRoleRequest(BaseModel): + role: str + + class BlogResultRequest(BaseModel): crawl_status: str status_code: int | None @@ -68,6 +89,8 @@ class BlogResultRequest(BaseModel): metadata_captured: bool = False title: str | None = None icon_url: str | None = None + crawl_error_kind: str | None = None + crawl_error_message: str | None = None class AddEdgeRequest(BaseModel): @@ -107,6 +130,33 @@ class IncrementBlogUserLabelRequest(BaseModel): user_id: int | None = None +class CreateRandomRecommendationBatchRequest(BaseModel): + count: int = 9 + visitor_id: str + session_id: str + user_id: int | None = None + source: str | None = None + page_url: str | None = None + context: dict[str, Any] | None = None + + +class RecordBlogInteractionRequest(BaseModel): + event_uuid: str + event_type: str + blog_id: int + visitor_id: str + session_id: str + entrance_kind: str + entrance_url: str + request_uuid: str | None = None + impression_id: int | None = None + position: int | None = None + interaction_order: int = 1 + user_id: int | None = None + client_event_at: str | None = None + attributes: dict[str, Any] | None = None + + class CreateBlogLabelTagRequest(BaseModel): name: str @@ -124,13 +174,6 @@ class BlogLabelParquetStatusResponse(BaseModel): updated_at: str | None -class FinalizeBlogDedupScanRunRequest(BaseModel): - crawler_restart_attempted: bool - crawler_restart_succeeded: bool - search_reindexed: bool - error_message: str | None = None - - _T = TypeVar("_T") _ExceptionTranslation = tuple[type[Exception], int, str | None] @@ -250,7 +293,12 @@ def build_persistence_state(settings: Settings | None = None) -> PersistenceStat resolved = settings or Settings.from_env() if resolved.db_dsn: run_postgres_migrations(resolved.db_dsn) - repository = build_repository(db_path=resolved.db_path, db_dsn=resolved.db_dsn, settings=resolved) + repository = build_repository( + db_path=resolved.db_path, + db_dsn=resolved.db_dsn, + settings=resolved, + email_delivery=build_email_delivery(resolved), + ) age_manager = AgeGraphManager( getattr(repository, "engine", None), enabled=resolved.age_enabled and resolved.age_shadow_reads, @@ -309,6 +357,7 @@ def list_blogs_catalog( has_title: str | None = None, has_icon: str | None = None, min_connections: str | None = None, + acceptance_status: str | None = "ACCEPTED", ) -> dict[str, Any]: return _call_with_value_error_http_translation( lambda: get_state().repository.list_blogs_catalog( @@ -323,6 +372,7 @@ def list_blogs_catalog( has_title=has_title, has_icon=has_icon, min_connections=min_connections, + acceptance_status=acceptance_status, ), status_code=422, ) @@ -334,9 +384,44 @@ def lookup_blog_candidates(url: str) -> dict[str, Any]: status_code=422, ) - @app.get("/internal/ingestion-requests") - def list_priority_ingestion_requests() -> list[dict[str, Any]]: - return get_state().repository.list_priority_ingestion_requests() + @app.post("/internal/recommendations/random-blog-batches") + def create_random_recommendation_batch(payload: CreateRandomRecommendationBatchRequest) -> dict[str, Any]: + return _call_with_http_exception_translation( + lambda: get_state().repository.create_random_recommendation_batch(**payload.model_dump()), + exception_translations=( + (ValueError, 422, None), + (UserAuthError, 401, None), + ), + ) + + @app.post("/internal/recommendation-events") + def record_blog_interaction(payload: RecordBlogInteractionRequest) -> dict[str, Any]: + return _call_with_http_exception_translation( + lambda: get_state().repository.record_blog_interaction(**payload.model_dump()), + exception_translations=( + (ValueError, 422, None), + (BlogLabelingNotFoundError, 404, None), + (UserAuthError, 401, None), + ), + ) + + @app.get("/internal/blogs/{blog_id}/recommendation-stats") + def get_blog_recommendation_stats(blog_id: int) -> dict[str, Any]: + return _require_payload( + get_state().repository.get_blog_recommendation_stats(blog_id), + detail="blog_not_found", + ) + + @app.get("/internal/recommendation-stats") + def get_recommendation_strategy_stats() -> dict[str, Any]: + return get_state().repository.get_recommendation_strategy_stats() + + @app.get("/internal/admin/hourly-stats") + def get_admin_hourly_stats(limit: int = 24) -> dict[str, Any]: + return _call_with_value_error_http_translation( + lambda: get_state().repository.get_admin_hourly_stats(limit=limit), + status_code=422, + ) @app.post("/internal/users/register") def register_user(payload: UserAuthRequest) -> dict[str, Any]: @@ -345,6 +430,7 @@ def register_user(payload: UserAuthRequest) -> dict[str, Any]: exception_translations=( (ValueError, 422, None), (UserAuthError, 409, None), + (EmailDeliveryError, 502, "email_delivery_failed"), ), ) @@ -369,6 +455,57 @@ def get_current_user(session_token: str) -> dict[str, Any]: def logout_user(session_token: str) -> dict[str, bool]: return {"ok": get_state().repository.revoke_user_session(token=session_token)} + @app.post("/internal/users/email-verification/request") + def request_email_verification(payload: EmailRequest) -> dict[str, Any]: + return _call_with_http_exception_translation( + lambda: get_state().repository.request_email_verification(email=payload.email), + exception_translations=( + (ValueError, 422, None), + (EmailDeliveryError, 502, "email_delivery_failed"), + ), + ) + + @app.post("/internal/users/email-verification/confirm") + def confirm_email_verification(payload: TokenRequest) -> dict[str, Any]: + return _call_with_http_exception_translation( + lambda: get_state().repository.confirm_email_verification(token=payload.token), + exception_translations=((UserAuthError, 401, None),), + ) + + @app.post("/internal/users/password-reset/request") + def request_password_reset(payload: EmailRequest) -> dict[str, Any]: + return _call_with_http_exception_translation( + lambda: get_state().repository.request_password_reset(email=payload.email), + exception_translations=( + (ValueError, 422, None), + (EmailDeliveryError, 502, "email_delivery_failed"), + ), + ) + + @app.post("/internal/users/password-reset/confirm") + def reset_user_password(payload: PasswordResetRequest) -> dict[str, Any]: + return _call_with_http_exception_translation( + lambda: get_state().repository.reset_user_password(token=payload.token, password=payload.password), + exception_translations=( + (ValueError, 422, None), + (UserAuthError, 401, None), + ), + ) + + @app.get("/internal/users") + def list_users(page: int = 1, page_size: int = 50) -> dict[str, Any]: + return get_state().repository.list_users(page=page, page_size=page_size) + + @app.patch("/internal/users/{user_id}/role") + def update_user_role(user_id: int, payload: UpdateUserRoleRequest) -> dict[str, Any]: + return _call_with_http_exception_translation( + lambda: get_state().repository.update_user_role(user_id=user_id, role=payload.role), + exception_translations=( + (ValueError, 422, None), + (UserAuthError, 404, None), + ), + ) + @app.get("/internal/users/{user_id}/label-selections") def list_user_label_selections(user_id: int, limit: int = 50) -> list[dict[str, Any]]: return get_state().repository.list_user_label_selections(user_id=user_id, limit=limit) @@ -484,15 +621,9 @@ def export_blog_label_training_parquet() -> Response: ) @app.get("/internal/queue/next") - def next_waiting(include_priority: bool = True) -> dict[str, Any] | None: + def next_waiting() -> dict[str, Any] | None: return _load_optional_row_as_dict( - lambda: get_state().repository.get_next_waiting_blog(include_priority=include_priority), - ) - - @app.get("/internal/queue/priority-next") - def next_priority_waiting() -> dict[str, Any] | None: - return _load_optional_row_as_dict( - lambda: get_state().repository.get_next_priority_blog(), + lambda: get_state().repository.get_next_waiting_blog(), ) @app.get("/internal/blogs/{blog_id}/detail") @@ -502,66 +633,28 @@ def get_blog_detail(blog_id: int) -> dict[str, Any]: detail="blog_not_found", ) - @app.post("/internal/ingestion-requests") - def create_ingestion_request(payload: CreateIngestionRequest) -> dict[str, Any]: + @app.post("/internal/user-seeds") + def create_user_seed(payload: CreateUserSeedRequest) -> dict[str, Any]: return _call_with_value_error_http_translation( - lambda: get_state().repository.create_ingestion_request(**payload.model_dump()), + lambda: get_state().repository.create_user_seed(**payload.model_dump()), status_code=422, ) - @app.get("/internal/ingestion-requests/{request_id}") - def get_ingestion_request(request_id: int, request_token: str) -> dict[str, Any]: - return _require_payload( - get_state().repository.get_ingestion_request( - request_id=request_id, - request_token=request_token, - ), - detail="ingestion_request_not_found", - ) - - @app.post("/internal/blog-dedup-scans/runs") - def create_blog_dedup_scan_run(crawler_was_running: bool = False) -> dict[str, Any]: - return get_state().repository.create_blog_dedup_scan_run(crawler_was_running=crawler_was_running) - - @app.post("/internal/blog-dedup-scans/{run_id}/execute") - def execute_blog_dedup_scan_run(run_id: int) -> dict[str, Any]: - return _call_with_value_error_http_translation( - lambda: get_state().repository.execute_blog_dedup_scan_run(run_id=run_id), - status_code=404, - ) - - @app.post("/internal/blog-dedup-scans/{run_id}/finalize") - def finalize_blog_dedup_scan_run(run_id: int, payload: FinalizeBlogDedupScanRunRequest) -> dict[str, Any]: - return _call_with_value_error_http_translation( - lambda: get_state().repository.finalize_blog_dedup_scan_run( - run_id=run_id, - **payload.model_dump(), - ), - status_code=404, - ) - - @app.get("/internal/blog-dedup-scans/latest") - def get_latest_blog_dedup_scan_run() -> dict[str, Any]: - return _require_payload( - get_state().repository.get_latest_blog_dedup_scan_run(), - detail="blog_dedup_scan_run_not_found", - ) - - @app.get("/internal/blog-dedup-scans/{run_id}/items") - def list_blog_dedup_scan_run_items(run_id: int) -> list[dict[str, Any]]: - return get_state().repository.list_blog_dedup_scan_run_items(run_id) - - @app.post("/internal/ingestion-requests/by-blog/{blog_id}/crawling") - def mark_ingestion_request_crawling(blog_id: int) -> dict[str, bool]: - return _run_action_and_return_ok( - lambda: get_state().repository.mark_ingestion_request_crawling(blog_id=blog_id), - ) - @app.post("/internal/blogs/upsert") def upsert_blog(payload: UpsertBlogRequest) -> dict[str, Any]: blog_id, inserted = get_state().repository.upsert_blog(**payload.model_dump()) return {"id": blog_id, "inserted": inserted} + @app.get("/internal/seeds") + def list_seeds() -> list[dict[str, Any]]: + """Return durable seed rows for crawler bootstrap replay.""" + return get_state().repository.list_seeds() + + @app.get("/internal/blogs/by-normalized-url") + def find_blog_by_normalized_url(normalized_url: str) -> dict[str, int | None]: + """Return the existing blog id for one normalized URL.""" + return {"id": get_state().repository.find_blog_id_by_normalized_url(normalized_url=normalized_url)} + @app.post("/internal/blogs/{blog_id}/result") def mark_blog_result(blog_id: int, payload: BlogResultRequest) -> dict[str, bool]: return _run_action_and_return_ok( diff --git a/persistence_api/models.py b/persistence_api/models.py index a9b2e6c..8316704 100644 --- a/persistence_api/models.py +++ b/persistence_api/models.py @@ -4,10 +4,11 @@ from datetime import datetime -from sqlalchemy import Boolean from sqlalchemy import DateTime from sqlalchemy import Enum +from sqlalchemy import Float from sqlalchemy import ForeignKey +from sqlalchemy import Boolean from sqlalchemy import Integer from sqlalchemy import JSON from sqlalchemy import Index @@ -51,6 +52,13 @@ class BlogModel(Base): title: Mapped[str | None] = mapped_column(Text, nullable=True) icon_url: Mapped[str | None] = mapped_column(Text, nullable=True) status_code: Mapped[int | None] = mapped_column(Integer, nullable=True) + acceptance_status: Mapped[str] = mapped_column(Text, nullable=False, default="UNKNOWN") + accepted_by: Mapped[str | None] = mapped_column(Text, nullable=True) + accepted_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True) + crawl_error_kind: Mapped[str | None] = mapped_column(Text, nullable=True) + crawl_error_message: Mapped[str | None] = mapped_column(Text, nullable=True) + last_crawl_attempt_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True) + successful_crawl_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True) crawl_status: Mapped[CrawlStatus] = mapped_column( Enum(CrawlStatus, name="crawl_status"), nullable=False, @@ -62,26 +70,28 @@ class BlogModel(Base): updated_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), nullable=False, server_default=func.now()) -class IngestionRequestModel(Base): - """User-triggered priority ingestion request.""" +class SeedModel(Base): + """Seed URL imported from a configured seed CSV file. - __tablename__ = "ingestion_requests" + Args: + None. SQLAlchemy constructs model instances from mapped keyword + arguments. + + Returns: + One durable seed record keyed by normalized URL and linked to the blog + row created or reused during CSV bootstrap. + """ + + __tablename__ = "seeds" id: Mapped[int] = mapped_column(primary_key=True) - requested_url: Mapped[str] = mapped_column(Text, nullable=False) - normalized_url: Mapped[str] = mapped_column(Text, nullable=False) - identity_key: Mapped[str] = mapped_column(Text, nullable=False, index=True, default="") - identity_reason_codes: Mapped[str] = mapped_column(Text, nullable=False, default="[]") - identity_ruleset_version: Mapped[str] = mapped_column(Text, nullable=False, default="") - requester_email: Mapped[str] = mapped_column(Text, nullable=False) - status: Mapped[str] = mapped_column(Text, nullable=False) - priority: Mapped[int] = mapped_column(Integer, nullable=False, default=100) - seed_blog_id: Mapped[int | None] = mapped_column(ForeignKey("blogs.blog_id", ondelete="SET NULL"), nullable=True) - matched_blog_id: Mapped[int | None] = mapped_column(ForeignKey("blogs.blog_id", ondelete="SET NULL"), nullable=True) - request_token: Mapped[str] = mapped_column(Text, nullable=False) - expires_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True) - error_message: Mapped[str | None] = mapped_column(Text, nullable=True) - created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), nullable=False, server_default=func.now()) + url: Mapped[str] = mapped_column(Text, nullable=False) + normalized_url: Mapped[str] = mapped_column(Text, nullable=False, unique=True, index=True) + domain: Mapped[str] = mapped_column(Text, nullable=False) + source_path: Mapped[str | None] = mapped_column(Text, nullable=True) + source_row: Mapped[int | None] = mapped_column(Integer, nullable=True) + blog_id: Mapped[int | None] = mapped_column(ForeignKey("blogs.blog_id", ondelete="SET NULL"), nullable=True) + imported_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), nullable=False, server_default=func.now()) updated_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), nullable=False, server_default=func.now()) @@ -103,6 +113,11 @@ class UserModel(Base): email: Mapped[str] = mapped_column(Text, nullable=False, unique=True, index=True) password_hash: Mapped[str] = mapped_column(Text, nullable=False) display_name: Mapped[str] = mapped_column(Text, nullable=False, default="") + role: Mapped[str] = mapped_column(Text, nullable=False, default="user") + is_active: Mapped[bool] = mapped_column(Boolean, nullable=False, default=True) + email_verified_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True) + password_changed_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True) + last_login_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True) created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), nullable=False, server_default=func.now()) updated_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), nullable=False, server_default=func.now()) @@ -128,6 +143,72 @@ class UserSessionModel(Base): revoked_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True) +class PendingUserRegistrationModel(Base): + """Unverified registration intent stored until email ownership is proven. + + Args: + None. SQLAlchemy constructs model instances from mapped keyword + arguments. + + Returns: + One pending email/password registration. A row is promoted into + ``users`` only after its verification token is consumed. + """ + + __tablename__ = "pending_user_registrations" + + id: Mapped[int] = mapped_column(primary_key=True) + email: Mapped[str] = mapped_column(Text, nullable=False, unique=True, index=True) + password_hash: Mapped[str] = mapped_column(Text, nullable=False) + token_hash: Mapped[str] = mapped_column(Text, nullable=False, unique=True, index=True) + created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), nullable=False, server_default=func.now()) + expires_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), nullable=False) + consumed_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True) + + +class UserVerificationTokenModel(Base): + """Single-use user lifecycle token stored as a hash. + + Args: + None. SQLAlchemy constructs model instances from mapped keyword + arguments. + + Returns: + Token row used for email verification and password reset flows. Raw + tokens are returned to callers once and are never stored. + """ + + __tablename__ = "user_verification_tokens" + + id: Mapped[int] = mapped_column(primary_key=True) + user_id: Mapped[int] = mapped_column(ForeignKey("users.id", ondelete="CASCADE"), nullable=False, index=True) + token_hash: Mapped[str] = mapped_column(Text, nullable=False, unique=True, index=True) + purpose: Mapped[str] = mapped_column(Text, nullable=False, index=True) + created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), nullable=False, server_default=func.now()) + expires_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), nullable=False) + consumed_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True) + + +class UserAuditEventModel(Base): + """Security-relevant account event for audit screens. + + Args: + None. SQLAlchemy constructs model instances from mapped keyword + arguments. + + Returns: + Minimal append-only audit event. Details must not contain raw secrets. + """ + + __tablename__ = "user_audit_events" + + id: Mapped[int] = mapped_column(primary_key=True) + user_id: Mapped[int | None] = mapped_column(ForeignKey("users.id", ondelete="SET NULL"), nullable=True, index=True) + event_type: Mapped[str] = mapped_column(Text, nullable=False, index=True) + details: Mapped[dict[str, object]] = mapped_column(JSON, nullable=False, default=dict) + created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), nullable=False, server_default=func.now()) + + class BlogLabelModel(Base): """Stable URL-keyed label vote counters. @@ -248,50 +329,136 @@ class RawDiscoveredUrlModel(Base): updated_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), nullable=False, server_default=func.now()) -class BlogDedupScanRunModel(Base): - """Administrative full-library dedup scan summary.""" +class RecommendationRequestModel(Base): + """One recommendation-serving request shown to a visitor. - __tablename__ = "blog_dedup_scan_runs" + Args: + None. SQLAlchemy constructs model instances from mapped keyword + arguments. + + Returns: + Recommendation request row that groups one ordered impression set. + """ + + __tablename__ = "recommendation_requests" + __table_args__ = ( + Index("ix_recommendation_requests_surface_created", "surface", "created_at"), + Index("ix_recommendation_requests_strategy_created", "strategy", "strategy_version", "created_at"), + ) id: Mapped[int] = mapped_column(primary_key=True) - status: Mapped[str] = mapped_column(Text, nullable=False) - ruleset_version: Mapped[str] = mapped_column(Text, nullable=False) - started_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), nullable=False) - completed_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True) - duration_ms: Mapped[int] = mapped_column(Integer, nullable=False, default=0) - total_count: Mapped[int] = mapped_column(Integer, nullable=False, default=0) - scanned_count: Mapped[int] = mapped_column(Integer, nullable=False, default=0) - removed_count: Mapped[int] = mapped_column(Integer, nullable=False, default=0) - kept_count: Mapped[int] = mapped_column(Integer, nullable=False, default=0) - crawler_was_running: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False) - crawler_restart_attempted: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False) - crawler_restart_succeeded: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False) - search_reindexed: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False) - error_message: Mapped[str | None] = mapped_column(Text, nullable=True) + request_uuid: Mapped[str] = mapped_column(Text, nullable=False, unique=True, index=True) + surface: Mapped[str] = mapped_column(Text, nullable=False, index=True) + strategy: Mapped[str] = mapped_column(Text, nullable=False) + strategy_version: Mapped[str] = mapped_column(Text, nullable=False, default="v1") + visitor_id: Mapped[str] = mapped_column(Text, nullable=False, index=True) + user_id: Mapped[int | None] = mapped_column(ForeignKey("users.id", ondelete="SET NULL"), nullable=True, index=True) + session_id: Mapped[str] = mapped_column(Text, nullable=False, index=True) + source: Mapped[str | None] = mapped_column(Text, nullable=True) + page_url: Mapped[str | None] = mapped_column(Text, nullable=True) + requested_count: Mapped[int] = mapped_column(Integer, nullable=False) + served_count: Mapped[int] = mapped_column(Integer, nullable=False) + context_json: Mapped[dict[str, object]] = mapped_column(JSON, nullable=False, default=dict) created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), nullable=False, server_default=func.now()) - updated_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), nullable=False, server_default=func.now()) -class BlogDedupScanRunItemModel(Base): - """Detailed removal records produced by one dedup scan run.""" +class RecommendationImpressionModel(Base): + """One ordered blog impression inside a recommendation request. + + Args: + None. SQLAlchemy constructs model instances from mapped keyword + arguments. + + Returns: + Impression row linking a request to one shown blog and position. + """ - __tablename__ = "blog_dedup_scan_run_items" + __tablename__ = "recommendation_impressions" + __table_args__ = ( + UniqueConstraint("request_id", "position", name="uq_recommendation_impression_request_position"), + UniqueConstraint("request_id", "normalized_url", name="uq_recommendation_impression_request_url"), + Index("ix_recommendation_impressions_url_created", "normalized_url", "created_at"), + ) id: Mapped[int] = mapped_column(primary_key=True) - run_id: Mapped[int] = mapped_column( - ForeignKey("blog_dedup_scan_runs.id", ondelete="CASCADE"), + request_id: Mapped[int] = mapped_column( + ForeignKey("recommendation_requests.id", ondelete="CASCADE"), nullable=False, + index=True, + ) + normalized_url: Mapped[str] = mapped_column(Text, nullable=False, index=True) + position: Mapped[int] = mapped_column(Integer, nullable=False) + score: Mapped[int | None] = mapped_column(Integer, nullable=True) + reason_json: Mapped[dict[str, object]] = mapped_column(JSON, nullable=False, default=dict) + created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), nullable=False, server_default=func.now()) + + +class BlogInteractionModel(Base): + """One idempotent visitor interaction with a blog or impression. + + Args: + None. SQLAlchemy constructs model instances from mapped keyword + arguments. + + Returns: + Raw immutable event row used for attribution and statistics. + """ + + __tablename__ = "blog_interactions" + __table_args__ = ( + Index("ix_blog_interactions_url_event_created", "normalized_url", "event_type", "created_at"), + Index("ix_blog_interactions_request_event", "request_id", "event_type"), + ) + + id: Mapped[int] = mapped_column(primary_key=True) + event_uuid: Mapped[str] = mapped_column(Text, nullable=False, unique=True, index=True) + request_id: Mapped[int | None] = mapped_column( + ForeignKey("recommendation_requests.id", ondelete="SET NULL"), + nullable=True, + index=True, ) - survivor_blog_id: Mapped[int] = mapped_column( - ForeignKey("blogs.blog_id", ondelete="SET NULL"), + impression_id: Mapped[int | None] = mapped_column( + ForeignKey("recommendation_impressions.id", ondelete="SET NULL"), nullable=True, + index=True, ) - removed_blog_id: Mapped[int | None] = mapped_column(nullable=True) - survivor_identity_key: Mapped[str] = mapped_column(Text, nullable=False) - removed_url: Mapped[str] = mapped_column(Text, nullable=False) - removed_normalized_url: Mapped[str] = mapped_column(Text, nullable=False) - removed_domain: Mapped[str] = mapped_column(Text, nullable=False) - reason_code: Mapped[str] = mapped_column(Text, nullable=False) - reason_codes: Mapped[str] = mapped_column(Text, nullable=False, default="[]") - survivor_selection_basis: Mapped[str] = mapped_column(Text, nullable=False) + normalized_url: Mapped[str] = mapped_column(Text, nullable=False, index=True) + event_type: Mapped[str] = mapped_column(Text, nullable=False, index=True) + position: Mapped[int | None] = mapped_column(Integer, nullable=True) + entrance_kind: Mapped[str] = mapped_column(Text, nullable=False, index=True) + entrance_url: Mapped[str] = mapped_column(Text, nullable=False, index=True) + interaction_order: Mapped[int] = mapped_column(Integer, nullable=False, default=1) + visitor_id: Mapped[str] = mapped_column(Text, nullable=False, index=True) + user_id: Mapped[int | None] = mapped_column(ForeignKey("users.id", ondelete="SET NULL"), nullable=True, index=True) + session_id: Mapped[str] = mapped_column(Text, nullable=False, index=True) + client_event_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True) + attributes_json: Mapped[dict[str, object]] = mapped_column(JSON, nullable=False, default=dict) + created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), nullable=False, server_default=func.now()) + + +class AdminHourlyStatsModel(Base): + """Hourly admin dashboard statistics snapshot. + + Args: + None. SQLAlchemy constructs model instances from mapped keyword + arguments. + + Returns: + One natural-hour aggregate row refreshed from source tables. + """ + + __tablename__ = "admin_hourly_stats" + __table_args__ = (UniqueConstraint("hour_start", name="uq_admin_hourly_stats_hour_start"),) + + id: Mapped[int] = mapped_column(primary_key=True) + hour_start: Mapped[datetime] = mapped_column(DateTime(timezone=True), nullable=False, index=True) + user_count: Mapped[int] = mapped_column(Integer, nullable=False, default=0) + random_request_count: Mapped[int] = mapped_column(Integer, nullable=False, default=0) + random_impression_count: Mapped[int] = mapped_column(Integer, nullable=False, default=0) + detail_open_count: Mapped[int] = mapped_column(Integer, nullable=False, default=0) + external_open_count: Mapped[int] = mapped_column(Integer, nullable=False, default=0) + detail_ctr: Mapped[float] = mapped_column(Float, nullable=False, default=0.0) + external_ctr: Mapped[float] = mapped_column(Float, nullable=False, default=0.0) + total_click_ctr: Mapped[float] = mapped_column(Float, nullable=False, default=0.0) + refreshed_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), nullable=False, server_default=func.now()) created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), nullable=False, server_default=func.now()) diff --git a/persistence_api/repository.py b/persistence_api/repository.py index 1ed1f66..21b83fe 100644 --- a/persistence_api/repository.py +++ b/persistence_api/repository.py @@ -36,21 +36,29 @@ from persistence_api.db import create_persistence_engine from persistence_api.db import create_session_factory from persistence_api.db import session_scope +from persistence_api.email_delivery import EmailDelivery +from persistence_api.email_delivery import NoopEmailDelivery from persistence_api.models import Base +from persistence_api.models import AdminHourlyStatsModel from persistence_api.models import BlogLabelModel from persistence_api.models import BlogLabelTagModel +from persistence_api.models import BlogInteractionModel from persistence_api.models import BlogUserLabelModel from persistence_api.models import BlogUserLabelSelectionModel from persistence_api.models import BlogModel -from persistence_api.models import BlogDedupScanRunItemModel -from persistence_api.models import BlogDedupScanRunModel from persistence_api.models import EdgeModel -from persistence_api.models import IngestionRequestModel from persistence_api.models import RawDiscoveredUrlModel +from persistence_api.models import RecommendationImpressionModel +from persistence_api.models import RecommendationRequestModel +from persistence_api.models import SeedModel +from persistence_api.models import PendingUserRegistrationModel +from persistence_api.models import UserAuditEventModel from persistence_api.models import UserModel +from persistence_api.models import UserVerificationTokenModel from persistence_api.models import UserSessionModel from persistence_api.recommendations import collect_friends_of_friends_candidates from crawler.crawling.decisions.chain import build_url_decision_chain +from crawler.crawling.decisions.base import UrlCandidateContext from crawler.crawling.normalization import IDENTITY_RULESET_VERSION from crawler.crawling.normalization import BlogIdentityResolution from crawler.crawling.normalization import normalize_url @@ -59,6 +67,8 @@ from shared.config import Settings from shared.observability import get_logger +BLOG_ACCEPTANCE_ACCEPTED = "ACCEPTED" +BLOG_ACCEPTANCE_UNKNOWN = "UNKNOWN" BLOG_CATALOG_ALLOWED_STATUSES = frozenset({status.value for status in CrawlStatus}) BLOG_CATALOG_DEFAULT_PAGE_SIZE = 50 BLOG_CATALOG_MAX_PAGE_SIZE = 200 @@ -66,7 +76,9 @@ BLOG_CATALOG_ALLOWED_SORTS = frozenset( {"id_asc", "id_desc", "recent_activity", "connections", "recently_discovered", "random"} ) -INGESTION_PRIORITY_LIST_LIMIT = 20 +BLOG_CATALOG_ALLOWED_ACCEPTANCE_STATUSES = frozenset( + {BLOG_ACCEPTANCE_ACCEPTED, BLOG_ACCEPTANCE_UNKNOWN, "REJECTED"} +) BLOG_LABELING_DEFAULT_PAGE_SIZE = 50 BLOG_LABELING_MAX_PAGE_SIZE = 200 BLOG_LABELING_DEFAULT_SORT = "id_desc" @@ -88,26 +100,26 @@ BLOG_LABEL_BLOG_ID = BLOG_LABEL_NAME_TO_ID["blog"] RAW_DISCOVERED_URL_DUPLICATE_STATUS = "rule:duplicate_url" RAW_DISCOVERED_URL_SUCCESS_STATUS = "success" +RANDOM_RECOMMENDATION_SURFACE = "random_blog_page" +RANDOM_RECOMMENDATION_STRATEGY = "weighted_random" +RANDOM_RECOMMENDATION_STRATEGY_VERSION = "v1" +RECOMMENDATION_EVENT_TYPES = frozenset( + {"click", "detail_open", "external_open", "label_select", "refresh", "dismiss", "copy_url"} +) REPOSITORY_LOGGER_NAME = "heyblog.repository" LOGGER = get_logger(REPOSITORY_LOGGER_NAME) -INGESTION_REQUEST_STATUS_RECEIVED = "RECEIVED" -INGESTION_REQUEST_STATUS_DEDUPED_EXISTING = "DEDUPED_EXISTING" -INGESTION_REQUEST_STATUS_QUEUED = "QUEUED" -INGESTION_REQUEST_STATUS_CRAWLING_SEED = "CRAWLING_SEED" -INGESTION_REQUEST_STATUS_COMPLETED = "COMPLETED" -INGESTION_REQUEST_STATUS_FAILED = "FAILED" -INGESTION_REQUEST_STATUS_EXPIRED = "EXPIRED" -ACTIVE_INGESTION_REQUEST_STATUSES = frozenset( - { - INGESTION_REQUEST_STATUS_RECEIVED, - INGESTION_REQUEST_STATUS_QUEUED, - INGESTION_REQUEST_STATUS_CRAWLING_SEED, - } -) EMAIL_PATTERN = re.compile(r"^[^@\s]+@[^@\s]+\.[^@\s]+$") PASSWORD_MIN_LENGTH = 8 USER_SESSION_TTL_DAYS = 30 PASSWORD_HASH_ITERATIONS = 210_000 +USER_ROLE_ADMIN = "admin" +USER_ROLE_USER = "user" +USER_ROLES = frozenset({USER_ROLE_ADMIN, USER_ROLE_USER}) +USER_TOKEN_EMAIL_VERIFICATION = "email_verification" +USER_TOKEN_PASSWORD_RESET = "password_reset" +USER_EMAIL_VERIFICATION_TTL_HOURS = 24 +USER_PASSWORD_RESET_TTL_HOURS = 2 +PENDING_REGISTRATION_TTL_HOURS = 24 class BlogLabelingError(Exception): @@ -238,6 +250,19 @@ def _hash_session_token(token: str) -> str: return hashlib.sha256(token.encode("utf-8")).hexdigest() +def _hash_user_lifecycle_token(token: str) -> str: + """Return the storage hash for an email verification or reset token. + + Args: + token: Raw lifecycle token returned once to a caller. + + Returns: + SHA-256 hex digest used for lookup without storing the raw token. + """ + + return hashlib.sha256(token.encode("utf-8")).hexdigest() + + def _user_payload(model: UserModel) -> dict[str, Any]: """Return the public user profile payload. @@ -252,11 +277,35 @@ def _user_payload(model: UserModel) -> dict[str, Any]: "id": int(model.id), "email": model.email, "display_name": model.display_name, + "role": model.role, + "is_active": bool(model.is_active), + "email_verified": model.email_verified_at is not None, + "email_verified_at": _iso(model.email_verified_at), "created_at": _iso(model.created_at), "updated_at": _iso(model.updated_at), } +def _user_admin_payload(model: UserModel) -> dict[str, Any]: + """Return an admin-safe user management payload. + + Args: + model: User database row. + + Returns: + JSON-serializable account summary for admin user lists. + """ + + payload = _user_payload(model) + payload.update( + { + "last_login_at": _iso(model.last_login_at), + "password_changed_at": _iso(model.password_changed_at), + } + ) + return payload + + def _sortable_datetime(value: datetime | None) -> datetime: if value is None: return datetime.min.replace(tzinfo=UTC) @@ -269,6 +318,80 @@ def _iso(value: datetime | None) -> str | None: return value.isoformat() if value is not None else None +def _clean_event_text(value: str, *, field: str, max_length: int = 256) -> str: + """Return a non-empty event text field or raise a stable validation error. + + Args: + value: Raw event field value supplied by a caller. + field: Field name included in the validation error. + max_length: Maximum accepted character length. + + Returns: + Trimmed field value. + + Raises: + ValueError: Raised when the value is blank or too long. + """ + + cleaned = str(value or "").strip() + if not cleaned: + raise ValueError(f"{field}_required") + if len(cleaned) > max_length: + raise ValueError(f"{field}_too_long") + return cleaned + + +def _coerce_json_object(value: dict[str, Any] | None) -> dict[str, Any]: + """Return a JSON object payload with unsupported values normalized by JSON. + + Args: + value: Optional JSON-like mapping supplied by a caller. + + Returns: + A JSON-serializable dictionary. + + Raises: + ValueError: Raised when the mapping cannot be encoded as JSON. + """ + + if value is None: + return {} + try: + return json.loads(json.dumps(value, ensure_ascii=True, default=str)) + except (TypeError, ValueError) as exc: + raise ValueError("invalid_json_attributes") from exc + + +def _parse_event_datetime(value: str | datetime | None) -> datetime | None: + """Return an optional timezone-aware client event timestamp. + + Args: + value: ISO datetime string, datetime instance, or `None`. + + Returns: + Parsed datetime with UTC timezone when supplied. + + Raises: + ValueError: Raised when the value cannot be parsed. + """ + + if value is None or isinstance(value, datetime): + parsed = value + else: + normalized = str(value).strip() + if not normalized: + return None + try: + parsed = datetime.fromisoformat(normalized.replace("Z", "+00:00")) + except ValueError as exc: + raise ValueError("invalid_client_event_at") from exc + if parsed is None: + return None + if parsed.tzinfo is None: + return parsed.replace(tzinfo=UTC) + return parsed.astimezone(UTC) + + def _business_blog_id(model: BlogModel | None) -> int | None: """Return the stable business blog identifier for one blog row.""" if model is None: @@ -449,6 +572,7 @@ def normalize_blog_catalog_query( has_title: bool | str | None = None, has_icon: bool | str | None = None, min_connections: int | str | None = None, + acceptance_status: str | None = BLOG_ACCEPTANCE_ACCEPTED, ) -> dict[str, Any]: """Normalize catalog query params into one shared spec.""" normalized_statuses: list[str] | None = None @@ -477,6 +601,11 @@ def normalize_blog_catalog_query( normalized_sort = _normalize_catalog_text(sort) or BLOG_CATALOG_DEFAULT_SORT if normalized_sort not in BLOG_CATALOG_ALLOWED_SORTS: raise ValueError(f"Unsupported blog catalog sort: {normalized_sort}") + normalized_acceptance_status = _normalize_catalog_text(acceptance_status) + if normalized_acceptance_status is not None: + normalized_acceptance_status = normalized_acceptance_status.upper() + if normalized_acceptance_status not in BLOG_CATALOG_ALLOWED_ACCEPTANCE_STATUSES: + raise ValueError(f"Unsupported blog acceptance status: {normalized_acceptance_status}") return { "page": max(page, 1), @@ -490,6 +619,7 @@ def normalize_blog_catalog_query( "has_title": _normalize_catalog_bool(has_title), "has_icon": _normalize_catalog_bool(has_icon), "min_connections": _normalize_catalog_int(min_connections), + "acceptance_status": normalized_acceptance_status, } @@ -580,10 +710,9 @@ def ensure_legacy_compat_schema(engine: Any) -> None: """Apply additive compatibility fixes needed by existing persistence databases.""" inspector = inspect(engine) existing_tables = set(inspector.get_table_names()) - if "blogs" not in existing_tables or "ingestion_requests" not in existing_tables: + if "blogs" not in existing_tables: return blog_columns = {column["name"] for column in inspector.get_columns("blogs")} - ingestion_columns = {column["name"] for column in inspector.get_columns("ingestion_requests")} with engine.begin() as connection: if "email" not in blog_columns: connection.execute(text("ALTER TABLE blogs ADD COLUMN email TEXT")) @@ -599,37 +728,40 @@ def ensure_legacy_compat_schema(engine: Any) -> None: connection.execute( text("ALTER TABLE blogs ADD COLUMN identity_ruleset_version TEXT DEFAULT '' NOT NULL") ) - if "identity_key" not in ingestion_columns: - connection.execute(text("ALTER TABLE ingestion_requests ADD COLUMN identity_key TEXT")) - if "identity_reason_codes" not in ingestion_columns: - connection.execute( - text( - "ALTER TABLE ingestion_requests ADD COLUMN identity_reason_codes TEXT DEFAULT '[]' NOT NULL" - ) - ) - if "identity_ruleset_version" not in ingestion_columns: - connection.execute( - text( - "ALTER TABLE ingestion_requests ADD COLUMN identity_ruleset_version TEXT DEFAULT '' NOT NULL" - ) - ) - if "blog_dedup_scan_runs" in existing_tables: - run_columns = {column["name"] for column in inspector.get_columns("blog_dedup_scan_runs")} - if "total_count" not in run_columns: - connection.execute( - text("ALTER TABLE blog_dedup_scan_runs ADD COLUMN total_count INTEGER DEFAULT 0 NOT NULL") - ) if "ix_blogs_identity_key" not in {index["name"] for index in inspector.get_indexes("blogs")}: connection.execute(text("CREATE INDEX IF NOT EXISTS ix_blogs_identity_key ON blogs (identity_key)")) - if "ix_ingestion_requests_identity_key" not in { - index["name"] for index in inspector.get_indexes("ingestion_requests") - }: - connection.execute( - text( - "CREATE INDEX IF NOT EXISTS ix_ingestion_requests_identity_key " - "ON ingestion_requests (identity_key)" + if "seeds" not in existing_tables: + if connection.dialect.name == "postgresql": + connection.execute( + text( + "CREATE TABLE seeds (" + "id SERIAL PRIMARY KEY, " + "url TEXT NOT NULL, " + "normalized_url TEXT NOT NULL UNIQUE, " + "domain TEXT NOT NULL, " + "source_path TEXT, " + "source_row INTEGER, " + "blog_id INTEGER REFERENCES blogs(blog_id) ON DELETE SET NULL, " + "imported_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP NOT NULL, " + "updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP NOT NULL)" + ) ) - ) + else: + connection.execute( + text( + "CREATE TABLE seeds (" + "id INTEGER PRIMARY KEY, " + "url TEXT NOT NULL, " + "normalized_url TEXT NOT NULL UNIQUE, " + "domain TEXT NOT NULL, " + "source_path TEXT, " + "source_row INTEGER, " + "blog_id INTEGER REFERENCES blogs(blog_id) ON DELETE SET NULL, " + "imported_at DATETIME DEFAULT CURRENT_TIMESTAMP NOT NULL, " + "updated_at DATETIME DEFAULT CURRENT_TIMESTAMP NOT NULL)" + ) + ) + connection.execute(text("CREATE INDEX IF NOT EXISTS ix_seeds_normalized_url ON seeds (normalized_url)")) if "blog_labels" not in existing_tables: if connection.dialect.name == "postgresql": connection.execute( @@ -704,6 +836,17 @@ def ensure_legacy_compat_schema(engine: Any) -> None: "email TEXT NOT NULL UNIQUE, " "password_hash TEXT NOT NULL, " "display_name TEXT DEFAULT '' NOT NULL, " + "role TEXT DEFAULT 'user' NOT NULL, " + "is_active BOOLEAN DEFAULT 1 NOT NULL, " + "email_verified_at " + + ("TIMESTAMP WITH TIME ZONE" if connection.dialect.name == "postgresql" else "DATETIME") + + ", " + "password_changed_at " + + ("TIMESTAMP WITH TIME ZONE" if connection.dialect.name == "postgresql" else "DATETIME") + + ", " + "last_login_at " + + ("TIMESTAMP WITH TIME ZONE" if connection.dialect.name == "postgresql" else "DATETIME") + + ", " "created_at " + ("TIMESTAMP WITH TIME ZONE" if connection.dialect.name == "postgresql" else "DATETIME") + " DEFAULT CURRENT_TIMESTAMP NOT NULL, " @@ -714,6 +857,18 @@ def ensure_legacy_compat_schema(engine: Any) -> None: ) connection.execute(text("CREATE INDEX IF NOT EXISTS ix_users_email ON users (email)")) existing_tables.add("users") + user_columns = {column["name"] for column in inspector.get_columns("users")} + user_timestamp_type = "TIMESTAMP WITH TIME ZONE" if connection.dialect.name == "postgresql" else "DATETIME" + if "role" not in user_columns: + connection.execute(text("ALTER TABLE users ADD COLUMN role TEXT DEFAULT 'user' NOT NULL")) + if "is_active" not in user_columns: + connection.execute(text("ALTER TABLE users ADD COLUMN is_active BOOLEAN DEFAULT 1 NOT NULL")) + if "email_verified_at" not in user_columns: + connection.execute(text(f"ALTER TABLE users ADD COLUMN email_verified_at {user_timestamp_type}")) + if "password_changed_at" not in user_columns: + connection.execute(text(f"ALTER TABLE users ADD COLUMN password_changed_at {user_timestamp_type}")) + if "last_login_at" not in user_columns: + connection.execute(text(f"ALTER TABLE users ADD COLUMN last_login_at {user_timestamp_type}")) if "user_sessions" not in existing_tables: connection.execute( text( @@ -735,6 +890,88 @@ def ensure_legacy_compat_schema(engine: Any) -> None: connection.execute(text("CREATE INDEX IF NOT EXISTS ix_user_sessions_user_id ON user_sessions (user_id)")) connection.execute(text("CREATE INDEX IF NOT EXISTS ix_user_sessions_token_hash ON user_sessions (token_hash)")) existing_tables.add("user_sessions") + if "user_verification_tokens" not in existing_tables: + connection.execute( + text( + "CREATE TABLE user_verification_tokens (" + "id INTEGER PRIMARY KEY, " + "user_id INTEGER NOT NULL REFERENCES users(id) ON DELETE CASCADE, " + "token_hash TEXT NOT NULL UNIQUE, " + "purpose TEXT NOT NULL, " + "created_at " + + user_timestamp_type + + " DEFAULT CURRENT_TIMESTAMP NOT NULL, " + "expires_at " + + user_timestamp_type + + " NOT NULL, " + "consumed_at " + + user_timestamp_type + + ")" + ) + ) + connection.execute( + text("CREATE INDEX IF NOT EXISTS ix_user_verification_tokens_user_id ON user_verification_tokens (user_id)") + ) + connection.execute( + text( + "CREATE INDEX IF NOT EXISTS ix_user_verification_tokens_token_hash " + "ON user_verification_tokens (token_hash)" + ) + ) + connection.execute( + text("CREATE INDEX IF NOT EXISTS ix_user_verification_tokens_purpose ON user_verification_tokens (purpose)") + ) + existing_tables.add("user_verification_tokens") + if "pending_user_registrations" not in existing_tables: + pending_id_type = "SERIAL PRIMARY KEY" if connection.dialect.name == "postgresql" else "INTEGER PRIMARY KEY" + connection.execute( + text( + "CREATE TABLE pending_user_registrations (" + f"id {pending_id_type}, " + "email TEXT NOT NULL UNIQUE, " + "password_hash TEXT NOT NULL, " + "token_hash TEXT NOT NULL UNIQUE, " + "created_at " + + user_timestamp_type + + " DEFAULT CURRENT_TIMESTAMP NOT NULL, " + "expires_at " + + user_timestamp_type + + " NOT NULL, " + "consumed_at " + + user_timestamp_type + + ")" + ) + ) + connection.execute( + text("CREATE INDEX IF NOT EXISTS ix_pending_user_registrations_email ON pending_user_registrations (email)") + ) + connection.execute( + text( + "CREATE INDEX IF NOT EXISTS ix_pending_user_registrations_token_hash " + "ON pending_user_registrations (token_hash)" + ) + ) + existing_tables.add("pending_user_registrations") + if "user_audit_events" not in existing_tables: + json_type = "JSONB" if connection.dialect.name == "postgresql" else "JSON" + json_default = "'{}'::jsonb" if connection.dialect.name == "postgresql" else "'{}'" + connection.execute( + text( + "CREATE TABLE user_audit_events (" + "id INTEGER PRIMARY KEY, " + "user_id INTEGER REFERENCES users(id) ON DELETE SET NULL, " + "event_type TEXT NOT NULL, " + f"details {json_type} DEFAULT {json_default} NOT NULL, " + "created_at " + + user_timestamp_type + + " DEFAULT CURRENT_TIMESTAMP NOT NULL)" + ) + ) + connection.execute(text("CREATE INDEX IF NOT EXISTS ix_user_audit_events_user_id ON user_audit_events (user_id)")) + connection.execute( + text("CREATE INDEX IF NOT EXISTS ix_user_audit_events_event_type ON user_audit_events (event_type)") + ) + existing_tables.add("user_audit_events") if "blog_user_label_selections" not in existing_tables: connection.execute( text( @@ -762,6 +999,50 @@ def ensure_legacy_compat_schema(engine: Any) -> None: ) ) existing_tables.add("blog_user_label_selections") + if "blog_interactions" in existing_tables: + interaction_columns = {column["name"] for column in inspector.get_columns("blog_interactions")} + if "entrance_kind" not in interaction_columns: + connection.execute( + text("ALTER TABLE blog_interactions ADD COLUMN entrance_kind TEXT NOT NULL DEFAULT 'legacy_unknown'") + ) + if connection.dialect.name == "postgresql": + connection.execute(text("ALTER TABLE blog_interactions ALTER COLUMN entrance_kind DROP DEFAULT")) + if "entrance_url" not in interaction_columns: + connection.execute( + text("ALTER TABLE blog_interactions ADD COLUMN entrance_url TEXT NOT NULL DEFAULT 'legacy_unknown'") + ) + if connection.dialect.name == "postgresql": + connection.execute(text("ALTER TABLE blog_interactions ALTER COLUMN entrance_url DROP DEFAULT")) + connection.execute( + text("CREATE INDEX IF NOT EXISTS ix_blog_interactions_entrance_kind ON blog_interactions (entrance_kind)") + ) + connection.execute( + text("CREATE INDEX IF NOT EXISTS ix_blog_interactions_entrance_url ON blog_interactions (entrance_url)") + ) + if "admin_hourly_stats" not in existing_tables: + stats_id_type = "SERIAL PRIMARY KEY" if connection.dialect.name == "postgresql" else "INTEGER PRIMARY KEY" + stats_timestamp_type = "TIMESTAMP WITH TIME ZONE" if connection.dialect.name == "postgresql" else "DATETIME" + connection.execute( + text( + "CREATE TABLE admin_hourly_stats (" + f"id {stats_id_type}, " + f"hour_start {stats_timestamp_type} NOT NULL UNIQUE, " + "user_count INTEGER DEFAULT 0 NOT NULL, " + "random_request_count INTEGER DEFAULT 0 NOT NULL, " + "random_impression_count INTEGER DEFAULT 0 NOT NULL, " + "detail_open_count INTEGER DEFAULT 0 NOT NULL, " + "external_open_count INTEGER DEFAULT 0 NOT NULL, " + "detail_ctr FLOAT DEFAULT 0 NOT NULL, " + "external_ctr FLOAT DEFAULT 0 NOT NULL, " + "total_click_ctr FLOAT DEFAULT 0 NOT NULL, " + f"refreshed_at {stats_timestamp_type} DEFAULT CURRENT_TIMESTAMP NOT NULL, " + f"created_at {stats_timestamp_type} DEFAULT CURRENT_TIMESTAMP NOT NULL)" + ) + ) + connection.execute( + text("CREATE INDEX IF NOT EXISTS ix_admin_hourly_stats_hour_start ON admin_hourly_stats (hour_start)") + ) + existing_tables.add("admin_hourly_stats") if "blog_label_tags" not in existing_tables: connection.execute( text( @@ -934,6 +1215,37 @@ def ensure_legacy_compat_schema(engine: Any) -> None: connection.execute( text(f'ALTER TABLE raw_discovered_urls DROP CONSTRAINT IF EXISTS "{constraint_name}"') ) + if "blogs" in existing_tables: + blog_columns = {column["name"] for column in inspector.get_columns("blogs")} + for column_name, ddl in ( + ("acceptance_status", "ALTER TABLE blogs ADD COLUMN acceptance_status TEXT NOT NULL DEFAULT 'UNKNOWN'"), + ("accepted_by", "ALTER TABLE blogs ADD COLUMN accepted_by TEXT"), + ("accepted_at", "ALTER TABLE blogs ADD COLUMN accepted_at TIMESTAMP"), + ("crawl_error_kind", "ALTER TABLE blogs ADD COLUMN crawl_error_kind TEXT"), + ("crawl_error_message", "ALTER TABLE blogs ADD COLUMN crawl_error_message TEXT"), + ("last_crawl_attempt_at", "ALTER TABLE blogs ADD COLUMN last_crawl_attempt_at TIMESTAMP"), + ("successful_crawl_at", "ALTER TABLE blogs ADD COLUMN successful_crawl_at TIMESTAMP"), + ): + if column_name not in blog_columns: + connection.execute(text(ddl)) + connection.execute( + text( + "UPDATE blogs SET acceptance_status = 'ACCEPTED', " + "accepted_by = COALESCE(accepted_by, 'seed'), " + "accepted_at = COALESCE(accepted_at, created_at) " + "WHERE acceptance_status = 'UNKNOWN' " + "AND blog_id NOT IN (SELECT to_blog_id FROM edges)" + ) + ) + connection.execute( + text( + "UPDATE blogs SET acceptance_status = 'ACCEPTED', " + "accepted_by = COALESCE(accepted_by, 'graph'), " + "accepted_at = COALESCE(accepted_at, created_at) " + "WHERE acceptance_status = 'UNKNOWN' " + "AND blog_id IN (SELECT from_blog_id FROM edges UNION SELECT to_blog_id FROM edges)" + ) + ) blog_rows = connection.execute( text( "SELECT id, blog_id, url, normalized_url, domain, identity_key, identity_ruleset_version " @@ -962,40 +1274,6 @@ def ensure_legacy_compat_schema(engine: Any) -> None: "domain": str(row["domain"] or identity.domain), }, ) - ingestion_rows = connection.execute( - text( - "SELECT id, requested_url, normalized_url, identity_key, identity_ruleset_version " - "FROM ingestion_requests" - ) - ).mappings().all() - for row in ingestion_rows: - needs_refresh = ( - not row["identity_key"] - or str(row["identity_ruleset_version"] or "") != IDENTITY_RULESET_VERSION - ) - if not needs_refresh: - continue - identity = resolve_blog_identity(str(row["requested_url"]) or str(row["normalized_url"])) - storage_url = ( - identity.canonical_url - if _uses_tenant_root_canonicalization(identity.reason_codes) - else normalize_url(str(row["requested_url"]) or str(row["normalized_url"])).normalized_url - ) - connection.execute( - text( - "UPDATE ingestion_requests SET identity_key = :identity_key, " - "identity_reason_codes = :reason_codes, identity_ruleset_version = :ruleset_version, " - "normalized_url = :normalized_url " - "WHERE id = :request_id" - ), - { - "request_id": row["id"], - "identity_key": identity.identity_key, - "reason_codes": _dump_reason_codes(identity.reason_codes), - "ruleset_version": identity.ruleset_version, - "normalized_url": storage_url, - }, - ) def _resolved_blog_title(model: BlogModel) -> str: title = (model.title or "").strip() @@ -1008,11 +1286,7 @@ def _resolved_blog_icon_url(model: BlogModel) -> str | None: icon_url = (model.icon_url or "").strip() if icon_url: return icon_url - - parsed = urlparse(model.url) - if parsed.scheme not in {"http", "https"} or not parsed.netloc: - return None - return f"{parsed.scheme}://{parsed.netloc}/favicon.ico" + return None @dataclass(frozen=True, slots=True) @@ -1061,6 +1335,13 @@ def as_blog_payload( "title": self.title, "icon_url": self.icon_url, "status_code": self.model.status_code, + "acceptance_status": self.model.acceptance_status, + "accepted_by": self.model.accepted_by, + "accepted_at": _iso(self.model.accepted_at), + "crawl_error_kind": self.model.crawl_error_kind, + "crawl_error_message": self.model.crawl_error_message, + "last_crawl_attempt_at": _iso(self.model.last_crawl_attempt_at), + "successful_crawl_at": _iso(self.model.successful_crawl_at), "crawl_status": self.model.crawl_status.value, "friend_links_count": int(self.model.friend_links_count), "last_crawled_at": _iso(self.model.last_crawled_at), @@ -1101,87 +1382,6 @@ def as_public_summary_payload(self) -> dict[str, Any]: } -@dataclass(frozen=True, slots=True) -class _IngestionRequestPayloadView: - """Hold one ingestion request plus its related blogs and expose output slices.""" - - model: IngestionRequestModel - seed_blog_view: _BlogPayloadView | None - matched_blog_view: _BlogPayloadView | None - - @classmethod - def from_model( - cls, - model: IngestionRequestModel, - *, - seed_blog: BlogModel | None = None, - matched_blog: BlogModel | None = None, - ) -> _IngestionRequestPayloadView: - """Return the resolved request view for one ingestion request row.""" - return cls( - model=model, - seed_blog_view=_BlogPayloadView.from_model(seed_blog), - matched_blog_view=_BlogPayloadView.from_model(matched_blog), - ) - - def _resolved_blog_view(self) -> _BlogPayloadView | None: - """Return the matched blog when present, otherwise the seed blog.""" - return self.matched_blog_view or self.seed_blog_view - - def _resolved_blog_id(self) -> int | None: - """Return the business id of the resolved blog used by public payloads.""" - resolved_blog_view = self._resolved_blog_view() - return resolved_blog_view.blog_id if resolved_blog_view is not None else None - - def as_full_payload(self) -> dict[str, Any]: - """Return the full ingestion request payload used by private flows.""" - resolved_blog_view = self._resolved_blog_view() - return { - "id": int(self.model.id), - "request_id": int(self.model.id), - "requested_url": self.model.requested_url, - "normalized_url": self.model.normalized_url, - "identity_key": self.model.identity_key, - "identity_reason_codes": _load_reason_codes(self.model.identity_reason_codes), - "identity_ruleset_version": self.model.identity_ruleset_version, - "email": self.model.requester_email, - "status": self.model.status, - "priority": int(self.model.priority), - "seed_blog_id": int(self.model.seed_blog_id) if self.model.seed_blog_id is not None else None, - "matched_blog_id": int(self.model.matched_blog_id) if self.model.matched_blog_id is not None else None, - "blog_id": self._resolved_blog_id(), - "request_token": self.model.request_token, - "expires_at": _iso(self.model.expires_at), - "error_message": self.model.error_message, - "created_at": _iso(self.model.created_at), - "updated_at": _iso(self.model.updated_at), - "seed_blog": self.seed_blog_view.as_blog_payload() if self.seed_blog_view is not None else None, - "matched_blog": self.matched_blog_view.as_blog_payload() if self.matched_blog_view is not None else None, - "blog": resolved_blog_view.as_blog_payload() if resolved_blog_view is not None else None, - } - - def as_priority_payload(self) -> dict[str, Any]: - """Return the public priority-list payload with private fields removed.""" - resolved_blog_view = self._resolved_blog_view() - return { - "request_id": int(self.model.id), - "requested_url": self.model.requested_url, - "normalized_url": self.model.normalized_url, - "status": self.model.status, - "seed_blog_id": int(self.model.seed_blog_id) if self.model.seed_blog_id is not None else None, - "matched_blog_id": int(self.model.matched_blog_id) if self.model.matched_blog_id is not None else None, - "blog_id": self._resolved_blog_id(), - "error_message": self.model.error_message, - "created_at": _iso(self.model.created_at), - "updated_at": _iso(self.model.updated_at), - "blog": ( - resolved_blog_view.as_public_summary_payload() - if resolved_blog_view is not None - else None - ), - } - - def _edge_payload(model: EdgeModel) -> dict[str, Any]: return { "id": int(model.id), @@ -1193,30 +1393,27 @@ def _edge_payload(model: EdgeModel) -> dict[str, Any]: } -def _ingestion_request_payload( - model: IngestionRequestModel, - *, - seed_blog: BlogModel | None = None, - matched_blog: BlogModel | None = None, -) -> dict[str, Any]: - return _IngestionRequestPayloadView.from_model( - model, - seed_blog=seed_blog, - matched_blog=matched_blog, - ).as_full_payload() +def _seed_payload(model: SeedModel) -> dict[str, Any]: + """Serialize one durable seed row for crawler bootstrap. + Args: + model: Seed ORM row to serialize. -def _priority_ingestion_request_payload( - model: IngestionRequestModel, - *, - seed_blog: BlogModel | None = None, - matched_blog: BlogModel | None = None, -) -> dict[str, Any]: - return _IngestionRequestPayloadView.from_model( - model, - seed_blog=seed_blog, - matched_blog=matched_blog, - ).as_priority_payload() + Returns: + Plain JSON-compatible seed payload. + """ + + return { + "id": int(model.id), + "url": str(model.url), + "normalized_url": str(model.normalized_url), + "domain": str(model.domain), + "source_path": model.source_path, + "source_row": model.source_row, + "blog_id": model.blog_id, + "imported_at": _iso(model.imported_at), + "updated_at": _iso(model.updated_at), + } def _blog_lookup_payload( @@ -1434,82 +1631,6 @@ def as_payload(self) -> dict[str, Any]: } -@dataclass(frozen=True, slots=True) -class _MaintenanceRunPayloadView: - """Hold the shared lifecycle facts exposed by maintenance run summaries.""" - - run_id: int - status: str - crawler_was_running: bool - started_at: datetime | None - completed_at: datetime | None - error_message: str | None - created_at: datetime | None - updated_at: datetime | None - - @classmethod - def from_model( - cls, - model: BlogDedupScanRunModel, - ) -> _MaintenanceRunPayloadView: - """Return the shared lifecycle view for one maintenance run row.""" - return cls( - run_id=int(model.id), - status=str(model.status), - crawler_was_running=bool(model.crawler_was_running), - started_at=model.started_at, - completed_at=model.completed_at, - error_message=model.error_message, - created_at=model.created_at, - updated_at=model.updated_at, - ) - - def as_payload(self) -> dict[str, Any]: - """Return the shared lifecycle payload used by maintenance run summaries.""" - return { - "id": self.run_id, - "status": self.status, - "crawler_was_running": self.crawler_was_running, - "started_at": _iso(self.started_at), - "completed_at": _iso(self.completed_at), - "error_message": self.error_message, - "created_at": _iso(self.created_at), - "updated_at": _iso(self.updated_at), - } - - -def _blog_dedup_scan_run_payload(model: BlogDedupScanRunModel) -> dict[str, Any]: - run_view = _MaintenanceRunPayloadView.from_model(model) - return run_view.as_payload() | { - "ruleset_version": model.ruleset_version, - "duration_ms": int(model.duration_ms), - "total_count": int(model.total_count), - "scanned_count": int(model.scanned_count), - "removed_count": int(model.removed_count), - "kept_count": int(model.kept_count), - "crawler_restart_attempted": bool(model.crawler_restart_attempted), - "crawler_restart_succeeded": bool(model.crawler_restart_succeeded), - "search_reindexed": bool(model.search_reindexed), - } - - -def _blog_dedup_scan_run_item_payload(model: BlogDedupScanRunItemModel) -> dict[str, Any]: - return { - "id": int(model.id), - "run_id": int(model.run_id), - "survivor_blog_id": int(model.survivor_blog_id) if model.survivor_blog_id is not None else None, - "removed_blog_id": int(model.removed_blog_id) if model.removed_blog_id is not None else None, - "survivor_identity_key": model.survivor_identity_key, - "removed_url": model.removed_url, - "removed_normalized_url": model.removed_normalized_url, - "removed_domain": model.removed_domain, - "reason_code": model.reason_code, - "reason_codes": _load_reason_codes(model.reason_codes), - "survivor_selection_basis": model.survivor_selection_basis, - "created_at": _iso(model.created_at), - } - - def _decision_scan_ruleset_version(settings: Settings) -> str: """Describe the current URL decision-chain configuration in one string. @@ -1620,6 +1741,14 @@ def _recommended_blog_payload( } +DISCOVERY_SOURCE_LABELS = { + "seed": "种子导入", + "user": "用户手动添加", + "rss": "RSS 判定", + "model": "模型判定", +} + + class RepositoryProtocol(Protocol): """Protocol shared by in-process and HTTP-backed repositories.""" @@ -1635,13 +1764,16 @@ def upsert_blog( domain: str, email: str | None = None, feed_url: str | None = None, + accepted_by: str | None = None, + seed_source_path: str | None = None, + seed_source_row: int | None = None, ) -> tuple[int, bool]: ... - def get_next_waiting_blog(self, *, include_priority: bool = True) -> dict[str, Any] | None: ... + def list_seeds(self) -> list[dict[str, Any]]: ... - def get_next_priority_blog(self) -> dict[str, Any] | None: ... + def get_next_waiting_blog(self) -> dict[str, Any] | None: ... - def create_ingestion_request(self, *, homepage_url: str, email: str) -> dict[str, Any]: ... + def create_user_seed(self, *, homepage_url: str) -> dict[str, Any]: ... def register_user(self, *, email: str, password: str) -> dict[str, Any]: ... @@ -1651,22 +1783,25 @@ def get_user_by_session_token(self, *, token: str) -> dict[str, Any] | None: ... def revoke_user_session(self, *, token: str) -> bool: ... - def list_user_label_selections(self, *, user_id: int, limit: int = 50) -> list[dict[str, Any]]: ... + def request_email_verification(self, *, email: str) -> dict[str, Any]: ... - def count_user_label_selections(self, *, user_id: int) -> int: ... + def confirm_email_verification(self, *, token: str) -> dict[str, Any]: ... - def get_ingestion_request( - self, - *, - request_id: int, - request_token: str, - ) -> dict[str, Any] | None: ... + def request_password_reset(self, *, email: str) -> dict[str, Any]: ... + + def reset_user_password(self, *, token: str, password: str) -> dict[str, Any]: ... + + def list_users(self, *, page: int = 1, page_size: int = 50) -> dict[str, Any]: ... - def list_priority_ingestion_requests(self, *, limit: int = INGESTION_PRIORITY_LIST_LIMIT) -> list[dict[str, Any]]: ... + def update_user_role(self, *, user_id: int, role: str) -> dict[str, Any]: ... + + def list_user_label_selections(self, *, user_id: int, limit: int = 50) -> list[dict[str, Any]]: ... + + def count_user_label_selections(self, *, user_id: int) -> int: ... def lookup_blog_candidates(self, *, url: str) -> dict[str, Any]: ... - def mark_ingestion_request_crawling(self, *, blog_id: int) -> None: ... + def find_blog_id_by_normalized_url(self, *, normalized_url: str) -> int | None: ... def mark_blog_result( self, @@ -1678,6 +1813,8 @@ def mark_blog_result( metadata_captured: bool = False, title: str | None = None, icon_url: str | None = None, + crawl_error_kind: str | None = None, + crawl_error_message: str | None = None, ) -> None: ... def add_edge( @@ -1731,8 +1868,46 @@ def list_blogs_catalog( has_title: bool | str | None = None, has_icon: bool | str | None = None, min_connections: int | None = None, + acceptance_status: str | None = BLOG_ACCEPTANCE_ACCEPTED, ) -> dict[str, Any]: ... + def create_random_recommendation_batch( + self, + *, + count: int = 9, + visitor_id: str, + session_id: str, + user_id: int | None = None, + source: str | None = None, + page_url: str | None = None, + context: dict[str, Any] | None = None, + ) -> dict[str, Any]: ... + + def record_blog_interaction( + self, + *, + event_uuid: str, + event_type: str, + blog_id: int, + visitor_id: str, + session_id: str, + entrance_kind: str, + entrance_url: str, + request_uuid: str | None = None, + impression_id: int | None = None, + position: int | None = None, + interaction_order: int = 1, + user_id: int | None = None, + client_event_at: str | datetime | None = None, + attributes: dict[str, Any] | None = None, + ) -> dict[str, Any]: ... + + def get_blog_recommendation_stats(self, blog_id: int) -> dict[str, Any] | None: ... + + def get_recommendation_strategy_stats(self) -> dict[str, Any]: ... + + def get_admin_hourly_stats(self, *, limit: int = 24) -> dict[str, Any]: ... + def list_blog_labeling_candidates( self, *, @@ -1789,24 +1964,6 @@ def stats(self) -> dict[str, Any]: ... def get_filter_stats_by_chain_order(self) -> dict[str, Any]: ... - def create_blog_dedup_scan_run(self, *, crawler_was_running: bool = False) -> dict[str, Any]: ... - - def execute_blog_dedup_scan_run(self, *, run_id: int) -> dict[str, Any]: ... - - def finalize_blog_dedup_scan_run( - self, - *, - run_id: int, - crawler_restart_attempted: bool, - crawler_restart_succeeded: bool, - search_reindexed: bool, - error_message: str | None = None, - ) -> dict[str, Any]: ... - - def get_latest_blog_dedup_scan_run(self) -> dict[str, Any] | None: ... - - def list_blog_dedup_scan_run_items(self, run_id: int) -> list[dict[str, Any]]: ... - def reset(self) -> dict[str, Any]: ... @@ -1817,17 +1974,22 @@ class SQLAlchemyRepository: database_url: str decision_settings: Settings | None = None startup_schema_sync: bool = True + public_base_url: str = "http://127.0.0.1:3000" + email_delivery: EmailDelivery = field(default_factory=NoopEmailDelivery) + email_dev_expose_tokens: bool = False engine: Any = field(init=False, repr=False) session_factory: Any = field(init=False, repr=False) def __post_init__(self) -> None: + if self.decision_settings is not None: + self.public_base_url = self.decision_settings.public_base_url + self.email_dev_expose_tokens = self.decision_settings.email_dev_expose_tokens self.engine = create_persistence_engine(self.database_url) self.session_factory = create_session_factory(self.engine) if self.startup_schema_sync: Base.metadata.create_all(self.engine) ensure_legacy_compat_schema(self.engine) with session_scope(self.session_factory) as session: - self._fail_orphaned_dedup_scan_runs(session) self._requeue_processing(session) @property @@ -1841,29 +2003,6 @@ def _requeue_processing(self, session: Session) -> None: BlogModel.updated_at: now_utc(), } ) - session.query(IngestionRequestModel).filter( - IngestionRequestModel.status == INGESTION_REQUEST_STATUS_CRAWLING_SEED - ).update( - { - IngestionRequestModel.status: INGESTION_REQUEST_STATUS_QUEUED, - IngestionRequestModel.updated_at: now_utc(), - } - ) - - def _fail_orphaned_dedup_scan_runs(self, session: Session) -> None: - orphaned_runs = session.scalars( - select(BlogDedupScanRunModel).where(BlogDedupScanRunModel.status == "RUNNING") - ).all() - if not orphaned_runs: - return - failed_at = now_utc() - for run in orphaned_runs: - started_at = _sortable_datetime(run.started_at) - run.status = "FAILED" - run.completed_at = failed_at - run.duration_ms = max(int((failed_at - started_at).total_seconds() * 1000), 0) - run.error_message = "orphaned_dedup_scan_run_cleaned_on_startup" - run.updated_at = failed_at def _get_blog_by_business_id(self, session: Session, blog_id: int) -> BlogModel | None: """Return one blog row by business ``blog_id``.""" @@ -1958,52 +2097,6 @@ def _row_blog_payload(self, row: Any) -> dict[str, Any]: identity_complete=bool(row.identity_complete), ) - def _serialize_ingestion_request_payload( - self, - session: Session, - request: IngestionRequestModel, - *, - serializer: Callable[..., dict[str, Any]], - ) -> dict[str, Any]: - """Resolve request blogs once and pass them to the chosen serializer.""" - seed_blog, matched_blog = self._resolve_ingestion_request_blogs(session, request) - return serializer(request, seed_blog=seed_blog, matched_blog=matched_blog) - - def _serialize_ingestion_request_payloads( - self, - session: Session, - requests: list[IngestionRequestModel], - *, - serializer: Callable[..., dict[str, Any]], - ) -> list[dict[str, Any]]: - """Resolve and serialize multiple ingestion requests using the shared serializer handoff.""" - return [ - self._serialize_ingestion_request_payload( - session, - request, - serializer=serializer, - ) - for request in requests - ] - - def _resolve_ingestion_request_blogs( - self, - session: Session, - request: IngestionRequestModel, - ) -> tuple[BlogModel | None, BlogModel | None]: - """Resolve the seed and matched blogs referenced by one ingestion request.""" - seed_blog = ( - self._get_blog_by_business_id(session, request.seed_blog_id) - if request.seed_blog_id is not None - else None - ) - matched_blog = ( - self._get_blog_by_business_id(session, request.matched_blog_id) - if request.matched_blog_id is not None - else None - ) - return seed_blog, matched_blog - def _latest_row_payload( self, session: Session, @@ -2136,10 +2229,8 @@ def _random_blog_catalog_statement(self, statement: Any) -> Any: """ admin_non_blog = _non_blog_label_count_expr(BlogLabelModel.label_id) - user_blog_count = _json_label_count_expr(BlogUserLabelModel.label_id, BLOG_LABEL_BLOG_ID) user_non_blog_count = _non_blog_label_count_expr(BlogUserLabelModel.label_id) - raw_weight = cast(10 + user_blog_count, Float) / cast(1 + user_non_blog_count, Float) - random_weight = case((raw_weight > 10, 10.0), else_=raw_weight) + random_weight = cast(10, Float) / cast(1 + user_non_blog_count, Float) return ( statement.outerjoin(BlogLabelModel, BlogLabelModel.normalized_url == BlogModel.normalized_url) .outerjoin(BlogUserLabelModel, BlogUserLabelModel.normalized_url == BlogModel.normalized_url) @@ -2186,6 +2277,212 @@ def _blog_detail_relation_payloads( for edge in edges ] + def _blog_discovery_path_payload(self, session: Session, blog: BlogModel) -> dict[str, Any]: + """Return a compact discovery-path payload for one blog. + + Args: + session: Active database session used for tracing raw discoveries. + blog: Blog row being displayed on the detail page. + + Returns: + Payload with discovery mode and ordered path steps from origin to + target. Manual seed/user blogs return a single-step path. + """ + + path_reversed: list[dict[str, Any]] = [] + visited_blog_ids: set[int] = set() + current_blog: BlogModel | None = blog + while current_blog is not None: + if current_blog is None: + break + current_blog_id = int(_business_blog_id(current_blog)) + if current_blog_id in visited_blog_ids: + break + visited_blog_ids.add(current_blog_id) + accepted_by = str(current_blog.accepted_by or "").strip().lower() + if accepted_by in {"seed", "user"}: + path_reversed.append(self._discovery_path_step(current_blog, raw=None, edge=None)) + break + incoming_edge = self._earliest_incoming_discovery_edge(session, current_blog_id) + raw = ( + self._success_raw_for_edge(session, incoming_edge) + if incoming_edge is not None + else self._earliest_success_raw_for_blog(session, current_blog) + ) + path_reversed.append(self._discovery_path_step(current_blog, raw=raw, edge=incoming_edge)) + source_blog_id = int(incoming_edge.from_blog_id) if incoming_edge is not None else ( + int(raw.source_blog_id) if raw is not None else None + ) + if source_blog_id is None: + break + current_blog = self._get_blog_by_business_id(session, source_blog_id) + + path = list(reversed(path_reversed)) + origin_source = str(path[0].get("accepted_by") or path[0].get("raw_accepted_by") or "").strip().lower() if path else "" + target_source = str(path[-1].get("accepted_by") or path[-1].get("raw_accepted_by") or "").strip().lower() if path else "" + mode = "manual" if target_source in {"seed", "user"} and len(path) == 1 else "crawled" + if origin_source in {"seed", "user"} and mode == "crawled": + origin_label = DISCOVERY_SOURCE_LABELS.get(origin_source, origin_source) + else: + origin_label = "发现链路" + return { + "mode": mode, + "origin_source": origin_source or None, + "origin_label": origin_label, + "target_source": target_source or None, + "truncated": False, + "steps": path, + } + + def _earliest_incoming_discovery_edge(self, session: Session, blog_id: int) -> EdgeModel | None: + """Return the earliest non-self incoming edge that discovered a blog.""" + + return session.scalar( + select(EdgeModel) + .where( + EdgeModel.to_blog_id == blog_id, + EdgeModel.from_blog_id != blog_id, + ) + .order_by(EdgeModel.discovered_at.asc(), EdgeModel.id.asc()) + .limit(1) + ) + + def _success_raw_for_edge(self, session: Session, edge: EdgeModel) -> RawDiscoveredUrlModel | None: + """Return the successful raw discovery row that produced one edge when available.""" + + candidate_urls = { + str(edge.link_url_raw or ""), + normalize_url(str(edge.link_url_raw or "")).normalized_url, + resolve_blog_identity(str(edge.link_url_raw or "")).canonical_url, + } + return session.scalar( + select(RawDiscoveredUrlModel) + .where( + RawDiscoveredUrlModel.source_blog_id == int(edge.from_blog_id), + RawDiscoveredUrlModel.normalized_url.in_([url for url in candidate_urls if url]), + RawDiscoveredUrlModel.status == RAW_DISCOVERED_URL_SUCCESS_STATUS, + ) + .order_by(RawDiscoveredUrlModel.id.asc()) + .limit(1) + ) + + def _earliest_success_raw_for_blog(self, session: Session, blog: BlogModel) -> RawDiscoveredUrlModel | None: + """Return the earliest successful raw discovery row for one blog.""" + + candidate_urls = {str(blog.normalized_url or ""), str(blog.url or "")} + identity = resolve_blog_identity(str(blog.url or blog.normalized_url or "")) + candidate_urls.add(identity.canonical_url) + normalized = normalize_url(str(blog.url or blog.normalized_url or "")) + candidate_urls.add(normalized.normalized_url) + return session.scalar( + select(RawDiscoveredUrlModel) + .where( + RawDiscoveredUrlModel.normalized_url.in_([url for url in candidate_urls if url]), + RawDiscoveredUrlModel.status == RAW_DISCOVERED_URL_SUCCESS_STATUS, + ) + .order_by(RawDiscoveredUrlModel.id.asc()) + .limit(1) + ) + + def _discovery_path_step( + self, + blog: BlogModel, + *, + raw: RawDiscoveredUrlModel | None, + edge: EdgeModel | None, + ) -> dict[str, Any]: + """Serialize one blog as a discovery path step.""" + + blog_view = _BlogPayloadView.from_model(blog) + accepted_by = str(blog.accepted_by or "").strip().lower() or None + raw_accepted_by = str(raw.accepted_by or "").strip().lower() if raw is not None else None + source = accepted_by or raw_accepted_by + raw_source_blog_id = int(raw.source_blog_id) if raw is not None else ( + int(edge.from_blog_id) if edge is not None else None + ) + discovered_at = _iso(raw.discovered_at) if raw is not None else ( + _iso(edge.discovered_at) if edge is not None else None + ) + return { + "blog": blog_view.as_neighbor_payload() if blog_view is not None else None, + "blog_id": int(_business_blog_id(blog)), + "url": str(blog.url or ""), + "domain": str(blog.domain or ""), + "accepted_by": accepted_by, + "accepted_label": DISCOVERY_SOURCE_LABELS.get(str(source or ""), source), + "raw_id": int(raw.id) if raw is not None else None, + "raw_source_blog_id": raw_source_blog_id, + "raw_accepted_by": raw_accepted_by or None, + "discovered_at": discovered_at, + } + + def _blog_relation_graph_payload( + self, + session: Session, + *, + blog: BlogModel, + direction: str, + depth: int = 2, + ) -> dict[str, Any]: + """Return a small directional relation graph around one blog. + + Args: + session: Active database session. + blog: Focus blog for the graph. + direction: Either ``incoming`` for upstream sources or + ``outgoing`` for downstream targets. + depth: Number of graph layers to traverse. + + Returns: + Payload with normalized graph nodes, directed edges, focus blog id, + direction, and depth metadata. + """ + + focus_id = int(_business_blog_id(blog)) + node_ids: set[int] = {focus_id} + edges_by_id: dict[int, EdgeModel] = {} + frontier = {focus_id} + for layer_index in range(depth): + next_frontier: set[int] = set() + for current_id in sorted(frontier): + if direction == "incoming": + statement = ( + select(EdgeModel) + .where(EdgeModel.to_blog_id == current_id, EdgeModel.from_blog_id != current_id) + .order_by(EdgeModel.discovered_at.asc(), EdgeModel.id.asc()) + ) + else: + statement = ( + select(EdgeModel) + .where(EdgeModel.from_blog_id == current_id, EdgeModel.to_blog_id != current_id) + .order_by(EdgeModel.discovered_at.asc(), EdgeModel.id.asc()) + ) + layer_edges = session.scalars(statement).all() + for edge in layer_edges: + edges_by_id[int(edge.id)] = edge + related_id = int(edge.from_blog_id) if direction == "incoming" else int(edge.to_blog_id) + if related_id not in node_ids: + next_frontier.add(related_id) + node_ids.add(related_id) + frontier = next_frontier + if not frontier: + break + + blog_rows = session.execute( + self._blog_select()[0].where(BlogModel.blog_id.in_(sorted(node_ids))) + ).all() + nodes_by_id: dict[int, dict[str, Any]] = {} + for row in blog_rows: + payload = self._row_blog_payload(row) + nodes_by_id[int(payload["blog_id"])] = payload + return { + "direction": direction, + "focus_blog_id": focus_id, + "depth": depth, + "nodes": [nodes_by_id[node_id] for node_id in sorted(node_ids) if node_id in nodes_by_id], + "edges": [_edge_payload(edge) for edge in sorted(edges_by_id.values(), key=lambda item: int(item.id))], + } + def _recommended_blog_rows( self, session: Session, @@ -2270,6 +2567,7 @@ def _upsert_blog_in_session( domain: str, email: str | None = None, feed_url: str | None = None, + accepted_by: str | None = None, preferred_blog_id: int | None = None, ) -> tuple[BlogModel, bool]: """Create or update one blog row and initialize its business id. @@ -2282,6 +2580,8 @@ def _upsert_blog_in_session( email: Optional contact email to fill when the row is missing one. feed_url: Optional RSS/Atom feed URL discovered for the blog. Stored when present; an existing feed is never overwritten with ``None``. + accepted_by: Optional acceptance source such as ``seed``, ``rss``, + or ``model``. When present, the blog is durably accepted. preferred_blog_id: Preferred externally meaningful ``blogs.blog_id``. Returns: @@ -2313,6 +2613,12 @@ def _upsert_blog_in_session( existing.email = email if feed_url is not None and not (existing.feed_url or "").strip(): existing.feed_url = feed_url + if existing.acceptance_status != BLOG_ACCEPTANCE_ACCEPTED: + existing.acceptance_status = BLOG_ACCEPTANCE_ACCEPTED + existing.accepted_at = existing.accepted_at or now_utc() + if accepted_by is not None: + existing.accepted_by = accepted_by + existing.accepted_at = existing.accepted_at or now_utc() existing.identity_key = identity.identity_key existing.identity_reason_codes = _dump_reason_codes(identity.reason_codes) existing.identity_ruleset_version = identity.ruleset_version @@ -2335,6 +2641,9 @@ def _upsert_blog_in_session( domain=stored_domain, email=email, feed_url=feed_url, + acceptance_status=BLOG_ACCEPTANCE_ACCEPTED, + accepted_by=accepted_by, + accepted_at=now_utc(), crawl_status=CrawlStatus.WAITING, friend_links_count=0, created_at=now_utc(), @@ -2429,9 +2738,9 @@ def _delete_blog_graph(self, session: Session, *, blog_id: int) -> None: blog_id: Blog identifier that should be removed from persistence. Returns: - ``None``. The blog, its edges, and dangling ingestion references - are removed or cleared in place. URL-keyed label assignments are - intentionally preserved across graph cleanup. + ``None``. The blog and its edges are removed in place. + URL-keyed label assignments are intentionally preserved across + graph cleanup. """ edge_ids = session.scalars( select(EdgeModel.id).where( @@ -2443,12 +2752,6 @@ def _delete_blog_graph(self, session: Session, *, blog_id: int) -> None: ).all() if edge_ids: session.query(EdgeModel).filter(EdgeModel.id.in_(edge_ids)).delete(synchronize_session=False) - session.query(IngestionRequestModel).filter( - IngestionRequestModel.seed_blog_id == blog_id - ).update({IngestionRequestModel.seed_blog_id: None}) - session.query(IngestionRequestModel).filter( - IngestionRequestModel.matched_blog_id == blog_id - ).update({IngestionRequestModel.matched_blog_id: None}) blog = self._get_blog_by_business_id(session, blog_id) if blog is not None: session.delete(blog) @@ -2467,6 +2770,9 @@ def upsert_blog( domain: str, email: str | None = None, feed_url: str | None = None, + accepted_by: str | None = None, + seed_source_path: str | None = None, + seed_source_row: int | None = None, ) -> tuple[int, bool]: with session_scope(self.session_factory) as session: blog, inserted = self._upsert_blog_in_session( @@ -2476,113 +2782,150 @@ def upsert_blog( domain=domain, email=email, feed_url=feed_url, + accepted_by=accepted_by, ) + if accepted_by == "seed": + self._upsert_seed_in_session( + session, + url=url, + normalized_url=str(blog.normalized_url), + domain=str(blog.domain), + blog_id=int(_business_blog_id(blog)), + source_path=seed_source_path, + source_row=seed_source_row, + ) return int(_business_blog_id(blog)), inserted - def create_ingestion_request(self, *, homepage_url: str, email: str) -> dict[str, Any]: - requested_url, normalized_url, domain, identity_key, reason_codes, ruleset_version = normalize_homepage_url( - homepage_url - ) - normalized_email = normalize_ingestion_email(email) - with session_scope(self.session_factory) as session: - existing_blog = session.scalar( - select(BlogModel).where(BlogModel.identity_key == identity_key) + def _upsert_seed_in_session( + self, + session: Session, + *, + url: str, + normalized_url: str, + domain: str, + blog_id: int, + source_path: str | None = None, + source_row: int | None = None, + ) -> SeedModel: + """Create or refresh the durable seed row for one imported URL. + + Args: + session: Active SQLAlchemy session that already contains the blog + upsert for the same import. + url: Original URL from the seed CSV row. + normalized_url: Stored blog normalized URL after identity + canonicalization. + domain: Stored blog domain. + blog_id: Business blog identifier linked from the seed row. + source_path: Optional CSV path used for traceability. + source_row: Optional one-based CSV data row number. + + Returns: + The created or updated seed row. + """ + + imported_at = now_utc() + seed = session.scalar(select(SeedModel).where(SeedModel.normalized_url == normalized_url)) + if seed is None: + seed = SeedModel( + url=url, + normalized_url=normalized_url, + domain=domain, + source_path=source_path, + source_row=source_row, + blog_id=blog_id, + imported_at=imported_at, + updated_at=imported_at, ) - if existing_blog is not None and not (existing_blog.email or "").strip(): - existing_blog.email = normalized_email - if existing_blog is not None: - if _uses_tenant_root_canonicalization(reason_codes): - existing_blog.url = normalized_url - existing_blog.normalized_url = normalized_url - existing_blog.domain = domain - existing_blog.identity_key = identity_key - existing_blog.identity_reason_codes = _dump_reason_codes(reason_codes) - existing_blog.identity_ruleset_version = ruleset_version - existing_blog.updated_at = now_utc() - - if existing_blog is not None and existing_blog.crawl_status == CrawlStatus.FINISHED: - existing_blog_view = _BlogPayloadView.from_model(existing_blog) - return { - "status": INGESTION_REQUEST_STATUS_DEDUPED_EXISTING, - "blog_id": int(_business_blog_id(existing_blog)), - "matched_blog_id": int(_business_blog_id(existing_blog)), - "request_id": None, - "request_token": None, - "blog": existing_blog_view.as_blog_payload() if existing_blog_view is not None else None, - } - - existing_request = self._oldest_ingestion_request( + session.add(seed) + session.flush() + return seed + + seed.url = url + seed.domain = domain + seed.blog_id = blog_id + seed.source_path = source_path + seed.source_row = source_row + seed.updated_at = imported_at + return seed + + def list_seeds(self) -> list[dict[str, Any]]: + """Return all durable seed rows in deterministic insertion order. + + Args: + None. + + Returns: + Seed payloads ordered by primary key for bootstrap replay. + """ + + with session_scope(self.session_factory) as session: + return self._ordered_row_payloads( session, - filters=(IngestionRequestModel.identity_key == identity_key,), - statuses=tuple(ACTIVE_INGESTION_REQUEST_STATUSES), + statement=select(SeedModel).order_by(SeedModel.id.asc()), + serializer=_seed_payload, ) - if existing_request is not None: - if not (existing_request.requester_email or "").strip(): - existing_request.requester_email = normalized_email - if _uses_tenant_root_canonicalization(reason_codes): - existing_request.normalized_url = normalized_url - existing_request.identity_key = identity_key - existing_request.identity_reason_codes = _dump_reason_codes(reason_codes) - existing_request.identity_ruleset_version = ruleset_version - existing_request.updated_at = now_utc() - return self._serialize_ingestion_request_payload( - session, - existing_request, - serializer=_ingestion_request_payload, - ) - if existing_blog is None: - existing_blog = BlogModel( - blog_id=None, - url=normalized_url, - normalized_url=normalized_url, - identity_key=identity_key, - identity_reason_codes=_dump_reason_codes(reason_codes), - identity_ruleset_version=ruleset_version, - domain=domain, - email=normalized_email, - crawl_status=CrawlStatus.WAITING, - friend_links_count=0, - created_at=now_utc(), - updated_at=now_utc(), - ) - session.add(existing_blog) - session.flush() - existing_blog.blog_id = int(existing_blog.id) - session.flush() - elif existing_blog.crawl_status == CrawlStatus.FAILED: - existing_blog.crawl_status = CrawlStatus.WAITING - existing_blog.updated_at = now_utc() - - request_status = ( - INGESTION_REQUEST_STATUS_CRAWLING_SEED - if existing_blog.crawl_status == CrawlStatus.PROCESSING - else INGESTION_REQUEST_STATUS_QUEUED - ) - request = IngestionRequestModel( - requested_url=requested_url, + def create_user_seed(self, *, homepage_url: str) -> dict[str, Any]: + """Accept one user-submitted URL as a crawler seed after rule checks. + + Args: + homepage_url: Complete blog homepage URL provided by a public user. + + Returns: + Payload describing the accepted blog and whether a row was inserted. + + Raises: + ValueError: Raised when URL normalization fails or deterministic + rule filters reject the URL. + """ + + requested_url, normalized_url, domain, _identity_key, _reason_codes, _ruleset_version = normalize_homepage_url( + homepage_url + ) + settings = self._decision_scan_settings() + decision_chain = build_url_decision_chain(settings) + candidate = UrlCandidateContext( + source_blog_id=0, + source_domain="", + normalized_url=normalized_url, + link_text="user", + context_text="user-submitted seed", + ) + for rule_filter in decision_chain.rule_filters: + decision = rule_filter.apply(candidate) + if not decision.accepted: + raise ValueError(str(decision.status or "rule_filter_rejected")) + + with session_scope(self.session_factory) as session: + blog, inserted = self._upsert_blog_in_session( + session, + url=requested_url, normalized_url=normalized_url, - identity_key=identity_key, - identity_reason_codes=_dump_reason_codes(reason_codes), - identity_ruleset_version=ruleset_version, - requester_email=normalized_email, - status=request_status, - priority=100, - seed_blog_id=int(_business_blog_id(existing_blog)), - matched_blog_id=None, - request_token=token_urlsafe(18), - expires_at=None, - error_message=None, - created_at=now_utc(), - updated_at=now_utc(), + domain=domain, + accepted_by="user", ) - session.add(request) - session.flush() - return self._serialize_ingestion_request_payload( + if blog.crawl_status == CrawlStatus.FAILED: + blog.crawl_status = CrawlStatus.WAITING + blog.crawl_error_kind = None + blog.crawl_error_message = None + blog.updated_at = now_utc() + self._upsert_seed_in_session( session, - request, - serializer=_ingestion_request_payload, + url=requested_url, + normalized_url=str(blog.normalized_url), + domain=str(blog.domain), + blog_id=int(_business_blog_id(blog)), + source_path="user", + source_row=None, ) + blog_view = _BlogPayloadView.from_model(blog) + return { + "status": "QUEUED" if blog.crawl_status == CrawlStatus.WAITING else "EXISTING", + "blog_id": int(_business_blog_id(blog)), + "inserted": inserted, + "blog": blog_view.as_blog_payload() if blog_view is not None else None, + } def _create_user_session_payload(self, session: Session, user: UserModel) -> dict[str, Any]: """Create one session row and return the auth response payload. @@ -2606,6 +2949,7 @@ def _create_user_session_payload(self, session: Session, user: UserModel) -> dic revoked_at=None, ) session.add(session_row) + user.last_login_at = timestamp user.updated_at = timestamp session.flush() return { @@ -2614,19 +2958,201 @@ def _create_user_session_payload(self, session: Session, user: UserModel) -> dic "user": _user_payload(user), } + def _record_user_audit_event( + self, + session: Session, + *, + user_id: int | None, + event_type: str, + details: dict[str, Any] | None = None, + ) -> None: + """Append one user audit event without storing raw secrets. + + Args: + session: Active SQLAlchemy session. + user_id: Optional user ID attached to the event. + event_type: Stable event name. + details: Optional JSON-safe metadata. Raw tokens and passwords must + not be supplied. + + Returns: + None. + """ + + session.add( + UserAuditEventModel( + user_id=user_id, + event_type=event_type, + details=_coerce_json_object(details), + created_at=now_utc(), + ) + ) + + def _create_lifecycle_token( + self, + session: Session, + *, + user: UserModel, + purpose: str, + ttl_hours: int, + ) -> dict[str, Any]: + """Create one hashed lifecycle token and return its raw one-time value. + + Args: + session: Active SQLAlchemy session. + user: User row that owns the token. + purpose: Token purpose such as email verification or password reset. + ttl_hours: Token lifetime in hours. + + Returns: + JSON payload containing the raw token once and expiry metadata. + """ + + timestamp = now_utc() + token = token_urlsafe(32) + row = UserVerificationTokenModel( + user_id=int(user.id), + token_hash=_hash_user_lifecycle_token(token), + purpose=purpose, + created_at=timestamp, + expires_at=timestamp + timedelta(hours=ttl_hours), + consumed_at=None, + ) + session.add(row) + session.flush() + return { + "token": token, + "expires_at": _iso(row.expires_at), + } + + def _consume_lifecycle_token( + self, + session: Session, + *, + token: str, + purpose: str, + ) -> tuple[UserVerificationTokenModel, UserModel]: + """Consume one valid lifecycle token and return its row plus user. + + Args: + session: Active SQLAlchemy session. + token: Raw token supplied by the caller. + purpose: Required token purpose. + + Returns: + Tuple of token row and owning user row. + + Raises: + UserAuthError: Raised when the token is invalid, expired, consumed, + or points to a missing/inactive user. + """ + + clean_token = token.strip() + if not clean_token: + raise UserAuthError("invalid_token") + timestamp = now_utc() + row = session.scalar( + select(UserVerificationTokenModel).where( + UserVerificationTokenModel.token_hash == _hash_user_lifecycle_token(clean_token), + UserVerificationTokenModel.purpose == purpose, + UserVerificationTokenModel.consumed_at.is_(None), + UserVerificationTokenModel.expires_at > timestamp, + ).limit(1) + ) + if row is None: + raise UserAuthError("invalid_token") + user = session.scalar(select(UserModel).where(UserModel.id == row.user_id).limit(1)) + if user is None or not user.is_active: + raise UserAuthError("invalid_token") + row.consumed_at = timestamp + return row, user + + def _email_verification_payload(self, token_payload: dict[str, Any]) -> dict[str, Any]: + """Return an email verification response payload. + + Args: + token_payload: Raw lifecycle token payload returned by + `_create_lifecycle_token`. + + Returns: + Payload containing delivery status and expiry. Development mode also + includes the raw token and verification URL for local manual flows. + """ + + token = str(token_payload["token"]) + verification_url = f"{self.public_base_url}/profile?verify_token={token}" + payload = { + "sent": True, + "expires_at": token_payload["expires_at"], + } + if self.email_dev_expose_tokens: + payload["verification_token"] = token + payload["verification_url"] = verification_url + return payload + + def _verification_url(self, token_payload: dict[str, Any]) -> str: + """Build the public email verification URL for one raw token payload. + + Args: + token_payload: Raw lifecycle token payload returned by + `_create_lifecycle_token`. + + Returns: + Public frontend URL that consumes the one-time verification token. + """ + + return f"{self.public_base_url}/profile?verify_token={token_payload['token']}" + + def _password_reset_payload(self, token_payload: dict[str, Any]) -> dict[str, Any]: + """Return a password reset response payload. + + Args: + token_payload: Raw lifecycle token payload returned by + `_create_lifecycle_token`. + + Returns: + Payload containing delivery status and expiry. Development mode also + includes the raw token and reset URL for local manual flows. + """ + + token = str(token_payload["token"]) + reset_url = f"{self.public_base_url}/profile?reset_token={token}" + payload = { + "sent": True, + "expires_at": token_payload["expires_at"], + } + if self.email_dev_expose_tokens: + payload["reset_token"] = token + payload["reset_url"] = reset_url + return payload + + def _password_reset_url(self, token_payload: dict[str, Any]) -> str: + """Build the public password reset URL for one raw token payload. + + Args: + token_payload: Raw lifecycle token payload returned by + `_create_lifecycle_token`. + + Returns: + Public frontend URL that consumes the one-time reset token. + """ + + return f"{self.public_base_url}/profile?reset_token={token_payload['token']}" + def register_user(self, *, email: str, password: str) -> dict[str, Any]: - """Create a user account and first login session. + """Create a pending registration and send a verification email. Args: email: User email address used as the login identifier. - password: Plaintext password to hash and store. + password: Plaintext password to hash and hold until verification. Returns: - Auth payload with bearer token and user profile. + Email verification delivery payload. Raises: ValueError: Raised for invalid email or weak password. - UserAuthError: Raised when the email is already registered. + UserAuthError: Raised when the email is already registered or has a + still-valid pending registration. """ normalized_email = _normalize_user_email(email) @@ -2636,16 +3162,45 @@ def register_user(self, *, email: str, password: str) -> dict[str, Any]: existing = session.scalar(select(UserModel).where(UserModel.email == normalized_email).limit(1)) if existing is not None: raise UserAuthError("email_already_registered") - user = UserModel( + pending = session.scalar( + select(PendingUserRegistrationModel) + .where( + PendingUserRegistrationModel.email == normalized_email, + PendingUserRegistrationModel.consumed_at.is_(None), + PendingUserRegistrationModel.expires_at > timestamp, + ) + .limit(1) + ) + if pending is not None: + raise UserAuthError("email_registration_pending") + session.query(PendingUserRegistrationModel).filter( + PendingUserRegistrationModel.email == normalized_email + ).delete(synchronize_session=False) + token_payload = { + "token": token_urlsafe(32), + "expires_at": _iso(timestamp + timedelta(hours=PENDING_REGISTRATION_TTL_HOURS)), + } + pending = PendingUserRegistrationModel( email=normalized_email, password_hash=_hash_password(validated_password), - display_name=normalized_email.split("@", 1)[0], + token_hash=_hash_user_lifecycle_token(str(token_payload["token"])), created_at=timestamp, - updated_at=timestamp, + expires_at=timestamp + timedelta(hours=PENDING_REGISTRATION_TTL_HOURS), + consumed_at=None, + ) + session.add(pending) + self.email_delivery.send_verification_email( + to_email=normalized_email, + verification_url=self._verification_url(token_payload), + ) + self._record_user_audit_event( + session, + user_id=None, + event_type="user.registration_verification_sent", + details={"email": normalized_email}, ) - session.add(user) session.flush() - return self._create_user_session_payload(session, user) + return self._email_verification_payload(token_payload) def login_user(self, *, email: str, password: str) -> dict[str, Any]: """Authenticate an existing user and create a fresh session. @@ -2665,8 +3220,9 @@ def login_user(self, *, email: str, password: str) -> dict[str, Any]: normalized_email = _normalize_user_email(email) with session_scope(self.session_factory) as session: user = session.scalar(select(UserModel).where(UserModel.email == normalized_email).limit(1)) - if user is None or not _verify_password(password, user.password_hash): + if user is None or not user.is_active or not _verify_password(password, user.password_hash): raise UserAuthError("invalid_credentials") + self._record_user_audit_event(session, user_id=int(user.id), event_type="user.login") return self._create_user_session_payload(session, user) def _active_user_by_session_token(self, session: Session, *, token: str) -> UserModel | None: @@ -2694,7 +3250,8 @@ def _active_user_by_session_token(self, session: Session, *, token: str) -> User ) if row is None: return None - return session.scalar(select(UserModel).where(UserModel.id == row.user_id).limit(1)) + user = session.scalar(select(UserModel).where(UserModel.id == row.user_id, UserModel.is_active.is_(True)).limit(1)) + return user def get_user_by_session_token(self, *, token: str) -> dict[str, Any] | None: """Load the current user for one bearer token. @@ -2736,42 +3293,217 @@ def revoke_user_session(self, *, token: str) -> bool: session.flush() return True - def get_ingestion_request( - self, - *, - request_id: int, - request_token: str, - ) -> dict[str, Any] | None: + def request_email_verification(self, *, email: str) -> dict[str, Any]: + """Create a fresh email verification token for one account. + + Args: + email: Account email address. + + Returns: + Dev-friendly verification payload. Unknown emails receive the same + neutral `sent` shape without token fields. + """ + + normalized_email = _normalize_user_email(email) with session_scope(self.session_factory) as session: - request = session.scalar( - select(IngestionRequestModel).where(IngestionRequestModel.id == request_id) + user = session.scalar(select(UserModel).where(UserModel.email == normalized_email).limit(1)) + if user is None or not user.is_active: + return {"sent": True} + if user.email_verified_at is not None: + return {"sent": True, "already_verified": True} + token_payload = self._create_lifecycle_token( + session, + user=user, + purpose=USER_TOKEN_EMAIL_VERIFICATION, + ttl_hours=USER_EMAIL_VERIFICATION_TTL_HOURS, ) - if request is None or request.request_token != request_token: - return None - return self._serialize_ingestion_request_payload( + self.email_delivery.send_verification_email( + to_email=normalized_email, + verification_url=self._verification_url(token_payload), + ) + self._record_user_audit_event(session, user_id=int(user.id), event_type="user.email_verification_requested") + return self._email_verification_payload(token_payload) + + def confirm_email_verification(self, *, token: str) -> dict[str, Any]: + """Consume an email verification token and activate the account. + + Args: + token: Raw verification token supplied by the user. + + Returns: + Created or updated user profile payload. + """ + + clean_token = token.strip() + if not clean_token: + raise UserAuthError("invalid_token") + with session_scope(self.session_factory) as session: + timestamp = now_utc() + pending = session.scalar( + select(PendingUserRegistrationModel) + .where( + PendingUserRegistrationModel.token_hash == _hash_user_lifecycle_token(clean_token), + PendingUserRegistrationModel.consumed_at.is_(None), + PendingUserRegistrationModel.expires_at > timestamp, + ) + .limit(1) + ) + if pending is not None: + existing = session.scalar(select(UserModel).where(UserModel.email == pending.email).limit(1)) + if existing is not None: + pending.consumed_at = timestamp + raise UserAuthError("email_already_registered") + user = UserModel( + email=str(pending.email), + password_hash=str(pending.password_hash), + display_name=str(pending.email).split("@", 1)[0], + role=USER_ROLE_USER, + is_active=True, + email_verified_at=timestamp, + password_changed_at=None, + last_login_at=None, + created_at=timestamp, + updated_at=timestamp, + ) + session.add(user) + pending.consumed_at = timestamp + session.flush() + self._record_user_audit_event(session, user_id=int(user.id), event_type="user.registered") + self._record_user_audit_event(session, user_id=int(user.id), event_type="user.email_verified") + session.flush() + return _user_payload(user) + + _, user = self._consume_lifecycle_token( session, - request, - serializer=_ingestion_request_payload, + token=clean_token, + purpose=USER_TOKEN_EMAIL_VERIFICATION, ) + user.email_verified_at = timestamp + user.updated_at = timestamp + self._record_user_audit_event(session, user_id=int(user.id), event_type="user.email_verified") + session.flush() + return _user_payload(user) - def list_priority_ingestion_requests(self, *, limit: int = INGESTION_PRIORITY_LIST_LIMIT) -> list[dict[str, Any]]: - resolved_limit = max(1, min(int(limit), INGESTION_PRIORITY_LIST_LIMIT)) - active_sort = case( - (IngestionRequestModel.status.in_(tuple(ACTIVE_INGESTION_REQUEST_STATUSES)), 0), - else_=1, - ) + def request_password_reset(self, *, email: str) -> dict[str, Any]: + """Create a fresh password reset token for one account. + + Args: + email: Account email address. + + Returns: + Neutral reset payload. Known active users include a dev token so + local tests and manual flows can complete without SMTP. + """ + + normalized_email = _normalize_user_email(email) with session_scope(self.session_factory) as session: - requests = session.scalars( - select(IngestionRequestModel) - .where(IngestionRequestModel.priority >= 100) - .order_by(active_sort.asc(), IngestionRequestModel.created_at.desc(), IngestionRequestModel.id.desc()) - .limit(resolved_limit) + user = session.scalar(select(UserModel).where(UserModel.email == normalized_email).limit(1)) + if user is None or not user.is_active: + return {"sent": True} + token_payload = self._create_lifecycle_token( + session, + user=user, + purpose=USER_TOKEN_PASSWORD_RESET, + ttl_hours=USER_PASSWORD_RESET_TTL_HOURS, + ) + self.email_delivery.send_password_reset_email( + to_email=normalized_email, + reset_url=self._password_reset_url(token_payload), + ) + self._record_user_audit_event(session, user_id=int(user.id), event_type="user.password_reset_requested") + return self._password_reset_payload(token_payload) + + def reset_user_password(self, *, token: str, password: str) -> dict[str, Any]: + """Consume a reset token, update password, and revoke active sessions. + + Args: + token: Raw password reset token supplied by the user. + password: New plaintext password. + + Returns: + Updated user profile payload. + """ + + validated_password = _validate_password(password) + with session_scope(self.session_factory) as session: + _, user = self._consume_lifecycle_token( + session, + token=token, + purpose=USER_TOKEN_PASSWORD_RESET, + ) + timestamp = now_utc() + user.password_hash = _hash_password(validated_password) + user.password_changed_at = timestamp + user.updated_at = timestamp + session.query(UserSessionModel).filter( + UserSessionModel.user_id == int(user.id), + UserSessionModel.revoked_at.is_(None), + ).update({UserSessionModel.revoked_at: timestamp}) + self._record_user_audit_event(session, user_id=int(user.id), event_type="user.password_reset_completed") + session.flush() + return _user_payload(user) + + def list_users(self, *, page: int = 1, page_size: int = 50) -> dict[str, Any]: + """List registered users for the admin user table. + + Args: + page: One-based page number. + page_size: Number of users per page. + + Returns: + Paginated user management payload. + """ + + safe_page = max(1, page) + safe_page_size = min(max(1, page_size), 200) + offset = (safe_page - 1) * safe_page_size + with session_scope(self.session_factory) as session: + total_items = int(session.scalar(select(func.count(UserModel.id))) or 0) + users = session.scalars( + select(UserModel) + .order_by(UserModel.id.asc()) + .limit(safe_page_size) + .offset(offset) ).all() - return self._serialize_ingestion_request_payloads( + total_pages = max(1, ceil(total_items / safe_page_size)) if total_items else 1 + return { + "items": [_user_admin_payload(user) for user in users], + "page": safe_page, + "page_size": safe_page_size, + "total_items": total_items, + "total_pages": total_pages, + "has_next": safe_page < total_pages, + "has_prev": safe_page > 1, + } + + def update_user_role(self, *, user_id: int, role: str) -> dict[str, Any]: + """Update one user's role in the simplified admin/user role model. + + Args: + user_id: Target user ID. + role: New role. Supported values are `admin` and `user`. + + Returns: + Updated admin user payload. + """ + + clean_role = role.strip().lower() + if clean_role not in USER_ROLES: + raise ValueError("invalid_user_role") + with session_scope(self.session_factory) as session: + user = session.scalar(select(UserModel).where(UserModel.id == user_id).limit(1)) + if user is None: + raise UserAuthError("user_not_found") + user.role = clean_role + user.updated_at = now_utc() + self._record_user_audit_event( session, - requests, - serializer=_priority_ingestion_request_payload, + user_id=int(user.id), + event_type="user.role_updated", + details={"role": clean_role}, ) + session.flush() + return _user_admin_payload(user) def lookup_blog_candidates(self, *, url: str) -> dict[str, Any]: normalized = normalize_url(url) @@ -2818,18 +3550,6 @@ def lookup_blog_candidates(self, *, url: str) -> dict[str, Any]: match_reason=None, ) - def mark_ingestion_request_crawling(self, *, blog_id: int) -> None: - with session_scope(self.session_factory) as session: - request = self._oldest_seed_ingestion_request( - session, - blog_id=blog_id, - statuses=(INGESTION_REQUEST_STATUS_QUEUED,), - ) - if request is None: - return - request.status = INGESTION_REQUEST_STATUS_CRAWLING_SEED - request.updated_at = now_utc() - def _claim_blog_for_statement(self, session: Session, statement: Any) -> dict[str, Any] | None: blog = session.scalar(statement) if blog is None: @@ -2846,43 +3566,26 @@ def _claim_first_matching_blog(self, session: Session, statement: Any) -> dict[s statement = statement.with_for_update(skip_locked=True) return self._claim_blog_for_statement(session, statement) - def _active_ingestion_seed_ids_statement(self) -> Any: - """Return the active ingestion seed ids used to exclude priority-backed blogs.""" - return select(IngestionRequestModel.seed_blog_id).where( - IngestionRequestModel.seed_blog_id.is_not(None), - IngestionRequestModel.status.in_(tuple(ACTIVE_INGESTION_REQUEST_STATUSES)), - ) + def get_next_waiting_blog(self) -> dict[str, Any] | None: + """Claim the next ordinary waiting blog for crawler processing. - def _oldest_ingestion_request( - self, - session: Session, - *, - filters: tuple[ColumnElement[bool], ...], - statuses: tuple[str, ...], - ) -> IngestionRequestModel | None: - """Return the oldest ingestion request matching the given filters and statuses.""" - return session.scalar( - select(IngestionRequestModel) - .where( - *filters, - IngestionRequestModel.status.in_(statuses), - ) - .order_by(IngestionRequestModel.created_at.asc(), IngestionRequestModel.id.asc()) - ) + Args: + None. - def _oldest_seed_ingestion_request( - self, - session: Session, - *, - blog_id: int, - statuses: tuple[str, ...], - ) -> IngestionRequestModel | None: - """Return the oldest ingestion request for one seed blog within the allowed statuses.""" - return self._oldest_ingestion_request( - session, - filters=(IngestionRequestModel.seed_blog_id == blog_id,), - statuses=statuses, - ) + Returns: + Serialized blog payload for the claimed row, or ``None`` when no + `WAITING` blog is available. The claimed row is immediately moved + to `PROCESSING`. + """ + + with session_scope(self.session_factory) as session: + statement = ( + select(BlogModel) + .where(BlogModel.crawl_status == CrawlStatus.WAITING) + .order_by(BlogModel.id.asc()) + .limit(1) + ) + return self._claim_first_matching_blog(session, statement) def _lookup_blog_matches( self, @@ -2908,47 +3611,6 @@ def _lookup_blog_matches( match_reason=match_reason, ) - def _priority_blog_claim_statement(self) -> Any: - """Build the priority queue statement without changing claim semantics.""" - return ( - select(BlogModel) - .join( - IngestionRequestModel, - IngestionRequestModel.seed_blog_id == BlogModel.blog_id, - ) - .where( - BlogModel.crawl_status == CrawlStatus.WAITING, - IngestionRequestModel.status == INGESTION_REQUEST_STATUS_QUEUED, - ) - .order_by( - IngestionRequestModel.priority.desc(), - IngestionRequestModel.created_at.asc(), - BlogModel.blog_id.asc(), - BlogModel.id.asc(), - ) - .limit(1) - ) - - def _waiting_blog_claim_statement(self, *, include_priority: bool) -> Any: - """Build the waiting queue statement while preserving priority exclusion semantics.""" - statement = select(BlogModel).where(BlogModel.crawl_status == CrawlStatus.WAITING) - if not include_priority: - statement = statement.where( - BlogModel.blog_id.not_in(self._active_ingestion_seed_ids_statement()) - ) - return statement.order_by(BlogModel.blog_id.asc(), BlogModel.id.asc()).limit(1) - - def get_next_priority_blog(self) -> dict[str, Any] | None: - with session_scope(self.session_factory) as session: - return self._claim_first_matching_blog(session, self._priority_blog_claim_statement()) - - def get_next_waiting_blog(self, *, include_priority: bool = True) -> dict[str, Any] | None: - with session_scope(self.session_factory) as session: - return self._claim_first_matching_blog( - session, - self._waiting_blog_claim_statement(include_priority=include_priority), - ) - def requeue_failed_blogs(self) -> dict[str, Any]: """Move every failed blog back into the waiting crawl queue. @@ -2963,19 +3625,10 @@ def requeue_failed_blogs(self) -> dict[str, Any]: for blog in blogs: blog.crawl_status = CrawlStatus.WAITING blog.status_code = None + blog.crawl_error_kind = None + blog.crawl_error_message = None blog.updated_at = timestamp - requests = session.scalars( - select(IngestionRequestModel).where( - IngestionRequestModel.seed_blog_id.in_([int(_business_blog_id(blog)) for blog in blogs]), - IngestionRequestModel.status == INGESTION_REQUEST_STATUS_FAILED, - ) - ).all() - for request in requests: - request.status = INGESTION_REQUEST_STATUS_QUEUED - request.error_message = None - request.updated_at = timestamp - return {"requeued": requeued_count} def mark_blog_result( @@ -2988,33 +3641,31 @@ def mark_blog_result( metadata_captured: bool = False, title: str | None = None, icon_url: str | None = None, + crawl_error_kind: str | None = None, + crawl_error_message: str | None = None, ) -> None: with session_scope(self.session_factory) as session: blog = self._get_blog_by_business_id(session, blog_id) if blog is None: return - blog.crawl_status = CrawlStatus(crawl_status) + resolved_status = CrawlStatus(crawl_status) + timestamp = now_utc() + blog.crawl_status = resolved_status blog.status_code = status_code blog.friend_links_count = friend_links_count - blog.last_crawled_at = now_utc() - blog.updated_at = now_utc() + blog.last_crawled_at = timestamp + blog.last_crawl_attempt_at = timestamp + blog.updated_at = timestamp + if resolved_status == CrawlStatus.FINISHED: + blog.successful_crawl_at = timestamp + blog.crawl_error_kind = None + blog.crawl_error_message = None + elif resolved_status == CrawlStatus.FAILED: + blog.crawl_error_kind = crawl_error_kind + blog.crawl_error_message = crawl_error_message if metadata_captured: blog.title = title blog.icon_url = icon_url - request = self._oldest_seed_ingestion_request( - session, - blog_id=blog_id, - statuses=tuple(ACTIVE_INGESTION_REQUEST_STATUSES), - ) - if request is not None: - if blog.crawl_status == CrawlStatus.FINISHED: - request.status = INGESTION_REQUEST_STATUS_COMPLETED - request.matched_blog_id = int(_business_blog_id(blog)) - request.error_message = None - elif blog.crawl_status == CrawlStatus.FAILED: - request.status = INGESTION_REQUEST_STATUS_FAILED - request.error_message = "seed crawl failed" - request.updated_at = now_utc() def add_edge( self, @@ -3115,6 +3766,36 @@ def create_raw_discovered_url_record( session.flush() return {"id": int(record.id), "status": str(record.status)} + def find_blog_id_by_normalized_url(self, *, normalized_url: str) -> int | None: + """Return the persisted blog id for one normalized URL when it exists. + + Args: + normalized_url: Canonical URL value used by crawler discovery and + blog upsert identity checks. + + Returns: + Business ``blog_id`` for the matching blog, or ``None`` when the + URL has not yet been accepted as a blog. + """ + + identity = resolve_blog_identity(normalized_url) + with session_scope(self.session_factory) as session: + blog_id = session.scalar( + select(BlogModel.blog_id) + .where(BlogModel.normalized_url == normalized_url) + .order_by(BlogModel.blog_id.asc(), BlogModel.id.asc()) + .limit(1) + ) + if blog_id is not None: + return int(blog_id) + blog_id = session.scalar( + select(BlogModel.blog_id) + .where(BlogModel.identity_key == identity.identity_key) + .order_by(BlogModel.blog_id.asc(), BlogModel.id.asc()) + .limit(1) + ) + return int(blog_id) if blog_id is not None else None + def update_raw_discovered_url_status( self, *, @@ -3308,6 +3989,7 @@ def list_blogs_catalog( has_title: bool | str | None = None, has_icon: bool | str | None = None, min_connections: int | None = None, + acceptance_status: str | None = BLOG_ACCEPTANCE_ACCEPTED, ) -> dict[str, Any]: query = normalize_blog_catalog_query( page=page, @@ -3321,9 +4003,12 @@ def list_blogs_catalog( has_title=has_title, has_icon=has_icon, min_connections=min_connections, + acceptance_status=acceptance_status, ) with session_scope(self.session_factory) as session: statement, metrics = self._blog_select() + if query["acceptance_status"] is not None: + statement = statement.where(BlogModel.acceptance_status == query["acceptance_status"]) if query["site"] is not None: pattern = f"%{query['site']}%" statement = statement.where( @@ -3357,11 +4042,7 @@ def list_blogs_catalog( ) if query["has_icon"] is True: statement = statement.where( - or_( - and_(BlogModel.icon_url.is_not(None), BlogModel.icon_url != ""), - BlogModel.url.like("http://%"), - BlogModel.url.like("https://%"), - ) + and_(BlogModel.icon_url.is_not(None), BlogModel.icon_url != "") ) if query["min_connections"] > 0: statement = statement.where(metrics["connection_count"] >= query["min_connections"]) @@ -3410,9 +4091,508 @@ def list_blogs_catalog( "has_title": query["has_title"], "has_icon": query["has_icon"], "min_connections": query["min_connections"], + "acceptance_status": query["acceptance_status"], }, ) + def create_random_recommendation_batch( + self, + *, + count: int = 9, + visitor_id: str, + session_id: str, + user_id: int | None = None, + source: str | None = None, + page_url: str | None = None, + context: dict[str, Any] | None = None, + ) -> dict[str, Any]: + """Persist one random-blog recommendation request and its impressions. + + Args: + count: Number of cards requested by the frontend. + visitor_id: Stable anonymous visitor identifier. + session_id: Stable browser-session identifier. + user_id: Optional authenticated user ID for attribution. + source: Optional caller surface detail. + page_url: Optional page URL where the batch was shown. + context: Optional JSON metadata associated with the serving event. + + Returns: + Recommendation batch payload containing request metadata and ordered + catalog items with impression attribution fields. + """ + + if count < 1 or count > 50: + raise ValueError("count_out_of_range") + clean_visitor_id = _clean_event_text(visitor_id, field="visitor_id") + clean_session_id = _clean_event_text(session_id, field="session_id") + request_uuid = token_urlsafe(24) + with session_scope(self.session_factory) as session: + if user_id is not None and session.scalar(select(UserModel.id).where(UserModel.id == user_id)) is None: + raise UserAuthError("user_not_found") + statement, _ = self._blog_select() + statement = statement.where( + BlogModel.crawl_status == CrawlStatus.FINISHED, + BlogModel.acceptance_status == BLOG_ACCEPTANCE_ACCEPTED, + ) + rows = session.execute(self._random_blog_catalog_statement(statement).limit(count)).all() + recommendation = RecommendationRequestModel( + request_uuid=request_uuid, + surface=RANDOM_RECOMMENDATION_SURFACE, + strategy=RANDOM_RECOMMENDATION_STRATEGY, + strategy_version=RANDOM_RECOMMENDATION_STRATEGY_VERSION, + visitor_id=clean_visitor_id, + user_id=user_id, + session_id=clean_session_id, + source=(source or "").strip() or None, + page_url=(page_url or "").strip() or None, + requested_count=count, + served_count=len(rows), + context_json=_coerce_json_object(context), + ) + session.add(recommendation) + session.flush() + items: list[dict[str, Any]] = [] + for position, row in enumerate(rows, start=1): + blog = row[0] + impression = RecommendationImpressionModel( + request_id=recommendation.id, + normalized_url=str(blog.normalized_url), + position=position, + score=None, + reason_json={"strategy": RANDOM_RECOMMENDATION_STRATEGY}, + ) + session.add(impression) + session.flush() + items.append( + self._row_blog_payload(row) + | { + "request_uuid": request_uuid, + "impression_id": impression.id, + "position": position, + } + ) + session.flush() + return { + "request_uuid": request_uuid, + "surface": RANDOM_RECOMMENDATION_SURFACE, + "strategy": RANDOM_RECOMMENDATION_STRATEGY, + "strategy_version": RANDOM_RECOMMENDATION_STRATEGY_VERSION, + "visitor_id": clean_visitor_id, + "session_id": clean_session_id, + "user_id": user_id, + "source": recommendation.source, + "page_url": recommendation.page_url, + "requested_count": count, + "served_count": len(items), + "created_at": _iso(recommendation.created_at), + "items": items, + } + + def _blog_interaction_payload(self, interaction: BlogInteractionModel) -> dict[str, Any]: + """Serialize one immutable blog interaction event row. + + Args: + interaction: Persisted interaction model. + + Returns: + JSON-ready event payload with attribution identifiers. + """ + + return { + "id": interaction.id, + "event_uuid": interaction.event_uuid, + "request_id": interaction.request_id, + "impression_id": interaction.impression_id, + "normalized_url": interaction.normalized_url, + "event_type": interaction.event_type, + "position": interaction.position, + "entrance_kind": interaction.entrance_kind, + "entrance_url": interaction.entrance_url, + "interaction_order": interaction.interaction_order, + "visitor_id": interaction.visitor_id, + "user_id": interaction.user_id, + "session_id": interaction.session_id, + "client_event_at": _iso(interaction.client_event_at), + "attributes": interaction.attributes_json, + "created_at": _iso(interaction.created_at), + } + + def record_blog_interaction( + self, + *, + event_uuid: str, + event_type: str, + blog_id: int, + visitor_id: str, + session_id: str, + entrance_kind: str, + entrance_url: str, + request_uuid: str | None = None, + impression_id: int | None = None, + position: int | None = None, + interaction_order: int = 1, + user_id: int | None = None, + client_event_at: str | datetime | None = None, + attributes: dict[str, Any] | None = None, + ) -> dict[str, Any]: + """Persist one idempotent recommendation interaction event. + + Args: + event_uuid: Client-generated idempotency key. + event_type: Interaction type such as ``detail_open``. + blog_id: Public/business blog ID receiving the event. + visitor_id: Stable anonymous visitor identifier. + session_id: Stable browser-session identifier. + entrance_kind: Stable entrance category such as ``random_blog_card``. + entrance_url: Raw URL for the entrance context. + request_uuid: Optional serving request UUID for attribution. + impression_id: Optional impression row ID for attribution. + position: Optional card position displayed to the visitor. + interaction_order: Monotonic client-side order within the session. + user_id: Optional authenticated user ID. + client_event_at: Optional client timestamp. + attributes: Optional JSON metadata for the event. + + Returns: + Serialized event payload plus a ``duplicate`` flag. + """ + + clean_event_uuid = _clean_event_text(event_uuid, field="event_uuid") + clean_event_type = _clean_event_text(event_type, field="event_type", max_length=64) + if clean_event_type not in RECOMMENDATION_EVENT_TYPES: + raise ValueError("unsupported_recommendation_event_type") + clean_visitor_id = _clean_event_text(visitor_id, field="visitor_id") + clean_session_id = _clean_event_text(session_id, field="session_id") + clean_entrance_kind = _clean_event_text(entrance_kind, field="entrance_kind", max_length=128) + clean_entrance_url = _clean_event_text(entrance_url, field="entrance_url", max_length=2048) + if interaction_order < 1: + raise ValueError("interaction_order_out_of_range") + with session_scope(self.session_factory) as session: + existing = session.scalar( + select(BlogInteractionModel).where(BlogInteractionModel.event_uuid == clean_event_uuid) + ) + if existing is not None: + return self._blog_interaction_payload(existing) | {"duplicate": True} + blog = self._get_blog_by_business_id(session, blog_id) + if blog is None: + raise BlogLabelingNotFoundError("blog_not_found") + if user_id is not None and session.scalar(select(UserModel.id).where(UserModel.id == user_id)) is None: + raise UserAuthError("user_not_found") + recommendation: RecommendationRequestModel | None = None + if request_uuid is not None: + recommendation = session.scalar( + select(RecommendationRequestModel).where( + RecommendationRequestModel.request_uuid == request_uuid + ) + ) + if recommendation is None: + raise ValueError("recommendation_request_not_found") + impression: RecommendationImpressionModel | None = None + if impression_id is not None: + impression = session.get(RecommendationImpressionModel, impression_id) + if impression is None: + raise ValueError("recommendation_impression_not_found") + if str(impression.normalized_url) != str(blog.normalized_url): + raise ValueError("recommendation_impression_blog_mismatch") + if recommendation is not None and int(impression.request_id) != int(recommendation.id): + raise ValueError("recommendation_impression_request_mismatch") + if position is None: + position = int(impression.position) + interaction = BlogInteractionModel( + event_uuid=clean_event_uuid, + request_id=recommendation.id if recommendation is not None else None, + impression_id=impression.id if impression is not None else None, + normalized_url=str(blog.normalized_url), + event_type=clean_event_type, + position=position, + entrance_kind=clean_entrance_kind, + entrance_url=clean_entrance_url, + interaction_order=interaction_order, + visitor_id=clean_visitor_id, + user_id=user_id, + session_id=clean_session_id, + client_event_at=_parse_event_datetime(client_event_at), + attributes_json=_coerce_json_object(attributes), + ) + session.add(interaction) + session.flush() + return self._blog_interaction_payload(interaction) | {"duplicate": False} + + def get_blog_recommendation_stats(self, blog_id: int) -> dict[str, Any] | None: + """Return recommendation exposure and interaction stats for one blog. + + Args: + blog_id: Public/business blog ID. + + Returns: + Stats payload, or ``None`` when the blog does not exist. + """ + + with session_scope(self.session_factory) as session: + blog = self._get_blog_by_business_id(session, blog_id) + if blog is None: + return None + impressions = int( + session.scalar( + select(func.count(RecommendationImpressionModel.id)).where( + RecommendationImpressionModel.normalized_url == blog.normalized_url + ) + ) + or 0 + ) + event_counts = { + str(event_type): int(count or 0) + for event_type, count in session.execute( + select(BlogInteractionModel.event_type, func.count(BlogInteractionModel.id)) + .where(BlogInteractionModel.normalized_url == blog.normalized_url) + .group_by(BlogInteractionModel.event_type) + ).all() + } + unique_visitors = int( + session.scalar( + select(func.count(func.distinct(BlogInteractionModel.visitor_id))).where( + BlogInteractionModel.normalized_url == blog.normalized_url + ) + ) + or 0 + ) + last_interaction_at = session.scalar( + select(func.max(BlogInteractionModel.created_at)).where( + BlogInteractionModel.normalized_url == blog.normalized_url + ) + ) + clicks = int(event_counts.get("click", 0)) + detail_opens = int(event_counts.get("detail_open", 0)) + external_opens = int(event_counts.get("external_open", 0)) + label_selects = int(event_counts.get("label_select", 0)) + return { + "blog_id": blog_id, + "normalized_url": blog.normalized_url, + "impressions": impressions, + "clicks": clicks, + "detail_opens": detail_opens, + "external_opens": external_opens, + "label_selects": label_selects, + "unique_visitors": unique_visitors, + "ctr": (clicks + detail_opens + external_opens) / impressions if impressions else 0.0, + "last_interaction_at": _iso(last_interaction_at), + "by_event_type": event_counts, + } + + def _hour_start(self, value: datetime | None = None) -> datetime: + """Return the UTC natural-hour boundary for one timestamp. + + Args: + value: Optional timestamp to normalize. Current UTC time is used + when omitted. + + Returns: + Timezone-aware UTC datetime truncated to the hour. + """ + + current = value or datetime.now(UTC) + if current.tzinfo is None: + current = current.replace(tzinfo=UTC) + return current.astimezone(UTC).replace(minute=0, second=0, microsecond=0) + + def _admin_hourly_stats_payload(self, row: AdminHourlyStatsModel) -> dict[str, Any]: + """Serialize one hourly admin statistics row. + + Args: + row: Persisted hourly statistics snapshot. + + Returns: + JSON-ready snapshot payload. + """ + + return { + "id": row.id, + "hour_start": _iso(row.hour_start), + "user_count": row.user_count, + "random_request_count": row.random_request_count, + "random_impression_count": row.random_impression_count, + "detail_open_count": row.detail_open_count, + "external_open_count": row.external_open_count, + "detail_ctr": row.detail_ctr, + "external_ctr": row.external_ctr, + "total_click_ctr": row.total_click_ctr, + "refreshed_at": _iso(row.refreshed_at), + "created_at": _iso(row.created_at), + } + + def _refresh_admin_hourly_stats(self, session: Session, hour_start: datetime) -> AdminHourlyStatsModel: + """Refresh or create one admin statistics snapshot for a natural hour. + + Args: + session: Active SQLAlchemy session. + hour_start: UTC natural-hour boundary to aggregate. + + Returns: + Persisted snapshot row after source-table aggregation. + """ + + hour_end = hour_start + timedelta(hours=1) + user_count = int( + session.scalar(select(func.count(UserModel.id)).where(UserModel.is_active.is_(True))) or 0 + ) + random_request_count = int( + session.scalar( + select(func.count(RecommendationRequestModel.id)).where( + RecommendationRequestModel.surface == RANDOM_RECOMMENDATION_SURFACE, + RecommendationRequestModel.created_at >= hour_start, + RecommendationRequestModel.created_at < hour_end, + ) + ) + or 0 + ) + random_impression_count = int( + session.scalar( + select(func.count(RecommendationImpressionModel.id)) + .join(RecommendationRequestModel, RecommendationImpressionModel.request_id == RecommendationRequestModel.id) + .where( + RecommendationRequestModel.surface == RANDOM_RECOMMENDATION_SURFACE, + RecommendationImpressionModel.created_at >= hour_start, + RecommendationImpressionModel.created_at < hour_end, + ) + ) + or 0 + ) + interaction_counts = { + str(event_type): int(count or 0) + for event_type, count in session.execute( + select(BlogInteractionModel.event_type, func.count(BlogInteractionModel.id)) + .join(RecommendationRequestModel, BlogInteractionModel.request_id == RecommendationRequestModel.id) + .where( + RecommendationRequestModel.surface == RANDOM_RECOMMENDATION_SURFACE, + BlogInteractionModel.created_at >= hour_start, + BlogInteractionModel.created_at < hour_end, + BlogInteractionModel.event_type.in_(("detail_open", "external_open")), + ) + .group_by(BlogInteractionModel.event_type) + ).all() + } + detail_open_count = int(interaction_counts.get("detail_open", 0)) + external_open_count = int(interaction_counts.get("external_open", 0)) + denominator = random_impression_count or 0 + detail_ctr = detail_open_count / denominator if denominator else 0.0 + external_ctr = external_open_count / denominator if denominator else 0.0 + total_click_ctr = (detail_open_count + external_open_count) / denominator if denominator else 0.0 + row = session.scalar( + select(AdminHourlyStatsModel).where(AdminHourlyStatsModel.hour_start == hour_start) + ) + if row is None: + row = AdminHourlyStatsModel(hour_start=hour_start) + session.add(row) + row.user_count = user_count + row.random_request_count = random_request_count + row.random_impression_count = random_impression_count + row.detail_open_count = detail_open_count + row.external_open_count = external_open_count + row.detail_ctr = detail_ctr + row.external_ctr = external_ctr + row.total_click_ctr = total_click_ctr + row.refreshed_at = datetime.now(UTC) + session.flush() + return row + + def get_admin_hourly_stats(self, *, limit: int = 24) -> dict[str, Any]: + """Return hourly admin statistics and refresh the current hour. + + Args: + limit: Maximum number of hourly snapshots to return. + + Returns: + Admin statistics payload with latest/current snapshots first. + """ + + normalized_limit = max(1, min(int(limit), 168)) + with session_scope(self.session_factory) as session: + current_hour = self._hour_start() + current = self._refresh_admin_hourly_stats(session, current_hour) + rows = list( + session.scalars( + select(AdminHourlyStatsModel) + .order_by(AdminHourlyStatsModel.hour_start.desc()) + .limit(normalized_limit) + ) + ) + latest = rows[0] if rows else current + return { + "current_hour": self._admin_hourly_stats_payload(current), + "latest": self._admin_hourly_stats_payload(latest), + "items": [self._admin_hourly_stats_payload(row) for row in rows], + } + + def get_recommendation_strategy_stats(self) -> dict[str, Any]: + """Return aggregate recommendation request, impression, and event stats. + + Args: + None. + + Returns: + Strategy-grouped aggregate stats for admin dashboards. + """ + + with session_scope(self.session_factory) as session: + total_requests = int(session.scalar(select(func.count(RecommendationRequestModel.id))) or 0) + total_impressions = int(session.scalar(select(func.count(RecommendationImpressionModel.id))) or 0) + total_interactions = int(session.scalar(select(func.count(BlogInteractionModel.id))) or 0) + click_counts = { + int(request_id): int(count or 0) + for request_id, count in session.execute( + select(BlogInteractionModel.request_id, func.count(BlogInteractionModel.id)) + .where( + BlogInteractionModel.request_id.is_not(None), + BlogInteractionModel.event_type.in_(("click", "detail_open", "external_open")), + ) + .group_by(BlogInteractionModel.request_id) + ).all() + } + grouped_rows = session.execute( + select( + RecommendationRequestModel.surface, + RecommendationRequestModel.strategy, + RecommendationRequestModel.strategy_version, + func.count(RecommendationRequestModel.id), + func.coalesce(func.sum(RecommendationRequestModel.served_count), 0), + func.count(func.distinct(RecommendationRequestModel.visitor_id)), + ).group_by( + RecommendationRequestModel.surface, + RecommendationRequestModel.strategy, + RecommendationRequestModel.strategy_version, + ) + ).all() + by_strategy: list[dict[str, Any]] = [] + for surface, strategy, strategy_version, request_count, served_count, visitor_count in grouped_rows: + request_ids = session.scalars( + select(RecommendationRequestModel.id).where( + RecommendationRequestModel.surface == surface, + RecommendationRequestModel.strategy == strategy, + RecommendationRequestModel.strategy_version == strategy_version, + ) + ).all() + clicks = sum(click_counts.get(int(request_id), 0) for request_id in request_ids) + impressions = int(served_count or 0) + by_strategy.append( + { + "surface": surface, + "strategy": strategy, + "strategy_version": strategy_version, + "requests": int(request_count or 0), + "impressions": impressions, + "clicks": clicks, + "unique_visitors": int(visitor_count or 0), + "ctr": clicks / impressions if impressions else 0.0, + } + ) + return { + "total_requests": total_requests, + "total_impressions": total_impressions, + "total_interactions": total_interactions, + "by_strategy": by_strategy, + } + def list_blog_labeling_candidates( self, *, @@ -4134,6 +5314,19 @@ def get_blog_detail(self, blog_id: int) -> dict[str, Any] | None: return { **self._row_blog_payload(blog_row), + "discovery_path": self._blog_discovery_path_payload(session, blog_row[0]), + "relation_graphs": { + "incoming": self._blog_relation_graph_payload( + session, + blog=blog_row[0], + direction="incoming", + ), + "outgoing": self._blog_relation_graph_payload( + session, + blog=blog_row[0], + direction="outgoing", + ), + }, "incoming_edges": self._blog_detail_relation_payloads( session, incoming_edges, @@ -4254,247 +5447,40 @@ def get_filter_stats_by_chain_order(self) -> dict[str, Any]: "funnel": funnel, } - def create_blog_dedup_scan_run(self, *, crawler_was_running: bool = False) -> dict[str, Any]: - started_at = now_utc() - settings = self._decision_scan_settings() - with session_scope(self.session_factory) as session: - total_count = _count_selectable_rows(session, BlogModel) - run = BlogDedupScanRunModel( - status="RUNNING", - ruleset_version=_decision_scan_ruleset_version(settings), - started_at=started_at, - completed_at=None, - duration_ms=0, - total_count=total_count, - scanned_count=0, - removed_count=0, - kept_count=0, - crawler_was_running=crawler_was_running, - crawler_restart_attempted=False, - crawler_restart_succeeded=False, - search_reindexed=False, - error_message=None, - created_at=started_at, - updated_at=started_at, - ) - session.add(run) - session.flush() - return _blog_dedup_scan_run_payload(run) - - def execute_blog_dedup_scan_run(self, *, run_id: int) -> dict[str, Any]: - started_at = now_utc() - settings = self._decision_scan_settings() - decision_chain = build_url_decision_chain(settings) - try: - with session_scope(self.session_factory) as session: - run = self._require_model( - session, - BlogDedupScanRunModel, - run_id, - not_found_error="blog_dedup_scan_run_not_found", - ) - run.status = "RUNNING" - run.started_at = run.started_at or started_at - run.completed_at = None - run.duration_ms = 0 - run.scanned_count = 0 - run.removed_count = 0 - run.kept_count = 0 - run.error_message = None - run.updated_at = started_at - blog_rows = session.execute( - select( - BlogModel.blog_id, - BlogModel.url, - BlogModel.domain, - BlogModel.identity_key, - ) - .order_by(BlogModel.blog_id.asc(), BlogModel.id.asc()) - ).all() - run.total_count = len(blog_rows) - - scanned_count = 0 - rejected_blog_count = 0 - for blog_row in blog_rows: - with session_scope(self.session_factory) as session: - run = self._require_model( - session, - BlogDedupScanRunModel, - run_id, - not_found_error="blog_dedup_scan_run_not_found", - ) - blog = self._get_blog_by_business_id(session, int(blog_row.blog_id)) - if blog is None: - continue - decision = decision_chain.decide( - str(blog.url or ""), - "", - link_text=str(blog.domain or ""), - context_text="", - ) - if not decision.accepted: - session.add( - BlogDedupScanRunItemModel( - run_id=int(run.id), - survivor_blog_id=None, - removed_blog_id=int(_business_blog_id(blog)), - survivor_identity_key=str(blog.identity_key or ""), - removed_url=str(blog.url or ""), - removed_normalized_url=str(blog.normalized_url or blog.url or ""), - removed_domain=str(blog.domain or ""), - reason_code=decision.reasons[0] if decision.reasons else "decision_rejected", - reason_codes=_dump_reason_codes(list(decision.reasons)), - survivor_selection_basis=( - f"scanned_blog_id={int(_business_blog_id(blog))}, " - f"decision_score={decision.score:.6f}" - ), - created_at=now_utc(), - ) - ) - self._delete_blog_graph(session, blog_id=int(_business_blog_id(blog))) - rejected_blog_count += 1 - - scanned_count += 1 - completed_so_far = now_utc() - run.scanned_count = scanned_count - run.removed_count = rejected_blog_count - run.kept_count = max(run.total_count - rejected_blog_count, 0) - run.duration_ms = max(int((completed_so_far - started_at).total_seconds() * 1000), 0) - run.updated_at = completed_so_far - - with session_scope(self.session_factory) as session: - run = self._require_model( - session, - BlogDedupScanRunModel, - run_id, - not_found_error="blog_dedup_scan_run_not_found", - ) - completed_at = now_utc() - final_blog_count = _count_selectable_rows(session, BlogModel) - run.status = "SUCCEEDED" - run.completed_at = completed_at - run.duration_ms = max(int((completed_at - started_at).total_seconds() * 1000), 0) - run.scanned_count = scanned_count - run.removed_count = max(run.total_count - final_blog_count, 0) - run.kept_count = final_blog_count - run.updated_at = completed_at - session.flush() - return _blog_dedup_scan_run_payload(run) - except Exception as exc: - with session_scope(self.session_factory) as session: - run = session.get(BlogDedupScanRunModel, run_id) - if run is not None: - completed_at = now_utc() - run.status = "FAILED" - run.completed_at = completed_at - run.duration_ms = max(int((completed_at - started_at).total_seconds() * 1000), 0) - run.error_message = str(exc) - run.updated_at = completed_at - raise - - def finalize_blog_dedup_scan_run( - self, - *, - run_id: int, - crawler_restart_attempted: bool, - crawler_restart_succeeded: bool, - search_reindexed: bool, - error_message: str | None = None, - ) -> dict[str, Any]: - with session_scope(self.session_factory) as session: - run = self._require_model( - session, - BlogDedupScanRunModel, - run_id, - not_found_error="blog_dedup_scan_run_not_found", - ) - run.crawler_restart_attempted = crawler_restart_attempted - run.crawler_restart_succeeded = crawler_restart_succeeded - run.search_reindexed = search_reindexed - if error_message: - run.error_message = error_message - run.updated_at = now_utc() - session.flush() - return _blog_dedup_scan_run_payload(run) - - def get_latest_blog_dedup_scan_run(self) -> dict[str, Any] | None: - with session_scope(self.session_factory) as session: - return self._latest_row_payload( - session, - statement=select(BlogDedupScanRunModel).order_by(BlogDedupScanRunModel.id.desc()).limit(1), - serializer=_blog_dedup_scan_run_payload, - ) - - def list_blog_dedup_scan_run_items(self, run_id: int) -> list[dict[str, Any]]: - with session_scope(self.session_factory) as session: - return self._ordered_row_payloads( - session, - statement=( - select(BlogDedupScanRunItemModel) - .where(BlogDedupScanRunItemModel.run_id == run_id) - .order_by(BlogDedupScanRunItemModel.id.asc()) - ), - serializer=_blog_dedup_scan_run_item_payload, - ) - def reset(self) -> dict[str, Any]: with session_scope(self.session_factory) as session: blogs_deleted = _count_selectable_rows(session, BlogModel) edges_deleted = _count_selectable_rows(session, EdgeModel) - requests_deleted = _count_selectable_rows(session, IngestionRequestModel) - users_preserved = _count_selectable_rows(session, UserModel) - user_sessions_preserved = _count_selectable_rows(session, UserSessionModel) - labels_preserved = _count_selectable_rows(session, BlogLabelModel) - user_labels_preserved = _count_selectable_rows(session, BlogUserLabelModel) - user_label_selections_preserved = _count_selectable_rows(session, BlogUserLabelSelectionModel) - label_tags_preserved = _count_selectable_rows(session, BlogLabelTagModel) raw_urls_deleted = _count_selectable_rows(session, RawDiscoveredUrlModel) - scan_items_deleted = _count_selectable_rows(session, BlogDedupScanRunItemModel) - scan_runs_deleted = _count_selectable_rows(session, BlogDedupScanRunModel) - if self.dialect_name == "postgresql": - session.execute( - text( - "TRUNCATE TABLE blog_dedup_scan_run_items, blog_dedup_scan_runs, " - "raw_discovered_urls, ingestion_requests, edges, blogs " - "RESTART IDENTITY CASCADE" - ) - ) - else: - session.query(BlogDedupScanRunItemModel).delete() - session.query(BlogDedupScanRunModel).delete() - session.query(RawDiscoveredUrlModel).delete() - session.query(IngestionRequestModel).delete() - session.query(EdgeModel).delete() - session.query(BlogModel).delete() + # Seeds are durable configuration, but their nullable blog pointer + # must be cleared before deleting the referenced blog rows. + session.query(SeedModel).update({SeedModel.blog_id: None}) + session.query(RawDiscoveredUrlModel).delete() + session.query(EdgeModel).delete() + session.query(BlogModel).delete() return { "ok": True, "blogs_deleted": blogs_deleted, "edges_deleted": edges_deleted, - "logs_deleted": 0, - "ingestion_requests_deleted": requests_deleted, - "users_preserved": users_preserved, - "user_sessions_preserved": user_sessions_preserved, - "blog_link_labels_deleted": 0, - "blog_label_tags_deleted": 0, - "blog_labels_preserved": labels_preserved, - "blog_labels_userlabel_preserved": user_labels_preserved, - "blog_user_label_selections_preserved": user_label_selections_preserved, - "blog_label_subjects_preserved": 0, - "blog_link_labels_preserved": labels_preserved, - "blog_label_tags_preserved": label_tags_preserved, "raw_discovered_urls_deleted": raw_urls_deleted, - "blog_dedup_scan_items_deleted": scan_items_deleted, - "blog_dedup_scan_runs_deleted": scan_runs_deleted, + "logs_deleted": 0, } class Repository(SQLAlchemyRepository): """Compatibility wrapper for test call sites that still pass a db path.""" - def __init__(self, db_path: Path, *, decision_settings: Settings | None = None) -> None: + def __init__( + self, + db_path: Path, + *, + decision_settings: Settings | None = None, + email_delivery: EmailDelivery | None = None, + ) -> None: super().__init__( f"sqlite+pysqlite:///{db_path}", decision_settings=decision_settings, + email_delivery=email_delivery or NoopEmailDelivery(), startup_schema_sync=True, ) @@ -4504,12 +5490,19 @@ def build_repository( db_path: Path, db_dsn: str | None = None, settings: Settings | None = None, + email_delivery: EmailDelivery | None = None, ) -> RepositoryProtocol: """Build the configured repository implementation.""" if db_dsn is not None: try: - return SQLAlchemyRepository(db_dsn, decision_settings=settings, startup_schema_sync=True) + kwargs: dict[str, Any] = { + "decision_settings": settings, + "startup_schema_sync": True, + } + if email_delivery is not None: + kwargs["email_delivery"] = email_delivery + return SQLAlchemyRepository(db_dsn, **kwargs) except ModuleNotFoundError as exc: if exc.name != "psycopg": raise - return Repository(db_path, decision_settings=settings) + return Repository(db_path, decision_settings=settings, email_delivery=email_delivery) diff --git a/pyproject.toml b/pyproject.toml index 606d33a..f0d5918 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,6 +13,7 @@ dependencies = [ "beautifulsoup4>=4.12,<5", "fastapi>=0.115,<1", "feedparser>=6.0,<7", + "extract-favicon>=0.5.4,<1", "httpx>=0.28,<1", "pydantic>=2.7,<3", "pyarrow>=18.1,<25", @@ -29,6 +30,7 @@ runtime-models = [ ] dev = [ "pytest>=8.3,<9", + "ruff>=0.8,<1", ] [tool.pytest.ini_options] diff --git a/readme.md b/readme.md index 793d111..637f9a9 100644 --- a/readme.md +++ b/readme.md @@ -19,6 +19,12 @@ 2026年5月29日,有了第一个fork,也是人生中第一次fork,开心 +2026年6月1日,拜托oyyt为本项目画了一个虚拟形象,开心 + +2026年6月3日,一觉醒来从 9 star变成了11 star,突破两位数,开心 + +2026年6月6日,发现有13人注册了用户,有点想哭,开心 + ## 文档导航 @@ -33,6 +39,10 @@ ## Quick Start +### 0. 推荐 +启动codex、claude code或任意vibecoding工具,然后:请完整阅读该项目确保你了解该项目,然后配置合理的.env文件后docker本地部署 + + ### 1. 仅 API / 后端最小路径 当你只想调试 HTTP 协议、聚合层行为,或者暂时不需要浏览器界面时,走这条路径最合适。 diff --git a/scripts/generate_visualization_benchmark.py b/scripts/generate_visualization_benchmark.py new file mode 100644 index 0000000..a7a9323 --- /dev/null +++ b/scripts/generate_visualization_benchmark.py @@ -0,0 +1,343 @@ +#!/usr/bin/env python3 +"""Generate a deterministic clustered graph payload for visualization QA. + +The benchmark uses a seeded stochastic-block-model style construction: +blogs are assigned to planted communities, intra-community edges are sampled +with a higher probability than inter-community bridges, and a few hub blogs are +given extra outgoing links. This mirrors the practical idea behind LFR-style +community-detection benchmarks: a known community assignment plus a controllable +mixing rate for cross-community edges. +""" + +from __future__ import annotations + +import argparse +import json +import math +import random +from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + + +DEFAULT_OUTPUT = Path("frontend/public/benchmarks/blog-community-graph.json") +DEFAULT_SEED = 42 +COMMUNITIES = [ + ("indie-web", "Indie Web", 24), + ("engineering", "Engineering", 22), + ("design", "Design", 18), + ("data-ai", "Data & AI", 20), + ("culture", "Culture", 16), +] + + +@dataclass(frozen=True) +class BlogNode: + """Synthetic blog node emitted in backend-compatible graph JSON form. + + Attributes: + id: Stable numeric blog id. + slug: URL-safe blog slug. + title: Human-readable blog title. + community_id: Planted benchmark community id. + community_label: Human-readable community label. + """ + + id: int + slug: str + title: str + community_id: str + community_label: str + + +def parse_args() -> argparse.Namespace: + """Parse command-line options for benchmark graph generation. + + Returns: + Parsed argparse namespace containing output path, seed, and edge rates. + """ + + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--output", type=Path, default=DEFAULT_OUTPUT, help="JSON output path.") + parser.add_argument("--seed", type=int, default=DEFAULT_SEED, help="Deterministic random seed.") + parser.add_argument("--intra-probability", type=float, default=0.34, help="Same-community edge probability.") + parser.add_argument("--inter-probability", type=float, default=0.002, help="Cross-community edge probability.") + parser.add_argument("--hub-links", type=int, default=1, help="Extra cross-community links per community hub.") + return parser.parse_args() + + +def make_slug(label: str, index: int) -> str: + """Build a stable blog slug from a community label and ordinal. + + Args: + label: Human-readable community label. + index: One-based blog index within that community. + + Returns: + URL-safe synthetic blog slug. + """ + + return f"{label.lower().replace(' & ', '-').replace(' ', '-')}-{index:02d}" + + +def build_nodes() -> list[BlogNode]: + """Create the planted blog communities. + + Returns: + List of 100 synthetic blog nodes split across five communities. + """ + + nodes: list[BlogNode] = [] + next_id = 1 + for community_id, community_label, size in COMMUNITIES: + for index in range(1, size + 1): + slug = make_slug(community_label, index) + nodes.append( + BlogNode( + id=next_id, + slug=slug, + title=f"{community_label} Notes {index:02d}", + community_id=community_id, + community_label=community_label, + ) + ) + next_id += 1 + return nodes + + +def edge_key(source: int, target: int) -> tuple[int, int]: + """Normalize a directed edge pair for duplicate checks. + + Args: + source: Source blog id. + target: Target blog id. + + Returns: + Directed edge identity tuple. + """ + + return (source, target) + + +def add_edge(edges: dict[tuple[int, int], dict[str, Any]], source: BlogNode, target: BlogNode, link_text: str) -> None: + """Add one directed edge unless it already exists or is a self-link. + + Args: + edges: Mutable edge dictionary keyed by directed source/target ids. + source: Source blog node. + target: Target blog node. + link_text: Synthetic friend-link label. + """ + + if source.id == target.id: + return + key = edge_key(source.id, target.id) + if key in edges: + return + edges[key] = { + "from_blog_id": source.id, + "to_blog_id": target.id, + "link_text": link_text, + "link_url_raw": f"https://benchmark.heyblog.local/{target.slug}/", + } + + +def build_edges( + nodes: list[BlogNode], + rng: random.Random, + intra_probability: float, + inter_probability: float, + hub_links: int, +) -> list[dict[str, Any]]: + """Sample benchmark edges with strong planted community structure. + + Args: + nodes: Synthetic blog nodes. + rng: Seeded random number generator. + intra_probability: Same-community edge probability. + inter_probability: Cross-community edge probability. + hub_links: Extra bridge count added from each community hub. + + Returns: + Backend-compatible edge dictionaries. + """ + + edges: dict[tuple[int, int], dict[str, Any]] = {} + by_community: dict[str, list[BlogNode]] = {} + for node in nodes: + by_community.setdefault(node.community_id, []).append(node) + + for community_nodes in by_community.values(): + for index, source in enumerate(community_nodes): + target = community_nodes[(index + 1) % len(community_nodes)] + add_edge(edges, source, target, "blogroll") + + for source_index, source in enumerate(nodes): + for target in nodes[source_index + 1 :]: + probability = intra_probability if source.community_id == target.community_id else inter_probability + if rng.random() >= probability: + continue + if rng.random() < 0.5: + add_edge(edges, source, target, "friend link") + else: + add_edge(edges, target, source, "friend link") + + for community_nodes in by_community.values(): + hub = community_nodes[0] + outside_nodes = [node for node in nodes if node.community_id != hub.community_id] + for target in rng.sample(outside_nodes, k=min(hub_links, len(outside_nodes))): + add_edge(edges, hub, target, "community bridge") + + sorted_edges = sorted(edges.values(), key=lambda edge: (edge["from_blog_id"], edge["to_blog_id"])) + for index, edge in enumerate(sorted_edges, start=1): + edge["id"] = f"benchmark-edge-{index:03d}" + return sorted_edges + + +def degree_counts(nodes: list[BlogNode], edges: list[dict[str, Any]]) -> dict[int, dict[str, int]]: + """Calculate directed degree counts for frontend visual weighting. + + Args: + nodes: Synthetic blog nodes. + edges: Generated directed edge list. + + Returns: + Mapping from blog id to incoming/outgoing/total degree counters. + """ + + counts = {node.id: {"incoming": 0, "outgoing": 0, "degree": 0} for node in nodes} + for edge in edges: + source = int(edge["from_blog_id"]) + target = int(edge["to_blog_id"]) + counts[source]["outgoing"] += 1 + counts[target]["incoming"] += 1 + counts[source]["degree"] += 1 + counts[target]["degree"] += 1 + return counts + + +def community_centers() -> dict[str, tuple[float, float, float]]: + """Return fixed 3D centers that make planted communities visually separate. + + Returns: + Mapping from community id to deterministic x/y/z layout center. + """ + + return { + "indie-web": (-520.0, -260.0, 0.0), + "engineering": (520.0, -260.0, 0.0), + "design": (-520.0, 300.0, 0.0), + "data-ai": (520.0, 300.0, 0.0), + "culture": (0.0, 40.0, 520.0), + } + + +def node_position(node: BlogNode, index: int, rng: random.Random) -> dict[str, float]: + """Place one benchmark node near its planted community center. + + Args: + node: Synthetic blog node to position. + index: Zero-based node index used for deterministic angular spread. + rng: Seeded random number generator for small jitter. + + Returns: + Mapping containing x, y, and z coordinates. + """ + + center_x, center_y, center_z = community_centers()[node.community_id] + angle = (index * 2.399963229728653) % 6.283185307179586 + radius = 42.0 + (index % 5) * 15.0 + rng.uniform(-8.0, 8.0) + z_jitter = rng.uniform(-36.0, 36.0) + return { + "x": round(center_x + radius * math.cos(angle), 3), + "y": round(center_y + radius * math.sin(angle), 3), + "z": round(center_z + z_jitter, 3), + } + + +def to_payload( + nodes: list[BlogNode], + edges: list[dict[str, Any]], + seed: int, + intra_probability: float, + inter_probability: float, +) -> dict[str, Any]: + """Build the backend-compatible benchmark graph payload. + + Args: + nodes: Synthetic blog nodes. + edges: Generated directed edge list. + seed: Random seed used for reproducibility. + intra_probability: Same-community edge probability. + inter_probability: Cross-community edge probability. + + Returns: + JSON-serializable graph payload consumed by the frontend. + """ + + counts = degree_counts(nodes, edges) + position_rng = random.Random(seed + 1009) + generated_at = datetime.now(timezone.utc).isoformat() + graph_nodes = [] + for index, node in enumerate(nodes): + node_counts = counts[node.id] + graph_nodes.append( + { + "id": node.id, + "url": f"https://benchmark.heyblog.local/{node.slug}/", + "domain": f"{node.slug}.benchmark.heyblog.local", + "title": node.title, + "icon_url": None, + "incoming_count": node_counts["incoming"], + "outgoing_count": node_counts["outgoing"], + "degree": node_counts["degree"], + "component_id": node.community_id, + "benchmark_community_label": node.community_label, + **node_position(node, index, position_rng), + } + ) + + return { + "nodes": graph_nodes, + "edges": edges, + "meta": { + "strategy": "synthetic-community-benchmark", + "limit": len(nodes), + "source": "scripts/generate_visualization_benchmark.py", + "generated_at": generated_at, + "total_nodes": len(nodes), + "total_edges": len(edges), + "selected_nodes": len(nodes), + "selected_edges": len(edges), + "available_nodes": len(nodes), + "available_edges": len(edges), + "benchmark": { + "seed": seed, + "model": "seeded stochastic block model inspired by LFR mixing-parameter benchmarks", + "community_sizes": {community_id: size for community_id, _label, size in COMMUNITIES}, + "intra_probability": intra_probability, + "inter_probability": inter_probability, + "estimated_mixing_rate": round(inter_probability / (intra_probability + inter_probability), 3), + "layout": "fixed separated community centers with deterministic jitter", + }, + }, + } + + +def main() -> None: + """Generate the benchmark graph JSON file on disk.""" + + args = parse_args() + rng = random.Random(args.seed) + nodes = build_nodes() + edges = build_edges(nodes, rng, args.intra_probability, args.inter_probability, args.hub_links) + payload = to_payload(nodes, edges, args.seed, args.intra_probability, args.inter_probability) + + args.output.parent.mkdir(parents=True, exist_ok=True) + args.output.write_text(json.dumps(payload, ensure_ascii=False, indent=2) + "\n", encoding="utf-8") + print(f"Wrote {len(nodes)} nodes and {len(edges)} edges to {args.output}") + + +if __name__ == "__main__": + main() diff --git a/scripts/run_visualization_benchmark.sh b/scripts/run_visualization_benchmark.sh new file mode 100755 index 0000000..e43b583 --- /dev/null +++ b/scripts/run_visualization_benchmark.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +PORT="${1:-3001}" +HOST="${HOST:-127.0.0.1}" + +echo "Generating visualization benchmark graph..." +python3 "$ROOT_DIR/scripts/generate_visualization_benchmark.py" + +echo +echo "Starting HeyBlog frontend benchmark server..." +echo "Benchmark URL: http://$HOST:$PORT/visualization/benchmark" +echo "Stop server: Ctrl+C" +echo + +cd "$ROOT_DIR/frontend" +npm run dev -- --host "$HOST" --port "$PORT" diff --git a/seed.csv b/seed.csv index 804edf2..8021f18 100644 --- a/seed.csv +++ b/seed.csv @@ -1,3 +1,8 @@ url -https://www.qladgk.com/ -https://baka.fun/ +https://blog.leonus.cn/ +https://moondvsted.space/ +https://blog.verynb.net/ +https://blog.sayori.org/ +https://liguang.wang/ +https://junsen.online/ +https://blog.rsjwy.com/ \ No newline at end of file diff --git a/shared/config.py b/shared/config.py index af51b2f..1ac7efa 100644 --- a/shared/config.py +++ b/shared/config.py @@ -14,7 +14,7 @@ DEFAULT_MAX_CANDIDATE_LINKS_PER_PAGE = 50 DEFAULT_CANDIDATE_PAGE_FETCH_CONCURRENCY = 4 DEFAULT_RUNTIME_WORKER_COUNT = 3 -DEFAULT_PRIORITY_SEED_NORMAL_QUEUE_SLOTS = 2 +DEFAULT_RUNTIME_AUTO_START_INTERVAL_SECONDS = 3600.0 DEFAULT_MAX_FETCHED_PAGE_BYTES = 2_000_000 DEFAULT_RAW_DISCOVERED_URL_LIMIT = 1_000_000 PROJECT_ROOT = Path(__file__).resolve().parent.parent @@ -106,7 +106,7 @@ class Settings: max_candidate_links_per_page: int = DEFAULT_MAX_CANDIDATE_LINKS_PER_PAGE candidate_page_fetch_concurrency: int = DEFAULT_CANDIDATE_PAGE_FETCH_CONCURRENCY runtime_worker_count: int = DEFAULT_RUNTIME_WORKER_COUNT - priority_seed_normal_queue_slots: int = DEFAULT_PRIORITY_SEED_NORMAL_QUEUE_SLOTS + runtime_auto_start_interval_seconds: float = DEFAULT_RUNTIME_AUTO_START_INTERVAL_SECONDS max_fetched_page_bytes: int = DEFAULT_MAX_FETCHED_PAGE_BYTES raw_discovered_url_limit: int = DEFAULT_RAW_DISCOVERED_URL_LIMIT friend_link_domain_blocklist: tuple[str, ...] = () @@ -115,6 +115,17 @@ class Settings: friend_link_prefix_blocklist: tuple[str, ...] = () admin_token: str | None = None admin_dev_bypass: bool = False + public_base_url: str = "http://127.0.0.1:3000" + email_provider: str = "disabled" + email_from: str = "" + email_dev_expose_tokens: bool = False + smtp_host: str = "" + smtp_port: int = 587 + smtp_username: str | None = None + smtp_password: str | None = None + smtp_use_tls: bool = True + smtp_use_ssl: bool = False + smtp_timeout_seconds: float = 10.0 decision_model_root: Path = DEFAULT_DECISION_MODEL_ROOT filter_chain_config_path: Path = DEFAULT_FILTER_CHAIN_CONFIG_PATH rss_discovery_enabled: bool = True @@ -202,12 +213,12 @@ def from_env(cls) -> "Settings": ) ), ), - priority_seed_normal_queue_slots=max( - 1, - int( + runtime_auto_start_interval_seconds=max( + 0.001, + float( os.getenv( - "HEYBLOG_PRIORITY_SEED_NORMAL_QUEUE_SLOTS", - str(DEFAULT_PRIORITY_SEED_NORMAL_QUEUE_SLOTS), + "HEYBLOG_RUNTIME_AUTO_START_INTERVAL_SECONDS", + str(DEFAULT_RUNTIME_AUTO_START_INTERVAL_SECONDS), ) ), ), @@ -232,6 +243,17 @@ def from_env(cls) -> "Settings": friend_link_prefix_blocklist=_parse_csv_env("HEYBLOG_FRIEND_LINK_PREFIX_BLOCKLIST"), admin_token=os.getenv("HEYBLOG_ADMIN_TOKEN"), admin_dev_bypass=_parse_bool_env("HEYBLOG_ADMIN_DEV_BYPASS"), + public_base_url=os.getenv("HEYBLOG_PUBLIC_BASE_URL", "http://127.0.0.1:3000").rstrip("/"), + email_provider=os.getenv("HEYBLOG_EMAIL_PROVIDER", "disabled").strip().lower() or "disabled", + email_from=os.getenv("HEYBLOG_EMAIL_FROM", "").strip(), + email_dev_expose_tokens=_parse_bool_env("HEYBLOG_EMAIL_DEV_EXPOSE_TOKENS"), + smtp_host=os.getenv("HEYBLOG_SMTP_HOST", "").strip(), + smtp_port=max(1, int(os.getenv("HEYBLOG_SMTP_PORT", "587"))), + smtp_username=os.getenv("HEYBLOG_SMTP_USERNAME") or None, + smtp_password=os.getenv("HEYBLOG_SMTP_PASSWORD") or None, + smtp_use_tls=_parse_bool_env("HEYBLOG_SMTP_USE_TLS", default=True), + smtp_use_ssl=_parse_bool_env("HEYBLOG_SMTP_USE_SSL"), + smtp_timeout_seconds=max(0.001, float(os.getenv("HEYBLOG_SMTP_TIMEOUT_SECONDS", "10.0"))), decision_model_root=Path( os.getenv("HEYBLOG_DECISION_MODEL_ROOT", str(DEFAULT_DECISION_MODEL_ROOT)) ), diff --git a/shared/http_clients/persistence_http.py b/shared/http_clients/persistence_http.py index d25c943..64ca354 100644 --- a/shared/http_clients/persistence_http.py +++ b/shared/http_clients/persistence_http.py @@ -69,6 +69,11 @@ def _put(self, path: str, payload: dict[str, Any]) -> Any: response.raise_for_status() return response.json() + def _patch(self, path: str, payload: dict[str, Any]) -> Any: + response = self.client.patch(path, json=payload, **context_header_kwargs()) + response.raise_for_status() + return response.json() + def _get(self, path: str, params: dict[str, Any] | None = None) -> Any: response = self.client.get(path, params=params, **context_header_kwargs()) response.raise_for_status() @@ -170,6 +175,9 @@ def upsert_blog( domain: str, email: str | None = None, feed_url: str | None = None, + accepted_by: str | None = None, + seed_source_path: str | None = None, + seed_source_row: int | None = None, ) -> tuple[int, bool]: payload = self._post( "/internal/blogs/upsert", @@ -179,16 +187,174 @@ def upsert_blog( "domain": domain, "email": email, "feed_url": feed_url, + "accepted_by": accepted_by, + "seed_source_path": seed_source_path, + "seed_source_row": seed_source_row, }, ) return int(payload["id"]), bool(payload["inserted"]) - def create_ingestion_request(self, *, homepage_url: str, email: str) -> dict[str, Any]: + def list_seeds(self) -> list[dict[str, Any]]: + """Fetch durable seed rows from persistence in replay order. + + Args: + None. + + Returns: + Seed payloads ordered by insertion ID. + """ + + return self._get("/internal/seeds") + + def create_random_recommendation_batch( + self, + *, + count: int = 9, + visitor_id: str, + session_id: str, + user_id: int | None = None, + source: str | None = None, + page_url: str | None = None, + context: dict[str, Any] | None = None, + ) -> dict[str, Any]: + """Create and persist one random-blog recommendation batch. + + Args: + count: Number of random cards requested. + visitor_id: Stable anonymous visitor identifier. + session_id: Stable browser-session identifier. + user_id: Optional authenticated user ID. + source: Optional caller/source label. + page_url: Optional frontend page URL. + context: Optional JSON metadata. + + Returns: + Recommendation batch payload returned by persistence. + """ + return self._post( - "/internal/ingestion-requests", + "/internal/recommendations/random-blog-batches", + { + "count": count, + "visitor_id": visitor_id, + "session_id": session_id, + "user_id": user_id, + "source": source, + "page_url": page_url, + "context": context, + }, + ) + + def record_blog_interaction( + self, + *, + event_uuid: str, + event_type: str, + blog_id: int, + visitor_id: str, + session_id: str, + entrance_kind: str, + entrance_url: str, + request_uuid: str | None = None, + impression_id: int | None = None, + position: int | None = None, + interaction_order: int = 1, + user_id: int | None = None, + client_event_at: str | None = None, + attributes: dict[str, Any] | None = None, + ) -> dict[str, Any]: + """Persist one random-blog recommendation interaction event. + + Args: + event_uuid: Client idempotency key. + event_type: Interaction type. + blog_id: Public/business blog ID. + visitor_id: Stable anonymous visitor identifier. + session_id: Stable browser-session identifier. + entrance_kind: Stable entrance category for the UI location. + entrance_url: Raw URL for the entrance context. + request_uuid: Optional recommendation request UUID. + impression_id: Optional impression ID. + position: Optional displayed card position. + interaction_order: Client-side event order. + user_id: Optional authenticated user ID. + client_event_at: Optional client timestamp. + attributes: Optional JSON metadata. + + Returns: + Interaction payload returned by persistence. + """ + + return self._post( + "/internal/recommendation-events", + { + "event_uuid": event_uuid, + "event_type": event_type, + "blog_id": blog_id, + "visitor_id": visitor_id, + "session_id": session_id, + "entrance_kind": entrance_kind, + "entrance_url": entrance_url, + "request_uuid": request_uuid, + "impression_id": impression_id, + "position": position, + "interaction_order": interaction_order, + "user_id": user_id, + "client_event_at": client_event_at, + "attributes": attributes, + }, + ) + + def get_blog_recommendation_stats(self, blog_id: int) -> dict[str, Any]: + """Load recommendation stats for one blog. + + Args: + blog_id: Public/business blog ID. + + Returns: + Stats payload returned by persistence. + """ + + return self._get(f"/internal/blogs/{blog_id}/recommendation-stats") + + def get_recommendation_strategy_stats(self) -> dict[str, Any]: + """Load aggregate recommendation strategy stats. + + Args: + None. + + Returns: + Aggregate stats payload returned by persistence. + """ + + return self._get("/internal/recommendation-stats") + + def get_admin_hourly_stats(self, *, limit: int = 24) -> dict[str, Any]: + """Load hourly admin dashboard statistics snapshots. + + Args: + limit: Maximum number of hourly snapshots to fetch. + + Returns: + Hourly admin statistics payload returned by persistence. + """ + + return self._get("/internal/admin/hourly-stats", {"limit": limit}) + + def create_user_seed(self, *, homepage_url: str) -> dict[str, Any]: + """Create or refresh a user-submitted crawler seed. + + Args: + homepage_url: Complete user-submitted blog homepage URL. + + Returns: + Accepted seed payload returned by persistence. + """ + + return self._post( + "/internal/user-seeds", { "homepage_url": homepage_url, - "email": email, }, ) @@ -228,6 +394,36 @@ def revoke_user_session(self, *, token: str) -> dict[str, Any]: return self._post(f"/internal/users/logout?session_token={token}", {}) + def request_email_verification(self, *, email: str) -> dict[str, Any]: + """Create a fresh email verification token for one account.""" + + return self._post("/internal/users/email-verification/request", {"email": email}) + + def confirm_email_verification(self, *, token: str) -> dict[str, Any]: + """Confirm a user email verification token.""" + + return self._post("/internal/users/email-verification/confirm", {"token": token}) + + def request_password_reset(self, *, email: str) -> dict[str, Any]: + """Create a fresh password reset token for one account.""" + + return self._post("/internal/users/password-reset/request", {"email": email}) + + def reset_user_password(self, *, token: str, password: str) -> dict[str, Any]: + """Confirm a password reset token and set a new password.""" + + return self._post("/internal/users/password-reset/confirm", {"token": token, "password": password}) + + def list_users(self, *, page: int = 1, page_size: int = 50) -> dict[str, Any]: + """Fetch a paginated admin user list.""" + + return self._get("/internal/users", {"page": page, "page_size": page_size}) + + def update_user_role(self, *, user_id: int, role: str) -> dict[str, Any]: + """Update one user's role.""" + + return self._patch(f"/internal/users/{user_id}/role", {"role": role}) + def list_user_label_selections(self, *, user_id: int, limit: int = 50) -> list[dict[str, Any]]: """Fetch recent random-page selections for one user.""" @@ -238,73 +434,18 @@ def get_user_label_stats(self, *, user_id: int) -> dict[str, int]: return self._get(f"/internal/users/{user_id}/label-stats") - def get_ingestion_request( - self, - *, - request_id: int, - request_token: str, - ) -> dict[str, Any] | None: - return self._get( - f"/internal/ingestion-requests/{request_id}", - {"request_token": request_token}, - ) - - def list_priority_ingestion_requests(self) -> list[dict[str, Any]]: - return self._get("/internal/ingestion-requests") - def lookup_blog_candidates(self, *, url: str) -> dict[str, Any]: return self._get("/internal/blogs/lookup", {"url": url}) - def create_blog_dedup_scan_run(self, *, crawler_was_running: bool = False) -> dict[str, Any]: - return self._create_maintenance_run( - "/internal/blog-dedup-scans/runs", - crawler_was_running=crawler_was_running, - ) + def find_blog_id_by_normalized_url(self, *, normalized_url: str) -> int | None: + """Fetch the persisted blog id for one normalized URL.""" - def execute_blog_dedup_scan_run(self, *, run_id: int) -> dict[str, Any]: - return self._post_maintenance_run_action( - "/internal/blog-dedup-scans", - run_id=run_id, - action="execute", - ) + payload = self._get("/internal/blogs/by-normalized-url", {"normalized_url": normalized_url}) + blog_id = payload.get("id") + return int(blog_id) if blog_id is not None else None - def finalize_blog_dedup_scan_run( - self, - *, - run_id: int, - crawler_restart_attempted: bool, - crawler_restart_succeeded: bool, - search_reindexed: bool, - error_message: str | None = None, - ) -> dict[str, Any]: - return self._post( - f"/internal/blog-dedup-scans/{run_id}/finalize", - { - "crawler_restart_attempted": crawler_restart_attempted, - "crawler_restart_succeeded": crawler_restart_succeeded, - "search_reindexed": search_reindexed, - "error_message": error_message, - }, - ) - - def latest_blog_dedup_scan_run(self) -> dict[str, Any]: - return self._get_latest_maintenance_run("/internal/blog-dedup-scans") - - def list_blog_dedup_scan_run_items(self, run_id: int) -> list[dict[str, Any]]: - return self._list_maintenance_run_children( - "/internal/blog-dedup-scans", - run_id=run_id, - child_resource="items", - ) - - def get_next_priority_blog(self) -> dict[str, Any] | None: - return self._get("/internal/queue/priority-next") - - def get_next_waiting_blog(self, *, include_priority: bool = True) -> dict[str, Any] | None: - return self._get("/internal/queue/next", {"include_priority": self._bool_query_value(include_priority)}) - - def mark_ingestion_request_crawling(self, *, blog_id: int) -> None: - self._post(f"/internal/ingestion-requests/by-blog/{blog_id}/crawling", {}) + def get_next_waiting_blog(self) -> dict[str, Any] | None: + return self._get("/internal/queue/next") def mark_blog_result( self, @@ -316,6 +457,8 @@ def mark_blog_result( metadata_captured: bool = False, title: str | None = None, icon_url: str | None = None, + crawl_error_kind: str | None = None, + crawl_error_message: str | None = None, ) -> None: self._post( f"/internal/blogs/{blog_id}/result", @@ -326,6 +469,8 @@ def mark_blog_result( "metadata_captured": metadata_captured, "title": title, "icon_url": icon_url, + "crawl_error_kind": crawl_error_kind, + "crawl_error_message": crawl_error_message, }, ) @@ -408,6 +553,7 @@ def list_blogs_catalog( has_title: bool | None = None, has_icon: bool | None = None, min_connections: int | None = None, + acceptance_status: str | None = "ACCEPTED", ) -> dict[str, Any]: return self._get( "/internal/blogs/catalog", @@ -423,6 +569,7 @@ def list_blogs_catalog( "has_title": has_title, "has_icon": has_icon, "min_connections": min_connections, + "acceptance_status": acceptance_status, }, ) @@ -596,6 +743,24 @@ def graph_snapshot(self, version: str) -> dict[str, Any]: def search_snapshot(self) -> dict[str, list[dict[str, Any]]]: return self._get("/internal/search-snapshot") + def list_blogs(self) -> list[dict[str, Any]]: + """Fetch all blog rows for graph export compatibility. + + Returns: + Blog payloads from the persistence service search snapshot. + """ + + return self.search_snapshot()["blogs"] + + def list_edges(self) -> list[dict[str, Any]]: + """Fetch all edge rows for graph export compatibility. + + Returns: + Edge payloads from the persistence service search snapshot. + """ + + return self.search_snapshot()["edges"] + def reset(self) -> dict[str, Any]: response = self.client.post("/internal/database/reset") response.raise_for_status() diff --git a/tests/test_crawler_service.py b/tests/test_crawler_service.py index eecdc7b..4bf2b22 100644 --- a/tests/test_crawler_service.py +++ b/tests/test_crawler_service.py @@ -46,6 +46,10 @@ def run_once(self, max_nodes: int | None = None) -> dict[str, object]: class StubRuntime: """Return fixed payloads for runtime endpoints.""" + def __init__(self) -> None: + self.scheduler_starts = 0 + self.scheduler_stops = 0 + def status(self) -> dict[str, object]: return { "runner_status": "idle", @@ -101,6 +105,14 @@ def current(self) -> dict[str, object]: ], } + def start_auto_scheduler(self) -> dict[str, object]: + self.scheduler_starts += 1 + return {"accepted": True, "interval_seconds": 3600.0} + + def stop_auto_scheduler(self) -> dict[str, object]: + self.scheduler_stops += 1 + return {"accepted": True} + def start(self) -> dict[str, object]: payload = self.status() payload["runner_status"] = "running" @@ -122,33 +134,35 @@ def run_batch(self, max_nodes: int) -> dict[str, object]: def test_crawler_service_routes_preserve_payload_shapes() -> None: """Crawler HTTP service should keep its public internal route contract stable.""" - app = create_app(CrawlerState(pipeline=StubPipeline(), runtime=StubRuntime())) - client = TestClient(app) - - assert client.get("/internal/health").json() == {"status": "ok"} - assert client.post("/internal/crawl/bootstrap").json() == {"seed_path": "seed.csv", "imported": 2} - assert client.post("/internal/crawl/run?max_nodes=5").json() == { - "processed": 5, - "discovered": 3, - "failed": 0, - "exports": {"graph_json": "graph.json"}, - } - status = client.get("/internal/runtime/status").json() - assert status["runner_status"] == "idle" - assert status["worker_count"] == 3 - current = client.get("/internal/runtime/current").json() - assert current["current_blog_id"] == 10 - assert current["current_worker_id"] == "worker-1" - assert current["active_workers"] == 1 - assert current["workers"][0]["worker_id"] == "worker-1" - assert client.post("/internal/runtime/start").json()["runner_status"] == "running" - assert client.post("/internal/runtime/stop").json()["runner_status"] == "stopping" - assert client.post("/internal/runtime/run-batch", json={"max_nodes": 4}).json()["result"] == { - "processed": 4, - "discovered": 1, - "failed": 0, - "exports": {}, - } + runtime = StubRuntime() + app = create_app(CrawlerState(pipeline=StubPipeline(), runtime=runtime)) + with TestClient(app) as client: + assert runtime.scheduler_starts == 1 + assert client.get("/internal/health").json() == {"status": "ok"} + assert client.post("/internal/crawl/bootstrap").json() == {"seed_path": "seed.csv", "imported": 2} + assert client.post("/internal/crawl/run?max_nodes=5").json() == { + "processed": 5, + "discovered": 3, + "failed": 0, + "exports": {"graph_json": "graph.json"}, + } + status = client.get("/internal/runtime/status").json() + assert status["runner_status"] == "idle" + assert status["worker_count"] == 3 + current = client.get("/internal/runtime/current").json() + assert current["current_blog_id"] == 10 + assert current["current_worker_id"] == "worker-1" + assert current["active_workers"] == 1 + assert current["workers"][0]["worker_id"] == "worker-1" + assert client.post("/internal/runtime/start").json()["runner_status"] == "running" + assert client.post("/internal/runtime/stop").json()["runner_status"] == "stopping" + assert client.post("/internal/runtime/run-batch", json={"max_nodes": 4}).json()["result"] == { + "processed": 4, + "discovered": 1, + "failed": 0, + "exports": {}, + } + assert runtime.scheduler_stops == 1 def test_services_crawler_main_remains_a_compatibility_shim() -> None: diff --git a/tests/test_filters.py b/tests/test_filters.py index f5d9547..46bcc26 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -17,8 +17,8 @@ def test_filter_rejects_known_platform_domains() -> None: assert not is_blog_candidate("https://t.co/share", "blog.example.com") -def test_filter_rejects_blocked_tlds_like_gov_and_org() -> None: - """Reject blocked TLD categories such as government/organization domains.""" +def test_filter_rejects_government_tlds_but_allows_org_domains() -> None: + """Reject government/education TLDs while allowing organization domains.""" decision = decide_blog_candidate("https://agency.gov/", "blog.example.com") gov_cn_decision = decide_blog_candidate("https://beian.miit.gov.cn/", "blog.example.com") org_decision = decide_blog_candidate("https://foundation.org/", "blog.example.com") @@ -29,9 +29,9 @@ def test_filter_rejects_blocked_tlds_like_gov_and_org() -> None: assert not gov_cn_decision.accepted assert gov_cn_decision.hard_blocked assert "blocked_tld" in gov_cn_decision.reasons - assert not org_decision.accepted - assert org_decision.hard_blocked - assert "blocked_tld" in org_decision.reasons + assert org_decision.accepted + assert not org_decision.hard_blocked + assert "blocked_tld" not in org_decision.reasons def test_filter_rejects_exact_url_and_prefix_blocklist_entries() -> None: diff --git a/tests/test_graph_projection.py b/tests/test_graph_projection.py index 96bcd5e..acb6d89 100644 --- a/tests/test_graph_projection.py +++ b/tests/test_graph_projection.py @@ -178,14 +178,58 @@ def test_core_view_count_sampling_expands_from_random_seed_by_bfs() -> None: assert {edge["id"] for edge in payload["edges"]} == {13, 14} -def test_core_view_seed_strategy_prefers_oldest_nodes() -> None: +def test_core_view_seed_strategy_returns_first_n_nodes_by_id() -> None: blogs, edges = sample_graph() + for blog_id in range(4, 32): + blogs.append( + { + "id": blog_id, + "url": f"https://extra-{blog_id}.example", + "normalized_url": f"https://extra-{blog_id}.example", + "domain": f"extra-{blog_id}.example", + "title": f"Extra {blog_id}", + "icon_url": None, + "status_code": 200, + "crawl_status": "FINISHED", + "friend_links_count": 0, + "last_crawled_at": None, + "created_at": "2026-03-31T00:00:00Z", + "updated_at": "2026-03-31T00:00:00Z", + }, + ) snapshot = build_graph_snapshot_payload(blogs, edges, version="v1", generated_at="2026-03-31T00:00:00Z") + payload = build_core_graph_view(snapshot, strategy="seed", limit=24) + + assert payload["meta"]["strategy"] == "seed" + assert {node["id"] for node in payload["nodes"]} == set(range(1, 25)) + assert {edge["id"] for edge in payload["edges"]} == {11, 12} + + +def test_core_view_seed_strategy_keeps_failed_parent_discovery_edges() -> None: + blogs, edges = sample_graph() + blogs[0]["crawl_status"] = "FAILED" + + snapshot = build_graph_snapshot_payload(blogs, edges, version="v1", generated_at="2026-03-31T00:00:00Z") payload = build_core_graph_view(snapshot, strategy="seed", limit=2) + assert {node["id"] for node in payload["nodes"]} == {1, 2} + assert {edge["id"] for edge in payload["edges"]} == {11} + node_by_id = {node["id"]: node for node in payload["nodes"]} + assert node_by_id[1]["crawl_status"] == "FAILED" + assert node_by_id[2]["incoming_count"] == 1 + + +def test_core_view_seed_strategy_allows_zero_nodes() -> None: + blogs, edges = sample_graph() + snapshot = build_graph_snapshot_payload(blogs, edges, version="v1", generated_at="2026-03-31T00:00:00Z") + + payload = build_core_graph_view(snapshot, strategy="seed", limit=0) + assert payload["meta"]["strategy"] == "seed" - assert {node["id"] for node in payload["nodes"][:2]} == {1, 2} + assert payload["meta"]["limit"] == 0 + assert payload["nodes"] == [] + assert payload["edges"] == [] def test_core_view_allows_ten_thousand_node_limit() -> None: diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index c6b2762..1ea75a5 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -5,13 +5,16 @@ from typing import Callable import pytest +from sqlalchemy import func from sqlalchemy import select from crawler.crawling.fetching.base import FetchAttempt from crawler.crawling.fetching.base import FetchResult from crawler.crawling.pipeline import CrawlPipeline from persistence_api.db import session_scope +from persistence_api.models import BlogModel from persistence_api.models import RawDiscoveredUrlModel +from persistence_api.models import SeedModel from persistence_api.repository import Repository from shared.config import Settings @@ -26,6 +29,7 @@ def __init__( batch_results: dict[str, FetchAttempt] | None = None, on_fetch: Callable[[str, float | None], None] | None = None, on_fetch_many: Callable[[list[str], float | None], None] | None = None, + valid_icon_urls: dict[str, str | None] | None = None, ) -> None: self.responses = responses self.batch_results = batch_results or {} @@ -35,6 +39,8 @@ def __init__( self.fetch_timeouts: list[float | None] = [] self.fetch_many_calls: list[tuple[list[str], int, float | None]] = [] self.batch_completion_order: list[str] = [] + self.valid_icon_urls = valid_icon_urls or {} + self.icon_validation_calls: list[tuple[str, float | None]] = [] def fetch(self, url: str, *, timeout_seconds: float | None = None) -> FetchResult: self.calls.append(url) @@ -76,6 +82,10 @@ def fetch_many( for url in urls } + def validate_icon_url(self, url: str, *, timeout_seconds: float | None = None) -> str | None: + self.icon_validation_calls.append((url, timeout_seconds)) + return self.valid_icon_urls.get(url) + def build_pipeline(tmp_path: Path) -> tuple[CrawlPipeline, Repository]: """Construct a pipeline backed by a temporary repository.""" @@ -104,6 +114,83 @@ def seed_blog(repository: Repository) -> dict[str, Any]: return blog +def test_bootstrap_seeds_persists_seed_rows_with_blog_links(tmp_path: Path) -> None: + """Seed CSV bootstrap should maintain a durable seed table alongside blogs.""" + pipeline, repository = build_pipeline(tmp_path) + seed_path = tmp_path / "seed.csv" + seed_path.write_text( + "url\nhttps://one.example.com/\n\nhttps://two.example.com/\n", + encoding="utf-8", + ) + + result = pipeline.bootstrap_seeds(seed_path) + + assert result == {"seed_path": str(seed_path), "imported": 2} + with session_scope(repository.session_factory) as session: + seeds = session.scalars(select(SeedModel).order_by(SeedModel.id)).all() + assert [(seed.url, seed.normalized_url, seed.source_path, seed.source_row) for seed in seeds] == [ + ( + "https://one.example.com/", + "https://one.example.com/", + str(seed_path), + 2, + ), + ( + "https://two.example.com/", + "https://two.example.com/", + str(seed_path), + 3, + ), + ] + assert all(seed.blog_id is not None for seed in seeds) + + +def test_bootstrap_seeds_does_not_reread_csv_after_seed_table_exists(tmp_path: Path) -> None: + """Re-importing after seed table initialization should not read CSV again.""" + pipeline, repository = build_pipeline(tmp_path) + seed_path = tmp_path / "seed.csv" + seed_path.write_text("url\nhttps://one.example.com/\n", encoding="utf-8") + assert pipeline.bootstrap_seeds(seed_path)["imported"] == 1 + + seed_path.write_text("url\nhttps://one.example.com/?utm_source=ignored\n", encoding="utf-8") + result = pipeline.bootstrap_seeds(seed_path) + + assert result == {"seed_path": str(seed_path), "imported": 0} + with session_scope(repository.session_factory) as session: + seeds = session.scalars(select(SeedModel)).all() + assert len(seeds) == 1 + assert seeds[0].url == "https://one.example.com/" + assert seeds[0].normalized_url == "https://one.example.com/" + assert session.scalar(select(func.count()).select_from(BlogModel)) == 1 + + +def test_bootstrap_seeds_replays_seed_table_before_reading_csv(tmp_path: Path) -> None: + """When durable seeds exist, bootstrap should use them instead of the CSV file.""" + pipeline, repository = build_pipeline(tmp_path) + seed_path = tmp_path / "seed.csv" + seed_path.write_text("url\nhttps://csv-only.example.com/\n", encoding="utf-8") + blog_id, inserted = repository.upsert_blog( + url="https://table-seed.example.com/", + normalized_url="https://table-seed.example.com/", + domain="table-seed.example.com", + accepted_by="seed", + seed_source_path="seed.csv", + seed_source_row=2, + ) + assert inserted is True + repository.reset() + + result = pipeline.bootstrap_seeds(seed_path) + + assert result == {"seed_path": str(seed_path), "imported": 1} + with session_scope(repository.session_factory) as session: + blog_urls = session.scalars(select(BlogModel.normalized_url).order_by(BlogModel.id)).all() + assert blog_urls == ["https://table-seed.example.com/"] + seed = session.scalar(select(SeedModel)) + assert seed is not None + assert seed.blog_id == blog_id + + def test_pipeline_persists_only_valid_friend_links(tmp_path: Path) -> None: """Only validated friend links from extracted sections should become edges.""" pipeline, repository = build_pipeline(tmp_path) @@ -142,7 +229,10 @@ def test_pipeline_persists_only_valid_friend_links(tmp_path: Path) -> None: status_code=200, text=friend_page_html, ), - } + }, + valid_icon_urls={ + "https://blog.example.com/static/favicon.png": "https://cdn.example.com/favicon.png", + }, ) discovered = pipeline._crawl_blog(blog) @@ -176,6 +266,77 @@ def test_pipeline_persists_only_valid_friend_links(tmp_path: Path) -> None: assert "depth" not in child_blog +def test_pipeline_persists_edges_for_duplicate_target_urls(tmp_path: Path) -> None: + """Repeated target URL discoveries should still preserve new source edges.""" + pipeline, repository = build_pipeline(tmp_path) + alpha = seed_blog(repository) + beta_id, _ = repository.upsert_blog( + url="https://beta.example/", + normalized_url="https://beta.example/", + domain="beta.example", + ) + beta = repository.get_blog(beta_id) + assert beta is not None + + homepage_html = '<html><body><footer><a href="/friends">友情链接</a></footer></body></html>' + alpha_friend_page_html = """ + <html><body><section class="friend-links"> + <a href="https://common.example/">Common</a> + </section></body></html> + """ + beta_friend_page_html = """ + <html><body><section class="friend-links"> + <a href="https://common.example/">Common Again</a> + <a href="https://common.example/">Common Duplicate</a> + </section></body></html> + """ + pipeline.fetcher = FakeFetcher( + { + "https://blog.example.com/": FetchResult( + url="https://blog.example.com/", + status_code=200, + text=homepage_html, + ), + "https://blog.example.com/friends": FetchResult( + url="https://blog.example.com/friends", + status_code=200, + text=alpha_friend_page_html, + ), + "https://beta.example/": FetchResult( + url="https://beta.example/", + status_code=200, + text=homepage_html, + ), + "https://beta.example/friends": FetchResult( + url="https://beta.example/friends", + status_code=200, + text=beta_friend_page_html, + ), + } + ) + + assert pipeline._crawl_blog(alpha) == 1 + assert pipeline._crawl_blog(beta) == 1 + + common_blog = next(blog for blog in repository.list_blogs() if blog["domain"] == "common.example") + edges = repository.list_edges() + assert {(edge["from_blog_id"], edge["to_blog_id"]) for edge in edges} == { + (alpha["blog_id"], common_blog["id"]), + (beta["blog_id"], common_blog["id"]), + } + + with session_scope(repository.session_factory) as session: + raw_rows = [ + (row.source_blog_id, row.normalized_url, row.status) + for row in session.scalars(select(RawDiscoveredUrlModel).order_by(RawDiscoveredUrlModel.id.asc())) + ] + + assert raw_rows == [ + (alpha["blog_id"], "https://common.example/", "success"), + (beta["blog_id"], "https://common.example/", "rule:duplicate_url"), + ] + + def test_pipeline_stores_feed_url_when_friend_link_exposes_rss(tmp_path: Path) -> None: """A friend link whose homepage exposes a valid feed should persist its feed URL.""" pipeline, repository = build_pipeline(tmp_path) @@ -295,7 +456,10 @@ def test_pipeline_persists_site_title_and_icon_metadata(tmp_path: Path) -> None: status_code=200, text=friend_page_html, ), - } + }, + valid_icon_urls={ + "https://blog.example.com/static/favicon.png": "https://cdn.example.com/favicon.png", + }, ) pipeline._crawl_blog(blog) @@ -303,11 +467,12 @@ def test_pipeline_persists_site_title_and_icon_metadata(tmp_path: Path) -> None: refreshed = repository.get_blog(int(blog["id"])) assert refreshed is not None assert refreshed["title"] == "Alpha Blog" - assert refreshed["icon_url"] == "https://blog.example.com/static/favicon.png" + assert refreshed["icon_url"] == "https://cdn.example.com/favicon.png" + assert pipeline.fetcher.icon_validation_calls[0][0] == "https://blog.example.com/static/favicon.png" -def test_pipeline_falls_back_to_origin_favicon_when_page_has_no_icon_link(tmp_path: Path) -> None: - """Missing explicit icon markup should still produce an origin favicon candidate.""" +def test_pipeline_keeps_icon_null_when_page_has_no_icon_link(tmp_path: Path) -> None: + """Missing explicit icon markup should leave persisted icon metadata empty.""" pipeline, repository = build_pipeline(tmp_path) blog = seed_blog(repository) @@ -326,7 +491,43 @@ def test_pipeline_falls_back_to_origin_favicon_when_page_has_no_icon_link(tmp_pa refreshed = repository.get_blog(int(blog["id"])) assert refreshed is not None assert refreshed["title"] == "Plain Blog" - assert refreshed["icon_url"] == "https://blog.example.com/favicon.ico" + assert refreshed["icon_url"] is None + with session_scope(repository.session_factory) as session: + stored_icon_url = session.scalar(select(BlogModel.icon_url).where(BlogModel.blog_id == int(blog["id"]))) + assert stored_icon_url is None + assert pipeline.fetcher.icon_validation_calls == [] + + +def test_pipeline_keeps_icon_null_when_icon_validation_fails(tmp_path: Path) -> None: + """Unreachable extracted icon candidates should not be persisted.""" + pipeline, repository = build_pipeline(tmp_path) + blog = seed_blog(repository) + + pipeline.fetcher = FakeFetcher( + { + "https://blog.example.com/": FetchResult( + url="https://blog.example.com/", + status_code=200, + text=( + "<html><head><title>Plain Blog" + '' + "" + ), + ), + }, + valid_icon_urls={"https://blog.example.com/missing.ico": None}, + ) + + pipeline._crawl_blog(blog) + + refreshed = repository.get_blog(int(blog["id"])) + assert refreshed is not None + assert refreshed["title"] == "Plain Blog" + assert refreshed["icon_url"] is None + with session_scope(repository.session_factory) as session: + stored_icon_url = session.scalar(select(BlogModel.icon_url).where(BlogModel.blog_id == int(blog["id"]))) + assert stored_icon_url is None + assert pipeline.fetcher.icon_validation_calls[0][0] == "https://blog.example.com/missing.ico" def test_pipeline_enqueues_discovered_children_without_depth_gating(tmp_path: Path) -> None: diff --git a/tests/test_repository.py b/tests/test_repository.py index 3ecf520..c3ea775 100644 --- a/tests/test_repository.py +++ b/tests/test_repository.py @@ -16,13 +16,82 @@ from persistence_api.db import session_scope from persistence_api.models import BlogLabelModel from persistence_api.models import BlogLabelTagModel +from persistence_api.models import BlogInteractionModel from persistence_api.models import BlogModel -from persistence_api.models import IngestionRequestModel +from persistence_api.models import PendingUserRegistrationModel from persistence_api.models import RawDiscoveredUrlModel +from persistence_api.models import RecommendationImpressionModel +from persistence_api.models import RecommendationRequestModel +from persistence_api.models import AdminHourlyStatsModel +from persistence_api.models import SeedModel from shared.contracts.enums import CrawlStatus from shared.config import Settings +class CapturingEmailDelivery: + """Test email sender that records lifecycle messages. + + Attributes: + verification_urls: Verification messages captured as `(email, url)`. + reset_urls: Password reset messages captured as `(email, url)`. + """ + + def __init__(self) -> None: + self.verification_urls: list[tuple[str, str]] = [] + self.reset_urls: list[tuple[str, str]] = [] + + def send_verification_email(self, *, to_email: str, verification_url: str) -> None: + """Capture one verification email. + + Args: + to_email: Recipient email address. + verification_url: One-time verification URL. + + Returns: + None. + """ + + self.verification_urls.append((to_email, verification_url)) + + def send_password_reset_email(self, *, to_email: str, reset_url: str) -> None: + """Capture one password reset email. + + Args: + to_email: Recipient email address. + reset_url: One-time password reset URL. + + Returns: + None. + """ + + self.reset_urls.append((to_email, reset_url)) + + +def build_dev_token_repository(tmp_path: Path) -> repository_module.SQLAlchemyRepository: + """Build a repository that exposes lifecycle tokens for local flow tests.""" + settings = Settings( + db_path=tmp_path / "db.sqlite", + seed_path=tmp_path / "seed.csv", + export_dir=tmp_path / "exports", + email_dev_expose_tokens=True, + ) + return repository_module.build_repository(db_path=settings.db_path, settings=settings) + + +def register_and_verify_user( + repository: repository_module.SQLAlchemyRepository, + *, + email: str, + password: str, +) -> dict[str, object]: + """Create a user through the verify-before-persist registration flow.""" + + pending = repository.register_user(email=email, password=password) + token = pending.get("verification_token") + assert isinstance(token, str) + return repository.confirm_email_verification(token=token) + + def test_build_repository_roundtrip_works_with_path_backed_repository(tmp_path: Path) -> None: """The compatibility wrapper should still support path-backed test repositories.""" repository = repository_module.build_repository(db_path=tmp_path / "db.sqlite") @@ -74,13 +143,16 @@ def fake_repository( } -def test_repository_reset_clears_data_and_restarts_ids(tmp_path: Path) -> None: - """Reset should wipe graph data and restart primary keys.""" - repository = repository_module.build_repository(db_path=tmp_path / "db.sqlite") +def test_repository_reset_preserves_seed_rows_and_restarts_ids(tmp_path: Path) -> None: + """Reset should wipe only graph queue tables while retaining other records.""" + repository = build_dev_token_repository(tmp_path) first_blog_id, inserted = repository.upsert_blog( url="https://blog.example.com/", normalized_url="https://blog.example.com/", domain="blog.example.com", + accepted_by="seed", + seed_source_path="seed.csv", + seed_source_row=2, ) assert inserted is True second_blog_id, inserted = repository.upsert_blog( @@ -89,35 +161,82 @@ def test_repository_reset_clears_data_and_restarts_ids(tmp_path: Path) -> None: domain="friend.example.com", ) assert inserted is True + repository.mark_blog_result( + blog_id=first_blog_id, + crawl_status="FINISHED", + status_code=200, + friend_links_count=1, + metadata_captured=True, + title="Blog Example", + ) repository.add_edge( from_blog_id=first_blog_id, to_blog_id=second_blog_id, link_url_raw="https://friend.example.com/", link_text="Friend Blog", ) + repository.create_raw_discovered_url( + source_blog_id=first_blog_id, + normalized_url="https://raw.example.com/", + status="success", + ) repository.add_log( blog_id=first_blog_id, stage="crawl", result="ok", message="This should not be persisted", ) + verified_user = register_and_verify_user(repository, email="reset-user@example.com", password="long enough") + user = repository.login_user(email=str(verified_user["email"]), password="long enough") + batch = repository.create_random_recommendation_batch( + count=1, + visitor_id="visitor-reset", + session_id="session-reset", + source="reset-test", + page_url="http://localhost/reset-test", + ) + recommendation_item = batch["items"][0] + repository.record_blog_interaction( + event_uuid="reset-event", + event_type="detail_open", + blog_id=recommendation_item["id"], + visitor_id="visitor-reset", + session_id="session-reset", + entrance_kind="reset_test", + entrance_url="http://localhost/reset-test", + request_uuid=recommendation_item["request_uuid"], + impression_id=recommendation_item["impression_id"], + interaction_order=1, + ) result = repository.reset() assert result["ok"] is True assert result["blogs_deleted"] == 2 assert result["edges_deleted"] == 1 + assert result["raw_discovered_urls_deleted"] == 1 assert result["logs_deleted"] == 0 - assert result["ingestion_requests_deleted"] == 0 - assert result["blog_link_labels_deleted"] == 0 - assert result["blog_label_tags_deleted"] == 0 - assert result["blog_dedup_scan_items_deleted"] == 0 - assert result["blog_dedup_scan_runs_deleted"] == 0 + assert set(result) == { + "ok", + "blogs_deleted", + "edges_deleted", + "raw_discovered_urls_deleted", + "logs_deleted", + } assert repository.list_blogs() == [] assert repository.list_edges() == [] assert repository.list_logs() == [] assert repository.stats()["total_blogs"] == 0 assert repository.stats()["total_edges"] == 0 + with session_scope(repository.session_factory) as session: + seed = session.scalar(select(SeedModel)) + assert seed is not None + assert seed.normalized_url == "https://blog.example.com/" + assert seed.blog_id is None + assert repository.get_user_by_session_token(token=user["token"]) is not None + assert session.scalar(select(RecommendationRequestModel).limit(1)) is not None + assert session.scalar(select(RecommendationImpressionModel).limit(1)) is not None + assert session.scalar(select(BlogInteractionModel).limit(1)) is not None new_blog_id, inserted = repository.upsert_blog( url="https://reset.example.com/", @@ -126,35 +245,137 @@ def test_repository_reset_clears_data_and_restarts_ids(tmp_path: Path) -> None: ) assert inserted is True assert new_blog_id == 1 + restored_blog_id, inserted = repository.upsert_blog( + url="https://blog.example.com/", + normalized_url="https://blog.example.com/", + domain="blog.example.com", + ) + assert inserted is True + restored_stats = repository.get_blog_recommendation_stats(restored_blog_id) + assert restored_stats is not None + assert restored_stats["impressions"] == 1 + assert restored_stats["detail_opens"] == 1 def test_repository_register_login_and_session_profile(tmp_path: Path) -> None: - """Users can register, log in, and resolve their bearer session profile.""" - repository = repository_module.build_repository(db_path=tmp_path / "db.sqlite") + """Users persist only after email verification, then can log in.""" + repository = build_dev_token_repository(tmp_path) + + pending = repository.register_user(email="User@Example.com", password="correct horse") + assert pending["sent"] is True + assert pending["verification_token"] + with session_scope(repository.session_factory) as session: + assert session.scalar(select(PendingUserRegistrationModel).where(PendingUserRegistrationModel.email == "user@example.com")) is not None + assert session.scalar(select(repository_module.UserModel).where(repository_module.UserModel.email == "user@example.com")) is None + with pytest.raises(repository_module.UserAuthError, match="invalid_credentials"): + repository.login_user(email="user@example.com", password="correct horse") - created = repository.register_user(email="User@Example.com", password="correct horse") - assert created["user"]["email"] == "user@example.com" - assert created["token"] - resolved_user = repository.get_user_by_session_token(token=created["token"]) - assert resolved_user is not None - assert resolved_user["id"] == created["user"]["id"] - assert resolved_user["email"] == created["user"]["email"] + created = repository.confirm_email_verification(token=str(pending["verification_token"])) + assert created["email"] == "user@example.com" + assert created["role"] == "user" + assert created["email_verified"] is True logged_in = repository.login_user(email="user@example.com", password="correct horse") - assert logged_in["user"]["id"] == created["user"]["id"] - assert logged_in["token"] != created["token"] + assert logged_in["user"]["id"] == created["id"] + assert logged_in["token"] + + assert repository.revoke_user_session(token=logged_in["token"]) is True + assert repository.get_user_by_session_token(token=logged_in["token"]) is None + + +def test_repository_email_verification_and_password_reset_flow(tmp_path: Path) -> None: + """Email verification and password reset tokens should be single-use.""" + repository = build_dev_token_repository(tmp_path) + + created = repository.register_user(email="verify@example.com", password="correct horse") + verification_token = created["verification_token"] + verified = repository.confirm_email_verification(token=verification_token) + assert verified["email_verified"] is True + + with pytest.raises(repository_module.UserAuthError, match="invalid_token"): + repository.confirm_email_verification(token=verification_token) + + login = repository.login_user(email="verify@example.com", password="correct horse") + reset_request = repository.request_password_reset(email="verify@example.com") + reset_token = reset_request["reset_token"] + reset_user = repository.reset_user_password(token=reset_token, password="new correct horse") + assert reset_user["email"] == "verify@example.com" + assert repository.get_user_by_session_token(token=login["token"]) is None + + with pytest.raises(repository_module.UserAuthError, match="invalid_credentials"): + repository.login_user(email="verify@example.com", password="correct horse") + assert repository.login_user(email="verify@example.com", password="new correct horse")["token"] + + +def test_repository_sends_lifecycle_email_and_hides_tokens_when_configured(tmp_path: Path) -> None: + """Production email mode should send links without exposing raw tokens.""" + email_delivery = CapturingEmailDelivery() + settings = Settings( + db_path=tmp_path / "db.sqlite", + seed_path=tmp_path / "seed.csv", + export_dir=tmp_path / "exports", + public_base_url="https://heyblog.example.com", + email_dev_expose_tokens=False, + ) + repository = repository_module.build_repository( + db_path=tmp_path / "db.sqlite", + settings=settings, + email_delivery=email_delivery, + ) + + verification_payload = repository.register_user(email="Mail@Example.com", password="correct horse") + assert verification_payload == { + "sent": True, + "expires_at": verification_payload["expires_at"], + } + assert len(email_delivery.verification_urls) == 1 + sent_email, verification_url = email_delivery.verification_urls[0] + assert sent_email == "mail@example.com" + assert verification_url.startswith("https://heyblog.example.com/profile?verify_token=") + verification_token = verification_url.rsplit("=", 1)[1] + assert repository.confirm_email_verification(token=verification_token)["email_verified"] is True + + reset_payload = repository.request_password_reset(email="mail@example.com") + assert reset_payload == { + "sent": True, + "expires_at": reset_payload["expires_at"], + } + assert len(email_delivery.reset_urls) == 1 + reset_email, reset_url = email_delivery.reset_urls[0] + assert reset_email == "mail@example.com" + assert reset_url.startswith("https://heyblog.example.com/profile?reset_token=") + reset_token = reset_url.rsplit("=", 1)[1] + assert repository.reset_user_password(token=reset_token, password="new correct horse")["email"] == "mail@example.com" + + +def test_repository_admin_role_updates_user_identity(tmp_path: Path) -> None: + """Users should be promotable between regular user and admin roles.""" + repository = build_dev_token_repository(tmp_path) + + created = register_and_verify_user(repository, email="admin@example.com", password="correct horse") + user_id = int(created["id"]) + promoted = repository.update_user_role(user_id=user_id, role="admin") + assert promoted["role"] == "admin" + listed = repository.list_users() + assert listed["items"][0]["role"] == "admin" - assert repository.revoke_user_session(token=created["token"]) is True - assert repository.get_user_by_session_token(token=created["token"]) is None + demoted = repository.update_user_role(user_id=user_id, role="user") + assert demoted["role"] == "user" + with pytest.raises(ValueError, match="invalid_user_role"): + repository.update_user_role(user_id=user_id, role="label_admin") def test_repository_rejects_duplicate_user_and_bad_credentials(tmp_path: Path) -> None: """Email uniqueness and password validation should produce stable errors.""" - repository = repository_module.build_repository(db_path=tmp_path / "db.sqlite") - repository.register_user(email="dupe@example.com", password="long enough") + repository = build_dev_token_repository(tmp_path) + register_and_verify_user(repository, email="dupe@example.com", password="long enough") with pytest.raises(repository_module.UserAuthError, match="email_already_registered"): repository.register_user(email="DUPE@example.com", password="long enough") + + repository.register_user(email="pending@example.com", password="long enough") + with pytest.raises(repository_module.UserAuthError, match="email_registration_pending"): + repository.register_user(email="PENDING@example.com", password="long enough") with pytest.raises(repository_module.UserAuthError, match="invalid_credentials"): repository.login_user(email="dupe@example.com", password="wrong password") with pytest.raises(ValueError, match="password_too_short"): @@ -187,64 +408,89 @@ def test_repository_mark_blog_result_persists_site_metadata(tmp_path: Path) -> N assert blog["icon_url"] == "https://blog.example.com/favicon.ico" -def test_repository_defaults_blog_email_to_none(tmp_path: Path) -> None: - """New blogs should keep a nullable email field until claimed by a user.""" +def test_repository_keeps_accepted_blog_visible_after_crawl_failure(tmp_path: Path) -> None: + """Crawl failures must not undo durable blog acceptance. + + Args: + tmp_path: Temporary directory used for the SQLite test database. + + Returns: + None. Assertions verify acceptance fields and catalog eligibility. + """ repository = repository_module.build_repository(db_path=tmp_path / "db.sqlite") blog_id, inserted = repository.upsert_blog( - url="https://blog.example.com/", - normalized_url="https://blog.example.com/", - domain="blog.example.com", + url="https://friend.example.com/", + normalized_url="https://friend.example.com/", + domain="friend.example.com", + accepted_by="rss", ) assert inserted is True + repository.mark_blog_result( + blog_id=blog_id, + crawl_status="FAILED", + status_code=413, + friend_links_count=0, + crawl_error_kind="page_too_large", + crawl_error_message="homepage exceeded max page bytes", + ) + blog = repository.get_blog(blog_id) assert blog is not None - assert blog["email"] is None - - -def test_repository_creates_ingestion_request_and_persists_blog_email(tmp_path: Path) -> None: - """Self-serve ingestion should capture the requester email onto the seed blog.""" - repository = repository_module.build_repository(db_path=tmp_path / "db.sqlite") + assert blog["acceptance_status"] == "ACCEPTED" + assert blog["accepted_by"] == "rss" + assert blog["crawl_status"] == "FAILED" + assert blog["crawl_error_kind"] == "page_too_large" + assert blog["successful_crawl_at"] is None - created = repository.create_ingestion_request( - homepage_url="https://blog.example.com/", - email="owner@example.com", - ) + catalog = repository.list_blogs_catalog() + assert [item["id"] for item in catalog["items"]] == [blog_id] + assert catalog["filters"]["acceptance_status"] == "ACCEPTED" - assert created["status"] == "QUEUED" - assert created["request_id"] == created["id"] - assert created["email"] == "owner@example.com" - assert created["blog"]["email"] == "owner@example.com" - fetched = repository.get_ingestion_request( - request_id=created["request_id"], - request_token=created["request_token"], - ) - assert fetched is not None - assert fetched["normalized_url"] == "https://blog.example.com/" - assert fetched["seed_blog_id"] == created["seed_blog_id"] - assert fetched["seed_blog"]["blog_id"] == created["seed_blog_id"] +def test_repository_successful_crawl_clears_previous_error(tmp_path: Path) -> None: + """A later successful crawl should clear stale failure details. + Args: + tmp_path: Temporary directory used for the SQLite test database. -def test_repository_dedupes_ingestion_request_by_normalized_url(tmp_path: Path) -> None: - """Repeated requests for the same blog should reuse one active ingestion request.""" + Returns: + None. Assertions verify failure details do not survive a success. + """ repository = repository_module.build_repository(db_path=tmp_path / "db.sqlite") + blog_id, _ = repository.upsert_blog( + url="https://blog.example.com/", + normalized_url="https://blog.example.com/", + domain="blog.example.com", + accepted_by="model", + ) - first = repository.create_ingestion_request( - homepage_url="https://blog.example.com/?utm_source=test", - email="owner@example.com", + repository.mark_blog_result( + blog_id=blog_id, + crawl_status="FAILED", + status_code=None, + friend_links_count=0, + crawl_error_kind="timeout", + crawl_error_message="timed out", ) - second = repository.create_ingestion_request( - homepage_url="https://blog.example.com/", - email="owner@example.com", + repository.mark_blog_result( + blog_id=blog_id, + crawl_status="FINISHED", + status_code=200, + friend_links_count=3, ) - assert first["request_id"] == second["request_id"] - assert len(repository.list_blogs()) == 1 + blog = repository.get_blog(blog_id) + assert blog is not None + assert blog["acceptance_status"] == "ACCEPTED" + assert blog["accepted_by"] == "model" + assert blog["crawl_error_kind"] is None + assert blog["crawl_error_message"] is None + assert blog["successful_crawl_at"] is not None -def test_repository_dedupes_existing_finished_blog_before_creating_request(tmp_path: Path) -> None: - """Already-finished blogs should short-circuit to a DEDUPED_EXISTING response.""" +def test_repository_defaults_blog_email_to_none(tmp_path: Path) -> None: + """New blogs should keep a nullable email field until claimed by a user.""" repository = repository_module.build_repository(db_path=tmp_path / "db.sqlite") blog_id, inserted = repository.upsert_blog( url="https://blog.example.com/", @@ -252,21 +498,10 @@ def test_repository_dedupes_existing_finished_blog_before_creating_request(tmp_p domain="blog.example.com", ) assert inserted is True - repository.mark_blog_result( - blog_id=blog_id, - crawl_status="FINISHED", - status_code=200, - friend_links_count=0, - ) - - response = repository.create_ingestion_request( - homepage_url="https://blog.example.com/", - email="owner@example.com", - ) - assert response["status"] == "DEDUPED_EXISTING" - assert response["blog_id"] == blog_id - assert response["request_id"] is None + blog = repository.get_blog(blog_id) + assert blog is not None + assert blog["email"] is None def test_repository_filter_stats_follow_configured_chain_order(tmp_path: Path) -> None: @@ -432,6 +667,31 @@ def test_repository_marks_duplicate_raw_urls_before_filter_chain(tmp_path: Path) assert first["id"] < duplicate["id"] +def test_repository_finds_blog_id_by_normalized_url(tmp_path: Path) -> None: + """Duplicate discovery repair should resolve accepted target blogs by URL.""" + repository = repository_module.build_repository(db_path=tmp_path / "db.sqlite") + blog_id, _ = repository.upsert_blog( + url="https://friend.example/", + normalized_url="https://friend.example/", + domain="friend.example", + ) + + assert repository.find_blog_id_by_normalized_url(normalized_url="https://friend.example/") == blog_id + assert repository.find_blog_id_by_normalized_url(normalized_url="https://missing.example/") is None + + +def test_repository_finds_blog_id_by_normalized_url_identity_fallback(tmp_path: Path) -> None: + """Duplicate edge repair should survive blog identity canonicalization.""" + repository = repository_module.build_repository(db_path=tmp_path / "db.sqlite") + blog_id, _ = repository.upsert_blog( + url="https://zhuruilei.66law.cn/", + normalized_url="https://zhuruilei.66law.cn/", + domain="zhuruilei.66law.cn", + ) + + assert repository.find_blog_id_by_normalized_url(normalized_url="https://zhuruilei.66law.cn/") == blog_id + + def test_retired_label_assignment_migration_reports_single_table_rows(tmp_path: Path) -> None: """Retired label-assignment migration should leave the single label table intact.""" repository = repository_module.build_repository(db_path=tmp_path / "db.sqlite") @@ -457,165 +717,6 @@ def test_retired_label_assignment_migration_reports_single_table_rows(tmp_path: -def test_repository_dedupes_ingestion_request_by_identity_key_but_keeps_history(tmp_path: Path) -> None: - """Alias URLs should reuse one active request, but completed history must not block a new request.""" - repository = repository_module.build_repository(db_path=tmp_path / "db.sqlite") - - first = repository.create_ingestion_request( - homepage_url="https://langhai.cc/", - email="owner@example.com", - ) - second = repository.create_ingestion_request( - homepage_url="http://blog.langhai.cc/index.html", - email="owner@example.com", - ) - - assert first["request_id"] == second["request_id"] - assert first["identity_key"] == "site:langhai.cc/" - - repository.mark_blog_result( - blog_id=first["seed_blog_id"], - crawl_status="FINISHED", - status_code=200, - friend_links_count=0, - ) - - third = repository.create_ingestion_request( - homepage_url="http://www.langhai.cc/", - email="owner@example.com", - ) - - assert third["request_id"] is None - assert third["status"] == "DEDUPED_EXISTING" - assert len(repository.list_blogs()) == 1 - - -def test_repository_run_blog_dedup_scan_removes_rejected_links_and_orphaned_targets( - tmp_path: Path, -) -> None: - """Admin rescan should drop persisted blog URLs rejected by the current decision chain.""" - settings = Settings( - db_path=tmp_path / "db.sqlite", - seed_path=tmp_path / "seed.csv", - export_dir=tmp_path / "exports", - friend_link_exact_url_blocklist=("https://rejected.example/",), - decision_model_consensus_enabled=False, - ) - repository = repository_module.build_repository(db_path=settings.db_path, settings=settings) - source_id, inserted = repository.upsert_blog( - url="https://source.example/", - normalized_url="https://source.example/", - domain="source.example", - ) - assert inserted is True - target_id, inserted = repository.upsert_blog( - url="https://rejected.example/", - normalized_url="https://rejected.example/", - domain="rejected.example", - ) - assert inserted is True - - with session_scope(repository.session_factory) as session: - session.add( - BlogLabelModel( - normalized_url="https://rejected.example/", - label_id={"1": 1}, - created_time=repository_module.now_utc(), - updated_time=repository_module.now_utc(), - ) - ) - - repository.add_edge( - from_blog_id=source_id, - to_blog_id=target_id, - link_url_raw="https://rejected.example/", - link_text="Rejected", - ) - - run = repository.create_blog_dedup_scan_run(crawler_was_running=True) - summary = repository.execute_blog_dedup_scan_run(run_id=int(run["id"])) - items = repository.list_blog_dedup_scan_run_items(summary["id"]) - blogs = repository.list_blogs() - - assert summary["status"] == "SUCCEEDED" - assert summary["crawler_was_running"] is True - assert summary["total_count"] == 2 - assert summary["scanned_count"] == 2 - assert summary["removed_count"] == 1 - assert summary["kept_count"] == 1 - assert repository.list_edges() == [] - assert [blog["id"] for blog in blogs] == [source_id] - assert len(items) == 1 - assert items[0]["survivor_blog_id"] is None - assert items[0]["removed_blog_id"] == target_id - assert items[0]["removed_url"] == "https://rejected.example/" - assert items[0]["reason_code"] == "exact_url_blocked" - - -def test_repository_dedup_scan_keeps_valid_blog_urls(tmp_path: Path) -> None: - """Rescan should preserve persisted blogs whose own URLs still pass the chain.""" - settings = Settings( - db_path=tmp_path / "db.sqlite", - seed_path=tmp_path / "seed.csv", - export_dir=tmp_path / "exports", - friend_link_exact_url_blocklist=("https://blocked.example/",), - decision_model_consensus_enabled=False, - ) - repository = repository_module.build_repository(db_path=settings.db_path, settings=settings) - first_source_id, inserted = repository.upsert_blog( - url="https://source-a.example/", - normalized_url="https://source-a.example/", - domain="source-a.example", - ) - assert inserted is True - second_source_id, inserted = repository.upsert_blog( - url="https://source-b.example/", - normalized_url="https://source-b.example/", - domain="source-b.example", - ) - assert inserted is True - target_id, inserted = repository.upsert_blog( - url="https://blocked.example/", - normalized_url="https://blocked.example/", - domain="blocked.example", - ) - assert inserted is True - survivor_id, inserted = repository.upsert_blog( - url="https://friend.example/", - normalized_url="https://friend.example/", - domain="friend.example", - ) - assert inserted is True - - repository.add_edge( - from_blog_id=first_source_id, - to_blog_id=survivor_id, - link_url_raw="https://friend.example/", - link_text="Canonical", - ) - repository.add_edge( - from_blog_id=second_source_id, - to_blog_id=target_id, - link_url_raw="https://blocked.example/", - link_text="Blocked", - ) - - run = repository.create_blog_dedup_scan_run(crawler_was_running=False) - summary = repository.execute_blog_dedup_scan_run(run_id=int(run["id"])) - items = repository.list_blog_dedup_scan_run_items(summary["id"]) - blogs = repository.list_blogs() - edges = repository.list_edges() - - assert summary["total_count"] == 4 - assert summary["scanned_count"] == 4 - assert summary["removed_count"] == 1 - assert summary["kept_count"] == 3 - assert len(items) == 1 - assert items[0]["removed_url"] == "https://blocked.example/" - assert [edge["link_url_raw"] for edge in edges] == ["https://friend.example/"] - assert {blog["id"] for blog in blogs} == {first_source_id, second_source_id, survivor_id} - - def test_repository_upsert_blog_collapses_tenant_like_subdomains_to_root_url(tmp_path: Path) -> None: """Tenant-like homepage subdomains should persist as one canonical root blog URL.""" repository = repository_module.build_repository(db_path=tmp_path / "db.sqlite") @@ -644,145 +745,6 @@ def test_repository_upsert_blog_collapses_tenant_like_subdomains_to_root_url(tmp assert "tenant_subdomain_collapsed" in blog["identity_reason_codes"] -def test_repository_ingestion_request_reuses_tenant_like_root_identity(tmp_path: Path) -> None: - """Tenant-like subdomains should share one queued seed blog/request identity.""" - repository = repository_module.build_repository(db_path=tmp_path / "db.sqlite") - - first = repository.create_ingestion_request( - homepage_url="https://zhuruilei.66law.cn/", - email="first@example.com", - ) - second = repository.create_ingestion_request( - homepage_url="https://lichenlvs.66law.cn/", - email="second@example.com", - ) - - assert first["status"] == "QUEUED" - assert second["status"] == "QUEUED" - assert second["request_id"] == first["request_id"] - assert second["seed_blog_id"] == first["seed_blog_id"] - assert second["identity_key"] == "site:66law.cn/" - - blog = repository.get_blog(int(first["seed_blog_id"])) - assert blog is not None - assert blog["blog_id"] == first["seed_blog_id"] - assert blog["url"] == "https://66law.cn/" - assert blog["normalized_url"] == "https://66law.cn/" - assert blog["domain"] == "66law.cn" - - -def test_repository_reused_tenant_like_ingestion_request_is_canonicalized_to_root_url(tmp_path: Path) -> None: - """Reused active requests should rewrite legacy tenant normalized_url to the registrable root URL.""" - repository = repository_module.build_repository(db_path=tmp_path / "db.sqlite") - - with session_scope(repository.session_factory) as session: - seed = BlogModel( - url="https://66law.cn/", - normalized_url="https://66law.cn/", - identity_key="site:66law.cn/", - identity_reason_codes='["scheme_ignored"]', - identity_ruleset_version=repository_module.IDENTITY_RULESET_VERSION, - domain="66law.cn", - email=None, - title=None, - icon_url=None, - status_code=None, - crawl_status=CrawlStatus.WAITING, - friend_links_count=0, - created_at=repository_module.now_utc(), - updated_at=repository_module.now_utc(), - ) - session.add(seed) - session.flush() - request = IngestionRequestModel( - requested_url="https://zhuruilei.66law.cn/", - normalized_url="https://zhuruilei.66law.cn/", - identity_key="site:66law.cn/", - identity_reason_codes='["scheme_ignored"]', - identity_ruleset_version=repository_module.IDENTITY_RULESET_VERSION, - requester_email="existing@example.com", - status="QUEUED", - priority=100, - seed_blog_id=int(seed.id), - matched_blog_id=None, - request_token="legacy-token", - expires_at=None, - error_message=None, - created_at=repository_module.now_utc(), - updated_at=repository_module.now_utc(), - ) - session.add(request) - session.flush() - request_id = int(request.id) - - reused = repository.create_ingestion_request( - homepage_url="https://lichenlvs.66law.cn/", - email="next@example.com", - ) - - assert reused["request_id"] == request_id - assert reused["normalized_url"] == "https://66law.cn/" - assert reused["identity_key"] == "site:66law.cn/" - - -def test_repository_dedup_scan_uses_model_consensus_when_enabled(tmp_path: Path, monkeypatch) -> None: - """Rescan should share the same model-consensus decision layer as live crawler filtering.""" - settings = Settings( - db_path=tmp_path / "db.sqlite", - seed_path=tmp_path / "seed.csv", - export_dir=tmp_path / "exports", - decision_model_root=tmp_path / "models", - decision_model_consensus_enabled=True, - ) - repository = repository_module.build_repository(db_path=settings.db_path, settings=settings) - source_id, inserted = repository.upsert_blog( - url="https://source.example/", - normalized_url="https://source.example/", - domain="source.example", - ) - assert inserted is True - target_id, inserted = repository.upsert_blog( - url="https://maybe-blog.example/", - normalized_url="https://maybe-blog.example/", - domain="maybe-blog.example", - ) - assert inserted is True - - run_dir = settings.decision_model_root / "structured" / "2604120847" - run_dir.mkdir(parents=True) - (run_dir / "model.joblib").write_bytes(b"stub") - (run_dir / "config.json").write_text('{"model_config":{"threshold":0.5}}', encoding="utf-8") - - class StubPredictor: - threshold = 0.5 - - def predict_proba(self, samples: list[object]) -> list[float]: - probabilities: list[float] = [] - for sample in samples: - url = str(getattr(sample, "url", "")) - probabilities.append(0.9 if "source.example" in url else 0.1) - return probabilities - - monkeypatch.setattr("crawler.crawling.decisions.consensus.load_model", lambda path: StubPredictor()) - - repository.add_edge( - from_blog_id=source_id, - to_blog_id=target_id, - link_url_raw="https://maybe-blog.example/", - link_text="Maybe Blog", - ) - - run = repository.create_blog_dedup_scan_run(crawler_was_running=False) - summary = repository.execute_blog_dedup_scan_run(run_id=int(run["id"])) - items = repository.list_blog_dedup_scan_run_items(summary["id"]) - - assert summary["removed_count"] == 1 - assert summary["kept_count"] == 1 - assert repository.list_edges() == [] - assert [blog["id"] for blog in repository.list_blogs()] == [source_id] - assert items[0]["reason_code"] == "model_consensus_all_non_blog" - - def test_repository_ensure_edge_in_session_dedupes_pending_edges(tmp_path: Path) -> None: """Refilter edge creation should ignore already-pending same-direction edges.""" repository = repository_module.build_repository(db_path=tmp_path / "db.sqlite") @@ -887,27 +849,10 @@ def test_repository_startup_migrates_legacy_tenant_like_rows_and_merges_to_root_ migrated = repository_module.build_repository(db_path=db_path) blogs = migrated.list_blogs() - latest_run = migrated.get_latest_blog_dedup_scan_run() assert len(blogs) == 2 assert {blog["identity_key"] for blog in blogs} == {"site:66law.cn/"} assert all(blog["identity_ruleset_version"] == repository_module.IDENTITY_RULESET_VERSION for blog in blogs) - assert latest_run is None - - -def test_repository_startup_marks_orphaned_dedup_scan_run_failed(tmp_path: Path) -> None: - """Startup should not leave stale RUNNING dedup scan summaries hanging forever.""" - db_path = tmp_path / "db.sqlite" - repository = repository_module.build_repository(db_path=db_path) - run = repository.create_blog_dedup_scan_run(crawler_was_running=False) - - restarted = repository_module.build_repository(db_path=db_path) - latest_run = restarted.get_latest_blog_dedup_scan_run() - - assert latest_run is not None - assert latest_run["id"] == run["id"] - assert latest_run["status"] == "FAILED" - assert latest_run["error_message"] == "orphaned_dedup_scan_run_cleaned_on_startup" def test_repository_requeues_processing_blogs_on_restart(tmp_path: Path) -> None: @@ -1000,61 +945,50 @@ def test_repository_claims_waiting_blogs_in_id_order(tmp_path: Path) -> None: assert second_claim["id"] == second_blog_id -def test_repository_claims_priority_blogs_by_request_priority(tmp_path: Path) -> None: - """Priority queue claiming should follow ingestion priority before request age.""" - repository = repository_module.build_repository(db_path=tmp_path / "db.sqlite") - first = repository.create_ingestion_request( - homepage_url="https://first-priority.example/", - email="owner@example.com", - ) - second = repository.create_ingestion_request( - homepage_url="https://second-priority.example/", - email="owner@example.com", +def test_repository_creates_user_seed_as_accepted_waiting_blog(tmp_path: Path) -> None: + """User seeds should be accepted as blogs while remaining crawlable.""" + repository = repository_module.build_repository( + db_path=tmp_path / "db.sqlite", + settings=Settings( + db_path=tmp_path / "db.sqlite", + seed_path=tmp_path / "seed.csv", + export_dir=tmp_path / "exports", + decision_model_consensus_enabled=False, + ), ) - with session_scope(repository.session_factory) as session: - first_request = session.scalar( - repository_module.select(repository_module.IngestionRequestModel).where( - repository_module.IngestionRequestModel.id == first["request_id"] - ) - ) - second_request = session.scalar( - repository_module.select(repository_module.IngestionRequestModel).where( - repository_module.IngestionRequestModel.id == second["request_id"] - ) - ) - assert first_request is not None - assert second_request is not None - first_request.priority = 100 - second_request.priority = 200 - first_request.updated_at = repository_module.now_utc() - second_request.updated_at = repository_module.now_utc() - - claimed = repository.get_next_priority_blog() + result = repository.create_user_seed(homepage_url="https://user-blog.example.com/") + assert result["status"] == "QUEUED" + blog = repository.get_blog(result["blog_id"]) + assert blog is not None + assert blog["acceptance_status"] == "ACCEPTED" + assert blog["accepted_by"] == "user" + assert blog["crawl_status"] == "WAITING" + seeds = repository.list_seeds() + assert len(seeds) == 1 + assert seeds[0]["normalized_url"] == "https://user-blog.example.com/" + assert seeds[0]["source_path"] == "user" + assert seeds[0]["blog_id"] == result["blog_id"] + claimed = repository.get_next_waiting_blog() assert claimed is not None - assert claimed["id"] == second["seed_blog_id"] + assert claimed["id"] == result["blog_id"] -def test_repository_waiting_queue_can_exclude_priority_seed_blogs(tmp_path: Path) -> None: - """Normal queue claiming should skip active ingestion seeds when requested.""" - repository = repository_module.build_repository(db_path=tmp_path / "db.sqlite") - priority_request = repository.create_ingestion_request( - homepage_url="https://priority-seed.example/", - email="owner@example.com", - ) - normal_blog_id, inserted = repository.upsert_blog( - url="https://normal.example/", - normalized_url="https://normal.example/", - domain="normal.example", +def test_repository_user_seed_runs_rule_filters_only(tmp_path: Path) -> None: + """User seed submission should reject deterministic rule failures.""" + repository = repository_module.build_repository( + db_path=tmp_path / "db.sqlite", + settings=Settings( + db_path=tmp_path / "db.sqlite", + seed_path=tmp_path / "seed.csv", + export_dir=tmp_path / "exports", + decision_model_consensus_enabled=False, + ), ) - assert inserted is True - - claimed = repository.get_next_waiting_blog(include_priority=False) - assert claimed is not None - assert claimed["id"] == normal_blog_id - assert repository.get_blog(priority_request["seed_blog_id"])["crawl_status"] == "WAITING" + with pytest.raises(ValueError, match="rule:non_root_path"): + repository.create_user_seed(homepage_url="https://user-blog.example.com/posts/1") def test_repository_blog_catalog_paginates_and_filters(tmp_path: Path) -> None: @@ -1140,6 +1074,7 @@ def test_repository_blog_catalog_normalizes_query_inputs(tmp_path: Path) -> None "has_title": None, "has_icon": None, "min_connections": 0, + "acceptance_status": "ACCEPTED", } last_page = repository.list_blogs_catalog(page=99, page_size=2) @@ -1200,7 +1135,13 @@ def test_repository_blog_catalog_supports_random_sort_for_finished_sampling(tmp_ def test_repository_random_catalog_filters_admin_non_blog_and_saves_user_labels(tmp_path: Path) -> None: """Random catalog should exclude admin non-blog URLs and store public votes separately.""" - repository = repository_module.build_repository(db_path=tmp_path / "heyblog.sqlite") + settings = Settings( + db_path=tmp_path / "heyblog.sqlite", + seed_path=tmp_path / "seed.csv", + export_dir=tmp_path / "exports", + email_dev_expose_tokens=True, + ) + repository = repository_module.build_repository(db_path=settings.db_path, settings=settings) blog_tag = repository.create_blog_label_tag(name="blog") company_tag = repository.create_blog_label_tag(name="company") other_tag = repository.create_blog_label_tag(name="other") @@ -1253,8 +1194,8 @@ def test_repository_random_catalog_filters_admin_non_blog_and_saves_user_labels( user_label = repository.increment_blog_user_label(blog_id=kept_id, label="blog") duplicate_blog = repository.increment_blog_user_label(blog_id=kept_id, label="blog", previous_label="blog") user_non_blog = repository.increment_blog_user_label(blog_id=kept_id, label="other", previous_label="blog") - account = repository.register_user(email="labeler@example.com", password="long enough") - user_id = int(account["user"]["id"]) + account = register_and_verify_user(repository, email="labeler@example.com", password="long enough") + user_id = int(account["id"]) account_blog = repository.increment_blog_user_label(blog_id=kept_id, label="blog", user_id=user_id) account_other = repository.increment_blog_user_label(blog_id=kept_id, label="other", user_id=user_id) @@ -1273,8 +1214,175 @@ def test_repository_random_catalog_filters_admin_non_blog_and_saves_user_labels( assert raw_kept not in [item["id"] for item in admin_labeled["items"]] +def test_repository_random_catalog_only_demotes_user_non_blog_feedback(tmp_path: Path) -> None: + """Random catalog weighting should ignore blog votes and demote non-blog votes.""" + repository = repository_module.build_repository(db_path=tmp_path / "heyblog.sqlite") + repository.create_blog_label_tag(name="blog") + repository.create_blog_label_tag(name="other") + + if repository.engine.dialect.name == "sqlite": + def fixed_random(dbapi_connection: object, _connection_record: object) -> None: + dbapi_connection.create_function("random", 0, lambda: 1) + + event.listen(repository.engine, "connect", fixed_random) + repository.engine.dispose() + + boosted_id, boosted_inserted = repository.upsert_blog( + url="https://boosted.example/", + normalized_url="https://boosted.example/", + domain="boosted.example", + ) + baseline_id, baseline_inserted = repository.upsert_blog( + url="https://baseline.example/", + normalized_url="https://baseline.example/", + domain="baseline.example", + ) + demoted_id, demoted_inserted = repository.upsert_blog( + url="https://demoted.example/", + normalized_url="https://demoted.example/", + domain="demoted.example", + ) + assert boosted_inserted is True + assert baseline_inserted is True + assert demoted_inserted is True + for blog_id, title in ( + (boosted_id, "Boosted"), + (baseline_id, "Baseline"), + (demoted_id, "Demoted"), + ): + repository.mark_blog_result( + blog_id=blog_id, + crawl_status="FINISHED", + status_code=200, + friend_links_count=1, + metadata_captured=True, + title=title, + icon_url=None, + ) + + repository.increment_blog_user_label(blog_id=boosted_id, label="blog") + repository.increment_blog_user_label(blog_id=demoted_id, label="other") + + random_page = repository.list_blogs_catalog(status="finished", sort="random", page_size=10) + + assert [item["url"] for item in random_page["items"]] == [ + "https://baseline.example/", + "https://boosted.example/", + "https://demoted.example/", + ] + + +def test_repository_persists_random_recommendation_batch_and_interaction_stats(tmp_path: Path) -> None: + """Random recommendation batches should persist request, impression, event, and stat rows.""" + repository = repository_module.build_repository(db_path=tmp_path / "db.sqlite") + for index in range(3): + blog_id, inserted = repository.upsert_blog( + url=f"https://recommend-{index}.example/", + normalized_url=f"https://recommend-{index}.example/", + domain=f"recommend-{index}.example", + accepted_by="rss", + ) + assert inserted is True + repository.mark_blog_result( + blog_id=blog_id, + crawl_status="FINISHED", + status_code=200, + friend_links_count=index, + metadata_captured=True, + title=f"Recommend {index}", + icon_url=None, + ) + + batch = repository.create_random_recommendation_batch( + count=2, + visitor_id="visitor-1", + session_id="session-1", + source="test", + page_url="http://localhost/random", + ) + + assert batch["requested_count"] == 2 + assert batch["served_count"] == 2 + assert [item["position"] for item in batch["items"]] == [1, 2] + first = batch["items"][0] + event = repository.record_blog_interaction( + event_uuid="event-1", + event_type="detail_open", + blog_id=first["id"], + visitor_id="visitor-1", + session_id="session-1", + entrance_kind="test_detail", + entrance_url="http://localhost/random", + request_uuid=first["request_uuid"], + impression_id=first["impression_id"], + interaction_order=1, + client_event_at="2026-06-07T12:00:00Z", + attributes={"button": "detail"}, + ) + duplicate = repository.record_blog_interaction( + event_uuid="event-1", + event_type="detail_open", + blog_id=first["id"], + visitor_id="visitor-1", + session_id="session-1", + entrance_kind="test_detail", + entrance_url="http://localhost/random", + ) + repository.record_blog_interaction( + event_uuid="event-2", + event_type="external_open", + blog_id=first["id"], + visitor_id="visitor-1", + session_id="session-1", + entrance_kind="test_external", + entrance_url="http://localhost/random", + request_uuid=first["request_uuid"], + impression_id=first["impression_id"], + interaction_order=2, + ) + stats = repository.get_blog_recommendation_stats(first["id"]) + strategy_stats = repository.get_recommendation_strategy_stats() + hourly_stats = repository.get_admin_hourly_stats() + + assert event["duplicate"] is False + assert event["entrance_kind"] == "test_detail" + assert event["entrance_url"] == "http://localhost/random" + assert duplicate["duplicate"] is True + assert stats is not None + assert stats["impressions"] == 1 + assert stats["detail_opens"] == 1 + assert stats["external_opens"] == 1 + assert stats["unique_visitors"] == 1 + assert stats["ctr"] == 2.0 + assert strategy_stats["total_requests"] == 1 + assert strategy_stats["total_impressions"] == 2 + assert strategy_stats["total_interactions"] == 2 + assert strategy_stats["by_strategy"][0]["clicks"] == 2 + assert hourly_stats["current_hour"]["user_count"] == 0 + assert hourly_stats["current_hour"]["random_request_count"] == 1 + assert hourly_stats["current_hour"]["random_impression_count"] == 2 + assert hourly_stats["current_hour"]["detail_open_count"] == 1 + assert hourly_stats["current_hour"]["external_open_count"] == 1 + assert hourly_stats["current_hour"]["detail_ctr"] == 0.5 + assert hourly_stats["current_hour"]["external_ctr"] == 0.5 + assert hourly_stats["current_hour"]["total_click_ctr"] == 1.0 + with session_scope(repository.session_factory) as session: + assert session.scalar(select(RecommendationRequestModel).limit(1)) is not None + stored_impression = session.scalar(select(RecommendationImpressionModel).limit(1)) + stored_interaction = session.scalar(select(BlogInteractionModel).limit(1)) + stored_hourly_stats = session.scalar(select(AdminHourlyStatsModel).limit(1)) + assert stored_impression is not None + assert stored_impression.normalized_url == first["normalized_url"] + assert "blog_id" not in RecommendationImpressionModel.__table__.columns + assert stored_interaction is not None + assert stored_interaction.normalized_url == first["normalized_url"] + assert "blog_id" not in BlogInteractionModel.__table__.columns + assert stored_hourly_stats is not None + assert stored_hourly_stats.random_impression_count == 2 + + def test_repository_blog_catalog_uses_display_identity_fallbacks_for_legacy_rows(tmp_path: Path) -> None: - """Catalog should remain usable for older rows that were created before metadata capture existed.""" + """Catalog should keep title fallback but not synthesize unverified icons.""" repository = repository_module.build_repository(db_path=tmp_path / "db.sqlite") blog_id, inserted = repository.upsert_blog( url="https://legacy.example/posts/1", @@ -1293,9 +1401,9 @@ def test_repository_blog_catalog_uses_display_identity_fallbacks_for_legacy_rows title_filtered = repository.list_blogs_catalog(has_title=True) icon_filtered = repository.list_blogs_catalog(has_icon=True) assert [row["id"] for row in title_filtered["items"]] == [blog_id] - assert [row["id"] for row in icon_filtered["items"]] == [blog_id] + assert icon_filtered["items"] == [] assert title_filtered["items"][0]["title"] == "legacy.example" - assert icon_filtered["items"][0]["icon_url"] == "https://legacy.example/favicon.ico" + assert title_filtered["items"][0]["icon_url"] is None def test_repository_blog_catalog_has_title_filters_on_stored_title_only(tmp_path: Path) -> None: @@ -1339,41 +1447,6 @@ def test_repository_blog_catalog_has_title_filters_on_stored_title_only(tmp_path assert payload["items"][0]["title"] == "untitled.example" -def test_repository_priority_ingestion_list_hides_private_fields_and_orders_active_first(tmp_path: Path) -> None: - """Public priority list should expose queue state without leaking request secrets.""" - repository = repository_module.build_repository(db_path=tmp_path / "db.sqlite") - - queued = repository.create_ingestion_request( - homepage_url="https://queued.example/", - email="owner@example.com", - ) - processing = repository.create_ingestion_request( - homepage_url="https://processing.example/", - email="runner@example.com", - ) - repository.mark_ingestion_request_crawling(blog_id=processing["seed_blog_id"]) - repository.mark_blog_result( - blog_id=processing["seed_blog_id"], - crawl_status="FINISHED", - status_code=200, - friend_links_count=0, - metadata_captured=True, - title="Processing Blog", - icon_url="https://processing.example/favicon.ico", - ) - - items = repository.list_priority_ingestion_requests() - - assert [item["status"] for item in items] == ["QUEUED", "COMPLETED"] - assert items[0]["request_id"] == queued["request_id"] - assert items[0]["requested_url"] == "https://queued.example/" - assert items[0]["blog"]["crawl_status"] == "WAITING" - assert "email" not in items[0] - assert "request_token" not in items[0] - assert "priority" not in items[0] - assert "email" not in items[0]["blog"] - - def test_repository_blog_lookup_prefers_identity_match_and_returns_reason(tmp_path: Path) -> None: """Lookup should follow the frozen identity-first match ladder.""" repository = repository_module.build_repository(db_path=tmp_path / "db.sqlite") @@ -1666,7 +1739,7 @@ def test_repository_blog_labels_are_keyed_by_url_across_reset_and_recrawl(tmp_pa labeled = repository.list_blog_labeling_candidates(label="blog", labeled=True) assert first["blog_id"] == first_raw_id - assert reset["blog_link_labels_preserved"] == 1 + assert reset["raw_discovered_urls_deleted"] == 1 assert second_raw_id != first_raw_id assert [row["id"] for row in labeled["items"]] == [second_raw_id] assert labeled["items"][0]["label_id"] == {"1": 1} @@ -1729,11 +1802,8 @@ def test_repository_blog_labeling_upsert_rejects_non_labelable_raw_targets_and_r assert labeled["items"][0]["label_id"] == {"1": 1, "4": 1} reset = repository.reset() - assert reset["blog_link_labels_deleted"] == 0 - assert reset["blog_label_tags_deleted"] == 0 - assert reset["blog_link_labels_preserved"] == 1 - assert reset["blog_labels_preserved"] == 1 - assert reset["blog_label_tags_preserved"] >= 6 + assert reset["blogs_deleted"] == 2 + assert reset["raw_discovered_urls_deleted"] == 1 assert repository.list_blog_labeling_candidates()["items"] == [] with session_scope(repository.session_factory) as session: label = session.scalar( @@ -1741,6 +1811,8 @@ def test_repository_blog_labeling_upsert_rejects_non_labelable_raw_targets_and_r ) assert label is not None assert label.label_id == {"1": 1, "4": 1} + assert session.scalar(select(BlogLabelTagModel).where(BlogLabelTagModel.slug == "blog")) is not None + assert session.scalar(select(BlogLabelTagModel).where(BlogLabelTagModel.slug == "unknown")) is not None assert set(label.__table__.columns.keys()) == { "normalized_url", "title", @@ -1787,7 +1859,6 @@ def test_repository_blog_label_counts_use_all_persisted_url_labels(tmp_path: Pat blog_tag = repository.create_blog_label_tag(name="blog") company_tag = repository.create_blog_label_tag(name="company") other_tag = repository.create_blog_label_tag(name="other") - unknown_tag = repository.create_blog_label_tag(name="unknown") timestamp = repository_module.now_utc() with session_scope(repository.session_factory) as session: session.add_all( @@ -2335,6 +2406,13 @@ def test_repository_blog_detail_aggregates_bidirectional_relationships(tmp_path: "title": "delta.example title", "icon_url": "https://delta.example/favicon.ico", "status_code": 200, + "acceptance_status": "ACCEPTED", + "accepted_by": None, + "accepted_at": detail["recommended_blogs"][0]["blog"]["accepted_at"], + "crawl_error_kind": None, + "crawl_error_message": None, + "last_crawl_attempt_at": detail["recommended_blogs"][0]["blog"]["last_crawl_attempt_at"], + "successful_crawl_at": detail["recommended_blogs"][0]["blog"]["successful_crawl_at"], "crawl_status": "FINISHED", "friend_links_count": 1, "last_crawled_at": detail["recommended_blogs"][0]["blog"]["last_crawled_at"], @@ -2346,6 +2424,15 @@ def test_repository_blog_detail_aggregates_bidirectional_relationships(tmp_path: "activity_at": detail["recommended_blogs"][0]["blog"]["activity_at"], "identity_complete": True, } + assert detail["relation_graphs"]["incoming"]["focus_blog_id"] == alpha_id + assert [node["blog_id"] for node in detail["relation_graphs"]["incoming"]["nodes"]] == [alpha_id, gamma_id] + assert detail["relation_graphs"]["incoming"]["edges"][0]["from_blog_id"] == gamma_id + assert detail["relation_graphs"]["outgoing"]["focus_blog_id"] == alpha_id + assert {node["blog_id"] for node in detail["relation_graphs"]["outgoing"]["nodes"]} == { + alpha_id, + beta_id, + delta_id, + } assert detail["recommended_blogs"][0]["reason"] == "mutual_connection" assert detail["recommended_blogs"][0]["mutual_connection_count"] == 1 assert detail["recommended_blogs"][0]["via_blogs"] == [ @@ -2357,3 +2444,216 @@ def test_repository_blog_detail_aggregates_bidirectional_relationships(tmp_path: "icon_url": "https://beta.example/favicon.ico", } ] + + +def test_repository_blog_detail_relation_graph_keeps_all_edges_within_two_layers( + tmp_path: Path, +) -> None: + """Relation graphs should keep every edge reachable within the configured two-layer depth.""" + repository = repository_module.build_repository(db_path=tmp_path / "db.sqlite") + focus_id, inserted = repository.upsert_blog( + url="https://focus.example/", + normalized_url="https://focus.example/", + domain="focus.example", + ) + assert inserted is True + + outgoing_first_ids: list[int] = [] + incoming_first_ids: list[int] = [] + for index in range(12): + outgoing_id, inserted = repository.upsert_blog( + url=f"https://out-first-{index}.example/", + normalized_url=f"https://out-first-{index}.example/", + domain=f"out-first-{index}.example", + ) + assert inserted is True + outgoing_first_ids.append(outgoing_id) + repository.add_edge( + from_blog_id=focus_id, + to_blog_id=outgoing_id, + link_url_raw=f"https://out-first-{index}.example/", + link_text=f"Out first {index}", + ) + + incoming_id, inserted = repository.upsert_blog( + url=f"https://in-first-{index}.example/", + normalized_url=f"https://in-first-{index}.example/", + domain=f"in-first-{index}.example", + ) + assert inserted is True + incoming_first_ids.append(incoming_id) + repository.add_edge( + from_blog_id=incoming_id, + to_blog_id=focus_id, + link_url_raw="https://focus.example/", + link_text=f"In first {index}", + ) + + outgoing_second_ids: list[int] = [] + incoming_second_ids: list[int] = [] + for index in range(11): + outgoing_id, inserted = repository.upsert_blog( + url=f"https://out-second-{index}.example/", + normalized_url=f"https://out-second-{index}.example/", + domain=f"out-second-{index}.example", + ) + assert inserted is True + outgoing_second_ids.append(outgoing_id) + repository.add_edge( + from_blog_id=outgoing_first_ids[0], + to_blog_id=outgoing_id, + link_url_raw=f"https://out-second-{index}.example/", + link_text=f"Out second {index}", + ) + + incoming_id, inserted = repository.upsert_blog( + url=f"https://in-second-{index}.example/", + normalized_url=f"https://in-second-{index}.example/", + domain=f"in-second-{index}.example", + ) + assert inserted is True + incoming_second_ids.append(incoming_id) + repository.add_edge( + from_blog_id=incoming_id, + to_blog_id=incoming_first_ids[0], + link_url_raw="https://in-first-0.example/", + link_text=f"In second {index}", + ) + + detail = repository.get_blog_detail(focus_id) + + assert detail is not None + outgoing_node_ids = {node["blog_id"] for node in detail["relation_graphs"]["outgoing"]["nodes"]} + assert set(outgoing_first_ids).issubset(outgoing_node_ids) + assert set(outgoing_second_ids).issubset(outgoing_node_ids) + + incoming_node_ids = {node["blog_id"] for node in detail["relation_graphs"]["incoming"]["nodes"]} + assert set(incoming_first_ids).issubset(incoming_node_ids) + assert set(incoming_second_ids).issubset(incoming_node_ids) + + +def test_repository_blog_detail_includes_discovery_path(tmp_path: Path) -> None: + """Detail payloads should explain manual origins and crawled discovery chains.""" + repository = repository_module.build_repository(db_path=tmp_path / "db.sqlite") + seed_id, inserted = repository.upsert_blog( + url="https://seed.example/", + normalized_url="https://seed.example/", + domain="seed.example", + accepted_by="seed", + seed_source_path="seed.csv", + seed_source_row=2, + ) + assert inserted is True + middle_id, inserted = repository.upsert_blog( + url="https://middle.example/", + normalized_url="https://middle.example/", + domain="middle.example", + accepted_by="rss", + ) + assert inserted is True + target_id, inserted = repository.upsert_blog( + url="https://target.example/", + normalized_url="https://target.example/", + domain="target.example", + accepted_by="model", + ) + assert inserted is True + first_raw = repository.create_raw_discovered_url( + source_blog_id=seed_id, + normalized_url="https://middle.example/", + status="pending", + ) + repository.update_raw_discovered_url_status(record_id=first_raw, status="success", accepted_by="rss") + second_raw = repository.create_raw_discovered_url( + source_blog_id=middle_id, + normalized_url="https://target.example/", + status="pending", + ) + repository.update_raw_discovered_url_status(record_id=second_raw, status="success", accepted_by="model") + + seed_detail = repository.get_blog_detail(seed_id) + target_detail = repository.get_blog_detail(target_id) + + assert seed_detail is not None + assert seed_detail["discovery_path"]["mode"] == "manual" + assert seed_detail["discovery_path"]["steps"][0]["accepted_by"] == "seed" + assert target_detail is not None + assert target_detail["discovery_path"]["mode"] == "crawled" + assert [step["domain"] for step in target_detail["discovery_path"]["steps"]] == [ + "seed.example", + "middle.example", + "target.example", + ] + assert target_detail["discovery_path"]["steps"][0]["accepted_label"] == "种子导入" + assert target_detail["discovery_path"]["steps"][1]["raw_source_blog_id"] == seed_id + assert target_detail["discovery_path"]["steps"][2]["raw_source_blog_id"] == middle_id + + +def test_repository_blog_detail_discovery_path_keeps_full_history(tmp_path: Path) -> None: + """Discovery paths should return every historical source step, even for long chains.""" + repository = repository_module.build_repository(db_path=tmp_path / "db.sqlite") + blog_ids: list[int] = [] + domains = [f"chain-{index}.example" for index in range(15)] + for index, domain in enumerate(domains): + blog_id, inserted = repository.upsert_blog( + url=f"https://{domain}/", + normalized_url=f"https://{domain}/", + domain=domain, + accepted_by="seed" if index == 0 else "rss", + ) + assert inserted is True + blog_ids.append(blog_id) + + for source_id, target_domain in zip(blog_ids[:-1], domains[1:], strict=True): + raw_id = repository.create_raw_discovered_url( + source_blog_id=source_id, + normalized_url=f"https://{target_domain}/", + status="pending", + ) + repository.update_raw_discovered_url_status(record_id=raw_id, status="success", accepted_by="rss") + + detail = repository.get_blog_detail(blog_ids[-1]) + + assert detail is not None + assert detail["discovery_path"]["truncated"] is False + assert [step["domain"] for step in detail["discovery_path"]["steps"]] == domains + + +def test_repository_blog_detail_discovery_path_uses_incoming_edge_for_alias_raw_url(tmp_path: Path) -> None: + """Discovery paths should follow incoming edges when raw URLs differ from canonical blog URLs.""" + repository = repository_module.build_repository(db_path=tmp_path / "db.sqlite") + seed_id, inserted = repository.upsert_blog( + url="https://seed.example/", + normalized_url="https://seed.example/", + domain="seed.example", + accepted_by="seed", + ) + assert inserted is True + target_id, inserted = repository.upsert_blog( + url="https://target.example/", + normalized_url="https://target.example/", + domain="target.example", + accepted_by="rss", + ) + assert inserted is True + raw_id = repository.create_raw_discovered_url( + source_blog_id=seed_id, + normalized_url="https://blog.target.example/", + status="pending", + ) + repository.update_raw_discovered_url_status(record_id=raw_id, status="success", accepted_by="rss") + repository.add_edge( + from_blog_id=seed_id, + to_blog_id=target_id, + link_url_raw="https://blog.target.example/", + link_text="Target", + ) + + detail = repository.get_blog_detail(target_id) + + assert detail is not None + assert [step["domain"] for step in detail["discovery_path"]["steps"]] == [ + "seed.example", + "target.example", + ] + assert detail["discovery_path"]["steps"][1]["raw_source_blog_id"] == seed_id diff --git a/tests/test_runtime.py b/tests/test_runtime.py index c3f8058..f1fd8bf 100644 --- a/tests/test_runtime.py +++ b/tests/test_runtime.py @@ -30,36 +30,6 @@ def stats(self) -> dict[str, int]: return {"raw_discovered_urls": self.raw_discovered_urls} -class PriorityQueueRepository: - """Support separate priority and normal queues for fairness tests.""" - - def __init__(self, *, priority_blog_ids: list[int], normal_blog_ids: list[int]) -> None: - self.priority_blog_ids = list(priority_blog_ids) - self.normal_blog_ids = list(normal_blog_ids) - self.lock = Lock() - self.claim_order: list[int] = [] - - def get_next_priority_blog(self) -> dict[str, object] | None: - with self.lock: - if not self.priority_blog_ids: - return None - blog_id = self.priority_blog_ids.pop(0) - self.claim_order.append(blog_id) - return {"id": blog_id, "url": f"https://priority{blog_id}.example.com/"} - - def get_next_waiting_blog(self, *, include_priority: bool = True) -> dict[str, object] | None: - with self.lock: - if self.normal_blog_ids: - blog_id = self.normal_blog_ids.pop(0) - self.claim_order.append(blog_id) - return {"id": blog_id, "url": f"https://blog{blog_id}.example.com/"} - if include_priority and self.priority_blog_ids: - blog_id = self.priority_blog_ids.pop(0) - self.claim_order.append(blog_id) - return {"id": blog_id, "url": f"https://priority{blog_id}.example.com/"} - return None - - class BlockingQueuePipeline: """Pipeline stub that blocks one claimed blog until the test releases it.""" @@ -160,9 +130,9 @@ def write_exports(self) -> dict[str, object]: class RecordingPipeline: - """A fast pipeline that records claim order for fairness assertions.""" + """A fast pipeline that records claim order for queue assertions.""" - def __init__(self, repository: PriorityQueueRepository) -> None: + def __init__(self, repository: QueueRepository) -> None: self.repository = repository self.processed_ids: list[int] = [] @@ -219,6 +189,33 @@ def write_exports(self) -> dict[str, object]: return {} +class IdleSchedulerPipeline: + """Pipeline stub that never has queued work but records start attempts.""" + + def __init__(self) -> None: + self.repository = QueueRepository([]) + self.capacity_gate = CrawlerCapacityGate(self.repository, raw_discovered_url_limit=-1) + self.export_calls = 0 + + def process_blog_row( + self, + row: dict[str, object], + *, + on_blog_start=None, + on_blog_finish=None, + on_blog_error=None, + ) -> dict[str, int]: + if on_blog_start is not None: + on_blog_start(row) + if on_blog_finish is not None: + on_blog_finish(row, {"discovered": 0}) + return {"processed": 1, "discovered": 0, "failed": 0} + + def write_exports(self) -> dict[str, object]: + self.export_calls += 1 + return {} + + def test_runtime_stop_waits_for_active_workers_to_finish_without_starting_more_blogs() -> None: """Stop should let the current worker set finish, then prevent any new blog from starting.""" pipeline = BlockingQueuePipeline([1, 2, 3, 4, 5, 6], target_active_runs=3) @@ -308,30 +305,15 @@ def test_runtime_records_fatal_worker_errors_and_clears_stale_current_task_field assert snapshot["workers"][0]["current_url"] is None -def test_runtime_prioritizes_seed_requests_before_normal_queue() -> None: - """Priority seeds should be claimed ahead of ordinary waiting blogs.""" - repository = PriorityQueueRepository(priority_blog_ids=[101], normal_blog_ids=[1, 2]) - runtime = CrawlerRuntimeService(RecordingPipeline(repository), worker_count=1) +def test_runtime_claims_waiting_blogs_in_queue_order() -> None: + """Runtime batches should keep claiming ordinary waiting blogs until the limit is reached.""" + pipeline = RecordingPipeline(QueueRepository([1, 2, 3])) + runtime = CrawlerRuntimeService(pipeline, worker_count=1) result = runtime.run_batch(3) assert result["accepted"] is True - assert repository.claim_order[0] == 101 - - -def test_runtime_releases_normal_queue_slots_after_each_priority_seed() -> None: - """After one priority seed, the runtime should release normal queue claims before taking the next priority.""" - repository = PriorityQueueRepository(priority_blog_ids=[101, 102], normal_blog_ids=[1, 2, 3]) - runtime = CrawlerRuntimeService( - RecordingPipeline(repository), - worker_count=1, - priority_seed_normal_queue_slots=2, - ) - - result = runtime.run_batch(5) - - assert result["accepted"] is True - assert repository.claim_order[:4] == [101, 1, 2, 102] + assert pipeline.processed_ids == [1, 2, 3] def test_runtime_continues_to_next_waiting_blog_after_one_timeout_failure() -> None: @@ -366,6 +348,92 @@ def test_runtime_rejects_start_when_raw_discovered_url_limit_is_reached() -> Non assert runtime.status()["runner_status"] == "idle" +def test_runtime_auto_scheduler_starts_idle_runtime() -> None: + """Hourly scheduler checks should wake an idle runtime by calling start.""" + pipeline = IdleSchedulerPipeline() + runtime = CrawlerRuntimeService( + pipeline, + worker_count=1, + auto_start_interval_seconds=0.01, + ) + + runtime.start_auto_scheduler() + try: + assert runtime._scheduler_thread is not None # noqa: SLF001 - test inspects scheduler thread. + assert runtime._scheduler_thread.is_alive() # noqa: SLF001 - test inspects scheduler thread. + assert pipeline.export_calls == 0 + + runtime._scheduler_stop_event.wait(0.05) # noqa: SLF001 - give the scheduler one tick window. + + assert pipeline.export_calls >= 1 + runtime.stop_auto_scheduler() + if runtime._scheduler_thread is not None: + runtime._scheduler_thread.join(timeout=2) + if runtime._thread is not None: + runtime._thread.join(timeout=2) + assert runtime.status()["runner_status"] == "idle" + finally: + runtime.stop_auto_scheduler() + if runtime._scheduler_thread is not None: + runtime._scheduler_thread.join(timeout=2) + + +def test_runtime_auto_scheduler_skips_busy_runtime() -> None: + """Scheduler checks should not restart a runtime that is already running.""" + pipeline = BlockingQueuePipeline([1], target_active_runs=1) + runtime = CrawlerRuntimeService( + pipeline, + worker_count=1, + auto_start_interval_seconds=0.01, + ) + + runtime.start() + assert pipeline.started.wait(timeout=1) + + scheduler_result = runtime.start_auto_scheduler() + assert scheduler_result["accepted"] is True + runtime._scheduler_stop_event.wait(0.03) # noqa: SLF001 - let the scheduler tick once. + + assert pipeline.run_calls == 1 + assert runtime.status()["runner_status"] in {"running", "stopping"} + + pipeline.release.set() + runtime._thread.join(timeout=2) # noqa: SLF001 - test waits for the background loop. + runtime.stop_auto_scheduler() + if runtime._scheduler_thread is not None: + runtime._scheduler_thread.join(timeout=2) + + +def test_runtime_auto_scheduler_retries_after_error_state() -> None: + """Scheduler checks should treat an errored runtime as not working.""" + pipeline = RecordingPipeline(QueueRepository([1, 2])) + runtime = CrawlerRuntimeService( + pipeline, + worker_count=1, + auto_start_interval_seconds=0.01, + ) + with runtime._lock: # noqa: SLF001 - test seeds a prior failed runtime state. + runtime._snapshot.runner_status = "error" + runtime._snapshot.last_error = "previous export failure" + + runtime.start_auto_scheduler() + try: + runtime._scheduler_stop_event.wait(0.05) # noqa: SLF001 - let the scheduler tick once. + + runtime.stop_auto_scheduler() + if runtime._scheduler_thread is not None: + runtime._scheduler_thread.join(timeout=2) + if runtime._thread is not None: + runtime._thread.join(timeout=2) + + assert pipeline.processed_ids == [1, 2] + assert runtime.status()["runner_status"] == "idle" + finally: + runtime.stop_auto_scheduler() + if runtime._scheduler_thread is not None: + runtime._scheduler_thread.join(timeout=2) + + def test_runtime_allows_start_when_raw_discovered_url_limit_is_disabled() -> None: """A -1 raw URL limit should disable crawler capacity gating.""" pipeline = BlockingQueuePipeline( diff --git a/tests/test_service_split.py b/tests/test_service_split.py index 328ab83..9ed72af 100644 --- a/tests/test_service_split.py +++ b/tests/test_service_split.py @@ -2,7 +2,6 @@ import json from pathlib import Path -from time import sleep import httpx import pytest @@ -10,6 +9,7 @@ from backend.main import BackendState from backend.main import create_app as create_backend_app +from persistence_api.email_delivery import EmailDeliveryError from frontend.server import create_app as create_frontend_app from persistence_api.main import PersistenceState from persistence_api.main import build_persistence_state @@ -109,6 +109,7 @@ def test_persistence_service_exposes_supported_repository_data(tmp_path: Path) - db_path=tmp_path / "heyblog.sqlite", seed_path=tmp_path / "seed.csv", export_dir=tmp_path / "exports", + email_dev_expose_tokens=True, ) state = build_persistence_state(settings) app = create_persistence_app(state) @@ -217,50 +218,35 @@ def test_persistence_service_exposes_supported_repository_data(tmp_path: Path) - } assert detail.json()["outgoing_edges"] == [] - request = client.post( - "/internal/ingestion-requests", - json={ - "homepage_url": "https://queued.example.com/", - "email": "owner@example.com", - }, - ) - assert request.status_code == 200 - assert request.json()["request_id"] == 1 - assert request.json()["status"] == "QUEUED" - - request_status = client.get( - "/internal/ingestion-requests/1", - params={"request_token": request.json()["request_token"]}, - ) - assert request_status.status_code == 200 - assert request_status.json()["email"] == "owner@example.com" - - priority_requests = client.get("/internal/ingestion-requests") - assert priority_requests.status_code == 200 - assert priority_requests.json()[0]["request_id"] == 1 - assert "email" not in priority_requests.json()[0] - assert "request_token" not in priority_requests.json()[0] - assert "email" not in priority_requests.json()[0]["blog"] - auth = client.post( "/internal/users/register", json={"email": "Member@Example.com", "password": "long enough"}, ) assert auth.status_code == 200 - assert auth.json()["user"]["email"] == "member@example.com" - token = auth.json()["token"] - assert client.get("/internal/users/me", params={"session_token": token}).json()["id"] == auth.json()["user"]["id"] + assert auth.json()["sent"] is True + assert client.post( + "/internal/users/login", + json={"email": "member@example.com", "password": "long enough"}, + ).status_code == 401 + verified_auth = client.post( + "/internal/users/email-verification/confirm", + json={"token": auth.json()["verification_token"]}, + ) + assert verified_auth.status_code == 200 + assert verified_auth.json()["email"] == "member@example.com" login = client.post( "/internal/users/login", json={"email": "member@example.com", "password": "long enough"}, ) assert login.status_code == 200 - assert login.json()["user"]["id"] == auth.json()["user"]["id"] + assert login.json()["user"]["id"] == verified_auth.json()["id"] + token = login.json()["token"] + assert client.get("/internal/users/me", params={"session_token": token}).json()["id"] == verified_auth.json()["id"] - lookup = client.get("/internal/blogs/lookup", params={"url": "https://queued.example.com/"}) + lookup = client.get("/internal/blogs/lookup", params={"url": "https://blog.example.com/"}) assert lookup.status_code == 200 assert lookup.json()["match_reason"] == "identity_key" - assert lookup.json()["items"][0]["id"] == request.json()["seed_blog_id"] + assert lookup.json()["items"][0]["id"] == 1 filter_stats = client.get("/internal/filter-stats") assert filter_stats.status_code == 200 @@ -268,16 +254,66 @@ def test_persistence_service_exposes_supported_repository_data(tmp_path: Path) - reset = client.post("/internal/database/reset") assert reset.status_code == 200 - assert reset.json()["blogs_deleted"] == 3 + assert reset.json()["blogs_deleted"] == 2 + assert reset.json()["edges_deleted"] == 1 + assert reset.json()["raw_discovered_urls_deleted"] == 0 assert reset.json()["logs_deleted"] == 0 - assert reset.json()["ingestion_requests_deleted"] == 1 - assert reset.json()["blog_link_labels_deleted"] == 0 empty_catalog = client.get("/internal/blogs/catalog") assert empty_catalog.status_code == 200 assert empty_catalog.json()["items"] == [] +def test_persistence_user_registration_translates_email_delivery_failure(tmp_path: Path) -> None: + """SMTP failures should return a stable API error instead of leaking provider details.""" + + class FailingEmailDelivery: + """Email sender that always fails during lifecycle delivery.""" + + def send_verification_email(self, *, to_email: str, verification_url: str) -> None: + """Raise a delivery error for one verification message. + + Args: + to_email: Recipient email address. + verification_url: One-time verification URL. + + Returns: + None. + """ + + del to_email, verification_url + raise EmailDeliveryError("email_delivery_failed") + + def send_password_reset_email(self, *, to_email: str, reset_url: str) -> None: + """Raise a delivery error for one password reset message. + + Args: + to_email: Recipient email address. + reset_url: One-time password reset URL. + + Returns: + None. + """ + + del to_email, reset_url + raise EmailDeliveryError("email_delivery_failed") + + settings = Settings( + db_path=tmp_path / "heyblog.sqlite", + seed_path=tmp_path / "seed.csv", + export_dir=tmp_path / "exports", + ) + state = build_persistence_state(settings) + state.repository.email_delivery = FailingEmailDelivery() + app = create_persistence_app(state) + client = TestClient(app) + + response = client.post("/internal/users/register", json={"email": "user@example.com", "password": "long enough"}) + + assert response.status_code == 502 + assert response.json()["detail"] == "email_delivery_failed" + + def test_persistence_service_removes_legacy_read_shortcuts(tmp_path: Path) -> None: """Persistence service should not expose obsolete raw-read shortcut endpoints.""" settings = Settings( @@ -300,10 +336,10 @@ def test_persistence_service_queue_routes_preserve_optional_row_serialization() class StubRepository: def __init__(self) -> None: - self.include_priority_calls: list[bool] = [] + self.calls = 0 - def get_next_waiting_blog(self, *, include_priority: bool = True) -> dict[str, object] | None: - self.include_priority_calls.append(include_priority) + def get_next_waiting_blog(self) -> dict[str, object] | None: + self.calls += 1 return { "id": 11, "blog_id": 11, @@ -311,9 +347,6 @@ def get_next_waiting_blog(self, *, include_priority: bool = True) -> dict[str, o "crawl_status": "PROCESSING", } - def get_next_priority_blog(self) -> dict[str, object] | None: - return None - repository = StubRepository() app = create_persistence_app( PersistenceState( @@ -324,8 +357,7 @@ def get_next_priority_blog(self) -> dict[str, object] | None: ) client = TestClient(app) - waiting = client.get("/internal/queue/next", params={"include_priority": "false"}) - priority = client.get("/internal/queue/priority-next") + waiting = client.get("/internal/queue/next") assert waiting.status_code == 200 assert waiting.json() == { @@ -334,78 +366,13 @@ def get_next_priority_blog(self) -> dict[str, object] | None: "domain": "queued.example.com", "crawl_status": "PROCESSING", } - assert repository.include_priority_calls == [False] - - assert priority.status_code == 200 - assert priority.json() is None - - -def test_persistence_service_maintenance_run_create_routes_preserve_bool_passthrough() -> None: - """Maintenance create routes should keep bool passthrough and payload shape unchanged.""" - - class StubRepository: - def __init__(self) -> None: - self.blog_dedup_calls: list[bool] = [] - - def create_blog_dedup_scan_run(self, *, crawler_was_running: bool = False) -> dict[str, object]: - self.blog_dedup_calls.append(crawler_was_running) - return {"id": 34, "status": "RUNNING", "crawler_was_running": crawler_was_running} - - repository = StubRepository() - app = create_persistence_app( - PersistenceState( - repository=repository, # type: ignore[arg-type] - graph_service=object(), # type: ignore[arg-type] - stats_service=object(), # type: ignore[arg-type] - ) - ) - client = TestClient(app) - - blog_dedup = client.post("/internal/blog-dedup-scans/runs") - - assert blog_dedup.status_code == 200 - assert blog_dedup.json() == {"id": 34, "status": "RUNNING", "crawler_was_running": False} - assert repository.blog_dedup_calls == [False] - - -def test_persistence_service_maintenance_child_list_routes_preserve_run_id_passthrough() -> None: - """Maintenance child-list routes should keep run_id passthrough and list payloads unchanged.""" - - class StubRepository: - def __init__(self) -> None: - self.blog_dedup_calls: list[int] = [] - - def list_blog_dedup_scan_run_items(self, run_id: int) -> list[dict[str, object]]: - self.blog_dedup_calls.append(run_id) - return [{"id": 2, "run_id": run_id, "reason_code": "blog_alias_collapsed"}] - - repository = StubRepository() - app = create_persistence_app( - PersistenceState( - repository=repository, # type: ignore[arg-type] - graph_service=object(), # type: ignore[arg-type] - stats_service=object(), # type: ignore[arg-type] - ) - ) - client = TestClient(app) - - blog_dedup = client.get("/internal/blog-dedup-scans/9/items") - - assert blog_dedup.status_code == 200 - assert blog_dedup.json() == [{"id": 2, "run_id": 9, "reason_code": "blog_alias_collapsed"}] - assert repository.blog_dedup_calls == [9] + assert repository.calls == 1 def test_persistence_service_zero_arg_list_routes_preserve_payload_passthrough() -> None: """Zero-arg list routes should keep list payloads and ordering unchanged.""" class StubRepository: - def list_priority_ingestion_requests(self) -> list[dict[str, object]]: - return [ - {"request_id": 2, "status": "QUEUED"}, - {"request_id": 5, "status": "CRAWLING"}, - ] - def list_blog_label_tags(self) -> list[dict[str, object]]: return [ {"id": 7, "slug": "blog"}, @@ -421,15 +388,8 @@ def list_blog_label_tags(self) -> list[dict[str, object]]: ) client = TestClient(app) - ingestion_requests = client.get("/internal/ingestion-requests") blog_label_tags = client.get("/internal/blog-labeling/tags") - assert ingestion_requests.status_code == 200 - assert ingestion_requests.json() == [ - {"request_id": 2, "status": "QUEUED"}, - {"request_id": 5, "status": "CRAWLING"}, - ] - assert blog_label_tags.status_code == 200 assert blog_label_tags.json() == [ {"id": 7, "slug": "blog"}, @@ -514,12 +474,96 @@ def stats(self) -> dict[str, object]: } +def test_backend_icon_proxy_returns_valid_image(monkeypatch) -> None: + """Backend icon proxy should return image bytes through the same origin.""" + app = create_backend_app(BackendState(persistence=object(), crawler=StubCrawler(), search=StubSearch())) + client = TestClient(app) + + class FakeStreamResponse: + status_code = 200 + headers = {"content-type": "image/png", "content-length": "8"} + url = "https://icons.example.com/favicon.png" + + def __enter__(self) -> "FakeStreamResponse": + return self + + def __exit__(self, *args: object) -> None: + return None + + def raise_for_status(self) -> None: + return None + + def iter_bytes(self): + yield b"png-bytes" + + def fake_stream(method: str, url: str, **kwargs: object) -> FakeStreamResponse: + assert method == "GET" + assert url == "https://icons.example.com/favicon.png" + assert kwargs["follow_redirects"] is False + assert kwargs["timeout"] == 8.0 + return FakeStreamResponse() + + monkeypatch.setattr("backend.main._is_private_icon_proxy_host", lambda hostname: False) + monkeypatch.setattr("backend.main.httpx.stream", fake_stream) + + response = client.get("/api/icons/proxy", params={"url": "https://icons.example.com/favicon.png"}) + + assert response.status_code == 200 + assert response.content == b"png-bytes" + assert response.headers["content-type"].startswith("image/png") + assert response.headers["cache-control"] == "public, max-age=86400" + + +def test_backend_icon_proxy_rejects_unsafe_urls() -> None: + """Backend icon proxy should reject unsupported or private URL targets.""" + app = create_backend_app(BackendState(persistence=object(), crawler=StubCrawler(), search=StubSearch())) + client = TestClient(app) + + unsupported = client.get("/api/icons/proxy", params={"url": "file:///etc/passwd"}) + loopback = client.get("/api/icons/proxy", params={"url": "http://127.0.0.1/favicon.ico"}) + + assert unsupported.status_code == 422 + assert unsupported.json()["detail"] == "unsupported_icon_url" + assert loopback.status_code == 422 + assert loopback.json()["detail"] == "unsafe_icon_url" + + +def test_backend_icon_proxy_rejects_private_redirects(monkeypatch) -> None: + """Backend icon proxy should re-check redirect targets before fetching them.""" + app = create_backend_app(BackendState(persistence=object(), crawler=StubCrawler(), search=StubSearch())) + client = TestClient(app) + + class RedirectResponse: + status_code = 302 + headers = {"location": "http://127.0.0.1/favicon.ico"} + url = "https://icons.example.com/favicon.png" + + def __enter__(self) -> "RedirectResponse": + return self + + def __exit__(self, *args: object) -> None: + return None + + def fake_stream(method: str, url: str, **kwargs: object) -> RedirectResponse: + del method, url, kwargs + return RedirectResponse() + + monkeypatch.setattr("backend.main._is_private_icon_proxy_host", lambda hostname: hostname == "127.0.0.1") + monkeypatch.setattr("backend.main.httpx.stream", fake_stream) + + response = client.get("/api/icons/proxy", params={"url": "https://icons.example.com/favicon.png"}) + + assert response.status_code == 422 + assert response.json()["detail"] == "unsafe_icon_url" + + def test_persistence_service_exposes_blog_labeling_endpoints(tmp_path: Path) -> None: """Persistence service should expose multi-tag candidate listing and label management.""" settings = Settings( db_path=tmp_path / "heyblog.sqlite", seed_path=tmp_path / "seed.csv", export_dir=tmp_path / "exports", + email_dev_expose_tokens=True, ) app = create_persistence_app(build_persistence_state(settings)) client = TestClient(app) @@ -661,18 +705,23 @@ def test_persistence_service_exposes_blog_labeling_endpoints(tmp_path: Path) -> assert switched_user_label.json()["label_slugs"] == ["other"] account = client.post("/internal/users/register", json={"email": "voter@example.com", "password": "long enough"}) assert account.status_code == 200 + verified_account = client.post( + "/internal/users/email-verification/confirm", + json={"token": account.json()["verification_token"]}, + ) + assert verified_account.status_code == 200 account_user_label = client.post( f"/internal/blogs/{finished.json()['id']}/user-labels", - json={"label": "blog", "user_id": account.json()["user"]["id"]}, + json={"label": "blog", "user_id": verified_account.json()["id"]}, ) assert account_user_label.status_code == 200 account_user_label_switch = client.post( f"/internal/blogs/{finished.json()['id']}/user-labels", - json={"label": "other", "user_id": account.json()["user"]["id"]}, + json={"label": "other", "user_id": verified_account.json()["id"]}, ) assert account_user_label_switch.status_code == 200 selections = client.get( - f"/internal/users/{account.json()['user']['id']}/label-selections", + f"/internal/users/{verified_account.json()['id']}/label-selections", params={"limit": 5}, ) assert selections.status_code == 200 @@ -845,13 +894,15 @@ def get(self, path: str, params: dict[str, object] | None = None, **kwargs: obje def post(self, path: str, json: dict[str, object], **kwargs: object) -> StubResponse: del kwargs self.post_calls.append((path, json)) + if path == "/internal/users/register": + return StubResponse({"sent": True, "verification_token": "verify-token"}) return StubResponse({"token": "token", "user": {"id": 7, "email": "user@example.com"}}) client = PersistenceHttpClient("http://persistence.internal") stub = StubClient() client.client = stub # type: ignore[assignment] - assert client.register_user(email="user@example.com", password="long enough")["token"] == "token" + assert client.register_user(email="user@example.com", password="long enough")["sent"] is True assert client.login_user(email="user@example.com", password="long enough")["token"] == "token" assert client.get_user_by_session_token(token="token")["id"] == 7 assert client.list_user_label_selections(user_id=7) == [] @@ -864,6 +915,185 @@ def post(self, path: str, json: dict[str, object], **kwargs: object) -> StubResp assert ("/internal/users/7/label-stats", None) in stub.get_calls +def test_persistence_http_client_can_manage_recommendation_data() -> None: + """The split-service HTTP client should expose recommendation data helpers.""" + + class StubResponse: + def __init__(self, payload: object) -> None: + self.payload = payload + + def raise_for_status(self) -> None: + return None + + def json(self) -> object: + return self.payload + + class StubClient: + def __init__(self) -> None: + self.get_calls: list[tuple[str, dict[str, object] | None]] = [] + self.post_calls: list[tuple[str, dict[str, object]]] = [] + + def get(self, path: str, params: dict[str, object] | None = None, **kwargs: object) -> StubResponse: + del kwargs + self.get_calls.append((path, params)) + return StubResponse({"ok": True}) + + def post(self, path: str, json: dict[str, object], **kwargs: object) -> StubResponse: + del kwargs + self.post_calls.append((path, json)) + return StubResponse({"ok": True, "items": []}) + + client = PersistenceHttpClient("http://persistence.internal") + stub = StubClient() + client.client = stub # type: ignore[assignment] + + client.create_random_recommendation_batch( + count=9, + visitor_id="visitor-1", + session_id="session-1", + source="random_page", + ) + client.record_blog_interaction( + event_uuid="event-1", + event_type="detail_open", + blog_id=42, + visitor_id="visitor-1", + session_id="session-1", + entrance_kind="test_detail", + entrance_url="http://localhost/random", + request_uuid="request-1", + impression_id=12, + position=1, + ) + assert client.get_blog_recommendation_stats(42) == {"ok": True} + assert client.get_recommendation_strategy_stats() == {"ok": True} + assert client.get_admin_hourly_stats(limit=6) == {"ok": True} + + assert stub.post_calls == [ + ( + "/internal/recommendations/random-blog-batches", + { + "count": 9, + "visitor_id": "visitor-1", + "session_id": "session-1", + "user_id": None, + "source": "random_page", + "page_url": None, + "context": None, + }, + ), + ( + "/internal/recommendation-events", + { + "event_uuid": "event-1", + "event_type": "detail_open", + "blog_id": 42, + "visitor_id": "visitor-1", + "session_id": "session-1", + "entrance_kind": "test_detail", + "entrance_url": "http://localhost/random", + "request_uuid": "request-1", + "impression_id": 12, + "position": 1, + "interaction_order": 1, + "user_id": None, + "client_event_at": None, + "attributes": None, + }, + ), + ] + assert stub.get_calls == [ + ("/internal/blogs/42/recommendation-stats", None), + ("/internal/recommendation-stats", None), + ("/internal/admin/hourly-stats", {"limit": 6}), + ] + + +def test_backend_routes_forward_recommendation_data_with_optional_user() -> None: + """Backend public recommendation routes should preserve attribution fields.""" + + class RecommendationPersistenceStub: + def __init__(self) -> None: + self.batch_payload: dict[str, object] | None = None + self.event_payload: dict[str, object] | None = None + + def get_user_by_session_token(self, *, token: str) -> dict[str, object] | None: + assert token == "session-token" + return {"id": 7, "email": "user@example.com"} + + def create_random_recommendation_batch(self, **kwargs: object) -> dict[str, object]: + self.batch_payload = kwargs + return {"request_uuid": "request-1", "items": []} + + def record_blog_interaction(self, **kwargs: object) -> dict[str, object]: + self.event_payload = kwargs + return {"event_uuid": kwargs["event_uuid"], "duplicate": False} + + def get_blog_recommendation_stats(self, blog_id: int) -> dict[str, object]: + return {"blog_id": blog_id, "impressions": 1} + + def get_recommendation_strategy_stats(self) -> dict[str, object]: + return {"total_requests": 1, "by_strategy": []} + + def get_admin_hourly_stats(self, *, limit: int = 24) -> dict[str, object]: + return {"limit": limit, "items": []} + + persistence = RecommendationPersistenceStub() + app = create_backend_app( + BackendState( + persistence=persistence, + crawler=StubCrawler(), + search=StubSearch(), + admin_token="secret-token", + ) + ) + client = TestClient(app) + + batch_response = client.post( + "/api/recommendations/random-blog-batches", + headers={"authorization": "Bearer session-token"}, + json={ + "count": 9, + "visitor_id": "visitor-1", + "session_id": "session-1", + "source": "random_page", + }, + ) + event_response = client.post( + "/api/recommendation-events", + headers={"authorization": "Bearer session-token"}, + json={ + "event_uuid": "event-1", + "event_type": "detail_open", + "blog_id": 42, + "visitor_id": "visitor-1", + "session_id": "session-1", + "entrance_kind": "test_detail", + "entrance_url": "http://localhost/random", + "request_uuid": "request-1", + "impression_id": 12, + "position": 1, + }, + ) + blog_stats = client.get("/api/blogs/42/stats") + admin_stats = client.get("/api/admin/recommendation-stats", headers=admin_headers()) + admin_hourly_stats = client.get("/api/admin/hourly-stats?limit=6", headers=admin_headers()) + + assert batch_response.status_code == 200 + assert event_response.status_code == 200 + assert blog_stats.json() == {"blog_id": 42, "impressions": 1} + assert admin_stats.json() == {"total_requests": 1, "by_strategy": []} + assert admin_hourly_stats.json() == {"limit": 6, "items": []} + assert persistence.batch_payload is not None + assert persistence.batch_payload["user_id"] == 7 + assert persistence.batch_payload["visitor_id"] == "visitor-1" + assert persistence.event_payload is not None + assert persistence.event_payload["user_id"] == 7 + assert persistence.event_payload["event_type"] == "detail_open" + assert persistence.event_payload["entrance_kind"] == "test_detail" + assert persistence.event_payload["entrance_url"] == "http://localhost/random" + + def test_settings_can_enable_postgres_runtime(tmp_path: Path, monkeypatch) -> None: """Environment loading should allow the split runtime to point at Postgres.""" monkeypatch.setenv("HEYBLOG_DB_DSN", "postgresql://heyblog:heyblog@persistence-db:5432/heyblog") @@ -885,6 +1115,78 @@ def test_settings_loads_candidate_link_page_limit(monkeypatch) -> None: assert settings.max_candidate_links_per_page == 17 +def test_settings_loads_runtime_auto_start_interval(monkeypatch) -> None: + """Environment loading should expose the crawler idle wakeup interval.""" + monkeypatch.setenv("HEYBLOG_RUNTIME_AUTO_START_INTERVAL_SECONDS", "42.5") + + settings = Settings.from_env() + + assert settings.runtime_auto_start_interval_seconds == 42.5 + + +def test_persistence_http_client_export_reads_use_search_snapshot() -> None: + """Crawler export compatibility reads should use the split persistence snapshot route.""" + seen_paths: list[str] = [] + + def handle_request(request: httpx.Request) -> httpx.Response: + seen_paths.append(request.url.path) + return httpx.Response( + 200, + request=request, + json={ + "blogs": [{"id": 1, "url": "https://blog.example.com/"}], + "edges": [{"id": 2, "from_blog_id": 1, "to_blog_id": 3}], + "logs": [], + }, + ) + + client = PersistenceHttpClient("http://persistence.test") + client.client = httpx.Client( + base_url="http://persistence.test", + transport=httpx.MockTransport(handle_request), + ) + + assert client.list_blogs() == [{"id": 1, "url": "https://blog.example.com/"}] + assert client.list_edges() == [{"id": 2, "from_blog_id": 1, "to_blog_id": 3}] + assert seen_paths == ["/internal/search-snapshot", "/internal/search-snapshot"] + + +def test_settings_loads_smtp_email_delivery_configuration(monkeypatch) -> None: + """Environment loading should expose SMTP lifecycle email settings.""" + monkeypatch.setenv("HEYBLOG_EMAIL_PROVIDER", "smtp") + monkeypatch.setenv("HEYBLOG_EMAIL_FROM", "no-reply@heyblog.example") + monkeypatch.setenv("HEYBLOG_EMAIL_DEV_EXPOSE_TOKENS", "false") + monkeypatch.setenv("HEYBLOG_SMTP_HOST", "smtp.heyblog.example") + monkeypatch.setenv("HEYBLOG_SMTP_PORT", "465") + monkeypatch.setenv("HEYBLOG_SMTP_USERNAME", "smtp-user") + monkeypatch.setenv("HEYBLOG_SMTP_PASSWORD", "smtp-password") + monkeypatch.setenv("HEYBLOG_SMTP_USE_TLS", "false") + monkeypatch.setenv("HEYBLOG_SMTP_USE_SSL", "true") + monkeypatch.setenv("HEYBLOG_SMTP_TIMEOUT_SECONDS", "3.5") + + settings = Settings.from_env() + + assert settings.email_provider == "smtp" + assert settings.email_from == "no-reply@heyblog.example" + assert settings.email_dev_expose_tokens is False + assert settings.smtp_host == "smtp.heyblog.example" + assert settings.smtp_port == 465 + assert settings.smtp_username == "smtp-user" + assert settings.smtp_password == "smtp-password" + assert settings.smtp_use_tls is False + assert settings.smtp_use_ssl is True + assert settings.smtp_timeout_seconds == 3.5 + + +def test_settings_defaults_to_hiding_lifecycle_tokens(monkeypatch) -> None: + """Environment loading should keep lifecycle tokens hidden unless opted in.""" + monkeypatch.delenv("HEYBLOG_EMAIL_DEV_EXPOSE_TOKENS", raising=False) + + settings = Settings.from_env() + + assert settings.email_dev_expose_tokens is False + + def test_settings_default_runtime_model_root_uses_runtime_resources(monkeypatch) -> None: """Environment loading should default runtime model reads to published resources.""" monkeypatch.delenv("HEYBLOG_DECISION_MODEL_ROOT", raising=False) @@ -1112,15 +1414,9 @@ def test_backend_service_preserves_supported_public_api_shape(monkeypatch) -> No "is_labeled": bool(tag_ids or label_id), }, "register_user": lambda self, email, password: { - "token": "user-token", - "expires_at": "2026-06-25T00:00:00Z", - "user": { - "id": 42, - "email": email.lower(), - "display_name": email.split("@", 1)[0], - "created_at": "2026-05-26T00:00:00Z", - "updated_at": "2026-05-26T00:00:00Z", - }, + "sent": True, + "verification_token": "verify-token", + "expires_at": "2026-06-12T00:00:00Z", }, "login_user": lambda self, email, password: { "token": "login-token", @@ -1350,79 +1646,21 @@ def test_backend_service_preserves_supported_public_api_shape(monkeypatch) -> No }, }, "list_logs": lambda self: [], - "create_ingestion_request": lambda self, homepage_url, email: { - "id": 9, - "request_id": 9, - "requested_url": homepage_url, - "normalized_url": homepage_url, - "email": email, - "status": "QUEUED", - "priority": 100, - "seed_blog_id": 3, - "matched_blog_id": None, - "blog_id": 3, - "request_token": "token-123", - "expires_at": None, - "error_message": None, - "created_at": "2026-04-05T00:00:00Z", - "updated_at": "2026-04-05T00:00:00Z", - "seed_blog": None, - "matched_blog": None, - "blog": None, - }, - "get_ingestion_request": lambda self, request_id, request_token: { - "id": request_id, - "request_id": request_id, - "requested_url": "https://queued.example/", - "normalized_url": "https://queued.example/", - "email": "owner@example.com", + "create_user_seed": lambda self, homepage_url: { "status": "QUEUED", - "priority": 100, - "seed_blog_id": 3, - "matched_blog_id": None, - "blog_id": 3, - "request_token": request_token, - "expires_at": None, - "error_message": None, - "created_at": "2026-04-05T00:00:00Z", - "updated_at": "2026-04-05T00:00:00Z", - "seed_blog": None, - "matched_blog": None, - "blog": None, + "blog_id": 44, + "inserted": True, + "blog": { + "id": 44, + "blog_id": 44, + "url": homepage_url, + "normalized_url": homepage_url, + "domain": "queued-user.example", + "acceptance_status": "ACCEPTED", + "accepted_by": "user", + "crawl_status": "WAITING", + }, }, - "list_priority_ingestion_requests": lambda self: [ - { - "request_id": 9, - "requested_url": "https://queued.example/", - "normalized_url": "https://queued.example/", - "status": "QUEUED", - "seed_blog_id": 3, - "matched_blog_id": None, - "blog_id": 3, - "error_message": None, - "created_at": "2026-04-05T00:00:00Z", - "updated_at": "2026-04-05T00:00:00Z", - "blog": { - "id": 3, - "url": "https://queued.example/", - "normalized_url": "https://queued.example/", - "domain": "queued.example", - "title": "Queued Example", - "icon_url": None, - "status_code": None, - "crawl_status": "WAITING", - "friend_links_count": 0, - "last_crawled_at": None, - "created_at": "2026-04-05T00:00:00Z", - "updated_at": "2026-04-05T00:00:00Z", - "incoming_count": 0, - "outgoing_count": 0, - "connection_count": 0, - "activity_at": None, - "identity_complete": True, - }, - } - ], "lookup_blog_candidates": lambda self, url: { "query_url": url, "normalized_query_url": "https://queued.example/", @@ -1461,13 +1699,8 @@ def test_backend_service_preserves_supported_public_api_shape(monkeypatch) -> No "ok": True, "blogs_deleted": 3, "edges_deleted": 4, + "raw_discovered_urls_deleted": 5, "logs_deleted": 0, - "ingestion_requests_deleted": 1, - "blog_link_labels_deleted": 0, - "blog_label_tags_deleted": 0, - "blog_label_subjects_preserved": 1, - "blog_link_labels_preserved": 1, - "blog_label_tags_preserved": 2, }, "requeue_failed_blogs": lambda self: {"requeued": 7}, }, @@ -1513,8 +1746,8 @@ def test_backend_service_preserves_supported_public_api_shape(monkeypatch) -> No auth = client.post("/api/auth/register", json={"email": "Member@Example.com", "password": "long enough"}) assert auth.status_code == 200 - assert auth.json()["token"] == "user-token" - assert auth.json()["user"]["email"] == "member@example.com" + assert auth.json()["sent"] is True + assert auth.json()["verification_token"] == "verify-token" login = client.post("/api/auth/login", json={"email": "member@example.com", "password": "long enough"}) assert login.status_code == 200 assert login.json()["token"] == "login-token" @@ -1629,23 +1862,14 @@ def fake_get(url: str, **kwargs: object) -> httpx.Response: assert requeue.status_code == 200 assert requeue.json() == {"requeued": 7} - ingestion = client.post( - "/api/ingestion-requests", - json={"homepage_url": "https://queued.example/", "email": "owner@example.com"}, + user_seed = client.post( + "/api/blogs/user-seeds", + json={"homepage_url": "https://queued-user.example/"}, ) - assert ingestion.status_code == 200 - assert ingestion.json()["request_id"] == 9 - - ingestion_status = client.get("/api/ingestion-requests/9?request_token=token-123") - assert ingestion_status.status_code == 200 - assert ingestion_status.json()["status"] == "QUEUED" - - priority_ingestion = client.get("/api/ingestion-requests") - assert priority_ingestion.status_code == 200 - assert priority_ingestion.json()[0]["request_id"] == 9 - assert "email" not in priority_ingestion.json()[0] - assert "request_token" not in priority_ingestion.json()[0] - assert "email" not in priority_ingestion.json()[0]["blog"] + assert user_seed.status_code == 200 + assert user_seed.json()["blog_id"] == 44 + assert user_seed.json()["blog"]["accepted_by"] == "user" + assert user_seed.json()["blog"]["crawl_status"] == "WAITING" lookup = client.get("/api/blogs/lookup?url=https://queued.example/") assert lookup.status_code == 200 @@ -1655,11 +1879,8 @@ def fake_get(url: str, **kwargs: object) -> httpx.Response: reset = client.post("/api/admin/database/reset", headers=admin_headers()) assert reset.status_code == 200 assert reset.json()["blogs_deleted"] == 3 - assert reset.json()["ingestion_requests_deleted"] == 1 - assert reset.json()["blog_link_labels_deleted"] == 0 - assert reset.json()["blog_label_tags_deleted"] == 0 - assert reset.json()["blog_link_labels_preserved"] == 1 - assert reset.json()["blog_label_tags_preserved"] == 2 + assert reset.json()["edges_deleted"] == 4 + assert reset.json()["raw_discovered_urls_deleted"] == 5 assert reset.json()["search_reindexed"] is True assert search.reindex_calls == 3 @@ -1836,7 +2057,7 @@ def list_logs(self) -> list[dict[str, object]]: return [] def reset(self) -> dict[str, object]: - return {"ok": True, "blogs_deleted": 0, "edges_deleted": 0, "logs_deleted": 0} + return {"ok": True, "blogs_deleted": 0, "edges_deleted": 0, "raw_discovered_urls_deleted": 0, "logs_deleted": 0} app = create_backend_app( BackendState(persistence=LabelingValidationStub(), crawler=StubCrawler(), search=StubSearch(), admin_token="secret-token") @@ -1911,7 +2132,7 @@ def list_logs(self) -> list[dict[str, object]]: return [] def reset(self) -> dict[str, object]: - return {"ok": True, "blogs_deleted": 0, "edges_deleted": 0, "logs_deleted": 0} + return {"ok": True, "blogs_deleted": 0, "edges_deleted": 0, "raw_discovered_urls_deleted": 0, "logs_deleted": 0} app = create_backend_app( BackendState(persistence=CatalogValidationStub(), crawler=StubCrawler(), search=StubSearch()) @@ -1927,8 +2148,8 @@ def reset(self) -> dict[str, object]: assert response.json()["detail"] == "Unsupported crawl status: BAD" -def test_backend_lookup_and_priority_list_surface_upstream_validation_errors() -> None: - """Public lookup and priority list endpoints should preserve upstream failures.""" +def test_backend_lookup_and_user_seed_surface_upstream_validation_errors() -> None: + """Public lookup and user seed endpoints should preserve upstream failures.""" class LookupValidationStub: def stats(self) -> dict[str, object]: @@ -1954,9 +2175,9 @@ def lookup_blog_candidates(self, *, url: str) -> dict[str, object]: response = httpx.Response(422, request=request, json={"detail": "Unsupported homepage URL"}) raise httpx.HTTPStatusError("boom", request=request, response=response) - def list_priority_ingestion_requests(self) -> list[dict[str, object]]: - request = httpx.Request("GET", "http://persistence/internal/ingestion-requests") - response = httpx.Response(503, request=request, json={"detail": "upstream_unavailable"}) + def create_user_seed(self, *, homepage_url: str) -> dict[str, object]: + request = httpx.Request("POST", "http://persistence/internal/user-seeds") + response = httpx.Response(422, request=request, json={"detail": "rule:blocked_tld"}) raise httpx.HTTPStatusError("boom", request=request, response=response) def get_blog(self, blog_id: int) -> None: @@ -1987,7 +2208,7 @@ def list_logs(self) -> list[dict[str, object]]: return [] def reset(self) -> dict[str, object]: - return {"ok": True, "blogs_deleted": 0, "edges_deleted": 0, "logs_deleted": 0} + return {"ok": True, "blogs_deleted": 0, "edges_deleted": 0, "raw_discovered_urls_deleted": 0, "logs_deleted": 0} app = create_backend_app( BackendState(persistence=LookupValidationStub(), crawler=StubCrawler(), search=StubSearch()) @@ -1998,9 +2219,9 @@ def reset(self) -> dict[str, object]: assert lookup.status_code == 422 assert lookup.json()["detail"] == "Unsupported homepage URL" - priority = client.get("/api/ingestion-requests") - assert priority.status_code == 503 - assert priority.json()["detail"] == "upstream_unavailable" + user_seed = client.post("/api/blogs/user-seeds", json={"homepage_url": "https://blog.sayori.org/"}) + assert user_seed.status_code == 422 + assert user_seed.json()["detail"] == "rule:blocked_tld" def test_backend_graph_neighbors_surfaces_upstream_not_found() -> None: @@ -2052,7 +2273,7 @@ def list_logs(self) -> list[dict[str, object]]: return [] def reset(self) -> dict[str, object]: - return {"ok": True, "blogs_deleted": 0, "edges_deleted": 0, "logs_deleted": 0} + return {"ok": True, "blogs_deleted": 0, "edges_deleted": 0, "raw_discovered_urls_deleted": 0, "logs_deleted": 0} app = create_backend_app( BackendState(persistence=GraphNeighborNotFoundStub(), crawler=StubCrawler(), search=StubSearch()) @@ -2100,6 +2321,7 @@ def runtime_status(self) -> dict[str, object]: "ok": True, "blogs_deleted": 0, "edges_deleted": 0, + "raw_discovered_urls_deleted": 0, "logs_deleted": 0, }, }, @@ -2140,229 +2362,73 @@ def test_backend_admin_routes_require_valid_token() -> None: assert invalid.json()["detail"] == "admin_auth_invalid" -def test_backend_admin_routes_fail_when_auth_not_configured() -> None: +def test_backend_admin_routes_require_verified_admin_session_role() -> None: + """Admin APIs should reject non-admin sessions even when called directly.""" + + class PersistenceStub: + def stats(self) -> dict[str, object]: + return {} + + def get_user_by_session_token(self, *, token: str) -> dict[str, object] | None: + users = { + "plain-user-token": { + "id": 1, + "role": "user", + "is_active": True, + "email_verified": True, + }, + "unverified-admin-token": { + "id": 2, + "role": "admin", + "is_active": True, + "email_verified": False, + }, + "admin-session-token": { + "id": 3, + "role": "admin", + "is_active": True, + "email_verified": True, + }, + } + return users.get(token) + app = create_backend_app( BackendState( - persistence=type("PersistenceStub", (), {"stats": lambda self: {}})(), + persistence=PersistenceStub(), crawler=StubCrawler(), search=StubSearch(), + admin_token="secret-token", ) ) client = TestClient(app) - response = client.get("/api/admin/runtime/status", headers=admin_headers()) + user_response = client.get("/api/admin/runtime/status", headers=admin_headers("plain-user-token")) + assert user_response.status_code == 403 + assert user_response.json()["detail"] == "admin_auth_forbidden" - assert response.status_code == 503 - assert response.json()["detail"] == "admin_auth_not_configured" + unverified_response = client.get("/api/admin/runtime/status", headers=admin_headers("unverified-admin-token")) + assert unverified_response.status_code == 403 + assert unverified_response.json()["detail"] == "admin_auth_forbidden" + admin_response = client.get("/api/admin/runtime/status", headers=admin_headers("admin-session-token")) + assert admin_response.status_code == 200 + assert admin_response.json()["runner_status"] == "idle" -def test_persistence_service_exposes_blog_dedup_scan_endpoints(tmp_path: Path) -> None: - """Persistence should expose decision-rescan summary and removed item endpoints.""" - settings = Settings( - db_path=tmp_path / "heyblog.sqlite", - seed_path=tmp_path / "seed.csv", - export_dir=tmp_path / "exports", - ) - app = create_persistence_app(build_persistence_state(settings)) - client = TestClient(app) - first = client.post( - "/internal/blogs/upsert", - json={ - "url": "https://langhai.cc/", - "normalized_url": "https://langhai.cc/", - "domain": "langhai.cc", - }, +def test_backend_admin_routes_fail_when_auth_not_configured() -> None: + app = create_backend_app( + BackendState( + persistence=type("PersistenceStub", (), {"stats": lambda self: {}})(), + crawler=StubCrawler(), + search=StubSearch(), + ) ) - assert first.status_code == 200 - - run = client.post("/internal/blog-dedup-scans/runs", params={"crawler_was_running": "true"}) - assert run.status_code == 200 - assert run.json()["status"] == "RUNNING" - assert run.json()["total_count"] == 1 - - executed = client.post(f"/internal/blog-dedup-scans/{run.json()['id']}/execute") - assert executed.status_code == 200 - assert executed.json()["status"] == "SUCCEEDED" - assert executed.json()["total_count"] == 1 - - latest = client.get("/internal/blog-dedup-scans/latest") - assert latest.status_code == 200 - assert latest.json()["id"] == run.json()["id"] - - items = client.get(f"/internal/blog-dedup-scans/{run.json()['id']}/items") - assert items.status_code == 200 - assert isinstance(items.json(), list) - - legacy_shortcut = client.post("/internal/blog-dedup-scans", params={"crawler_was_running": "true"}) - assert legacy_shortcut.status_code == 404 - - -def test_backend_blog_dedup_scan_stops_and_restarts_crawler_and_blocks_runtime_actions() -> None: - """Admin scan should orchestrate stop/scan/restart and expose maintenance lock.""" - - class ScanPersistenceStub: - def __init__(self) -> None: - self.finalize_calls: list[dict[str, object]] = [] - self.run = { - "id": 7, - "status": "PENDING", - "ruleset_version": "2026-04-07-v2", - "total_count": 3, - "scanned_count": 0, - "removed_count": 0, - "kept_count": 0, - "crawler_was_running": True, - "crawler_restart_attempted": False, - "crawler_restart_succeeded": False, - "search_reindexed": False, - "error_message": None, - } - - def stats(self) -> dict[str, object]: - return { - "pending_tasks": 0, - "processing_tasks": 0, - "finished_tasks": 0, - "failed_tasks": 0, - "total_blogs": 0, - "total_edges": 0, - "status_counts": {}, - "average_friend_links": 0.0, - } - - def list_blogs(self) -> list[dict[str, object]]: - return [] - - def get_blog(self, blog_id: int) -> None: - return None - - def get_blog_detail(self, blog_id: int) -> None: - return None - - def list_edges(self) -> list[dict[str, object]]: - return [] - - def graph(self) -> dict[str, object]: - return {"nodes": [], "edges": []} - - def graph_view(self, **_: object) -> dict[str, object]: - return {"nodes": [], "edges": [], "meta": {}} - - def graph_neighbors(self, blog_id: int, hops: int = 1, limit: int = 120) -> dict[str, object]: - return {"nodes": [], "edges": [], "meta": {}} - - def latest_graph_snapshot(self) -> dict[str, object]: - return {"version": "v1"} - - def graph_snapshot(self, version: str) -> dict[str, object]: - return {"version": version, "nodes": [], "edges": [], "meta": {}} - - def list_logs(self) -> list[dict[str, object]]: - return [] - - def reset(self) -> dict[str, object]: - return {"ok": True, "blogs_deleted": 0, "edges_deleted": 0, "logs_deleted": 0} - - def create_blog_dedup_scan_run(self, *, crawler_was_running: bool = False) -> dict[str, object]: - self.run.update( - { - "status": "RUNNING", - "crawler_was_running": crawler_was_running, - "crawler_restart_attempted": False, - "crawler_restart_succeeded": False, - "search_reindexed": False, - "error_message": None, - } - ) - return dict(self.run) - - def execute_blog_dedup_scan_run(self, *, run_id: int) -> dict[str, object]: - sleep(0.05) - self.run.update( - { - "id": run_id, - "status": "SUCCEEDED", - "scanned_count": 3, - "removed_count": 2, - "kept_count": 1, - } - ) - return dict(self.run) - - def finalize_blog_dedup_scan_run(self, **payload: object) -> dict[str, object]: - self.finalize_calls.append(payload) - self.run.update( - { - "id": int(payload["run_id"]), - "crawler_restart_attempted": bool(payload["crawler_restart_attempted"]), - "crawler_restart_succeeded": bool(payload["crawler_restart_succeeded"]), - "search_reindexed": bool(payload["search_reindexed"]), - "error_message": payload.get("error_message"), - } - ) - return dict(self.run) - - def latest_blog_dedup_scan_run(self) -> dict[str, object]: - return dict(self.run) - - def list_blog_dedup_scan_run_items(self, run_id: int) -> list[dict[str, object]]: - return [ - { - "id": 1, - "run_id": run_id, - "removed_url": "http://blog.langhai.cc/index.html", - "reason_code": "blog_alias_collapsed", - "survivor_selection_basis": "FINISHED, created_at=2026-04-05T00:00:00Z, id=1", - } - ] - - class ToggleCrawler(StubCrawler): - def __init__(self) -> None: - self.runner_status = "running" - self.stop_calls = 0 - self.start_calls = 0 - - def runtime_status(self) -> dict[str, object]: - payload = super().runtime_status() - payload["runner_status"] = self.runner_status - return payload - - def stop(self) -> dict[str, object]: - self.stop_calls += 1 - self.runner_status = "idle" - return self.runtime_status() - - def start(self) -> dict[str, object]: - self.start_calls += 1 - self.runner_status = "running" - return self.runtime_status() - - persistence = ScanPersistenceStub() - crawler = ToggleCrawler() - search = StubSearch() - app = create_backend_app(BackendState(persistence=persistence, crawler=crawler, search=search, admin_token="secret-token")) client = TestClient(app) - response = client.post("/api/admin/blog-dedup-scans", headers=admin_headers()) + response = client.get("/api/admin/runtime/status", headers=admin_headers()) - assert response.status_code == 200 - assert response.json()["status"] == "RUNNING" - assert response.json()["total_count"] == 3 - assert crawler.stop_calls == 1 - for _ in range(20): - latest = client.get("/api/admin/blog-dedup-scans/latest", headers=admin_headers()) - assert latest.status_code == 200 - if latest.json()["status"] == "SUCCEEDED": - break - sleep(0.05) - assert latest.json()["crawler_restart_attempted"] is True - assert latest.json()["crawler_restart_succeeded"] is True - assert latest.json()["search_reindexed"] is True - assert crawler.start_calls == 1 - items = client.get("/api/admin/blog-dedup-scans/7/items", headers=admin_headers()) - assert items.status_code == 200 - assert items.json()[0]["reason_code"] == "blog_alias_collapsed" + assert response.status_code == 503 + assert response.json()["detail"] == "admin_auth_not_configured" def test_search_service_queries_rebuilt_snapshot(tmp_path: Path) -> None: @@ -2419,6 +2485,48 @@ def fake_get(url: str, timeout: float) -> OkResponse: assert health.json()["status"] == "ok" +def test_frontend_api_proxy_preserves_cache_control(tmp_path: Path, monkeypatch) -> None: + """Frontend API proxy should keep cache headers for proxied icon images.""" + + class AsyncClientStub: + def __init__(self, timeout: float) -> None: + self.timeout = timeout + + async def __aenter__(self) -> "AsyncClientStub": + return self + + async def __aexit__(self, *args: object) -> None: + return None + + async def request(self, method: str, target: str, **kwargs: object) -> httpx.Response: + assert method == "GET" + assert target == "http://backend:8000/api/icons/proxy" + assert self.timeout == 60.0 + return httpx.Response( + 200, + content=b"icon", + headers={"content-type": "image/png", "cache-control": "public, max-age=86400"}, + request=httpx.Request(method, target), + ) + + monkeypatch.setattr("frontend.server.httpx.AsyncClient", AsyncClientStub) + settings = Settings( + db_path=tmp_path / "heyblog.sqlite", + seed_path=tmp_path / "seed.csv", + export_dir=tmp_path / "exports", + backend_base_url="http://backend:8000", + ) + app = create_frontend_app(settings) + client = TestClient(app) + + response = client.get("/api/icons/proxy", params={"url": "https://icons.example.com/favicon.png"}) + + assert response.status_code == 200 + assert response.content == b"icon" + assert response.headers["content-type"].startswith("image/png") + assert response.headers["cache-control"] == "public, max-age=86400" + + def test_frontend_root_serves_spa_entry(tmp_path: Path) -> None: """Frontend root should serve the SPA entry instead of redirecting.""" settings = Settings( @@ -2608,7 +2716,7 @@ async def request( app = create_frontend_app(settings) client = TestClient(app) - response = client.post("/api/ingestion-requests", json={"homepage_url": "https://blog.example.com"}) + response = client.post("/api/blogs/user-seeds", json={"homepage_url": "https://blog.example.com"}) assert response.status_code == 200 assert captured["headers"].pop("x-request-id") diff --git a/tests/test_site_metadata.py b/tests/test_site_metadata.py index 4e1d1ba..93e1dad 100644 --- a/tests/test_site_metadata.py +++ b/tests/test_site_metadata.py @@ -4,7 +4,7 @@ def test_extract_site_metadata_ignores_non_http_icon_urls() -> None: - """Unsafe icon schemes should be skipped in favor of a safe fallback.""" + """Unsafe icon schemes should be skipped without synthesizing a fallback.""" metadata = extract_site_metadata( "https://blog.example.com/", """ @@ -19,11 +19,11 @@ def test_extract_site_metadata_ignores_non_http_icon_urls() -> None: ) assert metadata.title == "Alpha Blog" - assert metadata.icon_url == "https://blog.example.com/favicon.ico" + assert metadata.icon_url is None def test_extract_site_metadata_returns_none_when_page_url_is_not_http() -> None: - """Fallback favicon should only be synthesized for HTTP(S) page URLs.""" + """Non-HTTP page URLs should not produce icon candidates.""" metadata = extract_site_metadata( "ftp://blog.example.com/", "Alpha Blog", diff --git a/tests/test_visualization_benchmark.py b/tests/test_visualization_benchmark.py new file mode 100644 index 0000000..0eb29f5 --- /dev/null +++ b/tests/test_visualization_benchmark.py @@ -0,0 +1,34 @@ +import json +from pathlib import Path + +from scripts.generate_visualization_benchmark import main + + +def test_visualization_benchmark_has_planted_communities(tmp_path: Path, monkeypatch) -> None: + """Generated benchmark should contain 100 clustered blogs and sparse bridges.""" + + output = tmp_path / "benchmark.json" + monkeypatch.setattr( + "sys.argv", + ["generate_visualization_benchmark.py", "--output", str(output)], + ) + + main() + + payload = json.loads(output.read_text(encoding="utf-8")) + nodes = payload["nodes"] + edges = payload["edges"] + community_by_id = {node["id"]: node["component_id"] for node in nodes} + internal_edges = [ + edge + for edge in edges + if community_by_id[edge["from_blog_id"]] == community_by_id[edge["to_blog_id"]] + ] + bridge_edges = [edge for edge in edges if edge not in internal_edges] + + assert len(nodes) == 100 + assert 420 <= len(edges) <= 560 + assert len(internal_edges) > len(bridge_edges) * 12 + assert len(bridge_edges) <= 35 + assert payload["meta"]["benchmark"]["seed"] == 42 + assert all({"x", "y", "z"}.issubset(node) for node in nodes) diff --git a/tracker/log-system.md b/tracker/log-system.md deleted file mode 100644 index 0626477..0000000 --- a/tracker/log-system.md +++ /dev/null @@ -1,78 +0,0 @@ -# Unified Logging System - -Created: 2026-05-24 - -## Background - -The project had crawler-only lifecycle logs, legacy no-op database log endpoints, -and task-specific maintenance events. The goal is to build one shared logging -system used by every service, with unified fields, service-specific directories, -separate files for review, and moderate event granularity. - -## Goals - -- Use one shared module for Python service logging. -- Write readable service-specific files under a common root. -- Keep application, error, and access logs separate. -- Carry request ids across frontend, backend, crawler, search, and persistence. -- Keep domain maintenance events separate from application logs. -- Avoid new dependencies unless explicitly requested. - -## Decisions - -- Use Python standard `logging` rather than adding `structlog` or `loguru`. -- Default to JSON lines for production-friendly parsing. -- Store local logs under `logs/{app,error,access}//` as hourly - service slices, and Docker logs under `volumes/logs/{app,error,access}//`. -- Preserve `/internal/logs` as a legacy no-op compatibility endpoint. -- Keep URL refilter and blog dedup progress as persisted domain events. - -## Progress - -- Added `shared.observability` with logging setup, JSON formatter, request-id - middleware, access logging, and event helper. -- Added log configuration fields to `Settings`. -- Wired backend, crawler, persistence-api, search, and frontend service entrypoints. -- Added `x-request-id` propagation through frontend proxy and shared HTTP clients. -- Migrated crawler lifecycle logs and model-consensus warnings to stable events. -- Updated Docker Compose to mount shared log volume. -- Updated API/config/architecture documentation and `.env` / `.env.example`. -- Added observability regression tests. -- Added a dedicated `url-refilter` event logger so dangerous raw URL refilter - runs write service-parallel files under `logs/app/url-refilter/` and - `logs/error/url-refilter/` instead of being buried in normal persistence logs. -- 2026-05-25 follow-up: backend now configures the same `url-refilter` - service logger and logs the start, crawler stop, persistence execution - request, completion, and failure boundaries so clicking the Admin button - creates a dedicated log entry immediately. -- 2026-05-25 follow-up: URL refilter lifecycle logs now include explicit - start, finish, failed-exit, and backend-close events with `reason` plus a - human message; execution progress logs are emitted at each 10,000 scanned - raw URLs, with final completion represented by the finish event. - -## Validation - -- Passed: `./.venv/bin/pytest tests/test_observability_logging.py tests/test_crawler_service.py tests/test_service_split.py::test_persistence_service_exposes_supported_repository_data tests/test_service_split.py::test_backend_url_refilter_run_stops_crawler_and_persists_progress_events` -- Passed: `./.venv/bin/pytest tests/test_observability_logging.py tests/test_service_split.py tests/test_pipeline.py tests/test_crawler_model_consensus.py` -- Passed: `./.venv/bin/pytest` (`152 passed`) -- Verified by tests: type-specific hourly log directories, JSON fields, request-id middleware, and shared HTTP client propagation. -- 2026-05-25 update: added regression coverage for dedicated service-parallel - maintenance log directories. - -## Closure - -Completed on 2026-05-24. The logging system now has a shared implementation, -service entrypoint integration, Docker volume routing, documentation, and -regression coverage. - -Update 2026-05-24: log files now group by type first (`app`, `error`, -`access`), then by service, slice hourly using `-YYYYMMDD-HH.log`, -delete slices older than `HEYBLOG_LOG_RETENTION_DAYS`, and expose those settings -in `.env` and `.env.example`. - -## Remaining Risks - -- Uvicorn's own access logger still exists unless deployment disables or - redirects it; HeyBlog now writes its own normalized access hourly slices. -- This pass does not implement audit logs or metrics; the boundaries are - documented for future work.