Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
"""add document list order index

Revision ID: fae1b2c3d4e5
Revises: f9d0e1f2a3b4
Create Date: 2026-06-30 07:15:00.000000
"""

from __future__ import annotations

from typing import Sequence, Union

from alembic import op


# revision identifiers, used by Alembic.
revision: str = "fae1b2c3d4e5"
down_revision: Union[str, Sequence[str], None] = "f9d0e1f2a3b4"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
op.execute(
"""
CREATE INDEX IF NOT EXISTS idx_documents_user_namespace_status_updated
ON documents (user_id, namespace, status, updated_at DESC, document_id ASC)
"""
)


def downgrade() -> None:
op.execute("DROP INDEX IF EXISTS idx_documents_user_namespace_status_updated")
21 changes: 16 additions & 5 deletions apps/api/app/api/v1/routes/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,19 +44,20 @@ async def _archive_document_response(
@router.get("")
async def list_documents(
namespace: str | None = Query(None, max_length=255),
page: int = Query(1, ge=1, description="Page number"),
page_size: int = Query(50, ge=1, le=200, description="Items per page"),
current_user: CurrentUser = Depends(with_current_user),
db: AsyncSession = Depends(get_db),
):
effective_namespace = normalize_retrieval_namespace(namespace)
documents = await _document_service.list_documents(
response = await _document_service.list_documents(
db,
user_id=current_user.user_id,
namespace=effective_namespace,
page=page,
page_size=page_size,
)
return {
"namespace": effective_namespace,
"documents": documents,
}
return response


@router.get("/{document_id}")
Expand Down Expand Up @@ -85,6 +86,10 @@ async def list_document_chunks(
page: int = Query(1, ge=1, description="Page number"),
page_size: int = Query(50, ge=1, le=200, description="Items per page"),
chunk_type: DocumentChunkType | None = Query(None, description="Chunk type filter"),
include_asset_urls: bool = Query(
False,
description="Generate 7-day asset URLs for image/table chunks when true",
),
current_user: CurrentUser = Depends(with_current_user),
db: AsyncSession = Depends(get_db),
):
Expand All @@ -95,6 +100,7 @@ async def list_document_chunks(
page=page,
page_size=page_size,
chunk_type=chunk_type,
include_asset_urls=include_asset_urls,
)
if response is None:
raise NotFoundException(
Expand All @@ -109,6 +115,10 @@ async def list_document_chunks(
async def get_document_chunk(
document_id: str,
document_chunk_id: str,
include_asset_urls: bool = Query(
False,
description="Generate 7-day asset URLs for image/table chunks when true",
),
current_user: CurrentUser = Depends(with_current_user),
db: AsyncSession = Depends(get_db),
):
Expand All @@ -117,6 +127,7 @@ async def get_document_chunk(
user_id=current_user.user_id,
document_id=document_id,
document_chunk_id=document_chunk_id,
include_asset_urls=include_asset_urls,
)
if response is None:
raise NotFoundException(
Expand Down
21 changes: 20 additions & 1 deletion apps/api/app/repositories/document_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,35 @@ async def list_by_user_namespace(
*,
user_id: str,
namespace: str,
limit: int,
offset: int,
) -> Sequence[Document]:
result = await db.execute(
select(Document)
.where(Document.user_id == user_id)
.where(Document.namespace == namespace)
.where(Document.status != "archived")
.order_by(Document.updated_at.desc())
.order_by(Document.updated_at.desc(), Document.document_id.asc())
.limit(limit)
.offset(offset)
)
return result.scalars().all()

async def count_by_user_namespace(
self,
db: AsyncSession,
*,
user_id: str,
namespace: str,
) -> int:
result = await db.execute(
select(func.count(Document.document_id))
.where(Document.user_id == user_id)
.where(Document.namespace == namespace)
.where(Document.status != "archived")
)
return int(result.scalar_one())

async def get_document(
self,
db: AsyncSession,
Expand Down
74 changes: 72 additions & 2 deletions apps/api/app/services/documents/lifecycle_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,44 @@
invalidate_retrieval_cache_namespaces,
)
from shared.services.retrieval.graph.service import DocumentGraphService, GraphScope
from shared.services.storage.result_storage import ResultStorage, get_result_storage

_DOCUMENT_CHUNK_ASSET_URL_EXPIRES_SECONDS = 7 * 24 * 60 * 60
_MEDIA_CHUNK_TYPES = frozenset({"image", "table"})


def _datetime_payload(value: datetime | None) -> str | None:
return value.isoformat() if value else None


def _document_chunk_asset_url(
*,
chunk_type: str,
job_id: str | None,
file_path: str | None,
include_asset_urls: bool,
result_storage: ResultStorage | None,
) -> str | None:
if (
not include_asset_urls
or chunk_type not in _MEDIA_CHUNK_TYPES
or not job_id
or not file_path
or result_storage is None
):
return None

try:
return result_storage.generate_artifact_url(
job_id=job_id,
artifact_ref=file_path,
expires_in=_DOCUMENT_CHUNK_ASSET_URL_EXPIRES_SECONDS,
)
except Exception as exc:
logger.warning(f"Failed to generate document chunk asset URL (ignored): {exc}")
return None


def document_payload(document) -> dict[str, Any]:
return {
"document_id": document.document_id,
Expand Down Expand Up @@ -53,13 +85,31 @@ async def list_documents(
*,
user_id: str,
namespace: str,
) -> list[dict[str, Any]]:
page: int,
page_size: int,
) -> dict[str, Any]:
total = await self._repository.count_by_user_namespace(
db,
user_id=user_id,
namespace=namespace,
)
documents = await self._repository.list_by_user_namespace(
db,
user_id=user_id,
namespace=namespace,
limit=page_size,
offset=(page - 1) * page_size,
)
return [document_payload(document) for document in documents]
return {
"namespace": namespace,
"documents": [document_payload(document) for document in documents],
"pagination": {
"page": page,
"page_size": page_size,
"total": total,
"total_pages": math.ceil(total / page_size) if total else 0,
},
}

async def list_document_chunks(
self,
Expand All @@ -70,6 +120,7 @@ async def list_document_chunks(
page: int,
page_size: int,
chunk_type: str | None,
include_asset_urls: bool,
) -> dict[str, Any] | None:
document = await self._repository.get_document(
db,
Expand Down Expand Up @@ -110,10 +161,14 @@ async def list_document_chunks(
offset=(page - 1) * page_size,
chunk_type=normalized_chunk_type,
)
result_storage = get_result_storage() if include_asset_urls else None
chunks = [
self._chunk_payload(
chunk=chunk,
section=section,
job_id=job_result.job_id,
include_asset_urls=include_asset_urls,
result_storage=result_storage,
)
for chunk, section, job_result in rows
]
Expand All @@ -140,6 +195,7 @@ async def get_document_chunk(
user_id: str,
document_id: str,
document_chunk_id: str,
include_asset_urls: bool,
) -> dict[str, Any] | None:
document = await self._repository.get_document(
db,
Expand All @@ -159,6 +215,7 @@ async def get_document_chunk(
return None

chunk, section, job_result = row
result_storage = get_result_storage() if include_asset_urls else None
return {
"document_id": document.document_id,
"namespace": document.namespace,
Expand All @@ -167,6 +224,9 @@ async def get_document_chunk(
"chunk": self._chunk_payload(
chunk=chunk,
section=section,
job_id=job_result.job_id,
include_asset_urls=include_asset_urls,
result_storage=result_storage,
),
}

Expand All @@ -191,6 +251,9 @@ def _chunk_payload(
*,
chunk: DocumentChunk,
section: DocumentSection | None,
job_id: str | None,
include_asset_urls: bool,
result_storage: ResultStorage | None,
) -> dict[str, Any]:
chunk_type = _normalize_chunk_type(chunk.chunk_type)
file_path = chunk.file_path
Expand All @@ -205,6 +268,13 @@ def _chunk_payload(
"file_path": file_path,
"sort_order": chunk.sort_order,
"metadata": chunk.chunk_metadata,
"asset_url": _document_chunk_asset_url(
chunk_type=chunk_type,
job_id=job_id,
file_path=file_path,
include_asset_urls=include_asset_urls,
result_storage=result_storage,
),
"created_at": _datetime_payload(chunk.created_at),
}

Expand Down
Loading
Loading