diff --git a/hospexplorer/ask/admin.py b/hospexplorer/ask/admin.py index a449c45..df3526f 100644 --- a/hospexplorer/ask/admin.py +++ b/hospexplorer/ask/admin.py @@ -1,8 +1,17 @@ +import csv +import io import logging +import os +import zipfile -from django.contrib import admin +from django.contrib import admin, messages from django.contrib.auth.admin import UserAdmin from django.contrib.auth.models import User +from django.core.files.base import ContentFile +from django.http import HttpResponseRedirect +from django.shortcuts import render +from django.urls import path, reverse + from ask.models import Conversation, TermsAcceptance, QARecord, SimWorkflow, WebsiteResource, PDFResource from ask.kb_connector import add_website_to_kb, add_pdf_to_kb, delete_kb_document @@ -250,3 +259,115 @@ def save_model(self, request, obj, form, change): logger.exception("Failed to send PDF to KB: %s", obj.file.name) self.message_user(request, f"PDF saved but failed to send to Knowledge Base: {e}", level="warning") + def get_urls(self): + urls = super().get_urls() + custom = [ + path( + "upload-zip/", + self.admin_site.admin_view(self.zip_upload_view), + name="ask_pdfresource_upload_zip", + ), + ] + return custom + urls + + def zip_upload_view(self, request): + changelist_url = reverse("admin:ask_pdfresource_changelist") + + if request.method == "POST": + zip_file = request.FILES.get("zip_file") + if not zip_file: + messages.error(request, "Please select a zip file to upload.") + return HttpResponseRedirect(request.path) + + try: + archive = zipfile.ZipFile(zip_file) + except zipfile.BadZipFile: + messages.error(request, "The uploaded file is not a valid zip archive.") + return HttpResponseRedirect(request.path) + + with archive: + # skip macOS Finder metadata: __MACOSX/ dir and AppleDouble "._" twins + def _is_real(name): + base = os.path.basename(name) + return not name.startswith("__MACOSX/") and not base.startswith("._") and base != "" + + real_names = [n for n in archive.namelist() if _is_real(n)] + + csv_names = [n for n in real_names if n.lower().endswith(".csv")] + if len(csv_names) == 0: + messages.error(request, "Zip must contain one CSV metadata file (filename,title).") + return HttpResponseRedirect(request.path) + if len(csv_names) > 1: + messages.error(request, f"Zip must contain exactly one CSV; found {len(csv_names)}.") + return HttpResponseRedirect(request.path) + + csv_text = archive.read(csv_names[0]).decode("utf-8-sig") + reader = csv.DictReader(io.StringIO(csv_text)) + required = {"filename", "title"} + if not required.issubset({(h or "").strip() for h in (reader.fieldnames or [])}): + messages.error(request, "CSV must have 'filename' and 'title' columns.") + return HttpResponseRedirect(request.path) + + zip_members = {n: n for n in real_names} + # also index by basename so CSV can refer to bare filenames regardless of zip layout + for n in real_names: + zip_members.setdefault(os.path.basename(n), n) + + total = 0 + saved = 0 + kb_pushed = 0 + kb_failed = 0 + for row in reader: + total += 1 + filename = (row.get("filename") or "").strip() + title = (row.get("title") or "").strip() + if not filename or not title: + messages.warning(request, f"Row {total}: missing filename or title; skipped.") + continue + + member = zip_members.get(filename) or zip_members.get(os.path.basename(filename)) + if not member: + messages.warning(request, f"Row {total}: '{filename}' not in zip; skipped.") + continue + + try: + pdf_bytes = archive.read(member) + except KeyError: + messages.warning(request, f"Row {total}: could not read '{filename}'; skipped.") + continue + + obj = PDFResource(title=title, creator=request.user, modifier=request.user) + obj.file.save(os.path.basename(filename), ContentFile(pdf_bytes), save=True) + saved += 1 + + try: + result = add_pdf_to_kb(pdf_bytes, os.path.basename(filename), title) + obj.mcp_kb_document_id = result.get("doc_id") + obj.save(update_fields=["mcp_kb_document_id"]) + kb_pushed += 1 + except Exception as e: + logger.exception("Bulk: failed to send PDF to KB: %s", filename) + messages.warning(request, f"Row {total}: '{title}' saved but KB push failed: {e}") + kb_failed += 1 + + if kb_failed: + messages.warning( + request, + f"Saved {saved} of {total} PDFs; {kb_pushed} pushed to Knowledge Base, " + f"{kb_failed} failed KB push (PDFs are stored locally but not searchable).", + ) + else: + messages.success(request, f"Imported {saved} of {total} PDFs.") + return HttpResponseRedirect(changelist_url) + + return render( + request, + "admin/ask/pdfresource/upload_zip.html", + { + **self.admin_site.each_context(request), + "opts": self.model._meta, + "title": "Upload zip of PDFs", + "changelist_url": changelist_url, + }, + ) + diff --git a/hospexplorer/ask/kb_connector.py b/hospexplorer/ask/kb_connector.py index 94bdf40..07f39fa 100644 --- a/hospexplorer/ask/kb_connector.py +++ b/hospexplorer/ask/kb_connector.py @@ -1,4 +1,5 @@ import logging +import time import httpx from django.conf import settings @@ -66,22 +67,38 @@ def add_pdf_to_kb(file_bytes, filename, title, url=None): } endpoint = f"{settings.KB_MCP_HOST}/docs/pdf/add" - files = {"file": (filename, file_bytes, "application/pdf")} data = {"title": title} if url: data["url"] = url - with httpx.Client() as client: - response = client.post( - endpoint, - headers=headers, - files=files, - data=data, - timeout=settings.KB_MCP_TIMEOUT, - ) - - response.raise_for_status() - return response.json() + attempts = max(1, settings.KB_MCP_PDF_RETRIES) + last_exc = None + for attempt in range(1, attempts + 1): + # rebuild files each attempt: httpx consumes the stream on send + files = {"file": (filename, file_bytes, "application/pdf")} + try: + with httpx.Client() as client: + response = client.post( + endpoint, + headers=headers, + files=files, + data=data, + timeout=settings.KB_MCP_PDF_TIMEOUT, + ) + response.raise_for_status() + return response.json() + except (httpx.TimeoutException, httpx.TransportError) as e: + last_exc = e + if attempt == attempts: + break + backoff = 2 ** (attempt - 1) + logger.warning( + "KB PDF push failed (attempt %d/%d) for %s: %s; retrying in %ds", + attempt, attempts, filename, e, backoff, + ) + time.sleep(backoff) + + raise last_exc def delete_kb_document(doc_id): diff --git a/hospexplorer/ask/templates/admin/ask/pdfresource/change_list.html b/hospexplorer/ask/templates/admin/ask/pdfresource/change_list.html new file mode 100644 index 0000000..adbea74 --- /dev/null +++ b/hospexplorer/ask/templates/admin/ask/pdfresource/change_list.html @@ -0,0 +1,8 @@ +{% extends "admin/change_list.html" %} + +{% block object-tools-items %} +
  • + Upload zip of PDFs +
  • + {{ block.super }} +{% endblock %} diff --git a/hospexplorer/ask/templates/admin/ask/pdfresource/upload_zip.html b/hospexplorer/ask/templates/admin/ask/pdfresource/upload_zip.html new file mode 100644 index 0000000..acffced --- /dev/null +++ b/hospexplorer/ask/templates/admin/ask/pdfresource/upload_zip.html @@ -0,0 +1,30 @@ +{% extends "admin/base_site.html" %} + +{% block breadcrumbs %} + +{% endblock %} + +{% block content %} +

    {{ title }}

    +

    + Upload a .zip containing PDF files and a single CSV metadata file + with columns filename,title. Each row creates a PDF Resource + and pushes the file to the Knowledge Base. +

    +
    + {% csrf_token %} +

    + + +

    +
    + + Cancel +
    +
    +{% endblock %} diff --git a/hospexplorer/hospexplorer/settings.py b/hospexplorer/hospexplorer/settings.py index 1cbccad..36fd7d3 100644 --- a/hospexplorer/hospexplorer/settings.py +++ b/hospexplorer/hospexplorer/settings.py @@ -170,6 +170,9 @@ KB_MCP_HOST = os.getenv("KB_MCP_HOST", "http://localhost:8002") KB_MCP_JWT_TOKEN = os.getenv("KB_MCP_JWT_TOKEN", "") KB_MCP_TIMEOUT = int(os.getenv("KB_MCP_TIMEOUT", 30)) +KB_MCP_PDF_TIMEOUT = int(os.getenv("KB_MCP_PDF_TIMEOUT", 300)) # Timeout is in seconds 300 seconds (5 minutes) +# Number of retries for the PDF upload, used in add_pdf_to_kb function in kb_connector.py +KB_MCP_PDF_RETRIES = int(os.getenv("KB_MCP_PDF_RETRIES", 2)) # Number of resources to fetch per page KB_RESOURCES_PAGE_SIZE = int(os.getenv("KB_RESOURCES_PAGE_SIZE", 20))