Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 122 additions & 1 deletion hospexplorer/ask/admin.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,17 @@
import csv
import io
import logging
import os
import zipfile

from django.contrib import admin
from django.contrib import admin, messages
from django.contrib.auth.admin import UserAdmin
from django.contrib.auth.models import User
from django.core.files.base import ContentFile
from django.http import HttpResponseRedirect
from django.shortcuts import render
from django.urls import path, reverse

from ask.models import Conversation, TermsAcceptance, QARecord, SimWorkflow, WebsiteResource, PDFResource
from ask.kb_connector import add_website_to_kb, add_pdf_to_kb, delete_kb_document

Expand Down Expand Up @@ -250,3 +259,115 @@ def save_model(self, request, obj, form, change):
logger.exception("Failed to send PDF to KB: %s", obj.file.name)
self.message_user(request, f"PDF saved but failed to send to Knowledge Base: {e}", level="warning")

def get_urls(self):
urls = super().get_urls()
custom = [
path(
"upload-zip/",
self.admin_site.admin_view(self.zip_upload_view),
name="ask_pdfresource_upload_zip",
),
]
return custom + urls

def zip_upload_view(self, request):
changelist_url = reverse("admin:ask_pdfresource_changelist")

if request.method == "POST":
zip_file = request.FILES.get("zip_file")
if not zip_file:
messages.error(request, "Please select a zip file to upload.")
return HttpResponseRedirect(request.path)

try:
archive = zipfile.ZipFile(zip_file)
except zipfile.BadZipFile:
messages.error(request, "The uploaded file is not a valid zip archive.")
return HttpResponseRedirect(request.path)

with archive:
# skip macOS Finder metadata: __MACOSX/ dir and AppleDouble "._" twins
def _is_real(name):
base = os.path.basename(name)
return not name.startswith("__MACOSX/") and not base.startswith("._") and base != ""

real_names = [n for n in archive.namelist() if _is_real(n)]

csv_names = [n for n in real_names if n.lower().endswith(".csv")]
if len(csv_names) == 0:
messages.error(request, "Zip must contain one CSV metadata file (filename,title).")
return HttpResponseRedirect(request.path)
if len(csv_names) > 1:
messages.error(request, f"Zip must contain exactly one CSV; found {len(csv_names)}.")
return HttpResponseRedirect(request.path)

csv_text = archive.read(csv_names[0]).decode("utf-8-sig")
reader = csv.DictReader(io.StringIO(csv_text))
required = {"filename", "title"}
if not required.issubset({(h or "").strip() for h in (reader.fieldnames or [])}):
messages.error(request, "CSV must have 'filename' and 'title' columns.")
return HttpResponseRedirect(request.path)

zip_members = {n: n for n in real_names}
# also index by basename so CSV can refer to bare filenames regardless of zip layout
for n in real_names:
zip_members.setdefault(os.path.basename(n), n)

total = 0
saved = 0
kb_pushed = 0
kb_failed = 0
for row in reader:
total += 1
filename = (row.get("filename") or "").strip()
title = (row.get("title") or "").strip()
if not filename or not title:
messages.warning(request, f"Row {total}: missing filename or title; skipped.")
continue

member = zip_members.get(filename) or zip_members.get(os.path.basename(filename))
if not member:
messages.warning(request, f"Row {total}: '{filename}' not in zip; skipped.")
continue

try:
pdf_bytes = archive.read(member)
except KeyError:
messages.warning(request, f"Row {total}: could not read '{filename}'; skipped.")
continue

obj = PDFResource(title=title, creator=request.user, modifier=request.user)
obj.file.save(os.path.basename(filename), ContentFile(pdf_bytes), save=True)
saved += 1

try:
result = add_pdf_to_kb(pdf_bytes, os.path.basename(filename), title)
obj.mcp_kb_document_id = result.get("doc_id")
obj.save(update_fields=["mcp_kb_document_id"])
kb_pushed += 1
except Exception as e:
logger.exception("Bulk: failed to send PDF to KB: %s", filename)
messages.warning(request, f"Row {total}: '{title}' saved but KB push failed: {e}")
kb_failed += 1

if kb_failed:
messages.warning(
request,
f"Saved {saved} of {total} PDFs; {kb_pushed} pushed to Knowledge Base, "
f"{kb_failed} failed KB push (PDFs are stored locally but not searchable).",
)
else:
messages.success(request, f"Imported {saved} of {total} PDFs.")
return HttpResponseRedirect(changelist_url)

return render(
request,
"admin/ask/pdfresource/upload_zip.html",
{
**self.admin_site.each_context(request),
"opts": self.model._meta,
"title": "Upload zip of PDFs",
"changelist_url": changelist_url,
},
)

41 changes: 29 additions & 12 deletions hospexplorer/ask/kb_connector.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
import time

import httpx
from django.conf import settings
Expand Down Expand Up @@ -66,22 +67,38 @@ def add_pdf_to_kb(file_bytes, filename, title, url=None):
}
endpoint = f"{settings.KB_MCP_HOST}/docs/pdf/add"

files = {"file": (filename, file_bytes, "application/pdf")}
data = {"title": title}
if url:
data["url"] = url

with httpx.Client() as client:
response = client.post(
endpoint,
headers=headers,
files=files,
data=data,
timeout=settings.KB_MCP_TIMEOUT,
)

response.raise_for_status()
return response.json()
attempts = max(1, settings.KB_MCP_PDF_RETRIES)
last_exc = None
for attempt in range(1, attempts + 1):
# rebuild files each attempt: httpx consumes the stream on send
files = {"file": (filename, file_bytes, "application/pdf")}
try:
with httpx.Client() as client:
response = client.post(
endpoint,
headers=headers,
files=files,
data=data,
timeout=settings.KB_MCP_PDF_TIMEOUT,
)
response.raise_for_status()
return response.json()
except (httpx.TimeoutException, httpx.TransportError) as e:
last_exc = e
if attempt == attempts:
break
backoff = 2 ** (attempt - 1)
logger.warning(
"KB PDF push failed (attempt %d/%d) for %s: %s; retrying in %ds",
attempt, attempts, filename, e, backoff,
)
time.sleep(backoff)

raise last_exc


def delete_kb_document(doc_id):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{% extends "admin/change_list.html" %}

{% block object-tools-items %}
<li>
<a href="upload-zip/" class="addlink">Upload zip of PDFs</a>
</li>
{{ block.super }}
{% endblock %}
30 changes: 30 additions & 0 deletions hospexplorer/ask/templates/admin/ask/pdfresource/upload_zip.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{% extends "admin/base_site.html" %}

{% block breadcrumbs %}
<div class="breadcrumbs">
<a href="{% url 'admin:index' %}">Home</a>
&rsaquo; <a href="{% url 'admin:app_list' app_label=opts.app_label %}">{{ opts.app_config.verbose_name }}</a>
&rsaquo; <a href="{{ changelist_url }}">{{ opts.verbose_name_plural|capfirst }}</a>
&rsaquo; {{ title }}
</div>
{% endblock %}

{% block content %}
<h1>{{ title }}</h1>
<p>
Upload a <code>.zip</code> containing PDF files and a single CSV metadata file
with columns <code>filename,title</code>. Each row creates a PDF Resource
and pushes the file to the Knowledge Base.
</p>
<form method="post" enctype="multipart/form-data">
{% csrf_token %}
<p>
<label for="zip_file">Zip file:</label>
<input type="file" name="zip_file" id="zip_file" accept=".zip" required>
</p>
<div class="submit-row">
<input type="submit" value="Upload" class="default">
<a href="{{ changelist_url }}" class="button cancel-link">Cancel</a>
</div>
</form>
{% endblock %}
3 changes: 3 additions & 0 deletions hospexplorer/hospexplorer/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,9 @@
KB_MCP_HOST = os.getenv("KB_MCP_HOST", "http://localhost:8002")
KB_MCP_JWT_TOKEN = os.getenv("KB_MCP_JWT_TOKEN", "")
KB_MCP_TIMEOUT = int(os.getenv("KB_MCP_TIMEOUT", 30))
KB_MCP_PDF_TIMEOUT = int(os.getenv("KB_MCP_PDF_TIMEOUT", 300)) # Timeout is in seconds 300 seconds (5 minutes)
# Number of retries for the PDF upload, used in add_pdf_to_kb function in kb_connector.py
KB_MCP_PDF_RETRIES = int(os.getenv("KB_MCP_PDF_RETRIES", 2))

# Number of resources to fetch per page
KB_RESOURCES_PAGE_SIZE = int(os.getenv("KB_RESOURCES_PAGE_SIZE", 20))
Expand Down