opea-project · lvliang-intel · May 8, 2025 · Apr 30, 2025 · May 1, 2025 · May 1, 2025
@@ -239,13 +239,16 @@ curl http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum \
    -F "language=en" \
 ```
 
+Note that the `-F "messages="` flag is required, even for file uploads. Multiple files can be uploaded in a single call with multiple `-F "files=@/path"` inputs.
+
 ### Query with audio and video
 
-> Audio and Video file uploads are not supported in docsum with curl request, please use the Gradio-UI.
+> Audio and video can be passed as base64 strings or uploaded by providing a local file path.
 
 Audio:
 
 ```bash
+# Send base64 string
 curl -X POST http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum \
    -H "Content-Type: application/json" \
    -d '{"type": "audio", "messages": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}'
@@ -257,11 +260,21 @@ curl http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum \
    -F "max_tokens=32" \
    -F "language=en" \
    -F "stream=True"
+
+# Upload file
+curl http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum \
+   -H "Content-Type: multipart/form-data" \
+   -F "type=audio" \
+   -F "messages=" \
+   -F "files=@/path to your file (.mp3, .wav)" \
+   -F "max_tokens=32" \
+   -F "language=en"
 ```
 
 Video:
 
 ```bash
+# Send base64 string
 curl -X POST http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum \
    -H "Content-Type: application/json" \
    -d '{"type": "video", "messages": "convert your video to base64 data type"}'
@@ -273,6 +286,15 @@ curl http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum \
    -F "max_tokens=32" \
    -F "language=en" \
    -F "stream=True"
+
+# Upload file
+curl http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum \
+   -H "Content-Type: multipart/form-data" \
+   -F "type=video" \
+   -F "messages=" \
+   -F "files=@/path to your file (.mp4)" \
+   -F "max_tokens=32" \
+   -F "language=en"
 ```
 
 ### Query with long context

@@ -156,16 +156,19 @@ curl http://${host_ip}:8888/v1/docsum \
    -F "messages=" \
    -F "files=@/path to your file (.txt, .docx, .pdf)" \
    -F "max_tokens=32" \
-   -F "language=en" \
+   -F "language=en"
 ```
 
+Note that the `-F "messages="` flag is required, even for file uploads. Multiple files can be uploaded in a single call with multiple `-F "files=@/path"` inputs.
+
 ### Query with audio and video
 
-> Audio and Video file uploads are not supported in docsum with curl request, please use the Gradio-UI.
+> Audio and video can be passed as base64 strings or uploaded by providing a local file path.
 
 Audio:
 
 ```bash
+# Send base64 string
 curl -X POST http://${host_ip}:8888/v1/docsum \
    -H "Content-Type: application/json" \
    -d '{"type": "audio", "messages": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}'
@@ -177,11 +180,21 @@ curl http://${host_ip}:8888/v1/docsum \
    -F "max_tokens=32" \
    -F "language=en" \
    -F "stream=True"
+
+# Upload file
+curl http://${host_ip}:8888/v1/docsum \
+   -H "Content-Type: multipart/form-data" \
+   -F "type=audio" \
+   -F "messages=" \
+   -F "files=@/path to your file (.mp3, .wav)" \
+   -F "max_tokens=32" \
+   -F "language=en"
 ```
 
 Video:
 
 ```bash
+# Send base64 string
 curl -X POST http://${host_ip}:8888/v1/docsum \
    -H "Content-Type: application/json" \
    -d '{"type": "video", "messages": "convert your video to base64 data type"}'
@@ -193,6 +206,15 @@ curl http://${host_ip}:8888/v1/docsum \
    -F "max_tokens=32" \
    -F "language=en" \
    -F "stream=True"
+
+# Upload file
+curl http://${host_ip}:8888/v1/docsum \
+   -H "Content-Type: multipart/form-data" \
+   -F "type=video" \
+   -F "messages=" \
+   -F "files=@/path to your file (.mp4)" \
+   -F "max_tokens=32" \
+   -F "language=en"
 ```
 
 ### Query with long context

@@ -161,13 +161,16 @@ curl http://${host_ip}:8888/v1/docsum \
    -F "language=en" \
 ```
 
+Note that the `-F "messages="` flag is required, even for file uploads. Multiple files can be uploaded in a single call with multiple `-F "files=@/path"` inputs.
+
 ### Query with audio and video
 
-> Audio and Video file uploads are not supported in docsum with curl request, please use the Gradio-UI.
+> Audio and video can be passed as base64 strings or uploaded by providing a local file path.
 
 Audio:
 
 ```bash
+# Send base64 string
 curl -X POST http://${host_ip}:8888/v1/docsum \
    -H "Content-Type: application/json" \
    -d '{"type": "audio", "messages": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}'
@@ -179,11 +182,21 @@ curl http://${host_ip}:8888/v1/docsum \
    -F "max_tokens=32" \
    -F "language=en" \
    -F "stream=True"
+
+# Upload file
+curl http://${host_ip}:8888/v1/docsum \
+   -H "Content-Type: multipart/form-data" \
+   -F "type=audio" \
+   -F "messages=" \
+   -F "files=@/path to your file (.mp3, .wav)" \
+   -F "max_tokens=32" \
+   -F "language=en"
 ```
 
 Video:
 
 ```bash
+# Send base64 string
 curl -X POST http://${host_ip}:8888/v1/docsum \
    -H "Content-Type: application/json" \
    -d '{"type": "video", "messages": "convert your video to base64 data type"}'
@@ -195,6 +208,15 @@ curl http://${host_ip}:8888/v1/docsum \
    -F "max_tokens=32" \
    -F "language=en" \
    -F "stream=True"
+
+# Upload file
+curl http://${host_ip}:8888/v1/docsum \
+   -H "Content-Type: multipart/form-data" \
+   -F "type=video" \
+   -F "messages=" \
+   -F "files=@/path to your file (.mp4)" \
+   -F "max_tokens=32" \
+   -F "language=en"
 ```
 
 ### Query with long context

@@ -63,6 +63,20 @@ def read_pdf(file):
     return docs
 
 
+def encode_file_to_base64(file_path):
+    """Encode the content of a file to a base64 string.
+
+    Args:
+        file_path (str): The path to the file to be encoded.
+
+    Returns:
+        str: The base64 encoded string of the file content.
+    """
+    with open(file_path, "rb") as f:
+        base64_str = base64.b64encode(f.read()).decode("utf-8")
+    return base64_str
+
+
 def video2audio(
     video_base64: str,
 ) -> str:
@@ -163,7 +177,6 @@ def add_remote_service(self):
 
     async def handle_request(self, request: Request, files: List[UploadFile] = File(default=None)):
         """Accept pure text, or files .txt/.pdf.docx, audio/video base64 string."""
-
         if "application/json" in request.headers.get("content-type"):
             data = await request.json()
             stream_opt = data.get("stream", True)
@@ -193,25 +206,24 @@ async def handle_request(self, request: Request, files: List[UploadFile] = File(
                     uid = str(uuid.uuid4())
                     file_path = f"/tmp/{uid}"
 
-                    if data_type is not None and data_type in ["audio", "video"]:
-                        raise ValueError(
-                            "Audio and Video file uploads are not supported in docsum with curl request, \
-                                please use the UI or pass base64 string of the content directly."
-                        )
-
-                    else:
-                        import aiofiles
+                    import aiofiles
 
-                        async with aiofiles.open(file_path, "wb") as f:
-                            await f.write(await file.read())
+                    async with aiofiles.open(file_path, "wb") as f:
+                        await f.write(await file.read())
 
+                    if data_type == "text":
                         docs = read_text_from_file(file, file_path)
-                        os.remove(file_path)
+                    elif data_type in ["audio", "video"]:
+                        docs = encode_file_to_base64(file_path)
+                    else:
+                        raise ValueError(f"Data type not recognized: {data_type}")
+
+                    os.remove(file_path)
 
-                        if isinstance(docs, list):
-                            file_summaries.extend(docs)
-                        else:
-                            file_summaries.append(docs)
+                    if isinstance(docs, list):
+                        file_summaries.extend(docs)
+                    else:
+                        file_summaries.append(docs)
 
             if file_summaries:
                 prompt = handle_message(chat_request.messages) + "\n".join(file_summaries)

@@ -237,6 +237,20 @@ function validate_megaservice_multimedia() {
         "language=en" \
         "stream=False"
 
+    echo ">>> Checking audio data in form format, upload file"
+    validate_service \
+        "${host_ip}:${BACKEND_SERVICE_PORT}/v1/docsum" \
+        "well" \
+        "docsum-gaudi-backend-server" \
+        "docsum-gaudi-backend-server" \
+        "media" "" \
+        "type=audio" \
+        "messages=" \
+        "files=@$ROOT_FOLDER/data/test.wav" \
+        "max_tokens=32" \
+        "language=en" \
+        "stream=False"
+
     echo ">>> Checking video data in json format"
     validate_service \
         "${host_ip}:${BACKEND_SERVICE_PORT}/v1/docsum" \
@@ -258,6 +272,20 @@ function validate_megaservice_multimedia() {
         "max_tokens=32" \
         "language=en" \
         "stream=False"
+
+    echo ">>> Checking video data in form format, upload file"
+    validate_service \
+        "${host_ip}:${BACKEND_SERVICE_PORT}/v1/docsum" \
+        "bye" \
+        "docsum-gaudi-backend-server" \
+        "docsum-gaudi-backend-server" \
+        "media" "" \
+        "type=video" \
+        "messages=" \
+        "files=@$ROOT_FOLDER/data/test.mp4" \
+        "max_tokens=32" \
+        "language=en" \
+        "stream=False"
 }
 
 function validate_megaservice_long_text() {

@@ -237,6 +237,20 @@ function validate_megaservice_multimedia() {
         "language=en" \
         "stream=False"
 
+    echo ">>> Checking audio data in form format, upload file"
+    validate_service \
+        "${host_ip}:${BACKEND_SERVICE_PORT}/v1/docsum" \
+        "well" \
+        "docsum-xeon-backend-server" \
+        "docsum-xeon-backend-server" \
+        "media" "" \
+        "type=audio" \
+        "messages=" \
+        "files=@$ROOT_FOLDER/data/test.wav" \
+        "max_tokens=32" \
+        "language=en" \
+        "stream=False"
+
     echo ">>> Checking video data in json format"
     validate_service \
         "${host_ip}:${BACKEND_SERVICE_PORT}/v1/docsum" \
@@ -258,6 +272,20 @@ function validate_megaservice_multimedia() {
         "max_tokens=32" \
         "language=en" \
         "stream=False"
+
+    echo ">>> Checking video data in form format, upload file"
+    validate_service \
+        "${host_ip}:${BACKEND_SERVICE_PORT}/v1/docsum" \
+        "bye" \
+        "docsum-xeon-backend-server" \
+        "docsum-xeon-backend-server" \
+        "media" "" \
+        "type=video" \
+        "messages=" \
+        "files=@$ROOT_FOLDER/data/test.mp4" \
+        "max_tokens=32" \
+        "language=en" \
+        "stream=False"
 }
 
 function validate_megaservice_long_text() {

@@ -229,6 +229,20 @@ function validate_megaservice_multimedia() {
         "language=en" \
         "stream=False"
 
+    echo ">>> Checking audio data in form format, upload file"
+    validate_service \
+        "${host_ip}:${BACKEND_SERVICE_PORT}/v1/docsum" \
+        "well" \
+        "docsum-gaudi-backend-server" \
+        "docsum-gaudi-backend-server" \
+        "media" "" \
+        "type=audio" \
+        "messages=" \
+        "files=@$ROOT_FOLDER/data/test.wav" \
+        "max_tokens=32" \
+        "language=en" \
+        "stream=False"
+
     echo ">>> Checking video data in json format"
     validate_service \
         "${host_ip}:${BACKEND_SERVICE_PORT}/v1/docsum" \
@@ -250,6 +264,20 @@ function validate_megaservice_multimedia() {
         "max_tokens=32" \
         "language=en" \
         "stream=False"
+
+    echo ">>> Checking video data in form format, upload file"
+    validate_service \
+        "${host_ip}:${BACKEND_SERVICE_PORT}/v1/docsum" \
+        "bye" \
+        "docsum-gaudi-backend-server" \
+        "docsum-gaudi-backend-server" \
+        "media" "" \
+        "type=video" \
+        "messages=" \
+        "files=@$ROOT_FOLDER/data/test.mp4" \
+        "max_tokens=32" \
+        "language=en" \
+        "stream=False"
 }
 
 function validate_megaservice_long_text() {