From 054f982d33bf31a8d63dfd6d73bc4534455453ae Mon Sep 17 00:00:00 2001
From: Melanie Buehler <melanie.h.buehler@intel.com>
Date: Wed, 30 Apr 2025 16:39:58 -0700
Subject: [PATCH 1/6] Move all file processing from UI to DocSum backend
 service

Signed-off-by: Melanie Buehler <melanie.h.buehler@intel.com>
---
 DocSum/docker_compose/amd/gpu/rocm/README.md  |  22 +-
 .../docker_compose/intel/cpu/xeon/README.md   |  24 +-
 .../docker_compose/intel/hpu/gaudi/README.md  |  22 +-
 DocSum/docsum.py                              |  44 ++--
 DocSum/ui/gradio/docsum_ui_gradio.py          | 214 +++++++-----------
 5 files changed, 177 insertions(+), 149 deletions(-)

diff --git a/DocSum/docker_compose/amd/gpu/rocm/README.md b/DocSum/docker_compose/amd/gpu/rocm/README.md
index fe37f39d57..3b027bb92c 100644
--- a/DocSum/docker_compose/amd/gpu/rocm/README.md
+++ b/DocSum/docker_compose/amd/gpu/rocm/README.md
@@ -241,11 +241,10 @@ curl http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum \
 
 ### Query with audio and video
 
-> Audio and Video file uploads are not supported in docsum with curl request, please use the Gradio-UI.
-
 Audio:
 
 ```bash
+# Send base64 string
 curl -X POST http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum \
    -H "Content-Type: application/json" \
    -d '{"type": "audio", "messages": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}'
@@ -257,11 +256,21 @@ curl http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum \
    -F "max_tokens=32" \
    -F "language=en" \
    -F "stream=True"
+
+# Upload file
+curl http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum \
+   -H "Content-Type: multipart/form-data" \
+   -F "type=audio" \
+   -F "messages=" \
+   -F "files=@/path to your file (.mp3, .wav)" \
+   -F "max_tokens=32" \
+   -F "language=en"
 ```
 
 Video:
 
 ```bash
+# Send base64 string
 curl -X POST http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum \
    -H "Content-Type: application/json" \
    -d '{"type": "video", "messages": "convert your video to base64 data type"}'
@@ -273,6 +282,15 @@ curl http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum \
    -F "max_tokens=32" \
    -F "language=en" \
    -F "stream=True"
+
+# Upload file
+curl http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum \
+   -H "Content-Type: multipart/form-data" \
+   -F "type=video" \
+   -F "messages=" \
+   -F "files=@/path to your file (.mp4)" \
+   -F "max_tokens=32" \
+   -F "language=en"
 ```
 
 ### Query with long context
diff --git a/DocSum/docker_compose/intel/cpu/xeon/README.md b/DocSum/docker_compose/intel/cpu/xeon/README.md
index 0930ab227e..b47bb292b5 100644
--- a/DocSum/docker_compose/intel/cpu/xeon/README.md
+++ b/DocSum/docker_compose/intel/cpu/xeon/README.md
@@ -156,16 +156,15 @@ curl http://${host_ip}:8888/v1/docsum \
    -F "messages=" \
    -F "files=@/path to your file (.txt, .docx, .pdf)" \
    -F "max_tokens=32" \
-   -F "language=en" \
+   -F "language=en"
 ```
 
 ### Query with audio and video
 
-> Audio and Video file uploads are not supported in docsum with curl request, please use the Gradio-UI.
-
 Audio:
 
 ```bash
+# Send base64 string
 curl -X POST http://${host_ip}:8888/v1/docsum \
    -H "Content-Type: application/json" \
    -d '{"type": "audio", "messages": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}'
@@ -177,11 +176,21 @@ curl http://${host_ip}:8888/v1/docsum \
    -F "max_tokens=32" \
    -F "language=en" \
    -F "stream=True"
+
+# Upload file
+curl http://${host_ip}:8888/v1/docsum \
+   -H "Content-Type: multipart/form-data" \
+   -F "type=audio" \
+   -F "messages=" \
+   -F "files=@/path to your file (.mp3, .wav)" \
+   -F "max_tokens=32" \
+   -F "language=en"
 ```
 
 Video:
 
 ```bash
+# Send base64 string
 curl -X POST http://${host_ip}:8888/v1/docsum \
    -H "Content-Type: application/json" \
    -d '{"type": "video", "messages": "convert your video to base64 data type"}'
@@ -193,6 +202,15 @@ curl http://${host_ip}:8888/v1/docsum \
    -F "max_tokens=32" \
    -F "language=en" \
    -F "stream=True"
+
+# Upload file
+curl http://${host_ip}:8888/v1/docsum \
+   -H "Content-Type: multipart/form-data" \
+   -F "type=video" \
+   -F "messages=" \
+   -F "files=@/path to your file (.mp4)" \
+   -F "max_tokens=32" \
+   -F "language=en"
 ```
 
 ### Query with long context
diff --git a/DocSum/docker_compose/intel/hpu/gaudi/README.md b/DocSum/docker_compose/intel/hpu/gaudi/README.md
index 7b552fd5b8..256bea262d 100644
--- a/DocSum/docker_compose/intel/hpu/gaudi/README.md
+++ b/DocSum/docker_compose/intel/hpu/gaudi/README.md
@@ -163,11 +163,10 @@ curl http://${host_ip}:8888/v1/docsum \
 
 ### Query with audio and video
 
-> Audio and Video file uploads are not supported in docsum with curl request, please use the Gradio-UI.
-
 Audio:
 
 ```bash
+# Send base64 string
 curl -X POST http://${host_ip}:8888/v1/docsum \
    -H "Content-Type: application/json" \
    -d '{"type": "audio", "messages": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}'
@@ -179,11 +178,21 @@ curl http://${host_ip}:8888/v1/docsum \
    -F "max_tokens=32" \
    -F "language=en" \
    -F "stream=True"
+
+# Upload file
+curl http://${host_ip}:8888/v1/docsum \
+   -H "Content-Type: multipart/form-data" \
+   -F "type=audio" \
+   -F "messages=" \
+   -F "files=@/path to your file (.mp3, .wav)" \
+   -F "max_tokens=32" \
+   -F "language=en"
 ```
 
 Video:
 
 ```bash
+# Send base64 string
 curl -X POST http://${host_ip}:8888/v1/docsum \
    -H "Content-Type: application/json" \
    -d '{"type": "video", "messages": "convert your video to base64 data type"}'
@@ -195,6 +204,15 @@ curl http://${host_ip}:8888/v1/docsum \
    -F "max_tokens=32" \
    -F "language=en" \
    -F "stream=True"
+
+# Upload file
+curl http://${host_ip}:8888/v1/docsum \
+   -H "Content-Type: multipart/form-data" \
+   -F "type=video" \
+   -F "messages=" \
+   -F "files=@/path to your file (.mp4)" \
+   -F "max_tokens=32" \
+   -F "language=en"
 ```
 
 ### Query with long context
diff --git a/DocSum/docsum.py b/DocSum/docsum.py
index 34e58c1df0..1be5e322a7 100644
--- a/DocSum/docsum.py
+++ b/DocSum/docsum.py
@@ -63,6 +63,20 @@ def read_pdf(file):
     return docs
 
 
+def encode_file_to_base64(file_path):
+        """Encode the content of a file to a base64 string.
+
+        Args:
+            file_path (str): The path to the file to be encoded.
+
+        Returns:
+            str: The base64 encoded string of the file content.
+        """
+        with open(file_path, "rb") as f:
+            base64_str = base64.b64encode(f.read()).decode("utf-8")
+        return base64_str
+
+
 def video2audio(
     video_base64: str,
 ) -> str:
@@ -163,7 +177,6 @@ def add_remote_service(self):
 
     async def handle_request(self, request: Request, files: List[UploadFile] = File(default=None)):
         """Accept pure text, or files .txt/.pdf.docx, audio/video base64 string."""
-
         if "application/json" in request.headers.get("content-type"):
             data = await request.json()
             stream_opt = data.get("stream", True)
@@ -193,25 +206,24 @@ async def handle_request(self, request: Request, files: List[UploadFile] = File(
                     uid = str(uuid.uuid4())
                     file_path = f"/tmp/{uid}"
 
-                    if data_type is not None and data_type in ["audio", "video"]:
-                        raise ValueError(
-                            "Audio and Video file uploads are not supported in docsum with curl request, \
-                                please use the UI or pass base64 string of the content directly."
-                        )
+                    import aiofiles
 
-                    else:
-                        import aiofiles
-
-                        async with aiofiles.open(file_path, "wb") as f:
-                            await f.write(await file.read())
+                    async with aiofiles.open(file_path, "wb") as f:
+                        await f.write(await file.read())
 
+                    if data_type == "text":
                         docs = read_text_from_file(file, file_path)
-                        os.remove(file_path)
+                    elif data_type in ["audio", "video"]:
+                        docs = encode_file_to_base64(file_path)
+                    else:
+                        raise ValueError(f"Data type not recognized: {data_type}")
+                    
+                    os.remove(file_path)
 
-                        if isinstance(docs, list):
-                            file_summaries.extend(docs)
-                        else:
-                            file_summaries.append(docs)
+                    if isinstance(docs, list):
+                        file_summaries.extend(docs)
+                    else:
+                        file_summaries.append(docs)
 
             if file_summaries:
                 prompt = handle_message(chat_request.messages) + "\n".join(file_summaries)
diff --git a/DocSum/ui/gradio/docsum_ui_gradio.py b/DocSum/ui/gradio/docsum_ui_gradio.py
index 5bb9a7091c..a477b31ff9 100644
--- a/DocSum/ui/gradio/docsum_ui_gradio.py
+++ b/DocSum/ui/gradio/docsum_ui_gradio.py
@@ -22,76 +22,12 @@
 class DocSumUI:
     def __init__(self):
         """Initialize the DocSumUI class with accepted file types, headers, and backend service endpoint."""
-        self.ACCEPTED_FILE_TYPES = ["pdf", "doc", "docx"]
+        self.ACCEPTED_TEXT_FILE_TYPES = [".pdf", ".doc", ".docx"]
+        self.ACCEPTED_AUDIO_FILE_TYPES = [".mp3", ".wav"]
+        self.ACCEPTED_VIDEO_FILE_TYPES = [".mp4"]
         self.HEADERS = {"Content-Type": "application/json"}
         self.BACKEND_SERVICE_ENDPOINT = os.getenv("BACKEND_SERVICE_ENDPOINT", "http://localhost:8888/v1/docsum")
 
-    def encode_file_to_base64(self, file_path):
-        """Encode the content of a file to a base64 string.
-
-        Args:
-            file_path (str): The path to the file to be encoded.
-
-        Returns:
-            str: The base64 encoded string of the file content.
-        """
-        logger.info(">>> Encoding file to base64: %s", file_path)
-        with open(file_path, "rb") as f:
-            base64_str = base64.b64encode(f.read()).decode("utf-8")
-        return base64_str
-
-    def read_file(self, file):
-        """Read and process the content of a file.
-
-        Args:
-            file (file-like object): The file to be read.
-
-        Returns:
-            str: The content of the file or an error message if the file type is unsupported.
-        """
-        self.page_content = ""
-        self.pages = []
-
-        if file.name.endswith(".pdf"):
-            loader = PyPDFLoader(file)
-        elif file.name.endswith((".doc", ".docx")):
-            loader = Docx2txtLoader(file)
-        else:
-            msg = f"Unsupported file type '{file.name}'. Choose from {self.ACCEPTED_FILE_TYPES}"
-            logger.error(msg)
-            return msg
-
-        for page in loader.lazy_load():
-            self.page_content += page.page_content
-
-        return self.page_content
-
-    def read_audio_file(self, file):
-        """Read and process the content of an audio file.
-
-        Args:
-            file (file-like object): The audio file to be read.
-
-        Returns:
-            str: The base64 encoded content of the audio file.
-        """
-        logger.info(">>> Reading audio file: %s", file.name)
-        base64_str = self.encode_file_to_base64(file)
-        return base64_str
-
-    def read_video_file(self, file):
-        """Read and process the content of a video file.
-
-        Args:
-            file (file-like object): The video file to be read.
-
-        Returns:
-            str: The base64 encoded content of the video file.
-        """
-        logger.info(">>> Reading video file: %s", file.name)
-        base64_str = self.encode_file_to_base64(file)
-        return base64_str
-
     def is_valid_url(self, url):
         try:
             result = urlparse(url)
@@ -128,78 +64,107 @@ def read_url(self, url):
 
         return self.page_content
 
-    def generate_summary(self, doc_content, document_type="text"):
+    def process_response(self, response):
+        if response.status_code == 200:
+            try:
+                # Check if the specific log path is in the response text
+                if "/logs/LLMChain/final_output" in response.text:
+                    # Extract the relevant part of the response
+                    temp = ast.literal_eval(
+                        [
+                            i.split("data: ")[1]
+                            for i in response.text.split("\n\n")
+                            if "/logs/LLMChain/final_output" in i
+                        ][0]
+                    )["ops"]
+
+                    # Find the final output value
+                    final_output = [i["value"] for i in temp if i["path"] == "/logs/LLMChain/final_output"][0]
+                    return final_output["text"]
+                else:
+                    # Perform string replacements to clean the response text
+                    cleaned_text = response.text
+                    replacements = [
+                        ("'\n\ndata: b'", ""),
+                        ("data: b' ", ""),
+                        ("</s>'\n\ndata: [DONE]\n\n", ""),
+                        ("\n\ndata: b", ""),
+                        ("'\n\n", ""),
+                        ("'\n", ""),
+                        ('''\'"''', ""),
+                    ]
+                    for old, new in replacements:
+                        cleaned_text = cleaned_text.replace(old, new)
+                    return cleaned_text
+            except (IndexError, KeyError, ValueError) as e:
+                # Handle potential errors during parsing
+                logger.error("Error parsing response: %s", e)
+                return response.text
+
+    def generate_summary(self, document, document_type="text"):
         """Generate a summary for the given document content.
 
         Args:
-            doc_content (str): The content of the document.
+            document (str): The content or path of the document.
             document_type (str): The type of the document (default is "text").
 
         Returns:
             str: The generated summary or an error message.
         """
-
         logger.info(">>> BACKEND_SERVICE_ENDPOINT - %s", self.BACKEND_SERVICE_ENDPOINT)
 
-        data = {"max_tokens": 256, "type": document_type, "messages": doc_content}
+        data = {"max_tokens": 256, "type": document_type, "messages": ""}
+
+        if os.path.exists(document):
+            file_header = "text/plain"
+            file_ext = os.path.splitext(document)[-1]
+            if file_ext == ".pdf":
+                file_header = "application/pdf"
+            elif file_ext in [".doc", ".docx"]:
+                file_header = "application/octet-stream"
+            elif file_ext in self.ACCEPTED_AUDIO_FILE_TYPES + self.ACCEPTED_VIDEO_FILE_TYPES:
+                file_header = f"{document_type}/{file_ext[-3:]}"
+            files = {"files": (os.path.basename(document), open(document, "rb"), file_header)}
+            try:
+                response = requests.post(
+                    url=self.BACKEND_SERVICE_ENDPOINT,
+                    headers={},
+                    files=files,
+                    data=data,
+                    proxies={"http_proxy": os.environ["http_proxy"], "https_proxy": os.environ["https_proxy"]},
+                )
 
-        try:
-            response = requests.post(
-                url=self.BACKEND_SERVICE_ENDPOINT,
-                headers=self.HEADERS,
-                data=json.dumps(data),
-                proxies={"http_proxy": os.environ["http_proxy"], "https_proxy": os.environ["https_proxy"]},
-            )
+                return self.process_response(response)
+
+            except requests.exceptions.RequestException as e:
+                logger.error("Request exception: %s", e)
+                return str(e)
+
+        else:
+            data["messages"] = document
+            try:
+                response = requests.post(
+                    url=self.BACKEND_SERVICE_ENDPOINT,
+                    headers=self.HEADERS,
+                    data=json.dumps(data),
+                    proxies={"http_proxy": os.environ["http_proxy"], "https_proxy": os.environ["https_proxy"]},
+                )
+
+                return self.process_response(response)
 
-            if response.status_code == 200:
-                try:
-                    # Check if the specific log path is in the response text
-                    if "/logs/LLMChain/final_output" in response.text:
-                        # Extract the relevant part of the response
-                        temp = ast.literal_eval(
-                            [
-                                i.split("data: ")[1]
-                                for i in response.text.split("\n\n")
-                                if "/logs/LLMChain/final_output" in i
-                            ][0]
-                        )["ops"]
-
-                        # Find the final output value
-                        final_output = [i["value"] for i in temp if i["path"] == "/logs/LLMChain/final_output"][0]
-                        return final_output["text"]
-                    else:
-                        # Perform string replacements to clean the response text
-                        cleaned_text = response.text
-                        replacements = [
-                            ("'\n\ndata: b'", ""),
-                            ("data: b' ", ""),
-                            ("</s>'\n\ndata: [DONE]\n\n", ""),
-                            ("\n\ndata: b", ""),
-                            ("'\n\n", ""),
-                            ("'\n", ""),
-                            ('''\'"''', ""),
-                        ]
-                        for old, new in replacements:
-                            cleaned_text = cleaned_text.replace(old, new)
-                        return cleaned_text
-                except (IndexError, KeyError, ValueError) as e:
-                    # Handle potential errors during parsing
-                    logger.error("Error parsing response: %s", e)
-                    return response.text
-
-        except requests.exceptions.RequestException as e:
-            logger.error("Request exception: %s", e)
-            return str(e)
+            except requests.exceptions.RequestException as e:
+                logger.error("Request exception: %s", e)
+                return str(e)
 
         return str(response.status_code)
 
-    def create_upload_ui(self, label, file_types, process_function, document_type="text"):
+    def create_upload_ui(self, label, file_types, document_type="text"):
         """Create a Gradio UI for file uploads.
 
         Args:
             label (str): The label for the upload button.
             file_types (list): The list of accepted file types.
-            process_function (function): The function to process the uploaded file.
+            document_type (str): The document type (text, audio, or video). Default is text.
 
         Returns:
             gr.Blocks: The Gradio Blocks object representing the upload UI.
@@ -214,7 +179,7 @@ def create_upload_ui(self, label, file_types, process_function, document_type="t
                         label="Text Summary", placeholder="Summarized text will be displayed here"
                     )
             upload_btn.upload(
-                lambda file: self.generate_summary(process_function(file), document_type=document_type),
+                lambda file: self.generate_summary(file, document_type=document_type),
                 upload_btn,
                 generated_text,
             )
@@ -264,23 +229,20 @@ def render(self):
         # File Upload UI
         file_ui = self.create_upload_ui(
             label="Please upload a document (.pdf, .doc, .docx)",
-            file_types=[".pdf", ".doc", ".docx"],
-            process_function=self.read_file,
+            file_types=self.ACCEPTED_TEXT_FILE_TYPES
         )
 
         # Audio Upload UI
         audio_ui = self.create_upload_ui(
             label="Please upload audio file (.wav, .mp3)",
-            file_types=[".wav", ".mp3"],
-            process_function=self.read_audio_file,
+            file_types=self.ACCEPTED_AUDIO_FILE_TYPES,
             document_type="audio",
         )
 
         # Video Upload UI
         video_ui = self.create_upload_ui(
             label="Please upload Video file (.mp4)",
-            file_types=[".mp4"],
-            process_function=self.read_video_file,
+            file_types=self.ACCEPTED_VIDEO_FILE_TYPES,
             document_type="video",
         )
 

From bc09a633d7ee539c4f1a9a50a589b5b90cb93d3f Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 1 May 2025 22:54:04 +0000
Subject: [PATCH 2/6] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 DocSum/docsum.py                     | 20 ++++++++++----------
 DocSum/ui/gradio/docsum_ui_gradio.py |  3 +--
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/DocSum/docsum.py b/DocSum/docsum.py
index 1be5e322a7..786e48a264 100644
--- a/DocSum/docsum.py
+++ b/DocSum/docsum.py
@@ -64,17 +64,17 @@ def read_pdf(file):
 
 
 def encode_file_to_base64(file_path):
-        """Encode the content of a file to a base64 string.
+    """Encode the content of a file to a base64 string.
 
-        Args:
-            file_path (str): The path to the file to be encoded.
+    Args:
+        file_path (str): The path to the file to be encoded.
 
-        Returns:
-            str: The base64 encoded string of the file content.
-        """
-        with open(file_path, "rb") as f:
-            base64_str = base64.b64encode(f.read()).decode("utf-8")
-        return base64_str
+    Returns:
+        str: The base64 encoded string of the file content.
+    """
+    with open(file_path, "rb") as f:
+        base64_str = base64.b64encode(f.read()).decode("utf-8")
+    return base64_str
 
 
 def video2audio(
@@ -217,7 +217,7 @@ async def handle_request(self, request: Request, files: List[UploadFile] = File(
                         docs = encode_file_to_base64(file_path)
                     else:
                         raise ValueError(f"Data type not recognized: {data_type}")
-                    
+
                     os.remove(file_path)
 
                     if isinstance(docs, list):
diff --git a/DocSum/ui/gradio/docsum_ui_gradio.py b/DocSum/ui/gradio/docsum_ui_gradio.py
index a477b31ff9..f354e269d4 100644
--- a/DocSum/ui/gradio/docsum_ui_gradio.py
+++ b/DocSum/ui/gradio/docsum_ui_gradio.py
@@ -228,8 +228,7 @@ def render(self):
 
         # File Upload UI
         file_ui = self.create_upload_ui(
-            label="Please upload a document (.pdf, .doc, .docx)",
-            file_types=self.ACCEPTED_TEXT_FILE_TYPES
+            label="Please upload a document (.pdf, .doc, .docx)", file_types=self.ACCEPTED_TEXT_FILE_TYPES
         )
 
         # Audio Upload UI

From ca57350c622437aee27b85a1136168077f3bbc31 Mon Sep 17 00:00:00 2001
From: Melanie Buehler <melanie.h.buehler@intel.com>
Date: Thu, 1 May 2025 16:10:25 -0700
Subject: [PATCH 3/6] Improve display text of document types

Signed-off-by: Melanie Buehler <melanie.h.buehler@intel.com>
---
 DocSum/ui/gradio/docsum_ui_gradio.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/DocSum/ui/gradio/docsum_ui_gradio.py b/DocSum/ui/gradio/docsum_ui_gradio.py
index f354e269d4..5844a432b9 100644
--- a/DocSum/ui/gradio/docsum_ui_gradio.py
+++ b/DocSum/ui/gradio/docsum_ui_gradio.py
@@ -228,19 +228,20 @@ def render(self):
 
         # File Upload UI
         file_ui = self.create_upload_ui(
-            label="Please upload a document (.pdf, .doc, .docx)", file_types=self.ACCEPTED_TEXT_FILE_TYPES
+            label=f"Please upload a document ({', '.join(self.ACCEPTED_TEXT_FILE_TYPES)})",
+            file_types=self.ACCEPTED_TEXT_FILE_TYPES
         )
 
         # Audio Upload UI
         audio_ui = self.create_upload_ui(
-            label="Please upload audio file (.wav, .mp3)",
+            label=f"Please upload audio file ({', '.join(self.ACCEPTED_AUDIO_FILE_TYPES)})",
             file_types=self.ACCEPTED_AUDIO_FILE_TYPES,
             document_type="audio",
         )
 
         # Video Upload UI
         video_ui = self.create_upload_ui(
-            label="Please upload Video file (.mp4)",
+            label=f"Please upload video file ({', '.join(self.ACCEPTED_VIDEO_FILE_TYPES)})",
             file_types=self.ACCEPTED_VIDEO_FILE_TYPES,
             document_type="video",
         )

From caba592e93e06579ac620f1b0433236503054fce Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 1 May 2025 23:15:26 +0000
Subject: [PATCH 4/6] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 DocSum/ui/gradio/docsum_ui_gradio.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DocSum/ui/gradio/docsum_ui_gradio.py b/DocSum/ui/gradio/docsum_ui_gradio.py
index 5844a432b9..8d8a440ce3 100644
--- a/DocSum/ui/gradio/docsum_ui_gradio.py
+++ b/DocSum/ui/gradio/docsum_ui_gradio.py
@@ -229,7 +229,7 @@ def render(self):
         # File Upload UI
         file_ui = self.create_upload_ui(
             label=f"Please upload a document ({', '.join(self.ACCEPTED_TEXT_FILE_TYPES)})",
-            file_types=self.ACCEPTED_TEXT_FILE_TYPES
+            file_types=self.ACCEPTED_TEXT_FILE_TYPES,
         )
 
         # Audio Upload UI

From 5a5384d91ee049ec4dc0af5fa70a17cb2a3eb616 Mon Sep 17 00:00:00 2001
From: Melanie Buehler <melanie.h.buehler@intel.com>
Date: Mon, 5 May 2025 14:52:40 -0700
Subject: [PATCH 5/6] Updated docs and tests per feedback

Signed-off-by: Melanie Buehler <melanie.h.buehler@intel.com>
---
 DocSum/docker_compose/amd/gpu/rocm/README.md  |  4 +++
 .../docker_compose/intel/cpu/xeon/README.md   |  4 +++
 .../docker_compose/intel/hpu/gaudi/README.md  |  4 +++
 DocSum/tests/test_compose_on_gaudi.sh         | 28 +++++++++++++++++++
 DocSum/tests/test_compose_on_xeon.sh          | 28 +++++++++++++++++++
 DocSum/tests/test_compose_tgi_on_gaudi.sh     | 28 +++++++++++++++++++
 DocSum/tests/test_compose_tgi_on_xeon.sh      | 28 +++++++++++++++++++
 7 files changed, 124 insertions(+)

diff --git a/DocSum/docker_compose/amd/gpu/rocm/README.md b/DocSum/docker_compose/amd/gpu/rocm/README.md
index 3b027bb92c..da9d7d749f 100644
--- a/DocSum/docker_compose/amd/gpu/rocm/README.md
+++ b/DocSum/docker_compose/amd/gpu/rocm/README.md
@@ -239,8 +239,12 @@ curl http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum \
    -F "language=en" \
 ```
 
+Note that the `-F "messages="` flag is required, even for file uploads. Multiple files can be uploaded in a single call with multiple `-F "files=@/path"` inputs.
+
 ### Query with audio and video
 
+> Audio and video can be passed as base64 strings or uploaded by providing a local file path.
+
 Audio:
 
 ```bash
diff --git a/DocSum/docker_compose/intel/cpu/xeon/README.md b/DocSum/docker_compose/intel/cpu/xeon/README.md
index b47bb292b5..06d3e4378d 100644
--- a/DocSum/docker_compose/intel/cpu/xeon/README.md
+++ b/DocSum/docker_compose/intel/cpu/xeon/README.md
@@ -159,8 +159,12 @@ curl http://${host_ip}:8888/v1/docsum \
    -F "language=en"
 ```
 
+Note that the `-F "messages="` flag is required, even for file uploads. Multiple files can be uploaded in a single call with multiple `-F "files=@/path"` inputs.
+
 ### Query with audio and video
 
+> Audio and video can be passed as base64 strings or uploaded by providing a local file path.
+
 Audio:
 
 ```bash
diff --git a/DocSum/docker_compose/intel/hpu/gaudi/README.md b/DocSum/docker_compose/intel/hpu/gaudi/README.md
index 256bea262d..5cf9e77477 100644
--- a/DocSum/docker_compose/intel/hpu/gaudi/README.md
+++ b/DocSum/docker_compose/intel/hpu/gaudi/README.md
@@ -161,8 +161,12 @@ curl http://${host_ip}:8888/v1/docsum \
    -F "language=en" \
 ```
 
+Note that the `-F "messages="` flag is required, even for file uploads. Multiple files can be uploaded in a single call with multiple `-F "files=@/path"` inputs.
+
 ### Query with audio and video
 
+> Audio and video can be passed as base64 strings or uploaded by providing a local file path.
+
 Audio:
 
 ```bash
diff --git a/DocSum/tests/test_compose_on_gaudi.sh b/DocSum/tests/test_compose_on_gaudi.sh
index aecdc006c7..fe3a3f7325 100644
--- a/DocSum/tests/test_compose_on_gaudi.sh
+++ b/DocSum/tests/test_compose_on_gaudi.sh
@@ -237,6 +237,20 @@ function validate_megaservice_multimedia() {
         "language=en" \
         "stream=False"
 
+    echo ">>> Checking audio data in form format, upload file"
+    validate_service \
+        "${host_ip}:${BACKEND_SERVICE_PORT}/v1/docsum" \
+        "well" \
+        "docsum-xeon-backend-server" \
+        "docsum-xeon-backend-server" \
+        "media" "" \
+        "type=audio" \
+        "messages=" \
+        "files=@$ROOT_FOLDER/data/test.wav" \
+        "max_tokens=32" \
+        "language=en" \
+        "stream=False"
+
     echo ">>> Checking video data in json format"
     validate_service \
         "${host_ip}:${BACKEND_SERVICE_PORT}/v1/docsum" \
@@ -258,6 +272,20 @@ function validate_megaservice_multimedia() {
         "max_tokens=32" \
         "language=en" \
         "stream=False"
+
+    echo ">>> Checking video data in form format, upload file"
+    validate_service \
+        "${host_ip}:${BACKEND_SERVICE_PORT}/v1/docsum" \
+        "bye" \
+        "docsum-xeon-backend-server" \
+        "docsum-xeon-backend-server" \
+        "media" "" \
+        "type=video" \
+        "messages=" \
+        "files=@$ROOT_FOLDER/data/test.mp4" \
+        "max_tokens=32" \
+        "language=en" \
+        "stream=False"
 }
 
 function validate_megaservice_long_text() {
diff --git a/DocSum/tests/test_compose_on_xeon.sh b/DocSum/tests/test_compose_on_xeon.sh
index 5ff7add6be..c231e7264e 100644
--- a/DocSum/tests/test_compose_on_xeon.sh
+++ b/DocSum/tests/test_compose_on_xeon.sh
@@ -237,6 +237,20 @@ function validate_megaservice_multimedia() {
         "language=en" \
         "stream=False"
 
+    echo ">>> Checking audio data in form format, upload file"
+    validate_service \
+        "${host_ip}:${BACKEND_SERVICE_PORT}/v1/docsum" \
+        "well" \
+        "docsum-xeon-backend-server" \
+        "docsum-xeon-backend-server" \
+        "media" "" \
+        "type=audio" \
+        "messages=" \
+        "files=@$ROOT_FOLDER/data/test.wav" \
+        "max_tokens=32" \
+        "language=en" \
+        "stream=False"
+
     echo ">>> Checking video data in json format"
     validate_service \
         "${host_ip}:${BACKEND_SERVICE_PORT}/v1/docsum" \
@@ -258,6 +272,20 @@ function validate_megaservice_multimedia() {
         "max_tokens=32" \
         "language=en" \
         "stream=False"
+
+    echo ">>> Checking video data in form format, upload file"
+    validate_service \
+        "${host_ip}:${BACKEND_SERVICE_PORT}/v1/docsum" \
+        "bye" \
+        "docsum-xeon-backend-server" \
+        "docsum-xeon-backend-server" \
+        "media" "" \
+        "type=video" \
+        "messages=" \
+        "files=@$ROOT_FOLDER/data/test.mp4" \
+        "max_tokens=32" \
+        "language=en" \
+        "stream=False"
 }
 
 function validate_megaservice_long_text() {
diff --git a/DocSum/tests/test_compose_tgi_on_gaudi.sh b/DocSum/tests/test_compose_tgi_on_gaudi.sh
index 6859e5354a..06dd9b7292 100644
--- a/DocSum/tests/test_compose_tgi_on_gaudi.sh
+++ b/DocSum/tests/test_compose_tgi_on_gaudi.sh
@@ -229,6 +229,20 @@ function validate_megaservice_multimedia() {
         "language=en" \
         "stream=False"
 
+    echo ">>> Checking audio data in form format, upload file"
+    validate_service \
+        "${host_ip}:${BACKEND_SERVICE_PORT}/v1/docsum" \
+        "well" \
+        "docsum-gaudi-backend-server" \
+        "docsum-gaudi-backend-server" \
+        "media" "" \
+        "type=audio" \
+        "messages=" \
+        "files=@$ROOT_FOLDER/data/test.wav" \
+        "max_tokens=32" \
+        "language=en" \
+        "stream=False"
+
     echo ">>> Checking video data in json format"
     validate_service \
         "${host_ip}:${BACKEND_SERVICE_PORT}/v1/docsum" \
@@ -250,6 +264,20 @@ function validate_megaservice_multimedia() {
         "max_tokens=32" \
         "language=en" \
         "stream=False"
+
+    echo ">>> Checking video data in form format, upload file"
+    validate_service \
+        "${host_ip}:${BACKEND_SERVICE_PORT}/v1/docsum" \
+        "bye" \
+        "docsum-gaudi-backend-server" \
+        "docsum-gaudi-backend-server" \
+        "media" "" \
+        "type=video" \
+        "messages=" \
+        "files=@$ROOT_FOLDER/data/test.mp4" \
+        "max_tokens=32" \
+        "language=en" \
+        "stream=False"
 }
 
 function validate_megaservice_long_text() {
diff --git a/DocSum/tests/test_compose_tgi_on_xeon.sh b/DocSum/tests/test_compose_tgi_on_xeon.sh
index f94eabf0c8..52edea31f8 100644
--- a/DocSum/tests/test_compose_tgi_on_xeon.sh
+++ b/DocSum/tests/test_compose_tgi_on_xeon.sh
@@ -229,6 +229,20 @@ function validate_megaservice_multimedia() {
         "language=en" \
         "stream=False"
 
+    echo ">>> Checking audio data in form format, upload file"
+    validate_service \
+        "${host_ip}:${BACKEND_SERVICE_PORT}/v1/docsum" \
+        "well" \
+        "docsum-xeon-backend-server" \
+        "docsum-xeon-backend-server" \
+        "media" "" \
+        "type=audio" \
+        "messages=" \
+        "files=@$ROOT_FOLDER/data/test.wav" \
+        "max_tokens=32" \
+        "language=en" \
+        "stream=False"
+
     echo ">>> Checking video data in json format"
     validate_service \
         "${host_ip}:${BACKEND_SERVICE_PORT}/v1/docsum" \
@@ -250,6 +264,20 @@ function validate_megaservice_multimedia() {
         "max_tokens=32" \
         "language=en" \
         "stream=False"
+
+    echo ">>> Checking video data in form format, upload file"
+    validate_service \
+        "${host_ip}:${BACKEND_SERVICE_PORT}/v1/docsum" \
+        "bye" \
+        "docsum-xeon-backend-server" \
+        "docsum-xeon-backend-server" \
+        "media" "" \
+        "type=video" \
+        "messages=" \
+        "files=@$ROOT_FOLDER/data/test.mp4" \
+        "max_tokens=32" \
+        "language=en" \
+        "stream=False"
 }
 
 function validate_megaservice_long_text() {

From 79c2c0ed2bf0e894cc2c70ee904df57346091031 Mon Sep 17 00:00:00 2001
From: Melanie Buehler <melanie.h.buehler@intel.com>
Date: Mon, 5 May 2025 15:11:39 -0700
Subject: [PATCH 6/6] Fix service name

Signed-off-by: Melanie Buehler <melanie.h.buehler@intel.com>
---
 DocSum/tests/test_compose_on_gaudi.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/DocSum/tests/test_compose_on_gaudi.sh b/DocSum/tests/test_compose_on_gaudi.sh
index fe3a3f7325..3c0f3d695b 100644
--- a/DocSum/tests/test_compose_on_gaudi.sh
+++ b/DocSum/tests/test_compose_on_gaudi.sh
@@ -241,8 +241,8 @@ function validate_megaservice_multimedia() {
     validate_service \
         "${host_ip}:${BACKEND_SERVICE_PORT}/v1/docsum" \
         "well" \
-        "docsum-xeon-backend-server" \
-        "docsum-xeon-backend-server" \
+        "docsum-gaudi-backend-server" \
+        "docsum-gaudi-backend-server" \
         "media" "" \
         "type=audio" \
         "messages=" \
@@ -277,8 +277,8 @@ function validate_megaservice_multimedia() {
     validate_service \
         "${host_ip}:${BACKEND_SERVICE_PORT}/v1/docsum" \
         "bye" \
-        "docsum-xeon-backend-server" \
-        "docsum-xeon-backend-server" \
+        "docsum-gaudi-backend-server" \
+        "docsum-gaudi-backend-server" \
         "media" "" \
         "type=video" \
         "messages=" \