55from pathlib import Path
66from dataclasses import dataclass
77import PIL
8- from enum import StrEnum
9-
8+ from enum import Enum
109import openai
1110from .base_llm import LLM , Message , validate_only_first_message_has_files
1211import cv2
@@ -21,8 +20,12 @@ def get_image_as_base64(image_bytes: bytes):
2120 return base64 .b64encode (image_bytes ).decode ("utf-8" )
2221
2322
24- def convert_message_to_hf_format (message : Message , max_n_frames_per_video : int ) -> dict :
25- """Convert a Message to HuggingFace chat format."""
23+ def convert_message_to_openai_format (message : Message , max_n_frames_per_video : int ) -> dict :
24+ """
25+ Convert a Message to OpenAI chat format.
26+ Images become base64 encoded strings.
27+ Videos are processed like a list of images, each of which becomes a base64 encoded string.
28+ """
2629 content = []
2730
2831 # Add text content if present
@@ -54,8 +57,9 @@ def convert_message_to_hf_format(message: Message, max_n_frames_per_video: int)
5457
5558
5659def video_to_imgs (video_path : Path , max_n_frames : int ) -> list [PIL .Image .Image ]:
57- assert isinstance (video_path , Path ), video_path
5860 """From https://github.com/agustoslu/simple-inference-benchmark/blob/5cec55787d34af65f0d11efc429c3d4de92f051a/utils.py#L79"""
61+ assert isinstance (video_path , Path ), video_path
62+ assert video_path .exists (), video_path
5963 cap = cv2 .VideoCapture (str (video_path ))
6064 total_frames = int (cap .get (cv2 .CAP_PROP_FRAME_COUNT ))
6165 fps = int (cap .get (cv2 .CAP_PROP_FPS ))
@@ -129,7 +133,7 @@ def extract_bytes(img: PIL.Image.Image | str | Path) -> bytes:
129133 raise ValueError (f"Unsupported image type: { type (img )} " )
130134
131135
132- class HuggingFaceVLMs (StrEnum ):
136+ class HuggingFaceVLMs (str , Enum ):
133137 gemma_3_27b_it = "google/gemma-3-27b-it"
134138
135139
@@ -171,7 +175,7 @@ def complete_msgs(self, msgs: list[Message]) -> str:
171175 """Complete a conversation with the model."""
172176 validate_only_first_message_has_files (msgs )
173177 hf_messages = [
174- convert_message_to_hf_format (
178+ convert_message_to_openai_format (
175179 msg , max_n_frames_per_video = self .max_n_frames_per_video
176180 )
177181 for msg in msgs
0 commit comments