From af406c743d4249f353187d580f503704144b8a62 Mon Sep 17 00:00:00 2001 From: timmermansjoy <61321383+timmermansjoy@users.noreply.github.com> Date: Sat, 9 Aug 2025 10:52:10 +0200 Subject: [PATCH 01/11] starting structure for new video class --- pyproject.toml | 3 + supervision/__init__.py | 3 +- supervision/utils/video.py | 10 +- supervision/video/__init__.py | 4 + supervision/video/backends/__init__.py | 7 + supervision/video/backends/base.py | 118 +++++++++++++++ supervision/video/backends/opencv.py | 196 +++++++++++++++++++++++++ supervision/video/backends/pyav.py | 11 ++ supervision/video/core.py | 75 ++++++++++ supervision/video/utils.py | 47 ++++++ 10 files changed, 469 insertions(+), 5 deletions(-) create mode 100644 supervision/video/__init__.py create mode 100644 supervision/video/backends/__init__.py create mode 100644 supervision/video/backends/base.py create mode 100644 supervision/video/backends/opencv.py create mode 100644 supervision/video/backends/pyav.py create mode 100644 supervision/video/core.py create mode 100644 supervision/video/utils.py diff --git a/pyproject.toml b/pyproject.toml index 9bf3b24aa..29a231030 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -83,6 +83,9 @@ build = [ "wheel>=0.40,<0.46", "build>=0.10,<1.3" ] +video = [ + "av>=15.0.0" +] [tool.bandit] target = ["test", "supervision"] diff --git a/supervision/__init__.py b/supervision/__init__.py index 04d3fb254..3eba1208d 100644 --- a/supervision/__init__.py +++ b/supervision/__init__.py @@ -131,11 +131,11 @@ from supervision.utils.notebook import plot_image, plot_images_grid from supervision.utils.video import ( FPSMonitor, - VideoInfo, VideoSink, get_video_frames_generator, process_video, ) +from supervision.video.core import Video, VideoInfo __all__ = [ "LMM", @@ -193,6 +193,7 @@ "TriangleAnnotator", "VertexAnnotator", "VertexLabelAnnotator", + "Video", "VideoInfo", "VideoSink", "approximate_polygon", diff --git a/supervision/utils/video.py b/supervision/utils/video.py index 3b281b4e2..10ea2dc29 100644 --- a/supervision/utils/video.py +++ b/supervision/utils/video.py @@ -9,6 +9,8 @@ import numpy as np from tqdm.auto import tqdm +from supervision.utils.internal import deprecated + @dataclass class VideoInfo: @@ -59,7 +61,7 @@ def from_video_path(cls, video_path: str) -> VideoInfo: def resolution_wh(self) -> tuple[int, int]: return self.width, self.height - +@deprecated class VideoSink: """ Context manager that saves video frames to a file using OpenCV. @@ -116,7 +118,7 @@ def write_frame(self, frame: np.ndarray): def __exit__(self, exc_type, exc_value, exc_traceback): self.__writer.release() - +@deprecated def _validate_and_setup_video( source_path: str, start: int, end: int | None, iterative_seek: bool = False ): @@ -140,7 +142,7 @@ def _validate_and_setup_video( return video, start, end - +@deprecated def get_video_frames_generator( source_path: str, stride: int = 1, @@ -191,7 +193,7 @@ def get_video_frames_generator( frame_position += stride video.release() - +@deprecated def process_video( source_path: str, target_path: str, diff --git a/supervision/video/__init__.py b/supervision/video/__init__.py new file mode 100644 index 000000000..445a57c74 --- /dev/null +++ b/supervision/video/__init__.py @@ -0,0 +1,4 @@ +from .core import Video +from .utils import VideoInfo + +__all__ = ["Video", "VideoInfo"] diff --git a/supervision/video/backends/__init__.py b/supervision/video/backends/__init__.py new file mode 100644 index 000000000..d834269c1 --- /dev/null +++ b/supervision/video/backends/__init__.py @@ -0,0 +1,7 @@ +BACKENDS = { + "opencv": "supervision.video.backends.opencv", + "pyav": "supervision.video.backends.pyav", +} + + +__all__ = ["BACKENDS"] diff --git a/supervision/video/backends/base.py b/supervision/video/backends/base.py new file mode 100644 index 000000000..6afcf0df2 --- /dev/null +++ b/supervision/video/backends/base.py @@ -0,0 +1,118 @@ +from __future__ import annotations + +from collections.abc import Iterator +from typing import Protocol, runtime_checkable + +import numpy as np + +from ..utils import VideoInfo + + +@runtime_checkable +class Backend(Protocol): + """ + The high-level :pyclass:`~supervision.video.Video` adapter instantiates a + backend - selected by name - and then only calls the methods defined + below. Anything else is considered a private implementation detail. + """ + + def __init__(self, source: str | int): + """Create a new backend for source. + + ``source`` can be + * ``str`` - file path, RTSP/HTTP URL … + * ``int`` - webcam index (OpenCV-style) + """ + + def info(self) -> VideoInfo: + """Return static information (width / height / fps / total_frames).""" + + def read(self) -> tuple[bool, np.ndarray]: + """Decode the next frame. + + Returns ``(success, frame)`` where frame is a ``np.ndarray`` (HxWx3). + """ + + def grab(self) -> bool: + """Grab the next frame without decoding pixels. + + Equivalent to OpenCV's ``VideoCapture.grab``. Useful if the user only + wants to skip frames quickly (stride > 1 for example). + """ + + def seek(self, frame_idx: int) -> None: + """Seek to frame_idx so that the next :py:meth:`read` returns it.""" + + # Encoding --------------------------------------------------------------- + + def writer( + self, + path: str, + info: VideoInfo, + codec: str | None = None, + ) -> Writer: + """Return a writer that encodes frames to path. + + Parameters + ---------- + path: + Target file path. + info: + Expected output resolution / fps (copied from source by default). + codec: + FourCC / codec name to override the backend default. + """ + + # Iterator convenience --------------------------------------------------- + + def __iter__(self) -> Iterator[np.ndarray]: + """Yield successive frames until exhaustion. + + This is considered convenience behaviour; the default implementation + below is fine for most back-ends. + """ + + +@runtime_checkable +class Writer(Protocol): + """Protocol for an encoded video writer returned by :py:meth:`Backend.writer`.""" + + def write(self, frame: np.ndarray, frame_number: int, callback) -> None: + """Write a single BGR / RGB frame to the output stream.""" + + def close(self) -> None: + """Flush and close the underlying container / file descriptor.""" + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + self.close() + return False # propagate exception (if any) + + +# --------------------------------------------------------------------------- +# Utility - a dummy writer that does nothing. Useful for testing. +# --------------------------------------------------------------------------- + + +class _NullWriter: + """Fallback Writer that silently drops every frame.""" + + def write(self, frame: np.ndarray, frame_number: int, callback) -> None: + pass + + def close(self) -> None: + pass + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + return False + + +__all__ = [ + "Backend", + "Writer", +] diff --git a/supervision/video/backends/opencv.py b/supervision/video/backends/opencv.py new file mode 100644 index 000000000..70d1285cc --- /dev/null +++ b/supervision/video/backends/opencv.py @@ -0,0 +1,196 @@ +from collections.abc import Iterator +from typing import Any, Optional +from collections.abc import Callable + +import cv2 +import numpy as np + +from ..utils import VideoInfo +from .base import Writer + + +class OpenCVWriter: + def __init__(self, vw: cv2.VideoWriter, info: VideoInfo): + self._vw = vw + self.info = info + + def write( + self, + frame: np.ndarray, + frame_number: int, + callback: Callable[[np.ndarray], None] | None = None, + ) -> None: + if callback: + frame = callback(frame, frame_number) + if frame.shape[0] != self.info.height or frame.shape[1] != self.info.width: + frame = cv2.resize(frame, (self.info.width, self.info.height)) + self._vw.write(frame) + + def close(self) -> None: + self._vw.release() + + def __enter__(self) -> Writer: + return self + + def __exit__(self, exc_type, exc_val, exc_tb) -> None: + self.close() + + +class Backend: + def __init__(self, source_path: str | int): + """Create a new backend for source. + + `source`` can b + * ``str`` - file path, RTSP/HTTP URL … + * ``int`` - webcam index (OpenCV-style) + """ + self.source_path = source_path + self.cap = cv2.VideoCapture(self.source_path) + if not self.cap.isOpened(): + raise ValueError(f"Could not open video source {self.source_path}") + + def info(self) -> VideoInfo: + """Return static information (width / height / fps / total_frames).""" + from ..core import VideoInfo + + w = int(self.cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + h = int(self.cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + precise_fps = self.cap.get(cv2.CAP_PROP_FPS) + fps = int(round(precise_fps, 0)) + n = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT)) + return VideoInfo(w, h, fps, precise_fps, n) + + def read(self) -> tuple[bool, np.ndarray]: + """Decode the next frame.""" + return self.cap.read() + + def grab(self) -> bool: + """Grab the next frame without decoding pixels.""" + return self.cap.grab() + + def seek(self, frame_idx: int) -> None: + """Seek to frame_idx so that the next :py:meth:`read` returns it.""" + self.cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx) + + # ? Do we want to mix and match different writers to different backends? + def writer(self, path: str, info: VideoInfo, codec: str | None = None) -> Writer: + """Return a writer that encodes frames to path. + + Parameters + ---------- + path: + Target file path. + info:" + Expected output resolution / fps (copied from source by default). + codec: + FourCC / codec name to override the backend default. + """ + fourcc = ( + cv2.VideoWriter_fourcc(*codec) if codec else cv2.VideoWriter_fourcc(*"mp4v") + ) + vw = cv2.VideoWriter(path, fourcc, info.fps, (info.width, info.height)) + return OpenCVWriter(vw, info) + + def frames( + self, + stride: int = 1, + start: int = 0, + end: int | None = None, + resolution_wh: tuple[int, int] | None = None, + interpolation=cv2.INTER_LINEAR, + ) -> Iterator[np.ndarray]: + """Yield frames lazily, with optional skipping and resizing. + + Parameters + ---------- + stride: + Number of frames to skip between yielded frames (``1`` yields every frame). + start: + First frame index (0-based) to yield. + end: + Index after the last frame to yield. ``None`` means until exhaustion. + resolution_wh: + Optional ``(width, height)`` to resize each yielded frame to. + + Yields + ------ + np.ndarray + The next decoded (and optionally resized) video frame. + """ + if stride < 1: + raise ValueError("stride must be >= 1") + + info = self.info() + total = ( + info.total_frames if info.total_frames and info.total_frames > 0 else None + ) + if end is None and total is not None: + end = total + if start < 0 or start >= end: + return + + # Position capture at the start frame + self.seek(start) + current_idx = start + infinate_stream = end is None + + while infinate_stream or current_idx < end: + success, frame = self.read() + if not success: + break + + if resolution_wh is not None and ( + frame.shape[1] != resolution_wh[0] or frame.shape[0] != resolution_wh[1] + ): + frame = cv2.resize(frame, resolution_wh, interpolation=interpolation) + + yield frame + current_idx += 1 + + # Efficiently skip stride-1 frames with grab() + skip = stride - 1 + while skip and current_idx < end: + grabbed = self.grab() + if not grabbed: + return + current_idx += 1 + skip -= 1 + + def __iter__(self) -> Iterator[np.ndarray]: + """Yield successive frames until exhaustion. + + This is considered convenience behaviour; the default implementation + below is fine for most back-ends. + """ + while True: + success, frame = self.read() + if not success: + break + yield frame + + def release(self): + """Release the video file.""" + self.cap.release() + + def __enter__(self): + return self + + def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: + self.release() + + def __len__(self) -> int: + n = self.info().total_frames + if n is None or n < 0: + raise TypeError("length is unknown for this stream") + return n + + def __getitem__(self, index: int) -> np.ndarray: + current = int(self.cap.get(cv2.CAP_PROP_POS_FRAMES)) + self.cap.set(cv2.CAP_PROP_POS_FRAMES, index) + success, frame = self.read() + self.cap.set( + cv2.CAP_PROP_POS_FRAMES, current + ) # ? Do we want to restore the video to the original position? + if not success: + raise IndexError(f"Failed to read frame {index}") + return frame diff --git a/supervision/video/backends/pyav.py b/supervision/video/backends/pyav.py new file mode 100644 index 000000000..bdc6a8e8c --- /dev/null +++ b/supervision/video/backends/pyav.py @@ -0,0 +1,11 @@ +try: + import pyav +except ImportError: + raise ImportError( + "The pyav backend is not installed, please install it using `pip install supervision[video]`" + ) + + +class Backend: + def __init__(self, source: str | int): + raise NotImplementedError("The pyav backend is not implemented yet") diff --git a/supervision/video/core.py b/supervision/video/core.py new file mode 100644 index 000000000..c07f883b9 --- /dev/null +++ b/supervision/video/core.py @@ -0,0 +1,75 @@ +from __future__ import annotations + +import importlib +from collections.abc import Generator +from dataclasses import replace + +import numpy as np +from tqdm.auto import tqdm + +from .utils import VideoInfo + + +class Video: + def __init__(self, video_path: str, backend: str | None = None): + self.video_path = video_path + self._backend_name = backend or "opencv" + self._backend = self.__get_backend() + + def __len__(self) -> int: + return len(self._backend) + + def __iter__(self): + return iter(self._backend) + + def __getitem__(self, index: int) -> np.ndarray: + return self._backend[index] + + def __repr__(self) -> str: + return f"