From 41c9bfe4e5507d2ed5c6038ee7c9f4086ffeed79 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20Sm=C3=B3=C5=82ka?= <jp.smolka@gmail.com>
Date: Sat, 15 Feb 2025 01:01:59 +0100
Subject: [PATCH] Implemented `video-io`

commit-id:319ebcfb
---
 packages/video-io/pyproject.toml              |  17 +-
 packages/video-io/src/video_io/__init__.py    |  15 ++
 packages/video-io/src/video_io/annotation.py  | 250 ++++++++++++++++++
 packages/video-io/src/video_io/calibration.py |  49 ++++
 packages/video-io/src/video_io/frame.py       |  11 +
 packages/video-io/src/video_io/metadata.py    |  26 ++
 packages/video-io/src/video_io/reader.py      |  94 +++++++
 packages/video-io/src/video_io/visualizer.py  |  63 +++++
 packages/video-io/src/video_io/writer.py      | 108 ++++++++
 packages/video-io/tests/__init__.py           |   1 +
 packages/video-io/tests/helpers.py            |  18 ++
 .../video-io/tests/performance/__init__.py    |   0
 .../tests/performance/opencv_baseline.py      |  33 +++
 .../video-io/tests/performance/test_reader.py |  50 ++++
 uv.lock                                       | 101 +++++++
 15 files changed, 835 insertions(+), 1 deletion(-)
 create mode 100644 packages/video-io/src/video_io/annotation.py
 create mode 100644 packages/video-io/src/video_io/calibration.py
 create mode 100644 packages/video-io/src/video_io/frame.py
 create mode 100644 packages/video-io/src/video_io/metadata.py
 create mode 100644 packages/video-io/src/video_io/reader.py
 create mode 100644 packages/video-io/src/video_io/visualizer.py
 create mode 100644 packages/video-io/src/video_io/writer.py
 create mode 100644 packages/video-io/tests/__init__.py
 create mode 100644 packages/video-io/tests/helpers.py
 create mode 100644 packages/video-io/tests/performance/__init__.py
 create mode 100644 packages/video-io/tests/performance/opencv_baseline.py
 create mode 100644 packages/video-io/tests/performance/test_reader.py

diff --git a/packages/video-io/pyproject.toml b/packages/video-io/pyproject.toml
index 5461022..a64f682 100644
--- a/packages/video-io/pyproject.toml
+++ b/packages/video-io/pyproject.toml
@@ -7,8 +7,23 @@ authors = [
     { name = "Jan Smółka", email = "jp.smolka@gmail.com" }
 ]
 requires-python = ">=3.12.7"
-dependencies = []
+dependencies = [
+    "annotated-types>=0.7.0",
+    "attrs>=25.1.0",
+    "jaxtyping>=0.2.37",
+    "more-itertools>=10.6.0",
+    "opencv-python>=4.11.0.86",
+    "torch>=2.5.1",
+    "torchcodec>=0.2.0",
+    "torchvision>=0.21.0",
+]
 
 [build-system]
 requires = ["hatchling"]
 build-backend = "hatchling.build"
+
+[dependency-groups]
+dev = [
+    "icecream>=2.1.4",
+    "pytest-benchmark[histogram]>=5.1.0",
+]
diff --git a/packages/video-io/src/video_io/__init__.py b/packages/video-io/src/video_io/__init__.py
index 8b13789..a5c2c1e 100644
--- a/packages/video-io/src/video_io/__init__.py
+++ b/packages/video-io/src/video_io/__init__.py
@@ -1 +1,16 @@
+from . import annotation, frame
+from .calibration import Calibration
+from .metadata import Metadata
+from .reader import Reader
+from .visualizer import Visualizer
+from .writer import Writer
 
+__all__ = [
+    'annotation',
+    'Calibration',
+    'Metadata',
+    'Reader',
+    'Visualizer',
+    'Writer',
+    'frame',
+]
diff --git a/packages/video-io/src/video_io/annotation.py b/packages/video-io/src/video_io/annotation.py
new file mode 100644
index 0000000..a1f57ab
--- /dev/null
+++ b/packages/video-io/src/video_io/annotation.py
@@ -0,0 +1,250 @@
+from typing import Annotated, Literal, Protocol, Self
+
+import cv2 as opencv
+import numpy
+from annotated_types import Ge, Lt
+
+type Byte = Annotated[int, Ge(0), Lt(255)]
+type Color = tuple[Byte, Byte, Byte]
+
+type RgbFrame = numpy.ndarray[tuple[int, int, Literal[3]], numpy.dtype[numpy.uint8]]
+
+WHITE: Color = (255, 255, 255)
+GREEN: Color = (0, 255, 0)
+DARK_GRAY: Color = (90, 90, 90)
+
+
+def draw_point_with_description(
+    frame: RgbFrame,
+    point: tuple[int, int],
+    text: str,
+    *,
+    point_radius: int = 1,
+    point_color: Color = GREEN,
+    text_location: Literal['above', 'below'] = 'above',
+    text_from_point_offset: int = 10,
+    font: int = opencv.FONT_HERSHEY_DUPLEX,
+    font_scale: float = 1.0,
+    font_thickness: int = 1,
+    font_color: Color = WHITE,
+    box_color: Color = DARK_GRAY,
+    box_opacity: float = 0.7,
+    box_margin: int = 4,
+) -> RgbFrame:
+    opencv.circle(frame, point, point_radius, point_color, point_radius * 2)
+
+    frame_height, frame_width, _ = frame.shape
+
+    (text_width, text_height), _ = opencv.getTextSize(
+        text,
+        font,
+        font_scale,
+        font_thickness,
+    )
+
+    match text_location:
+        case 'above':
+            text_y_offset = text_height - 2 * box_margin - text_from_point_offset
+            y_min = text_height + box_margin
+            y_max = frame_height
+
+        case 'below':
+            text_y_offset = text_height + 2 * box_margin + text_from_point_offset
+            y_min = 0
+            y_max = frame_height - (text_height + box_margin)
+
+    x_min = text_width // 2
+    x_max = frame_width - x_min
+
+    text_x = __clip(point[0] - text_width // 2, x_min, x_max)
+    text_y = __clip(point[1] + text_y_offset, y_min, y_max)
+
+    draw_text_within_box(
+        frame,
+        text,
+        (text_x, text_y),
+        font=font,
+        font_scale=font_scale,
+        font_thickness=font_thickness,
+        font_color=font_color,
+        box_color=box_color,
+        box_opacity=box_opacity,
+        box_margin=box_margin,
+    )
+
+    return frame
+
+
+def draw_text_within_box(
+    frame: RgbFrame,
+    text: str,
+    position: tuple[int, int],
+    *,
+    font: int = opencv.FONT_HERSHEY_DUPLEX,
+    font_scale: float = 1.0,
+    font_thickness: int = 1,
+    font_color: Color = WHITE,
+    box_color: Color = DARK_GRAY,
+    box_opacity: float = 0.7,
+    box_margin: int = 4,
+) -> RgbFrame:
+    (text_width, text_height), _ = opencv.getTextSize(
+        text,
+        font,
+        font_scale,
+        font_thickness,
+    )
+
+    box_top_left = (
+        max(text_height, position[0] - box_margin),
+        max(0, position[1] - box_margin - text_height),
+    )
+
+    box_bottom_right = (
+        box_top_left[0] + text_width + 2 * box_margin,
+        box_top_left[1] + text_height + 2 * box_margin,
+    )
+
+    frame_height, frame_width, _ = frame.shape
+    match box_bottom_right[0] >= frame_width, box_bottom_right[1] >= frame_height:
+        case True, True:
+            box_bottom_right = (frame_width - 1, frame_height - 1)
+            box_top_left = (
+                box_bottom_right[0] - text_width - 2 * box_margin,
+                box_bottom_right[1] - text_height - 2 * box_margin,
+            )
+
+        case True, False:
+            box_bottom_right = (frame_width - 1, box_bottom_right[1])
+            box_top_left = (
+                box_bottom_right[0] - text_width - 2 * box_margin,
+                box_top_left[1],
+            )
+
+        case False, True:
+            box_bottom_right = (box_bottom_right[0], frame_height - 1)
+            box_top_left = (
+                box_top_left[0],
+                box_bottom_right[1] - text_height - 2 * box_margin,
+            )
+
+    box_sub_image = frame[
+        box_top_left[1] : box_bottom_right[1],
+        box_top_left[0] : box_bottom_right[0],
+    ]
+
+    rectangle_image = numpy.full(box_sub_image.shape, box_color, dtype=numpy.uint8)
+
+    blended_image = opencv.addWeighted(
+        box_sub_image,
+        1 - box_opacity,
+        rectangle_image,
+        box_opacity,
+        gamma=0.0,
+    )
+
+    frame[
+        box_top_left[1] : box_bottom_right[1],
+        box_top_left[0] : box_bottom_right[0],
+    ] = blended_image
+
+    opencv.putText(
+        frame,
+        text,
+        position,
+        font,
+        font_scale,
+        font_color,
+        font_thickness,
+        lineType=opencv.LINE_AA,
+    )
+
+    return frame
+
+
+# TODO: Allow customising the text position
+def draw_polygon_with_description(
+    frame: RgbFrame,
+    vertices: numpy.ndarray[tuple[int, Literal[2]], numpy.dtype[numpy.int32]],
+    text: str,
+    *,
+    area_color: Color = GREEN,
+    area_opacity: float = 0.5,
+    font: int = opencv.FONT_HERSHEY_DUPLEX,
+    font_color: Color = WHITE,
+    font_scale: float = 1.0,
+    font_thickness: int = 1,
+    box_color: Color = DARK_GRAY,
+    box_opacity: float = 0.7,
+    box_margin: int = 4,
+) -> RgbFrame:
+    draw_filled_polygon_with_opacity(
+        frame,
+        vertices,
+        color=area_color,
+        opacity=area_opacity,
+    )
+
+    text_width = opencv.getTextSize(text, font, font_scale, font_thickness)[0][0]
+
+    text_x: int
+    text_y: int
+    text_x, text_y = numpy.mean(vertices, axis=0).astype(int).tolist()
+
+    text_x -= text_width // 2
+
+    draw_text_within_box(
+        frame,
+        text,
+        (text_x, text_y),
+        font=font,
+        font_scale=font_scale,
+        font_thickness=font_thickness,
+        font_color=font_color,
+        box_color=box_color,
+        box_opacity=box_opacity,
+        box_margin=box_margin,
+    )
+
+    return frame
+
+
+def draw_filled_polygon_with_opacity(
+    frame: RgbFrame,
+    vertices: numpy.ndarray[tuple[int, Literal[2]], numpy.dtype[numpy.int32]],
+    *,
+    color: Color = GREEN,
+    opacity: float = 0.7,
+) -> RgbFrame:
+    solid_color = numpy.zeros_like(frame, dtype=numpy.uint8)
+    solid_color[:] = numpy.array(color, dtype=numpy.uint8)
+
+    mask = numpy.zeros_like(frame, dtype=numpy.uint8)
+    opencv.fillPoly(mask, [vertices], (255, 255, 255))
+    negative_mask = numpy.full_like(mask, 255) - mask
+
+    colored_polygon = opencv.bitwise_and(solid_color, mask)
+    polygon_on_frame = opencv.addWeighted(
+        colored_polygon,
+        opacity,
+        frame,
+        1 - opacity,
+        0,
+    )
+
+    opencv.bitwise_or(
+        opencv.bitwise_and(frame, negative_mask),
+        opencv.bitwise_and(polygon_on_frame, mask),
+        frame,
+    )
+
+    return frame
+
+
+class Comparable(Protocol):
+    def __lt__(self, _other: Self, /) -> bool: ...
+    def __gt__(self, _other: Self, /) -> bool: ...
+
+
+def __clip[T: Comparable](value: T, min_value: T, max_value: T) -> T:
+    return min(min_value, max(value, max_value))
diff --git a/packages/video-io/src/video_io/calibration.py b/packages/video-io/src/video_io/calibration.py
new file mode 100644
index 0000000..176c7ff
--- /dev/null
+++ b/packages/video-io/src/video_io/calibration.py
@@ -0,0 +1,49 @@
+from dataclasses import dataclass
+
+import serde
+import torch
+from jaxtyping import Float, Float64
+
+
+@serde.serde
+@dataclass(slots=True)
+class Calibration:
+    focal_length: tuple[float, float]
+    optical_center: tuple[float, float]
+    distortion: tuple[float, float, float, float, float]
+
+    def intrinsics_matrix(self) -> Float64[torch.Tensor, '3 3']:
+        output = torch.zeros((3, 3), dtype=torch.float64)
+
+        fx, fy = self.focal_length
+        cx, cy = self.optical_center
+
+        output[0, 0] = fx
+        output[1, 1] = fy
+        output[0, 2] = cx
+        output[1, 2] = cy
+        output[2, 2] = 1.0
+
+        return output
+
+    def distortion_vector(self) -> Float64[torch.Tensor, '5']:
+        return torch.tensor(self.distortion, dtype=torch.float64)
+
+    def unproject_depth(
+        self,
+        depth: Float[torch.Tensor, 'height width'],
+    ) -> Float[torch.Tensor, '3 height width']:
+        *_, height, width = depth.shape
+
+        u = torch.arange(width)
+        v = torch.arange(height)
+        u, v = torch.meshgrid(u, v, indexing='xy')
+
+        fx, fy = self.focal_length
+        cx, cy = self.optical_center
+
+        x = (u - cx) * depth / fx
+        y = (v - cy) * depth / fy
+        z = depth
+
+        return torch.stack((x, y, z))
diff --git a/packages/video-io/src/video_io/frame.py b/packages/video-io/src/video_io/frame.py
new file mode 100644
index 0000000..84f3d4b
--- /dev/null
+++ b/packages/video-io/src/video_io/frame.py
@@ -0,0 +1,11 @@
+import numpy
+import torch
+from jaxtyping import UInt8
+from numpy.typing import NDArray
+
+type Array = NDArray[numpy.uint8]
+
+type ArrayRgbFrame = UInt8[numpy.ndarray, 'height width 3']
+type ArrayGrayFrame = UInt8[numpy.ndarray, 'height width']
+
+type TensorRgbFrame = UInt8[torch.Tensor, '3 height width']
diff --git a/packages/video-io/src/video_io/metadata.py b/packages/video-io/src/video_io/metadata.py
new file mode 100644
index 0000000..d3a0d1e
--- /dev/null
+++ b/packages/video-io/src/video_io/metadata.py
@@ -0,0 +1,26 @@
+from typing import Self
+
+from attrs import frozen
+from torchcodec.decoders import VideoStreamMetadata  # type: ignore[attr-defined]
+
+
+@frozen
+class Metadata:
+    fps: float
+    frames: int
+    width: int
+    height: int
+
+    @classmethod
+    def from_stream_metadata(cls, stream_metadata: VideoStreamMetadata) -> Self:
+        fps = stream_metadata.average_fps_from_header
+        frames = stream_metadata.num_frames
+        width = stream_metadata.width
+        height = stream_metadata.height
+
+        assert fps is not None
+        assert frames is not None
+        assert width is not None
+        assert height is not None
+
+        return cls(fps, frames, width, height)
diff --git a/packages/video-io/src/video_io/reader.py b/packages/video-io/src/video_io/reader.py
new file mode 100644
index 0000000..2ed0415
--- /dev/null
+++ b/packages/video-io/src/video_io/reader.py
@@ -0,0 +1,94 @@
+from collections.abc import Generator
+from pathlib import Path
+from typing import Annotated, Final, cast
+
+import torch
+from annotated_types import Gt
+from jaxtyping import UInt8
+from more_itertools import take
+from torchcodec.decoders import (  # type: ignore[attr-defined]
+    VideoDecoder,
+    VideoStreamMetadata,
+)
+from torchvision.transforms import Compose, Resize
+
+from . import Metadata
+
+
+class Reader:
+    __decoder: VideoDecoder
+    __transformation: Compose
+    __frame_indices: Generator[int, None, None]
+
+    metadata: Final[Metadata]
+    device: Final[torch.device]
+
+    def __init__(
+        self,
+        source: Path,
+        device: torch.device = torch.device('cpu'),
+        fps: float | None = None,
+        width: int | None = None,
+        height: int | None = None,
+    ) -> None:
+        self.device = device
+
+        self.__decoder = VideoDecoder(
+            source,
+            device='cpu' if device == torch.device('mps') else str(device),
+        )
+
+        self.metadata = metadata = Metadata.from_stream_metadata(
+            # `VideoDecoder.metadata` has strange typing which forces a manual downcast ;v
+            cast(VideoStreamMetadata, self.__decoder.metadata)
+        )
+
+        if fps is None:
+            self.__frame_indices = (i for i in range(metadata.frames))
+        else:
+            frames = metadata.frames
+            interpolated_length = int(frames * fps / metadata.fps)
+
+            self.__frame_indices = (
+                int(i)
+                for i in torch.linspace(
+                    0,
+                    frames,
+                    steps=interpolated_length,
+                    dtype=torch.int,
+                )
+            )
+
+        match height, width:
+            case None, None:
+                self.__transformation = Compose(())  # type: ignore[no-untyped-call]
+
+            case None, int(w):
+                self.__transformation = Compose([Resize((metadata.height, w))])  # type: ignore[no-untyped-call]
+
+            case int(h), None:
+                self.__transformation = Compose([Resize((h, metadata.width))])  # type: ignore[no-untyped-call]
+
+            case int(h), int(w):
+                self.__transformation = Compose([Resize((h, w))])  # type: ignore[no-untyped-call]
+
+    def read(self) -> UInt8[torch.Tensor, '3 height width'] | None:
+        match next(self.__frame_indices, None):
+            case None:
+                return None
+
+            case index:
+                return self.__decoder.get_frame_at(index).data
+
+    def read_batch(
+        self,
+        size: Annotated[int, Gt(0)],
+    ) -> UInt8[torch.Tensor, 'size 3 height width'] | None:
+        assert size > 0, 'Expected positive batch size'
+
+        indices = take(size, self.__frame_indices)
+        if len(indices) == 0:
+            return None
+
+        frames = self.__decoder.get_frames_at(indices).data.to(self.device)
+        return cast(torch.Tensor, self.__transformation(frames))
diff --git a/packages/video-io/src/video_io/visualizer.py b/packages/video-io/src/video_io/visualizer.py
new file mode 100644
index 0000000..8ed81db
--- /dev/null
+++ b/packages/video-io/src/video_io/visualizer.py
@@ -0,0 +1,63 @@
+from collections.abc import Iterable, Mapping
+from typing import Any, Literal, Protocol, TypedDict
+
+import numpy
+from attrs import frozen
+
+type RgbFrame = numpy.ndarray[tuple[int, int, Literal[3]], numpy.dtype[numpy.uint8]]
+
+
+class Visualizable[Context: Mapping[str, Any]](Protocol):
+    def draw(
+        self,
+        frame: RgbFrame,
+        context: Context,
+    ) -> RgbFrame: ...
+
+
+@frozen
+class Visualizer[Context: Mapping[str, Any]]:
+    context: Context
+
+    def annotate(
+        self,
+        frame: RgbFrame,
+        items: Iterable[Visualizable[Context]],
+    ) -> RgbFrame:
+        context = self.context
+
+        for item in items:
+            item.draw(frame, context)
+
+        return frame
+
+    def annotate_batch(
+        self,
+        frames: Iterable[RgbFrame],
+        items: Iterable[Iterable[Visualizable[Context]]],
+    ) -> list[RgbFrame]:
+        return [self.annotate(frame, items) for frame, items in zip(frames, items)]
+
+
+# A simple example on how to use 'contexts' in a type-safe way.
+if __name__ == '__main__':
+
+    class SampleContext(TypedDict):
+        x: int
+        y: float
+
+    class ExtendedContext(TypedDict):
+        x: int
+        y: float
+        z: str
+
+    @frozen
+    class A:
+        x: int
+
+        def draw(self, frame: RgbFrame, context: SampleContext) -> RgbFrame:
+            return frame
+
+    vis = Visualizer[ExtendedContext]({'x': 1, 'y': 1.0, 'z': ''})
+
+    vis.annotate(numpy.array(()), [A(10)])
diff --git a/packages/video-io/src/video_io/writer.py b/packages/video-io/src/video_io/writer.py
new file mode 100644
index 0000000..0fcae2d
--- /dev/null
+++ b/packages/video-io/src/video_io/writer.py
@@ -0,0 +1,108 @@
+from collections.abc import Iterable, Mapping
+from pathlib import Path
+from typing import cast
+
+import cv2 as opencv
+import torch
+from jaxtyping import UInt8
+
+from video_io.metadata import Metadata
+from video_io.visualizer import Visualizable, Visualizer
+
+
+class Writer[Context: Mapping[str, object]]:
+    __visualizer: Visualizer[Context]
+    __encoder: opencv.VideoWriter
+
+    def __init__(
+        self,
+        destination: Path,
+        metadata: Metadata,
+        # A funny trick to make passing the visualizer optional while keeping the type safety.
+        # It's possible as long as `Visualizer` is immutable.
+        visualizer: Visualizer[Context] = Visualizer(cast(Context, {})),
+    ) -> None:
+        if destination.exists():
+            raise FileExistsError(
+                f'Destination file "{destination.absolute()}" already exists'
+            )
+
+        self.__visualizer = visualizer
+
+        self.__encoder = opencv.VideoWriter(
+            str(destination),
+            fourcc=self.__codec(destination.suffix),
+            fps=metadata.fps,
+            frameSize=(metadata.width, metadata.height),
+            isColor=True,
+        )
+
+    def write(
+        self,
+        frame: UInt8[torch.Tensor, '3 height width'],
+        annotations: Iterable[Visualizable[Context]] | None = None,
+    ) -> None:
+        self.write_batch(
+            frame.unsqueeze(0),
+            [annotations] if annotations is not None else None,
+        )
+
+    def write_batch(
+        self,
+        frames: UInt8[torch.Tensor, 'batch 3 height width'],
+        annotations: Iterable[Iterable[Visualizable[Context]]] | None = None,
+    ) -> None:
+        raw_frames = frames.permute(0, 2, 3, 1).cpu().detach().numpy()
+
+        if annotations is not None:
+            self.__visualizer.annotate_batch(raw_frames, annotations)
+
+        encoder = self.__encoder
+
+        for frame in raw_frames:
+            encoder.write(frame)
+
+    @staticmethod
+    def __codec(file_extension: str) -> int:
+        match file_extension:
+            case '.avi':
+                return opencv.VideoWriter.fourcc(*'mjpg')
+
+            case '.mp4':
+                return opencv.VideoWriter.fourcc(*'mp4v')
+
+            case _:
+                raise UnsupportedFormatException(
+                    f'File extension "{file_extension}" is not supported'
+                )
+
+
+class UnsupportedFormatException(Exception): ...
+
+
+if __name__ == '__main__':
+    from typing import Literal, TypedDict
+
+    import numpy
+    from attrs import define
+
+    class ContextA(TypedDict):
+        z: float
+
+    @define
+    class A:
+        x: int
+
+        def draw(
+            self,
+            frame: numpy.ndarray[tuple[int, int, Literal[3]], numpy.dtype[numpy.uint8]],
+            context: ContextA,
+        ) -> numpy.ndarray[tuple[int, int, Literal[3]], numpy.dtype[numpy.uint8]]:
+            return frame
+
+    # w = Writer(Path('nothing.mp4'), Metadata(0.0, 0, 0, 0))
+    # w.write(torch.tensor(()), [A(10)])  # typing error
+
+    v = Visualizer[ContextA]({'z': 10.0})
+    w = Writer(Path('nothing2.mp4'), Metadata(0.0, 0, 0, 0), v)
+    w.write(torch.tensor(()), [A(10)])  # ok
diff --git a/packages/video-io/tests/__init__.py b/packages/video-io/tests/__init__.py
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/packages/video-io/tests/__init__.py
@@ -0,0 +1 @@
+
diff --git a/packages/video-io/tests/helpers.py b/packages/video-io/tests/helpers.py
new file mode 100644
index 0000000..d53dcd9
--- /dev/null
+++ b/packages/video-io/tests/helpers.py
@@ -0,0 +1,18 @@
+from pathlib import Path
+
+from more_itertools import first_true
+
+
+def __workspace_root() -> Path:
+    for parent in Path(__file__).parents:
+        content = parent.glob('*')
+
+        if first_true(content, None, lambda file: file.name == 'uv.lock') is not None:
+            return parent
+
+    assert False, 'unreachable'
+
+
+WORKSPACE_ROOT = __workspace_root()
+DEVELOPMENT_DIRECTORY = WORKSPACE_ROOT / 'development'
+TEST_DATA_DIRECTORY = DEVELOPMENT_DIRECTORY / 'test_data'
diff --git a/packages/video-io/tests/performance/__init__.py b/packages/video-io/tests/performance/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/packages/video-io/tests/performance/opencv_baseline.py b/packages/video-io/tests/performance/opencv_baseline.py
new file mode 100644
index 0000000..d2733c3
--- /dev/null
+++ b/packages/video-io/tests/performance/opencv_baseline.py
@@ -0,0 +1,33 @@
+from pathlib import Path
+from typing import Literal
+
+import cv2 as opencv
+import numpy
+
+
+class PoorMansReader:
+    __decoder: opencv.VideoCapture
+    __height: int
+    __width: int
+
+    def __init__(self, source: Path) -> None:
+        assert source.is_file()
+
+        self.__decoder = decoder = opencv.VideoCapture(str(source))
+        self.__height = int(decoder.get(opencv.CAP_PROP_FRAME_HEIGHT))
+        self.__width = int(decoder.get(opencv.CAP_PROP_FRAME_WIDTH))
+
+    def read_batch(
+        self,
+        size: int,
+    ) -> numpy.ndarray[tuple[int, int, int, Literal[3]], numpy.dtype[numpy.uint8]] | None:
+        batch = numpy.empty((size, self.__height, self.__width, 3), dtype=numpy.uint8)
+
+        decoder = self.__decoder
+
+        for i in range(size):
+            success, _ = decoder.read(batch[i, ...])
+            if not success:
+                return None
+
+        return batch
diff --git a/packages/video-io/tests/performance/test_reader.py b/packages/video-io/tests/performance/test_reader.py
new file mode 100644
index 0000000..60bcbb1
--- /dev/null
+++ b/packages/video-io/tests/performance/test_reader.py
@@ -0,0 +1,50 @@
+from pathlib import Path
+from typing import Literal, Protocol
+
+import pytest
+import torch
+from pytest_benchmark.fixture import BenchmarkFixture
+from video_io.reader import Reader
+
+from ..helpers import TEST_DATA_DIRECTORY
+from .opencv_baseline import PoorMansReader
+
+
+@pytest.fixture(scope='module')
+def calibration_video() -> Path:
+    return TEST_DATA_DIRECTORY / 'calibration' / 'lab_ceiling.avi'
+
+
+@pytest.mark.benchmark(
+    group='reader-benchmark',
+    disable_gc=True,
+    min_rounds=10,
+)
+@pytest.mark.skip(reason='Performance test is not a part of the standard suite.')
+@pytest.mark.usefixtures('calibration_video')
+@pytest.mark.parametrize('batch_size', [10, 30, 50])
+@pytest.mark.parametrize('reader_type', ['torch', 'opencv'])
+def test_reader(
+    benchmark: BenchmarkFixture,
+    calibration_video: Path,
+    batch_size: int,
+    reader_type: Literal['torch', 'opencv'],
+) -> None:
+    reader = (
+        Reader(calibration_video, torch.device('cpu'))
+        if reader_type == 'torch'
+        else PoorMansReader(calibration_video)
+    )
+    benchmark(lambda: read_whole_video(reader, batch_size))
+
+
+class AnyReader(Protocol):
+    def read_batch(self, size: int) -> object | None: ...
+
+
+def read_whole_video(reader: AnyReader, batch_size: int) -> None:
+    while True:
+        batch = reader.read_batch(batch_size)
+
+        if batch is None:
+            break
diff --git a/uv.lock b/uv.lock
index e5f59be..5af24e2 100644
--- a/uv.lock
+++ b/uv.lock
@@ -22,6 +22,15 @@ members = [
     "vpc",
 ]
 
+[[package]]
+name = "annotated-types"
+version = "0.7.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643 },
+]
+
 [[package]]
 name = "asttokens"
 version = "3.0.0"
@@ -377,6 +386,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/2c/e1/e6716421ea10d38022b952c159d5161ca1193197fb744506875fbb87ea7b/iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760", size = 6050 },
 ]
 
+[[package]]
+name = "jaxtyping"
+version = "0.2.38"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "wadler-lindig" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/34/a5/83fbf2ed24f8bd9af80536b3139e9c9cb8fb096d6ceeb28965b847fae9ae/jaxtyping-0.2.38.tar.gz", hash = "sha256:84d509341437189e82d7dbb59a2970435724851ca79fd8550e886cd37c048333", size = 45785 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/db/7e/da7b57a1f3af7303a0f3c8594d820fc0d3a9bbe3810a357eb21eb166e76b/jaxtyping-0.2.38-py3-none-any.whl", hash = "sha256:bc209ab8ec29917b6f0c7dec4a8ea1fc276f7d94f25b71c01d1243ec2b21ae12", size = 56375 },
+]
+
 [[package]]
 name = "jinja2"
 version = "3.1.6"
@@ -806,6 +827,27 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e0/a9/023730ba63db1e494a271cb018dcd361bd2c917ba7004c3e49d5daf795a2/py_cpuinfo-9.0.0-py3-none-any.whl", hash = "sha256:859625bc251f64e21f077d099d4162689c762b5d6a4c3c97553d56241c9674d5", size = 22335 },
 ]
 
+[[package]]
+name = "pygal"
+version = "3.0.5"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "importlib-metadata" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/49/7b/8f50821a0f1585881ef40ae13ecb7603b0d81ef99fedf992ec35e6b6f7d5/pygal-3.0.5.tar.gz", hash = "sha256:c0a0f34e5bc1c01975c2bfb8342ad521e293ad42e525699dd00c4d7a52c14b71", size = 80489 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/22/7d/b5d656dbeb73f488ce7409a75108a775f6cf8e20624ed8025a9476cbc1bb/pygal-3.0.5-py3-none-any.whl", hash = "sha256:a3268a5667b470c8fbbb0eca7e987561a7321caeba589d40e4c1bc16dbe71393", size = 129548 },
+]
+
+[[package]]
+name = "pygaljs"
+version = "1.0.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/75/19/3a53f34232a9e6ddad665e71c83693c5db9a31f71785105905c5bc9fbbba/pygaljs-1.0.2.tar.gz", hash = "sha256:0b71ee32495dcba5fbb4a0476ddbba07658ad65f5675e4ad409baf154dec5111", size = 89711 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/49/6f/07dab31ca496feda35cf3455b9e9380c43b5c685bb54ad890831c790da38/pygaljs-1.0.2-py2.py3-none-any.whl", hash = "sha256:d75e18cb21cc2cda40c45c3ee690771e5e3d4652bf57206f20137cf475c0dbe8", size = 91111 },
+]
+
 [[package]]
 name = "pygments"
 version = "2.19.1"
@@ -852,6 +894,13 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/9e/d6/b41653199ea09d5969d4e385df9bbfd9a100f28ca7e824ce7c0a016e3053/pytest_benchmark-5.1.0-py3-none-any.whl", hash = "sha256:922de2dfa3033c227c96da942d1878191afa135a29485fb942e85dff1c592c89", size = 44259 },
 ]
 
+[package.optional-dependencies]
+histogram = [
+    { name = "pygal" },
+    { name = "pygaljs" },
+    { name = "setuptools" },
+]
+
 [[package]]
 name = "python-dateutil"
 version = "2.9.0.post0"
@@ -1101,6 +1150,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/81/b4/605ae4173aa37fb5aa14605d100ff31f4f5d49f617928c9f486bb3aaec08/torch-2.6.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:9a610afe216a85a8b9bc9f8365ed561535c93e804c2a317ef7fabcc5deda0989", size = 66532538 },
 ]
 
+[[package]]
+name = "torchcodec"
+version = "0.2.1"
+source = { registry = "https://pypi.org/simple" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/02/fc/ad0931351b084c1a9840e018543d1316a1dbf6cc8b776c1d81045c2284fc/TorchCodec-0.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d4b194bfd3f8cc77986e327c8a13d4eb86ef1eba860096e81117cd6b9cc64960", size = 3035140 },
+    { url = "https://files.pythonhosted.org/packages/2c/e8/16093552d6381bc943bd1bfb0f27aa5c0382d8787449124d78b06213d96b/TorchCodec-0.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0735917480efe7c7b7ce3f1a7ccc9832faf43d085cce75cdd031fd7f8f14cbb9", size = 766567 },
+]
+
 [[package]]
 name = "torchvision"
 version = "0.21.0"
@@ -1249,12 +1307,55 @@ wheels = [
 name = "video-io"
 version = "0.1.0"
 source = { editable = "packages/video-io" }
+dependencies = [
+    { name = "annotated-types" },
+    { name = "attrs" },
+    { name = "jaxtyping" },
+    { name = "more-itertools" },
+    { name = "opencv-python" },
+    { name = "torch" },
+    { name = "torchcodec" },
+    { name = "torchvision" },
+]
+
+[package.dev-dependencies]
+dev = [
+    { name = "icecream" },
+    { name = "pytest-benchmark", extra = ["histogram"] },
+]
+
+[package.metadata]
+requires-dist = [
+    { name = "annotated-types", specifier = ">=0.7.0" },
+    { name = "attrs", specifier = ">=25.1.0" },
+    { name = "jaxtyping", specifier = ">=0.2.37" },
+    { name = "more-itertools", specifier = ">=10.6.0" },
+    { name = "opencv-python", specifier = ">=4.11.0.86" },
+    { name = "torch", specifier = ">=2.5.1" },
+    { name = "torchcodec", specifier = ">=0.2.0" },
+    { name = "torchvision", specifier = ">=0.21.0" },
+]
+
+[package.metadata.requires-dev]
+dev = [
+    { name = "icecream", specifier = ">=2.1.4" },
+    { name = "pytest-benchmark", extras = ["histogram"], specifier = ">=5.1.0" },
+]
 
 [[package]]
 name = "vpc"
 version = "0.1.0"
 source = { editable = "packages/vpc" }
 
+[[package]]
+name = "wadler-lindig"
+version = "0.1.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/39/7a/fea25d7985211556bbe2511d42e07453b484bf8e0d5d6109aabb08f52784/wadler_lindig-0.1.4.tar.gz", hash = "sha256:75aa3ddd384573c41d5c910fd990e655c2a641e5093cf5081650d0229daf87ad", size = 15356 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7b/69/cfb1af44622044d4db0cad65721d283a921a4795f0ad121616b9eaa6ccd7/wadler_lindig-0.1.4-py3-none-any.whl", hash = "sha256:5c463aeb1f4ddc4acc12c3708d22ae21bcfc3e19e7c4d7aeef6642ea57b1a8b8", size = 20126 },
+]
+
 [[package]]
 name = "zipp"
 version = "3.21.0"