Skip to content

Implemented video-io #156

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: spr/main/73e24d58
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion packages/video-io/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,23 @@ authors = [
{ name = "Jan Smółka", email = "[email protected]" }
]
requires-python = ">=3.12.7"
dependencies = []
dependencies = [
"annotated-types>=0.7.0",
"attrs>=25.1.0",
"jaxtyping>=0.2.37",
"more-itertools>=10.6.0",
"opencv-python>=4.11.0.86",
"torch>=2.5.1",
"torchcodec>=0.2.0",
"torchvision>=0.21.0",
]

[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[dependency-groups]
dev = [
"icecream>=2.1.4",
"pytest-benchmark[histogram]>=5.1.0",
]
15 changes: 15 additions & 0 deletions packages/video-io/src/video_io/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,16 @@
from . import annotation, frame
from .calibration import Calibration
from .metadata import Metadata
from .reader import Reader
from .visualizer import Visualizer
from .writer import Writer

__all__ = [
'annotation',
'Calibration',
'Metadata',
'Reader',
'Visualizer',
'Writer',
'frame',
]
250 changes: 250 additions & 0 deletions packages/video-io/src/video_io/annotation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,250 @@
from typing import Annotated, Literal, Protocol, Self

import cv2 as opencv
import numpy
from annotated_types import Ge, Lt

type Byte = Annotated[int, Ge(0), Lt(255)]
type Color = tuple[Byte, Byte, Byte]

type RgbFrame = numpy.ndarray[tuple[int, int, Literal[3]], numpy.dtype[numpy.uint8]]

WHITE: Color = (255, 255, 255)
GREEN: Color = (0, 255, 0)
DARK_GRAY: Color = (90, 90, 90)


def draw_point_with_description(
frame: RgbFrame,
point: tuple[int, int],
text: str,
*,
point_radius: int = 1,
point_color: Color = GREEN,
text_location: Literal['above', 'below'] = 'above',
text_from_point_offset: int = 10,
font: int = opencv.FONT_HERSHEY_DUPLEX,
font_scale: float = 1.0,
font_thickness: int = 1,
font_color: Color = WHITE,
box_color: Color = DARK_GRAY,
box_opacity: float = 0.7,
box_margin: int = 4,
) -> RgbFrame:
opencv.circle(frame, point, point_radius, point_color, point_radius * 2)

frame_height, frame_width, _ = frame.shape

(text_width, text_height), _ = opencv.getTextSize(
text,
font,
font_scale,
font_thickness,
)

match text_location:
case 'above':
text_y_offset = text_height - 2 * box_margin - text_from_point_offset
y_min = text_height + box_margin
y_max = frame_height

case 'below':
text_y_offset = text_height + 2 * box_margin + text_from_point_offset
y_min = 0
y_max = frame_height - (text_height + box_margin)

x_min = text_width // 2
x_max = frame_width - x_min

text_x = __clip(point[0] - text_width // 2, x_min, x_max)
text_y = __clip(point[1] + text_y_offset, y_min, y_max)

draw_text_within_box(
frame,
text,
(text_x, text_y),
font=font,
font_scale=font_scale,
font_thickness=font_thickness,
font_color=font_color,
box_color=box_color,
box_opacity=box_opacity,
box_margin=box_margin,
)

return frame


def draw_text_within_box(
frame: RgbFrame,
text: str,
position: tuple[int, int],
*,
font: int = opencv.FONT_HERSHEY_DUPLEX,
font_scale: float = 1.0,
font_thickness: int = 1,
font_color: Color = WHITE,
box_color: Color = DARK_GRAY,
box_opacity: float = 0.7,
box_margin: int = 4,
) -> RgbFrame:
(text_width, text_height), _ = opencv.getTextSize(
text,
font,
font_scale,
font_thickness,
)

box_top_left = (
max(text_height, position[0] - box_margin),
max(0, position[1] - box_margin - text_height),
)

box_bottom_right = (
box_top_left[0] + text_width + 2 * box_margin,
box_top_left[1] + text_height + 2 * box_margin,
)

frame_height, frame_width, _ = frame.shape
match box_bottom_right[0] >= frame_width, box_bottom_right[1] >= frame_height:
case True, True:
box_bottom_right = (frame_width - 1, frame_height - 1)
box_top_left = (
box_bottom_right[0] - text_width - 2 * box_margin,
box_bottom_right[1] - text_height - 2 * box_margin,
)

case True, False:
box_bottom_right = (frame_width - 1, box_bottom_right[1])
box_top_left = (
box_bottom_right[0] - text_width - 2 * box_margin,
box_top_left[1],
)

case False, True:
box_bottom_right = (box_bottom_right[0], frame_height - 1)
box_top_left = (
box_top_left[0],
box_bottom_right[1] - text_height - 2 * box_margin,
)

box_sub_image = frame[
box_top_left[1] : box_bottom_right[1],
box_top_left[0] : box_bottom_right[0],
]

rectangle_image = numpy.full(box_sub_image.shape, box_color, dtype=numpy.uint8)

blended_image = opencv.addWeighted(
box_sub_image,
1 - box_opacity,
rectangle_image,
box_opacity,
gamma=0.0,
)

frame[
box_top_left[1] : box_bottom_right[1],
box_top_left[0] : box_bottom_right[0],
] = blended_image

opencv.putText(
frame,
text,
position,
font,
font_scale,
font_color,
font_thickness,
lineType=opencv.LINE_AA,
)

return frame


# TODO: Allow customising the text position
def draw_polygon_with_description(
frame: RgbFrame,
vertices: numpy.ndarray[tuple[int, Literal[2]], numpy.dtype[numpy.int32]],
text: str,
*,
area_color: Color = GREEN,
area_opacity: float = 0.5,
font: int = opencv.FONT_HERSHEY_DUPLEX,
font_color: Color = WHITE,
font_scale: float = 1.0,
font_thickness: int = 1,
box_color: Color = DARK_GRAY,
box_opacity: float = 0.7,
box_margin: int = 4,
) -> RgbFrame:
draw_filled_polygon_with_opacity(
frame,
vertices,
color=area_color,
opacity=area_opacity,
)

text_width = opencv.getTextSize(text, font, font_scale, font_thickness)[0][0]

text_x: int
text_y: int
text_x, text_y = numpy.mean(vertices, axis=0).astype(int).tolist()

text_x -= text_width // 2

draw_text_within_box(
frame,
text,
(text_x, text_y),
font=font,
font_scale=font_scale,
font_thickness=font_thickness,
font_color=font_color,
box_color=box_color,
box_opacity=box_opacity,
box_margin=box_margin,
)

return frame


def draw_filled_polygon_with_opacity(
frame: RgbFrame,
vertices: numpy.ndarray[tuple[int, Literal[2]], numpy.dtype[numpy.int32]],
*,
color: Color = GREEN,
opacity: float = 0.7,
) -> RgbFrame:
solid_color = numpy.zeros_like(frame, dtype=numpy.uint8)
solid_color[:] = numpy.array(color, dtype=numpy.uint8)

mask = numpy.zeros_like(frame, dtype=numpy.uint8)
opencv.fillPoly(mask, [vertices], (255, 255, 255))
negative_mask = numpy.full_like(mask, 255) - mask

colored_polygon = opencv.bitwise_and(solid_color, mask)
polygon_on_frame = opencv.addWeighted(
colored_polygon,
opacity,
frame,
1 - opacity,
0,
)

opencv.bitwise_or(
opencv.bitwise_and(frame, negative_mask),
opencv.bitwise_and(polygon_on_frame, mask),
frame,
)

return frame


class Comparable(Protocol):
def __lt__(self, _other: Self, /) -> bool: ...
def __gt__(self, _other: Self, /) -> bool: ...


def __clip[T: Comparable](value: T, min_value: T, max_value: T) -> T:
return min(min_value, max(value, max_value))
49 changes: 49 additions & 0 deletions packages/video-io/src/video_io/calibration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from dataclasses import dataclass

import serde
import torch
from jaxtyping import Float, Float64


@serde.serde
@dataclass(slots=True)
class Calibration:
focal_length: tuple[float, float]
optical_center: tuple[float, float]
distortion: tuple[float, float, float, float, float]

def intrinsics_matrix(self) -> Float64[torch.Tensor, '3 3']:
output = torch.zeros((3, 3), dtype=torch.float64)

fx, fy = self.focal_length
cx, cy = self.optical_center

output[0, 0] = fx
output[1, 1] = fy
output[0, 2] = cx
output[1, 2] = cy
output[2, 2] = 1.0

return output

def distortion_vector(self) -> Float64[torch.Tensor, '5']:
return torch.tensor(self.distortion, dtype=torch.float64)

def unproject_depth(
self,
depth: Float[torch.Tensor, 'height width'],
) -> Float[torch.Tensor, '3 height width']:
*_, height, width = depth.shape

u = torch.arange(width)
v = torch.arange(height)
u, v = torch.meshgrid(u, v, indexing='xy')

fx, fy = self.focal_length
cx, cy = self.optical_center

x = (u - cx) * depth / fx
y = (v - cy) * depth / fy
z = depth

return torch.stack((x, y, z))
11 changes: 11 additions & 0 deletions packages/video-io/src/video_io/frame.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import numpy
import torch
from jaxtyping import UInt8
from numpy.typing import NDArray

type Array = NDArray[numpy.uint8]

type ArrayRgbFrame = UInt8[numpy.ndarray, 'height width 3']
type ArrayGrayFrame = UInt8[numpy.ndarray, 'height width']

type TensorRgbFrame = UInt8[torch.Tensor, '3 height width']
26 changes: 26 additions & 0 deletions packages/video-io/src/video_io/metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from typing import Self

from attrs import frozen
from torchcodec.decoders import VideoStreamMetadata # type: ignore[attr-defined]


@frozen
class Metadata:
fps: float
frames: int
width: int
height: int

@classmethod
def from_stream_metadata(cls, stream_metadata: VideoStreamMetadata) -> Self:
fps = stream_metadata.average_fps_from_header
frames = stream_metadata.num_frames
width = stream_metadata.width
height = stream_metadata.height

assert fps is not None
assert frames is not None
assert width is not None
assert height is not None

return cls(fps, frames, width, height)
Loading