Updating ffmpeg to 8.0 (#9552)

Eldies · SpecLad · zhiltsov-max · web-flow · commit 462b1f08cd7b · 2025-10-13T14:09:28.000+02:00
- Updated FFmpeg to 8.0, OpenH264 to 2.6.0 and av to 15.1.0
- Added an error resolution script to help users in the case of issues
with the updated version in old tasks

---------

Co-authored-by: Roman Donchenko &lt;roman@cvat.ai&gt;
Co-authored-by: Maxim Zhiltsov &lt;zhiltsov.max35@gmail.com&gt;
diff --git a/Dockerfile b/Dockerfile
@@ -37,8 +37,8 @@ FROM build-image-base AS build-image-av
 ARG PREFIX=/opt/ffmpeg
 ARG PKG_CONFIG_PATH=${PREFIX}/lib/pkgconfig
 
-ENV FFMPEG_VERSION=4.3.1 \
-    OPENH264_VERSION=2.1.1
+ENV FFMPEG_VERSION=8.0 \
+    OPENH264_VERSION=2.6.0
 
 WORKDIR /tmp/openh264
 RUN curl -sL https://github.com/cisco/openh264/archive/v${OPENH264_VERSION}.tar.gz --output - | \
@@ -61,11 +61,8 @@ COPY utils/dataset_manifest/requirements.txt /tmp/utils/dataset_manifest/require
 RUN grep -q '^av==' /tmp/utils/dataset_manifest/requirements.txt
 RUN sed -i '/^av==/!d' /tmp/utils/dataset_manifest/requirements.txt
 
-# Work around https://github.com/PyAV-Org/PyAV/issues/1140
-RUN pip install setuptools wheel 'cython<3'
-
 RUN --mount=type=cache,target=/root/.cache/pip/http-v2 \
-    python3 -m pip wheel --no-binary=av --no-build-isolation \
+    python3 -m pip wheel --no-binary=av \
     -r /tmp/utils/dataset_manifest/requirements.txt \
     -w /tmp/wheelhouse
 
diff --git a/changelog.d/20250926_160254_dmitrii.lavrukhin_ffmpeg.md b/changelog.d/20250926_160254_dmitrii.lavrukhin_ffmpeg.md
@@ -0,0 +1,4 @@
+### Changed
+
+- FFmpeg updated to 8.0
+  (<https://github.com/cvat-ai/cvat/pull/9552>)
diff --git a/cvat/apps/engine/media_extractors.py b/cvat/apps/engine/media_extractors.py
@@ -15,9 +15,10 @@
 from abc import ABC, abstractmethod
 from bisect import bisect
 from collections.abc import Generator, Iterable, Iterator, Sequence
-from contextlib import AbstractContextManager, ExitStack, closing, contextmanager
+from contextlib import ExitStack, closing
 from dataclasses import dataclass
 from enum import IntEnum
+from fractions import Fraction
 from random import shuffle
 from typing import Any, Callable, Optional, Protocol, TypeVar, Union
 
@@ -571,28 +572,14 @@ def extract(self):
             os.remove(self._zip_source.filename)
 
 class _AvVideoReading:
-    @contextmanager
     def read_av_container(
         self, source: Union[str, io.BytesIO]
-    ) -> Generator[av.container.InputContainer, None, None]:
+    ) -> av.container.InputContainer:
         if isinstance(source, io.BytesIO):
             source.seek(0) # required for re-reading
 
-        container = av.open(source)
-        try:
-            yield container
-        finally:
-            # fixes a memory leak in input container closing
-            # https://github.com/PyAV-Org/PyAV/issues/1117
-            for stream in container.streams:
-                context = stream.codec_context
-                if context and context.is_open:
-                    # Currently, context closing may get stuck on some videos for an unknown reason,
-                    # so the thread_type == 'AUTO' setting is disabled for future investigation
-                    context.close()
-
-            if container.open_files:
-                container.close()
+        return av.open(source)
+
 
     def decode_stream(
         self, container: av.container.Container, video_stream: av.video.stream.VideoStream
@@ -679,13 +666,10 @@ def iterate_frames(
             with closing(self._decode_stream(container, video_stream)) as stream_decoder:
                 for frame, frame_number in zip(stream_decoder, frame_counter):
                     if frame_number == next_frame_filter_frame:
-                        if video_stream.metadata.get('rotate'):
+                        if frame.rotation:
                             pts = frame.pts
                             frame = av.VideoFrame().from_ndarray(
-                                rotate_image(
-                                    frame.to_ndarray(format='bgr24'),
-                                    360 - int(video_stream.metadata.get('rotate'))
-                                ),
+                                rotate_image(frame.to_ndarray(format='bgr24'), frame.rotation),
                                 format ='bgr24'
                             )
                             frame.pts = pts
@@ -707,7 +691,7 @@ def get_progress(self, pos):
         duration = self._get_duration()
         return pos / duration if duration else None
 
-    def _read_av_container(self) -> AbstractContextManager[av.container.InputContainer]:
+    def _read_av_container(self) -> av.container.InputContainer:
         return _AvVideoReading().read_av_container(self._source_path[0])
 
     def _decode_stream(
@@ -798,7 +782,7 @@ def __init__(self, manifest_path: str, source_path: str, *, allow_threading: boo
 
         self.allow_threading = allow_threading
 
-    def _read_av_container(self) -> AbstractContextManager[av.container.InputContainer]:
+    def _read_av_container(self) -> av.container.InputContainer:
         return _AvVideoReading().read_av_container(self.source_path)
 
     def _decode_stream(
@@ -843,12 +827,9 @@ def iterate_frames(self, *, frame_filter: Iterable[int]) -> Iterable[av.VideoFra
             with closing(self._decode_stream(container, video_stream)) as stream_decoder:
                 for frame, frame_number in zip(stream_decoder, frame_counter):
                     if frame_number == next_frame_filter_frame:
-                        if video_stream.metadata.get('rotate'):
+                        if frame.rotation:
                             frame = av.VideoFrame().from_ndarray(
-                                rotate_image(
-                                    frame.to_ndarray(format='bgr24'),
-                                    360 - int(video_stream.metadata.get('rotate'))
-                                ),
+                                rotate_image(frame.to_ndarray(format='bgr24'), frame.rotation),
                                 format ='bgr24'
                             )
 
@@ -1050,7 +1031,7 @@ def __init__(self, quality=67):
                 "preset": "ultrafast",
             }
 
-    def _add_video_stream(self, container: av.container.OutputContainer, w, h, rate, options):
+    def _add_video_stream(self, container: av.container.OutputContainer, w, h, rate, options) -> av.video.stream.VideoStream:
         # x264 requires width and height must be divisible by 2 for yuv420p
         if h % 2:
             h += 1
@@ -1067,6 +1048,14 @@ def _add_video_stream(self, container: av.container.OutputContainer, w, h, rate,
         video_stream.pix_fmt = "yuv420p"
         video_stream.width = w
         video_stream.height = h
+
+        if "profile" in options:
+            video_stream.profile = options["profile"]
+        if "qmin" in options:
+            video_stream.codec_context.qmin = int(options["qmin"])
+            video_stream.codec_context.qmax = int(options["qmax"])
+        options = {k: options[k] for k in options if k not in ("profile", "qmin", "qmax")}
+
         video_stream.options = options
 
         return video_stream
@@ -1103,8 +1092,7 @@ def save_as_chunk(
                 options=self._codec_opts,
             )
 
-            with closing(output_v_stream):
-                self._encode_images(images, output_container, output_v_stream)
+            self._encode_images(images, output_container, output_v_stream)
 
         return [(input_w, input_h)]
 
@@ -1115,7 +1103,7 @@ def _encode_images(
         for frame, _, _ in images:
             # let libav set the correct pts and time_base
             frame.pts = None
-            frame.time_base = None
+            frame.time_base = Fraction(0, 1)
 
             for packet in stream.encode(frame):
                 container.mux(packet)
@@ -1160,8 +1148,7 @@ def save_as_chunk(self, images, chunk_path):
                 options=self._codec_opts,
             )
 
-            with closing(output_v_stream):
-                self._encode_images(images, output_container, output_v_stream)
+            self._encode_images(images, output_container, output_v_stream)
 
         return [(input_w, input_h)]
 
diff --git a/cvat/requirements/base.in b/cvat/requirements/base.in
@@ -2,11 +2,7 @@
 
 attrs==21.4.0
 
-# This is the last version of av that supports ffmpeg we depend on.
-# Changing ffmpeg is undesirable, as there might be video decoding differences
-# between versions.
-# TODO: try to move to the newer version
-av==9.2.0
+av==15.1.0
 
 azure-storage-blob==12.13.0
 boto3~=1.37
diff --git a/site/content/en/docs/administration/advanced/upgrade_guide.md b/site/content/en/docs/administration/advanced/upgrade_guide.md
@@ -56,6 +56,13 @@ To upgrade CVAT, follow these steps:
   docker logs cvat_server -f
   ```
 
+## How to upgrade CVAT from v2.46.0 to v2.47.0.
+
+In version 2.47.0, CVAT upgraded the FFmpeg library it uses to split videos into frames from 4.3.1 to 8.0.
+There is a small chance that some video files may not be processed differently by the new FFmpeg version.
+
+If one of your tasks is affected, follow the guide in ./utils/ffmpeg_compatibility/README.md
+
 ## Upgrade CVAT after v2.26.0
 
 In version 2.26.0, CVAT changed the location where the export cache is stored.
diff --git a/utils/dataset_manifest/core.py b/utils/dataset_manifest/core.py
@@ -36,16 +36,13 @@ def __init__(self, source_path, chunk_size, force):
             for packet in container.demux(video_stream):
                 for frame in packet.decode():
                     # check type of first frame
-                    if not frame.pict_type.name == "I":
+                    if frame.pict_type != av.video.frame.PictureType.I:
                         raise InvalidVideoError("The first frame is not a key frame")
 
                     # get video resolution
-                    if video_stream.metadata.get("rotate"):
+                    if frame.rotation:
                         frame = av.VideoFrame().from_ndarray(
-                            rotate_image(
-                                frame.to_ndarray(format="bgr24"),
-                                360 - int(container.streams.video[0].metadata.get("rotate")),
-                            ),
+                            rotate_image(frame.to_ndarray(format="bgr24"), frame.rotation),
                             format="bgr24",
                         )
                     self.height, self.width = (frame.height, frame.width)
diff --git a/utils/dataset_manifest/requirements.in b/utils/dataset_manifest/requirements.in
@@ -1,4 +1,4 @@
-av==9.2.0  # Pinned for the whole CVAT
+av==15.1.0  # Pinned for the whole CVAT
 natsort>=8.0.0
 opencv-python-headless>=4.4.0.42
 Pillow>=10.3.0
diff --git a/utils/dataset_manifest/requirements.txt b/utils/dataset_manifest/requirements.txt
@@ -1,9 +1,9 @@
-# SHA1:3671835f743ca6c6c8d49b36eda2bb7e0763fa0b
+# SHA1:c994feeb8bd193f610d522cfea809387b62748ab
 #
 # This file is automatically generated.
 # To update it, refer to cvat/requirements/README.txt.
 #
-av==9.2.0
+av==15.1.0
     # via -r utils/dataset_manifest/requirements.in
 natsort==8.0.0
     # via -r utils/dataset_manifest/requirements.in
diff --git a/utils/ffmpeg_compatibility/README.md b/utils/ffmpeg_compatibility/README.md
@@ -0,0 +1,22 @@
+# Description
+In version 2.47.0, CVAT upgraded the FFMPEG library it uses to split videos into frames from 4.3.1 to 8.0.
+
+There is a small chance that some video files may be split into frames differently by different FFmpeg versions.
+
+In the case of any difference in frame decoding,
+this script may be used to switch a task to static chunks and generate frames with the old FFMPEG version.
+
+> NOTE: This option requires administrator
+> access to the server instance. If you do not have such access, please try
+> to contact the server administration.
+
+# Usage
+
+If your CVAT is deployed through docker, run
+```shell
+docker compose \
+  -f docker-compose.yml \
+  \  # optionally -f docker-compose.dev.yml \
+  -f ./utils/ffmpeg_compatibility/docker-compose.yml \
+  run --rm generate_chunks_for_task <task_id>
+```
diff --git a/utils/ffmpeg_compatibility/docker-compose.yml b/utils/ffmpeg_compatibility/docker-compose.yml
@@ -0,0 +1,13 @@
+services:
+  generate_chunks_for_task:
+    image: cvat/server:v2.46.0
+    environment:
+      CVAT_POSTGRES_HOST: cvat_db
+    depends_on:
+      - cvat_db
+    volumes:
+      - cvat_data:/home/django/data
+      - ./utils/ffmpeg_compatibility/switch_task_to_static_cache.py:/home/django/switch_task_to_static_cache.py:ro
+    networks:
+      - cvat
+    entrypoint: ["python", "switch_task_to_static_cache.py"]
diff --git a/utils/ffmpeg_compatibility/switch_task_to_static_cache.py b/utils/ffmpeg_compatibility/switch_task_to_static_cache.py
@@ -0,0 +1,102 @@
+from __future__ import annotations
+
+import argparse
+import os
+import shutil
+from unittest.mock import Mock, PropertyMock, patch
+
+import django
+from django.db import connection, transaction
+from django.db.migrations.recorder import MigrationRecorder
+from tqdm import tqdm
+
+django.setup()
+
+from cvat.apps.engine import models
+from cvat.apps.engine.media_extractors import MEDIA_TYPES
+from cvat.apps.engine.task import _create_static_chunks
+
+EXPECTED_LAST_ENGINE_MIGRATION = "0093_issue_assignee_updated_date_alter_issue_assignee_and_more"
+
+
+def _ensure_last_engine_applied_migration_name():
+    recorder = MigrationRecorder(connection)
+    app_name = "engine"
+    applied_migrations_names = list(
+        recorder.Migration.objects.filter(app=app_name).values_list("name", flat=True)
+    )
+    assert applied_migrations_names, f"No migrations applied for app '{app_name}'"
+
+    highest_by_number = max(applied_migrations_names, key=lambda name: int(name.split("_")[0]))
+
+    assert highest_by_number == EXPECTED_LAST_ENGINE_MIGRATION, (
+        f"Last applied migration for app '{app_name}' is '{highest_by_number}', "
+        f"expected '{EXPECTED_LAST_ENGINE_MIGRATION}'. "
+        f"Manually verify that the script still works correctly "
+        f"and update EXPECTED_ENGINE_LAST_ENGINE_MIGRATION var in the script."
+    )
+
+
+def _build_extractor(data: models.Data):
+    details = {
+        "source_path": [os.path.join(data.get_upload_dirname(), data.video.path)],
+        "step": data.get_frame_step(),
+        "start": data.start_frame,
+        "stop": data.stop_frame,
+    }
+    return MEDIA_TYPES["video"]["extractor"](**details)
+
+
+def _cleanup_static_cache(data: models.Data):
+    for folder in (data.get_compressed_cache_dirname(), data.get_original_cache_dirname()):
+        if os.path.exists(folder):
+            shutil.rmtree(folder)
+        os.makedirs(folder)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Switch a video task to static cache storage and generate static chunks."
+    )
+    parser.add_argument("task_id", type=int, help="Task ID (integer)")
+    args = parser.parse_args()
+    task_id = args.task_id
+
+    _ensure_last_engine_applied_migration_name()
+
+    with transaction.atomic():
+        try:
+            task: models.Task = models.Task.objects.select_for_update().get(pk=task_id)
+        except models.Task.DoesNotExist as ex:
+            raise ValueError(f"Task #{task_id} not found.") from ex
+
+        assert (
+            task.mode == "interpolation"
+        ), f"Task #{task_id} is not a video task (mode={task.mode})."
+
+        data: models.Data = task.data
+
+        extractor = _build_extractor(data)
+
+        data.storage_method = models.StorageMethodChoice.FILE_SYSTEM
+        _cleanup_static_cache(data)
+
+        with tqdm(total=1.0, bar_format="{l_bar}{bar}| {n:.2f}/{total:.2f}") as pbar:
+
+            class _PropertyMock(PropertyMock):
+                def __set__(self, instance, value):
+                    pbar.n = value
+                    pbar.refresh()
+
+            with patch("cvat.apps.engine.task.ImportRQMeta", return_value=Mock()) as mock:
+                type(mock.for_job.return_value).task_progress = _PropertyMock()
+                _create_static_chunks(
+                    task, media_extractor=extractor, upload_dir=data.get_upload_dirname()
+                )
+                data.save(update_fields=["storage_method"])
+
+    print(f"Task #{task_id}: switched to static cache.")
+
+
+if __name__ == "__main__":
+    main()

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-av==9.2.0 # Pinned for the whole CVAT`
	`1`	`+av==15.1.0 # Pinned for the whole CVAT`
`2`	`2`	`natsort>=8.0.0`
`3`	`3`	`opencv-python-headless>=4.4.0.42`
`4`	`4`	`Pillow>=10.3.0`
Original file line number	Diff line number	Diff line change
`@@ -1,9 +1,9 @@`
`1`		`-# SHA1:3671835f743ca6c6c8d49b36eda2bb7e0763fa0b`
	`1`	`+# SHA1:c994feeb8bd193f610d522cfea809387b62748ab`
`2`	`2`	`#`
`3`	`3`	`# This file is automatically generated.`
`4`	`4`	`# To update it, refer to cvat/requirements/README.txt.`
`5`	`5`	`#`
`6`		`-av==9.2.0`
	`6`	`+av==15.1.0`
`7`	`7`	`# via -r utils/dataset_manifest/requirements.in`
`8`	`8`	`natsort==8.0.0`
`9`	`9`	`# via -r utils/dataset_manifest/requirements.in`