Optimize task creation from CS without manifest (#9827)

zhiltsov-max · web-flow · commit 4849a5e8a684 · 2025-10-13T14:12:06.000+03:00
Related: #9757 When a raw images task is created from a CS without a manifest attached, CVAT downloads image headers to get image resolution. This operation can be quite time-consuming for big tasks, but it can be optimized quite simply. - Improved performance of CS image header downloading and manifest creation ~2-6x The default chunk size used in the downloader is 64KB. For most image formats the required information is available in the first 1KB, while 64KB (the previous value) can be the size of the whole file. It is tempting to change it to a lower value, e.g. 1500 (the default Ethernet v2 MTU size) and it works fine, except for jpgs that include an embedded thumbnail (preview) image in the header, which can basically be of any size. <s>Probably, this can be implemented with a more advanced JPEG parser. It doesn't seem reasonable to use the reduced chunk size and download the whole image for such images, as such jpegs seem to be quite common, but maybe it can be implemented as an exception just for the jpg format.</s> Now, multiple header sizes are attempted per file. AWS connections limits are floating and depend on the data filenames. In the worst case, we can expect about 100 connections per prefix, up to infinite in the best case (random prefixes). It's also possible to get throttled by AWS (e.g. 503 Slow Down), it should be handled by the boto library itself. https://docs.aws.amazon.com/AmazonS3/latest/userguide/optimizing-performance.html https://stackoverflow.com/questions/37432285/maximum-no-of-connections-that-can-be-held-by-s3 https://repost.aws/knowledge-center/http-5xx-errors-s3 https://boto3.amazonaws.com/v1/documentation/api/latest/guide/retries.html Details: test dataset: 26822 .jpg images (~10 GB) baseline: 550s with queue: 320 - 350s with reduced chunk size (1 MTU): 220 - 280s with improved connection reuse for AWS: up to 82s (64 connections for 16 cores, up from 10 kept alive with the default config) Sample script for testing ```python from time import perf_counter from tempfile import TemporaryDirectory from tqdm import tqdm from cvat.apps.engine import models, cloud_provider from utils.dataset_manifest.core import ImageManifestManager cloud_storage = models.CloudStorage.objects.get(id=yourcs) storage_client = cloud_provider.db_storage_to_storage_instance(cloud_storage) media = [v["name"] for v in storage_client.list_files(prefix="images/", _use_flat_listing=True)] header_downloader = cloud_provider.HeaderFirstMediaDownloader.create( models.DimensionType.DIM_2D, client=storage_client ) content_generator = ( v for v in tqdm( storage_client.bulk_download_to_memory(media, object_downloader=header_downloader.download), total=len(media), ) ) with TemporaryDirectory() as tempdir: start_time = perf_counter() manifest = ImageManifestManager(tempdir, upload_dir=tempdir, create_index=False) manifest.link( sources=content_generator, stop=len(media) - 1, DIM_3D=False, ) manifest.create() duration = perf_counter() - start_time print( f"Manifest for {len(media)} files created in", duration, "seconds", f"avg. {duration / (len(media) or 1)}s.", ) # run with # cat test_cs_downloading.py | python manage.py shell ```
diff --git a/changelog.d/20250915_143240_mzhiltso_optimize_cs_downloading.md b/changelog.d/20250915_143240_mzhiltso_optimize_cs_downloading.md
@@ -0,0 +1,4 @@
+### Fixed
+
+- Improved performance of task creation from cloud without manifest
+  (<https://github.com/cvat-ai/cvat/pull/9827>)
diff --git a/cvat/apps/engine/cloud_provider.py b/cvat/apps/engine/cloud_provider.py
@@ -6,14 +6,16 @@
 from __future__ import annotations
 
 import functools
+import io
 import json
 import os
 from abc import ABC, abstractmethod
-from collections.abc import Iterator
-from concurrent.futures import FIRST_EXCEPTION, ThreadPoolExecutor, wait
+from collections.abc import Iterator, Sequence
+from concurrent.futures import FIRST_EXCEPTION, Future, ThreadPoolExecutor, wait
 from enum import Enum
 from io import BytesIO
 from pathlib import Path
+from queue import Queue
 from typing import Any, BinaryIO, Callable, Optional, TypeVar
 
 import boto3
@@ -35,7 +37,7 @@
 from cvat.apps.engine.log import ServerLogManager
 from cvat.apps.engine.models import CloudProviderChoice, CredentialsTypeChoice, DimensionType
 from cvat.apps.engine.rq import ExportRQMeta
-from cvat.apps.engine.utils import get_cpu_number, take_by
+from cvat.apps.engine.utils import get_cpu_number
 from cvat.utils.http import PROXIES_FOR_UNTRUSTED_URLS
 from utils.dataset_manifest.utils import InvalidPcdError, PcdReader
 
@@ -214,9 +216,26 @@ def bulk_download_to_memory(
         func = object_downloader or self.download_fileobj
         threads_number = get_max_threads_number(len(files))
 
+        # We're using a custom queue to limit the maximum number of downloaded unprocessed
+        # files stored in the memory.
+        # For example, the builtin executor.map() could also be used here, but it
+        # would enqueue all the file list in one go, and the downloaded files
+        # would all be stored in memory until processed.
+        queue: Queue[Future] = Queue(maxsize=threads_number)
+        input_iter = iter(files)
         with ThreadPoolExecutor(max_workers=threads_number) as executor:
-            for batch_links in take_by(files, chunk_size=threads_number):
-                yield from executor.map(func, batch_links)
+            while not queue.empty() or input_iter is not None:
+                while not queue.full() and input_iter is not None:
+                    next_job_params = next(input_iter, None)
+                    if next_job_params is None:
+                        input_iter = None
+                        break
+
+                    next_job = executor.submit(func, next_job_params)
+                    queue.put(next_job)
+
+                top_job = queue.get()
+                yield top_job.result()
 
     def bulk_download_to_dir(
         self,
@@ -354,26 +373,38 @@ def write_access(self):
 
 
 class HeaderFirstDownloader(ABC):
-    def __init__(
-        self,
-        *,
-        client: _CloudStorage,
-        chunk_size: int = 65536,
-    ):
+    def __init__(self, *, client: _CloudStorage):
         self.client = client
-        self.chunk_size = chunk_size
 
     @abstractmethod
-    def try_parse_header(self, header: bytes, *, key: str) -> Any | None: ...
+    def try_parse_header(self, header: NamedBytesIO) -> Any | None: ...
 
-    def log_header_miss(self, file: NamedBytesIO, *, key: str):
-        full_object_size = len(file.getvalue())
+    def log_header_miss(self, key: str, header_size: int, *, full_file: NamedBytesIO | None = None):
+        message = (
+            f'The first {header_size} bytes were not enough to parse the "{key}" object header. '
+        )
+
+        if full_file:
+            full_object_size = len(full_file.getvalue())
+
+            message += (
+                f'Object size was {full_object_size} bytes. '
+                f'Downloaded percentage was '
+                f'{min(header_size, full_object_size) / full_object_size:.0%}'
+            )
 
-        slogger.glob.warning(
-            f'The {self.chunk_size} bytes were not enough to parse the "{key}" object header. '
-            f'Object size was {full_object_size} bytes. '
-            f'Downloaded percent was '
-            f'{round(min(self.chunk_size, full_object_size) / full_object_size):%}'
+        slogger.glob.warning(message)
+
+    def get_header_sizes_to_try(self) -> Sequence[int]:
+        return (
+            # The first 1-2Kb are typically enough for most formats with the static header size.
+            # Unfortunately, it's not enough for some popular formats, such as jpeg,
+            # which can optionally include a preview image embedded in the header, so we try
+            # other bigger sizes, but less than the whole file.
+            # For comparison, the standard Ethernet v2 MTU size is 1500 bytes.
+            2048,
+            16384,
+            65536,
         )
 
     def download(self, key: str) -> NamedBytesIO:
@@ -388,34 +419,54 @@ def download(self, key: str) -> NamedBytesIO:
         Returns:
             buffer with the image
         """
-        chunk = self.client.download_range_of_bytes(key, stop_byte=self.chunk_size - 1)
 
-        if self.try_parse_header(chunk, key=key):
-            buff = NamedBytesIO(chunk)
-            buff.filename = key
-        else:
-            buff = self.client.download_fileobj(key)
-            self.log_header_miss(file=buff, key=key)
+        buff = NamedBytesIO()
+        buff.filename = key
 
+        headers_to_try = self.get_header_sizes_to_try()
+        for i, header_size in enumerate(headers_to_try):
+            buff.seek(0, io.SEEK_END)
+            cur_pos = buff.tell()
+            chunk = self.client.download_range_of_bytes(
+                key, start_byte=cur_pos, stop_byte=header_size - 1
+            )
+            buff.write(chunk)
+            buff.seek(0)
+
+            if self.try_parse_header(buff):
+                buff.seek(0)
+                return buff
+
+            if i + 1 < len(headers_to_try):
+                self.log_header_miss(key=key, header_size=header_size)
+
+        buff = self.client.download_fileobj(key)
+        self.log_header_miss(key=key, header_size=header_size, full_file=buff)
         return buff
 
 class _HeaderFirstImageDownloader(HeaderFirstDownloader):
-    def try_parse_header(self, header, *, key):
+    def try_parse_header(self, header):
         image_parser = ImageFile.Parser()
-        image_parser.feed(header)
+        image_parser.feed(header.getvalue())
         return image_parser.image
 
-    def log_header_miss(self, file, *, key):
-        full_object_size = len(file.getvalue())
-
-        slogger.glob.warning(
-            f'The {self.chunk_size} bytes were not enough to parse the "{key}" object header. '
-            f'Object size was {full_object_size} bytes. '
-            f'Image resolution was {Image.open(file).size}. '
-            f'Downloaded percent was '
-            f'{round(min(self.chunk_size, full_object_size) / full_object_size):.0%}'
+    def log_header_miss(self, key, header_size, *, full_file = None):
+        message = (
+            f'The first {header_size} bytes were not enough to parse the "{key}" object header. '
         )
 
+        if full_file:
+            full_object_size = len(full_file.getvalue())
+
+            message += (
+                f'Object size was {full_object_size} bytes. '
+                f'Image resolution was {Image.open(full_file).size}. '
+                f'Downloaded percentage was '
+                f'{min(header_size, full_object_size) / full_object_size:.0%}'
+            )
+
+        slogger.glob.warning(message)
+
     def download(self, key):
         try:
             return super().download(key)
@@ -426,10 +477,10 @@ def download(self, key):
 
 
 class _HeaderFirstPcdDownloader(HeaderFirstDownloader):
-    def try_parse_header(self, header, *, key):
+    def try_parse_header(self, header):
         pcd_parser = PcdReader()
-        file = NamedBytesIO(header)
-        file_ext = os.path.splitext(key)[1].lower()
+        file = header
+        file_ext = os.path.splitext(file.filename)[1].lower()
 
         if file_ext == ".bin":
             # We need to ensure the file is a valid .bin file
@@ -551,7 +602,16 @@ def __init__(self,
 
         session = boto3.Session(**kwargs)
         self._s3 = session.resource("s3", endpoint_url=endpoint_url,
-            config=Config(proxies=PROXIES_FOR_UNTRUSTED_URLS or {}),
+            config=Config(
+                proxies=PROXIES_FOR_UNTRUSTED_URLS or {},
+                max_pool_connections=(
+                    # AWS can throttle the requests if there are too many of them,
+                    # the SDK handles it with the retry policy:
+                    # https://boto3.amazonaws.com/v1/documentation/api/latest/guide/retries.html
+                    # 10 is the default value
+                    max(10, CPU_NUMBER * settings.CLOUD_DATA_DOWNLOADING_MAX_THREADS_NUMBER_PER_CPU)
+                )
+            ),
         )
 
         # anonymous access