Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
### Fixed

- Heavyweight backups created from tasks using cloud storage that have
images as frames and non-default start frame, stop frame or frame step
settings no longer fail to import. Note that the fix is for backup
creation; as such, CVAT will still not be able to import backups of
such tasks created by previous versions
(<https://github.com/cvat-ai/cvat/pull/10004>)
50 changes: 49 additions & 1 deletion cvat/apps/engine/backup.py
Original file line number Diff line number Diff line change
Expand Up @@ -423,6 +423,7 @@ def __init__(self, pk, version=Version.V1, *, lightweight: bool):
)
self._label_mapping = _get_label_mapping(db_labels)
self._lightweight = lightweight
self._manifest_was_filtered = False

def _write_annotation_guide(self, zip_object: ZipFile, target_dir: str) -> None:
annotation_guide = (
Expand All @@ -436,6 +437,46 @@ def _write_annotation_guide(self, zip_object: ZipFile, target_dir: str) -> None:
target_dir=target_dir,
)

def _write_filtered_media_manifest(self, zip_object: ZipFile, target_dir: str) -> None:
Copy link
Contributor

@zhiltsov-max zhiltsov-max Nov 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like these frame filtering parameters (start, stop, and step) only make sense and should only apply to video tasks. This might be a big change for a bug fix, but could you check if we can move this into task creation instead? I'd probably prefer having lightweight and regular backups with the same data configurations.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like these frame filtering parameters (start, stop, and step) only make sense and should only apply to video tasks.

I don't think I agree. For example, when a user creates a task, they can select a directory full of images and then set these parameters to only use a subset of them.

Also, regardless of whether creating tasks like this should be supported, de facto it already works. So we should be able to back up and restore such tasks.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think I agree. For example, when a user creates a task, they can select a directory full of images and then set these parameters to only use a subset of them.

My point was only that we probably should stop including all the unused images into task manifests for non-video tasks. Basically, do the same manifest filtering that was added in this PR, but during the task creation after we apply all the frame filters (these parameters, wildcards etc.).

# When making a heavyweight backup of a task with images, we only include those frames
# that match the task's frame range. This function filters the manifest so that it also
# includes only those frames. That way, we don't have a manifest referencing nonexistent
# images in the backup.

target_data_dir = os.path.join(target_dir, self.DATA_DIRNAME)

if hasattr(self._db_data, "video"):
# No filtering necessary; just use the original manifest.
self._write_files(
source_dir=self._db_data.get_upload_dirname(),
zip_object=zip_object,
files=[self._db_data.get_manifest_path()],
target_dir=target_data_dir,
)
return

with tempfile.TemporaryDirectory() as tmp_dir:
present_frame_nums = {im.frame for im in self._db_data.images.all()}

filtered_manifest_path = os.path.join(tmp_dir, self.MEDIA_MANIFEST_FILENAME)

imm_original = ImageManifestManager(
self._db_data.get_manifest_path(), create_index=False
)
imm_filtered = ImageManifestManager(filtered_manifest_path, create_index=False)
imm_filtered.create(
entry for frame_num, entry in imm_original if frame_num in present_frame_nums
)

self._write_files(
source_dir=tmp_dir,
zip_object=zip_object,
files=[filtered_manifest_path],
target_dir=target_data_dir,
)

self._manifest_was_filtered = True

def _write_data(self, zip_object: ZipFile, target_dir: str) -> None:
target_data_dir = os.path.join(target_dir, self.DATA_DIRNAME)

Expand Down Expand Up @@ -480,7 +521,9 @@ def _write_data(self, zip_object: ZipFile, target_dir: str) -> None:
target_dir=target_data_dir,
)
else:
files_for_local_copy = [self._db_data.get_manifest_path()]
self._write_filtered_media_manifest(zip_object=zip_object, target_dir=target_dir)

files_for_local_copy = []

media_files_to_download = []
for media_file in self._db_data.related_files.all():
Expand Down Expand Up @@ -669,6 +712,11 @@ def serialize_data():
else:
data["storage"] = self._db_data.storage

if self._manifest_was_filtered:
del data["start_frame"]
del data["stop_frame"]
del data["frame_filter"]

return self._prepare_data_meta(data)

task = serialize_task()
Expand Down
4 changes: 2 additions & 2 deletions tests/docker-compose.minio.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ services:
IMPORT_EXPORT_BUCKET: "importexportbucket"
volumes:
- ./tests/cypress/e2e/actions_tasks/assets/case_65_manifest/:/mnt/images_with_manifest:ro
- ./tests/mounted_file_share/pcd_with_related/:/mnt/pcd_with_related:ro
- ./tests/mounted_file_share/:/mnt/mounted_file_share:ro
networks:
- cvat
entrypoint: >
Expand All @@ -64,7 +64,7 @@ services:
else
FULL_PATH=$${BUCKET};
fi
$${MC_PATH} cp --recursive /mnt/ $${FULL_PATH};
$${MC_PATH} cp --recursive /mnt/mounted_file_share/ /mnt/images_with_manifest $${FULL_PATH}/
for i in 1 2;
do
$${MC_PATH} cp /mnt/images_with_manifest/manifest.jsonl $${FULL_PATH}/images_with_manifest/manifest_$${i}.jsonl;
Expand Down
Binary file added tests/mounted_file_share/images/image_0.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added tests/mounted_file_share/images/image_4.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added tests/mounted_file_share/images/image_5.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
3 changes: 2 additions & 1 deletion tests/python/rest_api/test_cloud_storages.py
Original file line number Diff line number Diff line change
Expand Up @@ -693,7 +693,8 @@ def test_iterate_over_cloud_storage_content(
if not next_token:
break

assert expected_content == current_content
# Each page is currently sorted individually, so we have to resort after combining.
assert expected_content == sorted(current_content, key=lambda el: el["type"].value)

@pytest.mark.parametrize("cloud_storage_id", [2])
def test_can_get_storage_content_with_manually_created_dirs(
Expand Down
2 changes: 1 addition & 1 deletion tests/python/rest_api/test_task_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -853,7 +853,7 @@ def test_user_cannot_create_task_with_cloud_storage_without_access(
("abc_manifest.jsonl", "[a-c]*.jpeg", False, 2, ""),
("abc_manifest.jsonl", "[d]*.jpeg", False, 1, ""),
("abc_manifest.jsonl", "[e-z]*.jpeg", False, 0, "No media data found"),
(None, "*", True, 0, "Combined media types are not supported"),
(None, "*", True, 0, "Only one video, archive, pdf, zip"),
(None, "test/*", True, 3, ""),
(None, "test/sub*1.jpeg", True, 1, ""),
(None, "*image*.jpeg", True, 3, ""),
Expand Down
26 changes: 18 additions & 8 deletions tests/python/rest_api/test_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1301,10 +1301,6 @@ def test_cannot_export_backup_for_task_without_data(self, tasks):
@pytest.mark.with_external_services
@pytest.mark.parametrize("lightweight_backup", [True, False])
def test_can_export_and_import_backup_task_with_cloud_storage(self, lightweight_backup):
cloud_storage_content = [
"images_with_manifest/image_case_65_1.png",
"images_with_manifest/image_case_65_2.png",
]
task_spec = {
"name": "Task with files from cloud storage",
"labels": [
Expand All @@ -1317,7 +1313,10 @@ def test_can_export_and_import_backup_task_with_cloud_storage(self, lightweight_
"image_quality": 75,
"use_cache": False,
"cloud_storage_id": 1,
"server_files": cloud_storage_content,
"server_files": [f"images/image_{i}.jpg" for i in range(0, 6)],
"start_frame": 1,
"stop_frame": 4,
"frame_filter": "step=2",
}
task_id, _ = create_task(self.user, task_spec, data_spec)

Expand All @@ -1335,7 +1334,7 @@ def test_can_export_and_import_backup_task_with_cloud_storage(self, lightweight_

expected_media = {"manifest.jsonl"}
if not lightweight_backup:
expected_media.update(cloud_storage_content)
expected_media.update(["images/image_1.jpg", "images/image_3.jpg"])
assert files_in_data == expected_media

self._test_can_restore_task_from_backup(task_id, lightweight_backup=lightweight_backup)
Expand Down Expand Up @@ -1401,9 +1400,20 @@ def _test_can_restore_task_from_backup(self, task_id: int, lightweight_backup: b
exclude_regex_paths = [r"root\['chunks_updated_date'\]"] # must be different

if old_meta["storage"] == "cloud_storage":
assert new_meta["storage"] == ("cloud_storage" if lightweight_backup else "local")
assert new_meta["cloud_storage_id"] is None
exclude_regex_paths.extend([r"root\['cloud_storage_id'\]", r"root\['storage'\]"])
exclude_regex_paths.append(r"root\['cloud_storage_id'\]")

if not lightweight_backup:
assert new_meta["storage"] == "local"
assert new_meta["start_frame"] == 0
assert new_meta["stop_frame"] == len(old_meta["frames"]) - 1
assert new_meta["frame_filter"] == ""
exclude_regex_paths += [
r"root\['storage'\]",
r"root\['start_frame'\]",
r"root\['stop_frame'\]",
r"root\['frame_filter'\]",
]

assert (
DeepDiff(
Expand Down
Loading