Skip to content

Commit aeccf57

Browse files
committed
Fix heavyweight backups of tasks that have images and a non-default frame range
Let's say we have a cloud storage-based tasks that was created with 6 input images, and the following settings: start frame 1, stop frame 4, frame step 2. A heavyweight backup of such a task will then contain only frames 1 and 3 in the `data` directory; however, the manifest will still contain all 6 frames. If you try to restore such a backup, CVAT will fail, because it checks that the files in the `data` directory correspond 1:1 to the manifest. This could potentially be fixed in the restore code; however, it seems to me that the backups created in this case are incorrect, as they have manifests referencing nonexistent files. As such, I think it's more appropriate to fix this in the backup code. The fix is to filter the manifest during backup, leaving only entries corresponding to frames that actually get backed up. We also need to reset the frame range to the default, so that it matches the filtered manifest. Note that the same bug also affects backups of tasks based on the mounted share. It should be reasonably easy to fix (just use `_write_filtered_media_manifest` in the `StorageChoice.SHARE` branch), but I cannot test such a fix, because share backups are currently broken entirely. So I will defer this fix until cvat-ai#9972.
1 parent 98c0a2a commit aeccf57

File tree

7 files changed

+77
-11
lines changed

7 files changed

+77
-11
lines changed
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
### Fixed
2+
3+
- Heavyweight backups created from tasks using cloud storage that have
4+
images as frames and non-default start frame, stop frame or frame step
5+
settings no longer fail to import. Note that the fix is for backup
6+
creation; as such, CVAT will still not be able to import backups of
7+
such tasks created by previous versions
8+
(<https://github.com/cvat-ai/cvat/pull/10004>)

cvat/apps/engine/backup.py

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -423,6 +423,7 @@ def __init__(self, pk, version=Version.V1, *, lightweight: bool):
423423
)
424424
self._label_mapping = _get_label_mapping(db_labels)
425425
self._lightweight = lightweight
426+
self._manifest_was_filtered = False
426427

427428
def _write_annotation_guide(self, zip_object: ZipFile, target_dir: str) -> None:
428429
annotation_guide = (
@@ -436,6 +437,46 @@ def _write_annotation_guide(self, zip_object: ZipFile, target_dir: str) -> None:
436437
target_dir=target_dir,
437438
)
438439

440+
def _write_filtered_media_manifest(self, zip_object: ZipFile, target_dir: str) -> None:
441+
# When making a heavyweight backup of a task with images, we only include those frames
442+
# that match the task's frame range. This function filters the manifest so that it also
443+
# includes only those frames. That way, we don't have a manifest referencing nonexistent
444+
# images in the backup.
445+
446+
target_data_dir = os.path.join(target_dir, self.DATA_DIRNAME)
447+
448+
if hasattr(self._db_data, "video"):
449+
# No filtering necessary; just use the original manifest.
450+
self._write_files(
451+
source_dir=self._db_data.get_upload_dirname(),
452+
zip_object=zip_object,
453+
files=[self._db_data.get_manifest_path()],
454+
target_dir=target_data_dir,
455+
)
456+
return
457+
458+
with tempfile.TemporaryDirectory() as tmp_dir:
459+
present_frame_nums = {im.frame for im in self._db_data.images.all()}
460+
461+
filtered_manifest_path = os.path.join(tmp_dir, self.MEDIA_MANIFEST_FILENAME)
462+
463+
imm_original = ImageManifestManager(
464+
self._db_data.get_manifest_path(), create_index=False
465+
)
466+
imm_filtered = ImageManifestManager(filtered_manifest_path, create_index=False)
467+
imm_filtered.create(
468+
entry for frame_num, entry in imm_original if frame_num in present_frame_nums
469+
)
470+
471+
self._write_files(
472+
source_dir=tmp_dir,
473+
zip_object=zip_object,
474+
files=[filtered_manifest_path],
475+
target_dir=target_data_dir,
476+
)
477+
478+
self._manifest_was_filtered = True
479+
439480
def _write_data(self, zip_object: ZipFile, target_dir: str) -> None:
440481
target_data_dir = os.path.join(target_dir, self.DATA_DIRNAME)
441482

@@ -480,7 +521,9 @@ def _write_data(self, zip_object: ZipFile, target_dir: str) -> None:
480521
target_dir=target_data_dir,
481522
)
482523
else:
483-
files_for_local_copy = [self._db_data.get_manifest_path()]
524+
self._write_filtered_media_manifest(zip_object=zip_object, target_dir=target_dir)
525+
526+
files_for_local_copy = []
484527

485528
media_files_to_download = []
486529
for media_file in self._db_data.related_files.all():
@@ -669,6 +712,11 @@ def serialize_data():
669712
else:
670713
data["storage"] = self._db_data.storage
671714

715+
if self._manifest_was_filtered:
716+
del data["start_frame"]
717+
del data["stop_frame"]
718+
del data["frame_filter"]
719+
672720
return self._prepare_data_meta(data)
673721

674722
task = serialize_task()

tests/docker-compose.minio.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ services:
4949
IMPORT_EXPORT_BUCKET: "importexportbucket"
5050
volumes:
5151
- ./tests/cypress/e2e/actions_tasks/assets/case_65_manifest/:/mnt/images_with_manifest:ro
52-
- ./tests/mounted_file_share/pcd_with_related/:/mnt/pcd_with_related:ro
52+
- ./tests/mounted_file_share/:/mnt/mounted_file_share:ro
5353
networks:
5454
- cvat
5555
entrypoint: >
@@ -64,7 +64,7 @@ services:
6464
else
6565
FULL_PATH=$${BUCKET};
6666
fi
67-
$${MC_PATH} cp --recursive /mnt/ $${FULL_PATH};
67+
$${MC_PATH} cp --recursive /mnt/mounted_file_share/* /mnt/images_with_manifest/ $${FULL_PATH};
6868
for i in 1 2;
6969
do
7070
$${MC_PATH} cp /mnt/images_with_manifest/manifest.jsonl $${FULL_PATH}/images_with_manifest/manifest_$${i}.jsonl;
7.88 KB
Loading
7.88 KB
Loading
7.88 KB
Loading

tests/python/rest_api/test_tasks.py

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1301,10 +1301,6 @@ def test_cannot_export_backup_for_task_without_data(self, tasks):
13011301
@pytest.mark.with_external_services
13021302
@pytest.mark.parametrize("lightweight_backup", [True, False])
13031303
def test_can_export_and_import_backup_task_with_cloud_storage(self, lightweight_backup):
1304-
cloud_storage_content = [
1305-
"images_with_manifest/image_case_65_1.png",
1306-
"images_with_manifest/image_case_65_2.png",
1307-
]
13081304
task_spec = {
13091305
"name": "Task with files from cloud storage",
13101306
"labels": [
@@ -1317,7 +1313,10 @@ def test_can_export_and_import_backup_task_with_cloud_storage(self, lightweight_
13171313
"image_quality": 75,
13181314
"use_cache": False,
13191315
"cloud_storage_id": 1,
1320-
"server_files": cloud_storage_content,
1316+
"server_files": [f"images/image_{i}.jpg" for i in range(0, 6)],
1317+
"start_frame": 1,
1318+
"stop_frame": 4,
1319+
"frame_filter": "step=2",
13211320
}
13221321
task_id, _ = create_task(self.user, task_spec, data_spec)
13231322

@@ -1335,7 +1334,7 @@ def test_can_export_and_import_backup_task_with_cloud_storage(self, lightweight_
13351334

13361335
expected_media = {"manifest.jsonl"}
13371336
if not lightweight_backup:
1338-
expected_media.update(cloud_storage_content)
1337+
expected_media.update(["images/image_1.jpg", "images/image_3.jpg"])
13391338
assert files_in_data == expected_media
13401339

13411340
self._test_can_restore_task_from_backup(task_id, lightweight_backup=lightweight_backup)
@@ -1401,9 +1400,20 @@ def _test_can_restore_task_from_backup(self, task_id: int, lightweight_backup: b
14011400
exclude_regex_paths = [r"root\['chunks_updated_date'\]"] # must be different
14021401

14031402
if old_meta["storage"] == "cloud_storage":
1404-
assert new_meta["storage"] == ("cloud_storage" if lightweight_backup else "local")
14051403
assert new_meta["cloud_storage_id"] is None
1406-
exclude_regex_paths.extend([r"root\['cloud_storage_id'\]", r"root\['storage'\]"])
1404+
exclude_regex_paths.append(r"root\['cloud_storage_id'\]")
1405+
1406+
if not lightweight_backup:
1407+
assert new_meta["storage"] == "local"
1408+
assert new_meta["start_frame"] == 0
1409+
assert new_meta["stop_frame"] == len(old_meta["frames"]) - 1
1410+
assert new_meta["frame_filter"] == ""
1411+
exclude_regex_paths += [
1412+
r"root\['storage'\]",
1413+
r"root\['start_frame'\]",
1414+
r"root\['stop_frame'\]",
1415+
r"root\['frame_filter'\]",
1416+
]
14071417

14081418
assert (
14091419
DeepDiff(

0 commit comments

Comments
 (0)