diff --git a/kart/lfs_commands/__init__.py b/kart/lfs_commands/__init__.py index 4f19f7f54..eda2e0e47 100644 --- a/kart/lfs_commands/__init__.py +++ b/kart/lfs_commands/__init__.py @@ -15,6 +15,7 @@ get_hash_from_pointer_file, get_local_path_from_lfs_oid, ) +from kart.lfs_commands.url_redirector import UrlRedirector from kart.object_builder import ObjectBuilder from kart.rev_list_objects import rev_list_tile_pointer_files from kart.repo import KartRepoState @@ -276,29 +277,32 @@ def fetch_lfs_blobs_for_commits( repo.spatial_filter if do_spatial_filter else SpatialFilter.MATCH_ALL ) - pointer_file_oids = set() + dataset_to_pointer_file_oids = {} for commit in commits: for dataset in repo.datasets( commit, filter_dataset_type=ALL_TILE_DATASET_TYPES ): - pointer_file_oids.update( - blob.hex - for blob in dataset.tile_pointer_blobs(spatial_filter=spatial_filter) + pointer_file_oids = dataset_to_pointer_file_oids.setdefault( + dataset.path, set() ) + for blob in dataset.tile_pointer_blobs(spatial_filter=spatial_filter): + pointer_file_oids.add(blob.hex) fetch_lfs_blobs_for_pointer_files( - repo, pointer_file_oids, dry_run=dry_run, quiet=quiet + repo, dataset_to_pointer_file_oids, dry_run=dry_run, quiet=quiet ) def fetch_lfs_blobs_for_pointer_files( - repo, pointer_files, *, remote_name=None, dry_run=False, quiet=False + repo, dataset_to_pointer_file_oids, *, remote_name=None, dry_run=False, quiet=False ): """ - Given a list of pointer files (or OIDs of pointer files themselves - not the OIDs they point to) - fetch all the tiles that those pointer files point to that are not already present in the local cache. + Given a dict in the format: {dataset-path: set(pointer-file-oid-1, pointer-file-oid-2, ...)} + Where dataset-path is the path to a dataset, and each pointer-file-oid is the OID of the pointer file itself + (not the LFS oid that the pointer file points to) that is present in that dataset: + Fetches all the tiles that those pointer files point to that are not already present in the local cache. """ - if not pointer_files: + if not dataset_to_pointer_file_oids: return if not remote_name: @@ -312,7 +316,12 @@ def fetch_lfs_blobs_for_pointer_files( urls_sizes = {} non_urls_sizes = {} - for pointer_file in pointer_files: + pointer_files_to_datasets = _invert_pointer_file_oid_dict( + dataset_to_pointer_file_oids + ) + url_redirector = UrlRedirector(repo) + + for pointer_file, datasets in pointer_files_to_datasets.items(): if isinstance(pointer_file, str): pointer_blob = repo[pointer_file] elif getattr(pointer_file, "type", None) == pygit2.GIT_OBJ_BLOB: @@ -322,6 +331,8 @@ def fetch_lfs_blobs_for_pointer_files( pointer_dict = pointer_file_bytes_to_dict(pointer_blob) url = pointer_dict.get("url") + url = url_redirector.apply_redirect(url, datasets) + lfs_oid = get_hash_from_pointer_file(pointer_dict) pointer_file_oid = pointer_blob.hex lfs_path = get_local_path_from_lfs_oid(repo, lfs_oid) @@ -369,6 +380,20 @@ def fetch_lfs_blobs_for_pointer_files( _do_fetch_from_remote(repo, non_urls, remote_name, quiet=quiet) +def _invert_pointer_file_oid_dict(dataset_to_pointer_file_oids): + result = {} + for dataset, pointer_file_oids in dataset_to_pointer_file_oids.items(): + assert isinstance(dataset, str) + for pointer_file_oid in pointer_file_oids: + existing = result.setdefault(pointer_file_oid, dataset) + if dataset != existing: + if isinstance(existing, str): + result[pointer_file_oid] = {existing, dataset} + elif isinstance(existing, set): + existing.add(dataset) + return result + + def _do_fetch_from_urls(repo, urls_and_lfs_oids, quiet=False): non_s3_url = next( (url for (url, lfs_oid) in urls_and_lfs_oids if not url.startswith("s3://")), diff --git a/kart/lfs_commands/url_redirector.py b/kart/lfs_commands/url_redirector.py new file mode 100644 index 000000000..101193915 --- /dev/null +++ b/kart/lfs_commands/url_redirector.py @@ -0,0 +1,86 @@ +from collections.abc import Iterable + +from kart.tile import ALL_TILE_DATASET_TYPES + + +class UrlRedirector: + """ + Loads a set of redirect rules that apply to linked-datasets from a given commit. + + Suppose, for example, a user migrates all their data from one S3 region to another, for whatever reason. + And suppose the bucket in the new region has a new name, since bucket names are globally unique. + (It may be possible to migrate the bucket name, but for the purpose of this example, the new bucket has a new name). + That will break a linked-dataset where the URLs embedded in each tile point to the original bucket. + + The workaround: each linked-dataset has a meta-item called "linked-storage.json", which may contain a mapping + called "urlRedirects". If these redirect rules are updated appropriately, then URLs that point to the old bucket + will be treated as if they point to the new bucket, without needing to update the URL in every single tile + individually and retroactively. + + Here is an example urlRedirects mapping that contains 3 rules: + { + "s3://old/and/broken/": "s3://new/and/shiny/", + "s3://old/path/to/tile.laz": "s3://new/path/to/same/tile.laz", + "s3://old/", "s3://new/" + } + + This would be applied to an URL as follows - each rule is attempted in turn. + If a rule applies, the url is updated, and subsequent rules are attempted against the updated url. + Eventually the url - which may have been updated by zero, one, or many rules - is returned. + + - The first rule ends with a '/' so it does prefix matching: + If the url starts with "s3://old/and/broken/", this prefix will be replaced with "s3://new/and/shiny/" + - The second rule does not end with a '/' so it does exact matching: + If the url is now exactly "s3://old/path/to/tile.laz", it will be set to" s3://new/path/to/same/tile.laz" + - The third rule ends with a '/' so it does prefix matching: + If the url now starts with "s3://old/", this prefix will be replaced with "s3://new/" + + Currently url redirect rules are only loaded from the HEAD commit - this is subject to change. + """ + + def __init__(self, repo, commit=None): + # TODO - improve redirect-commit finding logic - probably do some of the following: + # - find the tip of the default branch + # - find the local tip of the branch that the remote HEAD was pointing to when we last fetched + # - find a branch specified somehow in the config as the url-redirect branch + + self.commit = commit if commit is not None else repo.head_commit + + self.dataset_to_redirects = {} + + if not self.commit: + return + + for dataset in repo.datasets( + self.commit, filter_dataset_type=ALL_TILE_DATASET_TYPES + ): + linked_storage = dataset.get_meta_item("linked-storage.json") + redirects = linked_storage.get("urlRedirects") if linked_storage else None + if redirects: + self.dataset_to_redirects[dataset.path] = redirects + + def apply_redirect(self, url, dataset): + # It could be the case that a single LFS object is in more than one dataset. + # In this case, we just try to find any set of redirect rules that applies to the object. + if isinstance(dataset, Iterable) and not isinstance(dataset, str): + for d in dataset: + result = self.apply_redirect(url, d) + if result != url: + return result + return url + + if not isinstance(dataset, str): + dataset = dataset.path + redirects = self.dataset_to_redirects.get(dataset) + if not redirects: + return url + + for from_, to_ in redirects.items(): + if from_.endswith("/"): + if url.startswith(from_): + url = to_ + url[len(from_) :] + else: + if url == from_: + url = to_ + + return url diff --git a/kart/workdir.py b/kart/workdir.py index 6dc8570f3..6edb75de8 100644 --- a/kart/workdir.py +++ b/kart/workdir.py @@ -303,19 +303,20 @@ def _do_reset_datasets( track_changes_as_dirty=False, quiet=False, ): - pointer_files_to_fetch = set() + dataset_to_pointer_oids_to_fetch = {} workdir_diff_cache = self.workdir_diff_cache() update_diffs = {} # First pass - make sure the LFS blobs are present in the local LFS cache: # - For the datasets that will be inserted (written from scratch): for ds_path in ds_inserts: - pointer_files_to_fetch.update( - blob.hex - for blob in target_datasets[ds_path].tile_pointer_blobs( - self.repo.spatial_filter - ) + pointer_file_oids = dataset_to_pointer_oids_to_fetch.setdefault( + ds_path, set() ) + for blob in target_datasets[ds_path].tile_pointer_blobs( + self.repo.spatial_filter + ): + pointer_file_oids.add(blob.hex) # - For the datasets that will be updated: for ds_path in ds_updates: @@ -326,17 +327,18 @@ def _do_reset_datasets( workdir_diff_cache, repo_key_filter[ds_path], ) - pointer_files_to_fetch.update( - blob.hex - for blob in self._list_new_pointer_blobs_for_diff( - update_diffs[ds_path], target_datasets[ds_path] - ) + pointer_file_oids = dataset_to_pointer_oids_to_fetch.setdefault( + ds_path, set() ) + for blob in self._list_new_pointer_blobs_for_diff( + update_diffs[ds_path], target_datasets[ds_path] + ): + pointer_file_oids.add(blob.hex) # We fetch the LFS tiles immediately before writing them to the working copy - # unlike ODB objects that are already fetched. fetch_lfs_blobs_for_pointer_files( - self.repo, pointer_files_to_fetch, quiet=quiet + self.repo, dataset_to_pointer_oids_to_fetch, quiet=quiet ) # Second pass - actually update the working copy: diff --git a/tests/data/linked-dataset.tgz b/tests/data/linked-dataset.tgz new file mode 100644 index 000000000..32da6a0a0 Binary files /dev/null and b/tests/data/linked-dataset.tgz differ diff --git a/tests/linked_storage/test_url_redirects.py b/tests/linked_storage/test_url_redirects.py new file mode 100644 index 000000000..faddb81a7 --- /dev/null +++ b/tests/linked_storage/test_url_redirects.py @@ -0,0 +1,66 @@ +import json + +from kart.repo import KartRepo + + +def test_s3_url_redirects( + data_archive, + cli_runner, + s3_test_data_point_cloud, + check_lfs_hashes, + check_tile_is_reflinked, +): + with data_archive("linked-dataset") as repo_path: + r = cli_runner.invoke(["lfs+", "fetch", "HEAD", "--dry-run"]) + assert r.exit_code == 0, r.stderr + assert r.stdout.splitlines()[:8] == [ + "Running fetch with --dry-run:", + " Found 16 LFS blobs (373KiB) to fetch from specific URLs", + "", + "LFS blob OID: (Pointer file OID):", + "03e3d4dc6fc8e75c65ffdb39b630ffe26e4b95982b9765c919e34fb940e66fc0 (8d2362d8f14ea34aaebdede6602dcca0bcdd8297) → s3://example-bucket/example-path/auckland_3_2.laz", + "06bd15fbb6616cf63a4a410c5ba4666dab76177a58cb99c3fa2afb46c9dd6379 (f129df999b5aea453ace9d4fcd1496dcebf97fe1) → s3://example-bucket/example-path/auckland_1_3.laz", + "09701813661e369395d088a9a44f1201200155e652a8b6e291e71904f45e32a6 (553775bcbaa9c067e8ad611270d53d4f37ac37da) → s3://example-bucket/example-path/auckland_3_0.laz", + "111579edfe022ebfd3388cc47d911c16c72c7ebd84c32a7a0c1dab6ed9ec896a (76cff04b9c7ffb01bb99ac42a6e94612fdea605f) → s3://example-bucket/example-path/auckland_0_2.laz", + ] + + s3_test_data_point_cloud_prefix = s3_test_data_point_cloud.split("*")[0] + + linked_storage_json = { + "urlRedirects": { + "s3://example-bucket/example-path/": s3_test_data_point_cloud_prefix + } + } + r = cli_runner.invoke( + [ + "meta", + "set", + "auckland", + f"linked-storage.json={json.dumps(linked_storage_json)}", + ] + ) + assert r.exit_code == 0, r.stderr + + r = cli_runner.invoke(["lfs+", "fetch", "HEAD", "--dry-run"]) + assert r.exit_code == 0, r.stderr + assert r.stdout.splitlines()[:8] == [ + "Running fetch with --dry-run:", + " Found 16 LFS blobs (373KiB) to fetch from specific URLs", + "", + "LFS blob OID: (Pointer file OID):", + f"03e3d4dc6fc8e75c65ffdb39b630ffe26e4b95982b9765c919e34fb940e66fc0 (8d2362d8f14ea34aaebdede6602dcca0bcdd8297) → {s3_test_data_point_cloud_prefix}auckland_3_2.laz", + f"06bd15fbb6616cf63a4a410c5ba4666dab76177a58cb99c3fa2afb46c9dd6379 (f129df999b5aea453ace9d4fcd1496dcebf97fe1) → {s3_test_data_point_cloud_prefix}auckland_1_3.laz", + f"09701813661e369395d088a9a44f1201200155e652a8b6e291e71904f45e32a6 (553775bcbaa9c067e8ad611270d53d4f37ac37da) → {s3_test_data_point_cloud_prefix}auckland_3_0.laz", + f"111579edfe022ebfd3388cc47d911c16c72c7ebd84c32a7a0c1dab6ed9ec896a (76cff04b9c7ffb01bb99ac42a6e94612fdea605f) → {s3_test_data_point_cloud_prefix}auckland_0_2.laz", + ] + + r = cli_runner.invoke(["checkout", "--dataset=auckland"]) + assert r.exit_code == 0, r.stderr + + repo = KartRepo(repo_path) + check_lfs_hashes(repo, expected_file_count=16) + for x in range(4): + for y in range(4): + check_tile_is_reflinked( + repo_path / "auckland" / f"auckland_{x}_{y}.laz", repo + )