Skip to content

feat: delete orphaned files #1958

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 27 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
9dcb580
feat: delete orphaned files
jayceslesar Apr 29, 2025
e43505c
simpler and a test
jayceslesar Apr 29, 2025
eed5ea8
remove
jayceslesar Apr 29, 2025
8cca600
updates from review!
jayceslesar May 2, 2025
75b1240
include dry run and older than
jayceslesar May 2, 2025
6379480
add case for dry run
jayceslesar May 2, 2025
0c2822e
use .path so we get paths pack
jayceslesar May 3, 2025
aaf8fc2
actually pass in iterable
jayceslesar May 3, 2025
b09641b
capture manifest_list files
jayceslesar May 3, 2025
beec233
refactor into `all_known_files`
jayceslesar May 3, 2025
b888c56
fix type in docstring
jayceslesar May 3, 2025
ff461ed
mildly more readable
jayceslesar May 3, 2025
3b3b10e
beef up tests
jayceslesar May 3, 2025
a62c8cf
make `older_than` required
jayceslesar May 4, 2025
07cbf1b
move under `optimize` namespace
jayceslesar May 4, 2025
54e1e00
add some better logging about what was/was not deleted
jayceslesar May 4, 2025
7c780d3
Merge branch 'main' into feat/orphan-files
jayceslesar May 10, 2025
9b6c9ed
Merge branch 'main' into feat/orphan-files
jayceslesar May 13, 2025
34d10b9
rename optimize -> maintenance
jayceslesar May 17, 2025
0335957
make orphaned_files private
jayceslesar May 17, 2025
9f8145c
correctly coerce list
jayceslesar May 17, 2025
fbdcbd3
add metadata files
jayceslesar May 28, 2025
85b4ab3
Merge branch 'main' into feat/orphan-files
jayceslesar May 28, 2025
c414df8
Merge branch 'main' into feat/orphan-files
jayceslesar Jun 10, 2025
aa9d536
Merge branch 'main' into feat/orphan-files
jayceslesar Jun 21, 2025
b4c14fc
fix test
jayceslesar Jun 21, 2025
f4d98d2
allow older_than to be None
jayceslesar Jun 21, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions pyiceberg/table/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@
from pyiceberg.schema import Schema
from pyiceberg.table.inspect import InspectTable
from pyiceberg.table.locations import LocationProvider, load_location_provider
from pyiceberg.table.maintenance import MaintenanceTable
from pyiceberg.table.metadata import (
INITIAL_SEQUENCE_NUMBER,
TableMetadata,
Expand Down Expand Up @@ -1038,6 +1039,15 @@ def inspect(self) -> InspectTable:
"""
return InspectTable(self)

@property
def maintenance(self) -> MaintenanceTable:
"""Return the MaintenanceTable object for maintenance.

Returns:
MaintenanceTable object based on this Table.
"""
return MaintenanceTable(self)

def refresh(self) -> Table:
"""Refresh the current table metadata.

Expand Down
43 changes: 40 additions & 3 deletions pyiceberg/table/inspect.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
from __future__ import annotations

from datetime import datetime, timezone
from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Set, Tuple
from functools import reduce
from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Set, Tuple, Union

from pyiceberg.conversions import from_bytes
from pyiceberg.manifest import DataFile, DataFileContent, ManifestContent, ManifestFile, PartitionFieldSummary
Expand Down Expand Up @@ -665,10 +666,20 @@ def data_files(self, snapshot_id: Optional[int] = None) -> "pa.Table":
def delete_files(self, snapshot_id: Optional[int] = None) -> "pa.Table":
return self._files(snapshot_id, {DataFileContent.POSITION_DELETES, DataFileContent.EQUALITY_DELETES})

def all_manifests(self) -> "pa.Table":
def all_manifests(self, snapshots: Optional[Union[list[Snapshot], list[int]]] = None) -> "pa.Table":
import pyarrow as pa

snapshots = self.tbl.snapshots()
# coerce into snapshot objects if users passes in snapshot ids
if snapshots is not None:
if isinstance(snapshots[0], int):
snapshots = [
snapshot
for snapshot_id in snapshots
if (snapshot := self.tbl.metadata.snapshot_by_id(snapshot_id)) is not None
]
else:
snapshots = self.tbl.snapshots()

if not snapshots:
return pa.Table.from_pylist([], schema=self._get_all_manifests_schema())

Expand All @@ -678,6 +689,32 @@ def all_manifests(self) -> "pa.Table":
)
return pa.concat_tables(manifests_by_snapshots)

def _all_known_files(self) -> dict[str, set[str]]:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we still need this now we have all_files?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I dont think that all_files includes manifest lists or statistics files right?

Copy link
Contributor

@Fokko Fokko May 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It should reference deletes, but thinking of it, this is probably much more efficient because it does not pull in all the other data.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any updates here? What else is blocking this PR -- happy to address

"""Get all the known files in the table.

Returns:
dict of {file_type: set of file paths} for each file type.
"""
snapshots = self.tbl.snapshots()

_all_known_files = {}
_all_known_files["manifests"] = set(self.all_manifests(snapshots)["path"].to_pylist())
_all_known_files["manifest_lists"] = {snapshot.manifest_list for snapshot in snapshots}
_all_known_files["statistics"] = {statistic.statistics_path for statistic in self.tbl.metadata.statistics}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice!

Looking at the Java side, there seems to be a focus on (especially in tests) partition statistics files being not deleted by the procedure too. I don't think this is a concern here because PyIceberg doesn't support reading tables with partition statistics in the first place. But I think it should soon (I put up #2034 / #2033) in which case this logic would have to include that.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice if yours merges first I can add that here

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hahah nevermind -- yours definitely needs to go first


metadata_files = {entry.metadata_file for entry in self.tbl.metadata.metadata_log}
metadata_files.add(self.tbl.metadata_location) # Include current metadata file
_all_known_files["metadata"] = metadata_files

executor = ExecutorFactory.get_or_create()
snapshot_ids = [snapshot.snapshot_id for snapshot in snapshots]
files_by_snapshots: Iterator[Set[str]] = executor.map(
lambda snapshot_id: set(self.files(snapshot_id)["file_path"].to_pylist()), snapshot_ids
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

might be nice if InspectTable.files or InspectTable._files took an Optional[Union[int, Snapshot]] so we didn't have to get the id from a snapshot and then turn it back into a Snapshot inside InspectTable._files

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I think there are a lot of places where we arbitrarily use one over the other and imo would be nice to standardize. Probably out of scope for this PR but I think would definitely clean things up

)
_all_known_files["datafiles"] = reduce(set.union, files_by_snapshots, set())

return _all_known_files

def _all_files(self, data_file_filter: Optional[Set[DataFileContent]] = None) -> "pa.Table":
import pyarrow as pa

Expand Down
119 changes: 119 additions & 0 deletions pyiceberg/table/maintenance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from __future__ import annotations

import logging
from datetime import datetime, timedelta, timezone
from functools import reduce
from typing import TYPE_CHECKING, Optional, Set

from pyiceberg.utils.concurrent import ExecutorFactory

logger = logging.getLogger(__name__)


if TYPE_CHECKING:
from pyiceberg.table import Table


class MaintenanceTable:
tbl: Table

def __init__(self, tbl: Table) -> None:
self.tbl = tbl

try:
import pyarrow as pa # noqa
except ModuleNotFoundError as e:
raise ModuleNotFoundError("For metadata operations PyArrow needs to be installed") from e

def _orphaned_files(self, location: str, older_than: Optional[timedelta] = None) -> Set[str]:
"""Get all files which are not referenced in any metadata files of an Iceberg table and can thus be considered "orphaned".

Args:
location: The location to check for orphaned files.
older_than: The time period to check for orphaned files. Defaults to 3 days.

Returns:
A set of orphaned file paths.
"""
try:
import pyarrow as pa # noqa: F401
except ModuleNotFoundError as e:
raise ModuleNotFoundError("For deleting orphaned files PyArrow needs to be installed") from e

from pyarrow.fs import FileSelector, FileType

from pyiceberg.io.pyarrow import PyArrowFileIO

all_known_files = self.tbl.inspect._all_known_files()
flat_known_files: set[str] = reduce(set.union, all_known_files.values(), set())

scheme, _, _ = PyArrowFileIO.parse_location(location)
pyarrow_io = PyArrowFileIO()
fs = pyarrow_io.fs_by_scheme(scheme, None)

_, _, path = pyarrow_io.parse_location(location)
selector = FileSelector(path, recursive=True)

# filter to just files as it may return directories, and filter on time
if older_than is None:
older_than = timedelta(0)
as_of = datetime.now(timezone.utc) - older_than
all_files = [f.path for f in fs.get_file_info(selector) if f.type == FileType.File and f.mtime < as_of]

orphaned_files = set(all_files).difference(flat_known_files)

return orphaned_files

def remove_orphaned_files(self, older_than: Optional[timedelta] = None, dry_run: bool = False) -> None:
"""Remove files which are not referenced in any metadata files of an Iceberg table and can thus be considered "orphaned".

Args:
older_than: The time period to check for orphaned files. Defaults to 3 days.
dry_run: If True, only log the files that would be deleted. Defaults to False.
"""
location = self.tbl.location()
orphaned_files = self._orphaned_files(location, older_than)
logger.info(f"Found {len(orphaned_files)} orphaned files at {location}!")
deleted_files = set()
failed_to_delete_files = set()

def _delete(file: str) -> None:
# don't error if the file doesn't exist
# still catch ctrl-c, etc.
try:
self.tbl.io.delete(file)
deleted_files.add(file)
except Exception:
failed_to_delete_files.add(file)

if orphaned_files:
if dry_run:
logger.info(f"(Dry Run) Deleted {len(orphaned_files)} orphaned files at {location}!")
else:
executor = ExecutorFactory.get_or_create()
deletes = executor.map(_delete, orphaned_files)
# exhaust
list(deletes)
logger.info(f"Deleted {len(deleted_files)} orphaned files at {location}!")
logger.info(f"Files:\n{deleted_files}")
if failed_to_delete_files:
logger.warning(f"Failed to delete {len(failed_to_delete_files)} orphaned files at {location}!")
logger.warning(f"Files:\n{failed_to_delete_files}")
else:
logger.info(f"No orphaned files found at {location}!")
120 changes: 120 additions & 0 deletions tests/table/test_remove_orphans.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import os
from datetime import datetime, timedelta
from pathlib import Path, PosixPath
from unittest.mock import patch

import pyarrow as pa
import pytest

from pyiceberg.catalog import Catalog
from pyiceberg.schema import Schema
from pyiceberg.types import IntegerType, NestedField, StringType
from tests.catalog.test_base import InMemoryCatalog


@pytest.fixture
def catalog(tmp_path: PosixPath) -> InMemoryCatalog:
catalog = InMemoryCatalog("test.in_memory.catalog", warehouse=tmp_path.absolute().as_posix())
catalog.create_namespace("default")
return catalog


def test_remove_orphaned_files(catalog: Catalog) -> None:
identifier = "default.test_remove_orphaned_files"

schema = Schema(
NestedField(1, "city", StringType(), required=True),
NestedField(2, "inhabitants", IntegerType(), required=True),
# Mark City as the identifier field, also known as the primary-key
identifier_field_ids=[1],
)

tbl = catalog.create_table(identifier, schema=schema)

arrow_schema = pa.schema(
[
pa.field("city", pa.string(), nullable=False),
pa.field("inhabitants", pa.int32(), nullable=False),
]
)

df = pa.Table.from_pylist(
[
{"city": "Drachten", "inhabitants": 45019},
{"city": "Drachten", "inhabitants": 45019},
],
schema=arrow_schema,
)
tbl.append(df)

orphaned_file = Path(tbl.location()) / "orphan.txt"

orphaned_file.touch()
assert orphaned_file.exists()

# assert no files deleted if dry run...
tbl.maintenance.remove_orphaned_files(dry_run=True)
assert orphaned_file.exists()

# modify creation date to be older than 3 days
five_days_ago = (datetime.now() - timedelta(days=5)).timestamp()
os.utime(orphaned_file, (five_days_ago, five_days_ago))
tbl.maintenance.remove_orphaned_files()
assert not orphaned_file.exists()

# assert that all known files still exist...
all_known_files = tbl.inspect._all_known_files()
for files in all_known_files.values():
for file in files:
assert Path(file).exists()


def test_remove_orphaned_files_with_invalid_file_doesnt_error(catalog: Catalog) -> None:
identifier = "default.test_remove_orphaned_files"

schema = Schema(
NestedField(1, "city", StringType(), required=True),
NestedField(2, "inhabitants", IntegerType(), required=True),
# Mark City as the identifier field, also known as the primary-key
identifier_field_ids=[1],
)

tbl = catalog.create_table(identifier, schema=schema)

arrow_schema = pa.schema(
[
pa.field("city", pa.string(), nullable=False),
pa.field("inhabitants", pa.int32(), nullable=False),
]
)

df = pa.Table.from_pylist(
[
{"city": "Drachten", "inhabitants": 45019},
{"city": "Drachten", "inhabitants": 45019},
],
schema=arrow_schema,
)
tbl.append(df)

file_that_does_not_exist = "foo/bar.baz"
with patch.object(type(tbl.maintenance), "_orphaned_files", return_value={file_that_does_not_exist}):
with patch.object(tbl.io, "delete", wraps=tbl.io.delete) as mock_delete:
tbl.maintenance.remove_orphaned_files(timedelta(days=3))
mock_delete.assert_called_with(file_that_does_not_exist)