From 78656073f99fb45d71e6d5bb4a76e676f186c8e6 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 9 Nov 2023 18:56:15 +0100 Subject: [PATCH 1/4] Cleanup /data/tmp/ci/dev_preview of files older than 30 days and empty directories --- zim/tmp-kiwix/tmp-kiwix-cleanup.cronjob.yaml | 38 ++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 zim/tmp-kiwix/tmp-kiwix-cleanup.cronjob.yaml diff --git a/zim/tmp-kiwix/tmp-kiwix-cleanup.cronjob.yaml b/zim/tmp-kiwix/tmp-kiwix-cleanup.cronjob.yaml new file mode 100644 index 0000000..16eba86 --- /dev/null +++ b/zim/tmp-kiwix/tmp-kiwix-cleanup.cronjob.yaml @@ -0,0 +1,38 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + name: tmp-cleanup-files + namespace: zim +spec: + schedule: "0 2 * * *" + successfulJobsHistoryLimit: 1 + failedJobsHistoryLimit: 1 + concurrencyPolicy: Forbid + jobTemplate: + spec: + backoffLimit: 2 + template: + spec: + restartPolicy: Never + containers: + - image: docker.io/nginx:1.21 + imagePullPolicy: IfNotPresent + name: jobrunner + volumeMounts: + - mountPath: "/data/tmp" + name: tmp-kiwix-volume + readOnly: true + workingDir: /data/tmp + command: ["/bin/sh","-c"] + args: ["find /data/tmp/ci/dev_preview -type f -mtime +30 | wc -l; find /data/tmp/ci/dev_preview -type f -mtime +30 --delete; find /data/tmp/ci/dev_preview -type d -empty | wc -l; find /data/tmp/ci/dev_preview -type d -empty --delete"] + resources: + requests: + cpu: 100m + memory: 64Mi + volumes: + - name: tmp-kiwix-volume + persistentVolumeClaim: + # /!\ name is inverted compared to this file + claimName: kiwix-tmp-pvc + nodeSelector: + k8s.kiwix.org/role: "storage" From 01db32502c5c93ead46fb41aa06994e54f5871f2 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Mon, 13 Nov 2023 13:36:27 +0100 Subject: [PATCH 2/4] Create a Python script to cleanup old files and directories --- zim/tmp-kiwix/cleanup-old-files.py | 120 +++++++++++++++++++ zim/tmp-kiwix/tmp-kiwix-cleanup.cronjob.yaml | 10 +- 2 files changed, 126 insertions(+), 4 deletions(-) create mode 100644 zim/tmp-kiwix/cleanup-old-files.py diff --git a/zim/tmp-kiwix/cleanup-old-files.py b/zim/tmp-kiwix/cleanup-old-files.py new file mode 100644 index 0000000..84a2919 --- /dev/null +++ b/zim/tmp-kiwix/cleanup-old-files.py @@ -0,0 +1,120 @@ +#!/usr/local/bin/python3 + +""" +File and Directory Deletion Script + +This script performs the deletion of files older than a specified number of days and +deletes empty subdirectories in a specified folder. + +Usage: + python cleanup-old-files.py -f /path/to/folder -n 30 [-d] + +Options: + -d, --dry-run Perform a dry run to list the files and directories that would be deleted. + If not specified, the script performs the actual deletion. + -f, --folder Path to the target folder. + -n, --days Delete files older than this many days. Default is 30 days. +""" + +import os +import argparse +import datetime + +def list_files_to_delete(folder_path, days): + """ + List files that are older than a specified number of days. + + Args: + folder_path (str): The path to the target folder. + days (int): Delete files older than this many days. + + Returns: + list: A list of file paths to be deleted. + """ + current_time = datetime.datetime.now() + files_to_delete = [] + + def list_files(folder_path): + for foldername, subfolders, filenames in os.walk(folder_path): + for filename in filenames: + file_path = os.path.join(foldername, filename) + yield file_path + + for file_path in list_files(folder_path): + # Get the last modification time of the file + file_time = datetime.datetime.fromtimestamp(os.path.getmtime(file_path)) + + # Calculate the difference in days + days_difference = (current_time - file_time).days + + if days_difference > days: + files_to_delete.append(file_path) + + return files_to_delete + +def list_directories_to_delete(folder_path, files_to_delete): + """ + List empty subdirectories or subdirectories containing only files to be deleted. + + Args: + folder_path (str): The path to the target folder. + files_to_delete (list): A list of file paths to be deleted. + + Returns: + list: A list of empty subdirectories to be deleted. + """ + empty_directories_to_delete = [] + + for foldername, subfolders, filenames in os.walk(folder_path): + for subfolder in subfolders: + subfolder_path = os.path.join(foldername, subfolder) + + # Check if the directory is already empty or contains only files that will be deleted + if not os.listdir(subfolder_path) or all(os.path.join(subfolder_path, filename) in files_to_delete for filename in os.listdir(subfolder_path)): + empty_directories_to_delete.append(subfolder_path) + + return empty_directories_to_delete + +def process_deletion(dry_run, files_to_delete, empty_directories_to_delete): + """ + Process file and directory deletion based on the dry-run status. + + Args: + dry_run (bool): True for dry-run, False for actual deletion. + files_to_delete (list): A list of file paths to be deleted. + empty_directories_to_delete (list): A list of empty subdirectories to be deleted. + """ + if dry_run: + print(f"These files would be deleted:") + print("\n".join(files_to_delete)) + print("\nEmpty subdirectories that would be deleted:") + print("\n".join(empty_directories_to_delete)) + else: + print(f"Deleting files:") + for file_path in files_to_delete: + os.remove(file_path) + print(f"Deleted: {file_path}") + + print(f"\nDeleting empty subdirectories:") + for directory_path in empty_directories_to_delete: + os.rmdir(directory_path) + print(f"Deleted empty directory: {directory_path}") + +def main(): + """ + Main function to parse arguments and initiate the deletion process. + """ + parser = argparse.ArgumentParser(description="Delete old files and empty subdirectories.") + parser.add_argument("-d", "--dry-run", action="store_true", default=False, help="Perform a dry run (default: False)") + parser.add_argument("-f", "--folder", required=True, help="Path to the target folder") + parser.add_argument("-n", "--days", type=int, default=30, help="Delete files older than this many days (default: 30)") + + args = parser.parse_args() + + files_to_delete = list_files_to_delete(args.folder, args.days) + empty_directories_to_delete = list_directories_to_delete(args.folder, files_to_delete) + + process_deletion(args.dry_run, files_to_delete, empty_directories_to_delete) + +if __name__ == "__main__": + main() diff --git a/zim/tmp-kiwix/tmp-kiwix-cleanup.cronjob.yaml b/zim/tmp-kiwix/tmp-kiwix-cleanup.cronjob.yaml index 16eba86..95fe061 100644 --- a/zim/tmp-kiwix/tmp-kiwix-cleanup.cronjob.yaml +++ b/zim/tmp-kiwix/tmp-kiwix-cleanup.cronjob.yaml @@ -15,16 +15,18 @@ spec: spec: restartPolicy: Never containers: - - image: docker.io/nginx:1.21 - imagePullPolicy: IfNotPresent + - image: ghcr.io/kiwix/maintenance:latest + imagePullPolicy: Always name: jobrunner + environment: + - name: INSTALL_SCRIPTS + value: https://raw.githubusercontent.com/kiwix/k8s/cleanup_tmp_dev_preview/zim/tmp-kiwix/cleanup-old-files.py volumeMounts: - mountPath: "/data/tmp" name: tmp-kiwix-volume readOnly: true workingDir: /data/tmp - command: ["/bin/sh","-c"] - args: ["find /data/tmp/ci/dev_preview -type f -mtime +30 | wc -l; find /data/tmp/ci/dev_preview -type f -mtime +30 --delete; find /data/tmp/ci/dev_preview -type d -empty | wc -l; find /data/tmp/ci/dev_preview -type d -empty --delete"] + command: ["cleanup-old-files.py -f /data/tmp/ci/dev_preview -n 30 -d"] resources: requests: cpu: 100m From 13b9e2992a078a8ec356a63f377fae6a6f51a7d5 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Tue, 14 Nov 2023 14:35:37 +0100 Subject: [PATCH 3/4] Fixes following review --- .gitignore | 1 + zim/tmp-kiwix/cleanup-old-files.py | 133 ++++++++++--------- zim/tmp-kiwix/tmp-kiwix-cleanup.cronjob.yaml | 6 +- 3 files changed, 72 insertions(+), 68 deletions(-) diff --git a/.gitignore b/.gitignore index 3742ad4..643e489 100644 --- a/.gitignore +++ b/.gitignore @@ -17,4 +17,5 @@ kubeconfig-*.yaml *.secret.yaml *.secrets.yaml +# editor config not maintained .vscode \ No newline at end of file diff --git a/zim/tmp-kiwix/cleanup-old-files.py b/zim/tmp-kiwix/cleanup-old-files.py index 84a2919..8ccc8e8 100644 --- a/zim/tmp-kiwix/cleanup-old-files.py +++ b/zim/tmp-kiwix/cleanup-old-files.py @@ -19,76 +19,54 @@ import os import argparse import datetime +from pathlib import Path -def list_files_to_delete(folder_path, days): - """ - List files that are older than a specified number of days. - Args: - folder_path (str): The path to the target folder. - days (int): Delete files older than this many days. - - Returns: - list: A list of file paths to be deleted. - """ - current_time = datetime.datetime.now() - files_to_delete = [] - - def list_files(folder_path): - for foldername, subfolders, filenames in os.walk(folder_path): - for filename in filenames: - file_path = os.path.join(foldername, filename) - yield file_path - - for file_path in list_files(folder_path): - # Get the last modification time of the file - file_time = datetime.datetime.fromtimestamp(os.path.getmtime(file_path)) - - # Calculate the difference in days - days_difference = (current_time - file_time).days - - if days_difference > days: +def list_files_to_delete(folder_path: Path, days: int) -> list[Path]: + """List files that are older than a specified number of days.""" + minimum_time = (datetime.datetime.now() - datetime.timedelta(days=days)).timestamp() + files_to_delete: list[Path] = [] + for file_path in folder_path.rglob("*"): + if file_path.stat().st_mtime < minimum_time: files_to_delete.append(file_path) return files_to_delete -def list_directories_to_delete(folder_path, files_to_delete): - """ - List empty subdirectories or subdirectories containing only files to be deleted. - Args: - folder_path (str): The path to the target folder. - files_to_delete (list): A list of file paths to be deleted. +def list_directories_to_delete( + folder_path: Path, files_to_delete: list[Path] +) -> list[Path]: + """List empty subdirectories or subdirectories containing only files to be deleted.""" + empty_directories_to_delete: list[Path] = [] + + for file in files_to_delete: + parent = Path(file).parent - Returns: - list: A list of empty subdirectories to be deleted. - """ - empty_directories_to_delete = [] + # Check recursively if the parent directory is already empty or contains only + # files/directories that will be deleted + while all( + item in files_to_delete or item in empty_directories_to_delete + for item in parent.iterdir() + ): + # Abort if parent is already marked for deletion + if parent in empty_directories_to_delete: + break + empty_directories_to_delete.append(parent) - for foldername, subfolders, filenames in os.walk(folder_path): - for subfolder in subfolders: - subfolder_path = os.path.join(foldername, subfolder) - - # Check if the directory is already empty or contains only files that will be deleted - if not os.listdir(subfolder_path) or all(os.path.join(subfolder_path, filename) in files_to_delete for filename in os.listdir(subfolder_path)): - empty_directories_to_delete.append(subfolder_path) + parent = parent.parent return empty_directories_to_delete -def process_deletion(dry_run, files_to_delete, empty_directories_to_delete): - """ - Process file and directory deletion based on the dry-run status. - Args: - dry_run (bool): True for dry-run, False for actual deletion. - files_to_delete (list): A list of file paths to be deleted. - empty_directories_to_delete (list): A list of empty subdirectories to be deleted. - """ +def process_deletion( + dry_run: bool, files_to_delete: list[Path], empty_directories_to_delete: list[Path] +): + """Process file and directory deletion based on the dry-run status.""" if dry_run: print(f"These files would be deleted:") - print("\n".join(files_to_delete)) + print("\n".join([str(path) for path in files_to_delete])) print("\nEmpty subdirectories that would be deleted:") - print("\n".join(empty_directories_to_delete)) + print("\n".join([str(path) for path in empty_directories_to_delete])) else: print(f"Deleting files:") for file_path in files_to_delete: @@ -97,24 +75,49 @@ def process_deletion(dry_run, files_to_delete, empty_directories_to_delete): print(f"\nDeleting empty subdirectories:") for directory_path in empty_directories_to_delete: - os.rmdir(directory_path) - print(f"Deleted empty directory: {directory_path}") + try: + os.rmdir(directory_path) + print(f"Deleted empty directory: {directory_path}") + except OSError as ex: + # do not fail script when we fail to delete a directory since this has + # almost zero impact on storage + it might happen that a file appears + # between our inventory and the real deletion + print(f"Failed to delete directory {directory_path}:\n{ex}") + def main(): - """ - Main function to parse arguments and initiate the deletion process. - """ - parser = argparse.ArgumentParser(description="Delete old files and empty subdirectories.") - parser.add_argument("-d", "--dry-run", action="store_true", default=False, help="Perform a dry run (default: False)") - parser.add_argument("-f", "--folder", required=True, help="Path to the target folder") - parser.add_argument("-n", "--days", type=int, default=30, help="Delete files older than this many days (default: 30)") + """Main function to parse arguments and initiate the deletion process.""" + parser = argparse.ArgumentParser( + description="Delete old files and empty subdirectories." + ) + parser.add_argument( + "-d", + "--dry-run", + action="store_true", + default=False, + help="Perform a dry run (default: False)", + ) + parser.add_argument( + "-f", "--folder", required=True, help="Path to the target folder" + ) + parser.add_argument( + "-n", + "--days", + type=int, + default=30, + help="Delete files older than this many days (default: 30)", + ) args = parser.parse_args() - files_to_delete = list_files_to_delete(args.folder, args.days) - empty_directories_to_delete = list_directories_to_delete(args.folder, files_to_delete) + folder_path = Path(args.folder) + files_to_delete = list_files_to_delete(folder_path, args.days) + empty_directories_to_delete = list_directories_to_delete( + folder_path, files_to_delete + ) process_deletion(args.dry_run, files_to_delete, empty_directories_to_delete) + if __name__ == "__main__": main() diff --git a/zim/tmp-kiwix/tmp-kiwix-cleanup.cronjob.yaml b/zim/tmp-kiwix/tmp-kiwix-cleanup.cronjob.yaml index 95fe061..f720d6d 100644 --- a/zim/tmp-kiwix/tmp-kiwix-cleanup.cronjob.yaml +++ b/zim/tmp-kiwix/tmp-kiwix-cleanup.cronjob.yaml @@ -20,13 +20,13 @@ spec: name: jobrunner environment: - name: INSTALL_SCRIPTS - value: https://raw.githubusercontent.com/kiwix/k8s/cleanup_tmp_dev_preview/zim/tmp-kiwix/cleanup-old-files.py + value: github://kiwix/k8s/zim/tmp-kiwix/cleanup-old-files.py\n volumeMounts: - mountPath: "/data/tmp" name: tmp-kiwix-volume readOnly: true - workingDir: /data/tmp - command: ["cleanup-old-files.py -f /data/tmp/ci/dev_preview -n 30 -d"] + workingDir: /data/tmp + args: ["cleanup-old-files", "-f", "/data/tmp/ci/dev_preview", "-n", "30", "-d"] resources: requests: cpu: 100m From 55089f47027a9cd9c2bbc213fad65295cf84f84f Mon Sep 17 00:00:00 2001 From: benoit74 Date: Tue, 14 Nov 2023 17:56:04 +0100 Subject: [PATCH 4/4] More elegant code - not using os package to delete files and directories - not marking a directory as a file to be deleted ... - not deleting directories at input folder or higher in the hierarchy - all conditions moved in the while loop for directory selection - display of directories and files sorted alphabetically for dry run --- zim/tmp-kiwix/cleanup-old-files.py | 44 ++++++++++++++++-------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/zim/tmp-kiwix/cleanup-old-files.py b/zim/tmp-kiwix/cleanup-old-files.py index 8ccc8e8..08fc7e9 100644 --- a/zim/tmp-kiwix/cleanup-old-files.py +++ b/zim/tmp-kiwix/cleanup-old-files.py @@ -16,7 +16,6 @@ -n, --days Delete files older than this many days. Default is 30 days. """ -import os import argparse import datetime from pathlib import Path @@ -27,6 +26,8 @@ def list_files_to_delete(folder_path: Path, days: int) -> list[Path]: minimum_time = (datetime.datetime.now() - datetime.timedelta(days=days)).timestamp() files_to_delete: list[Path] = [] for file_path in folder_path.rglob("*"): + if not file_path.is_file(): + continue if file_path.stat().st_mtime < minimum_time: files_to_delete.append(file_path) @@ -40,20 +41,23 @@ def list_directories_to_delete( empty_directories_to_delete: list[Path] = [] for file in files_to_delete: - parent = Path(file).parent - - # Check recursively if the parent directory is already empty or contains only - # files/directories that will be deleted - while all( - item in files_to_delete or item in empty_directories_to_delete - for item in parent.iterdir() + current_parent = file.parent + + # Check recursively if the parent directory is not the root processing folder + # and if it is still relative to this root folder (probably a no brainer) + # and it is not already selected for deletion + # and it is already empty or contains only files/directories that will be deleted + while ( + current_parent != folder_path + and current_parent.is_relative_to(folder_path) + and current_parent not in empty_directories_to_delete + and all( + item in files_to_delete or item in empty_directories_to_delete + for item in current_parent.iterdir() + ) ): - # Abort if parent is already marked for deletion - if parent in empty_directories_to_delete: - break - empty_directories_to_delete.append(parent) - - parent = parent.parent + empty_directories_to_delete.append(current_parent) + current_parent = current_parent.parent return empty_directories_to_delete @@ -64,24 +68,24 @@ def process_deletion( """Process file and directory deletion based on the dry-run status.""" if dry_run: print(f"These files would be deleted:") - print("\n".join([str(path) for path in files_to_delete])) + print("\n".join(sorted(str(path) for path in files_to_delete))) print("\nEmpty subdirectories that would be deleted:") - print("\n".join([str(path) for path in empty_directories_to_delete])) + print("\n".join(sorted(str(path) for path in empty_directories_to_delete))) else: print(f"Deleting files:") for file_path in files_to_delete: - os.remove(file_path) + file_path.unlink() print(f"Deleted: {file_path}") print(f"\nDeleting empty subdirectories:") for directory_path in empty_directories_to_delete: try: - os.rmdir(directory_path) + directory_path.rmdir() print(f"Deleted empty directory: {directory_path}") except OSError as ex: # do not fail script when we fail to delete a directory since this has - # almost zero impact on storage + it might happen that a file appears - # between our inventory and the real deletion + # almost zero impact on storage + it might happen that a file is created + # between our inventory and the real directory deletion print(f"Failed to delete directory {directory_path}:\n{ex}")