diff --git a/README.md b/README.md index 0146a60..503d046 100644 --- a/README.md +++ b/README.md @@ -9,9 +9,23 @@ Print FastCDC rolling hash chunks and checksums. ``` -Usage: chunksum [] [] +usage: chunksum [-h] [-n ALG_NAME] [-f CHUNKSUMS_FILE] [-i INCR_FILE] dir -alg_name: +Print FastCDC rolling hash chunks and checksums. + +positional arguments: + dir directory + +optional arguments: + -h, --help show this help message and exit + -n ALG_NAME, --alg-name ALG_NAME + chunksum algorithm name. + -f CHUNKSUMS_FILE, --chunksums-file CHUNKSUMS_FILE + chunksum file path, `-' for standard output. + -i INCR_FILE, --incr-file INCR_FILE + incremental updates file path + +alg-name: Format "fc[k|m|g][0-9][sha2|blake2b|blake2s][32]". For example, "fck4sha2", means using FastCDC("fc") with an @@ -27,17 +41,16 @@ alg_name: (default: fck4sha2) -prev_chunksums_file: +chunksums-file and incr-file: You can specify the previous chunksums file if you want to resume a previous check, or if you want to find the incremental updates (new files) of the directory. - Examples: $ chunksum /etc > ~/etc.chunksums - $ chunksum ~/Videos fcm4blake2b32 > ~/Videos/chunksums + $ chunksum -n fcm4blake2b32 -f ~/Videos/chunksums ~/Videos - $ chunksum ~/Videos fcm4blake2b32 ~/chunksums > ~/chunksums.incr + $ chunksum -n fcm4blake2b32 -f ~/chunksums -i ~/chunksums.incr ~/Videos ``` diff --git a/chunksum/chunksum.py b/chunksum/chunksum.py index 4c1ea19..bbede61 100644 --- a/chunksum/chunksum.py +++ b/chunksum/chunksum.py @@ -1,21 +1,16 @@ #!/usr/bin/env python -import os import re -import sys -from hashlib import blake2b -from hashlib import blake2s -from hashlib import sha256 from os.path import getsize -from os.path import join from tqdm.auto import tqdm -from tqdm.utils import _screen_shape_wrapper -from tqdm.utils import CallbackIOWrapper from .cdc import Chunker from .chunksize import GIGA from .chunksize import KILO from .chunksize import MEGA +from .hash import hash_digest_size +from .iter import iter_file_content +from .utils import sorted_walk UNITS = { @@ -24,76 +19,6 @@ "g": GIGA, } -HASH_FUNCTIONS = { - "sha2": sha256, - "blake2b": blake2b, - "blake2s": blake2s, -} - - -def iter_file_content(file, size=1024): - if hasattr(file, "name"): - yield from _iter_file_content_progress(file, file.name, size=size) - else: - yield from _iter_file_content(file, size=size) - - -def _iter_file_content(file, size=1024): - """ - >>> import io - >>> stream = io.StringIO('abcdefg') - >>> list(_iter_file_content(stream, size=3)) - ['abc', 'def', 'g'] - """ - - while True: - content = file.read(size) - if not content: - break - yield content - - -def get_screen_width(fd=sys.stdout): - """ - >>> get_screen_width(None) - (None, None) - """ - dynamic = _screen_shape_wrapper() - return dynamic(fd) - - -def get_tqdm_limited_desc(desc, fd=sys.stdout): - """ - >>> get_tqdm_limited_desc(str(list(range(100))), None) - '...93, 94, 95, 96, 97, 98, 99]' - """ - default_screen_width = 80 - reserve_size_for_tqdm = 50 - - width = get_screen_width() - if width and width[0]: - cols = width[0] # pragma: no cover - else: - cols = default_screen_width - desc_limit = cols - reserve_size_for_tqdm - if len(desc) > desc_limit: - return f"...{desc[3 - desc_limit: ]}" - else: - return desc - - -def _iter_file_content_progress(file, path, size=1024): - with tqdm( - total=getsize(path), - desc=get_tqdm_limited_desc(path), - unit="B", - unit_scale=True, - unit_divisor=1024, - delay=1.0, - ) as t: - fobj = CallbackIOWrapper(t.update, file, "read") - yield from _iter_file_content(fobj, size) - def get_chunker(size_name="", avg=1024, min=256, max=4096): """ @@ -128,61 +53,6 @@ def get_chunker(size_name="", avg=1024, min=256, max=4096): return Chunker(size.avg, size.min, size.max) -def get_hasher(name): - """ - >>> get_hasher('sha2') - - >>> get_hasher('blake2b') - <_blake2.blake2b ...> - >>> get_hasher('blake2b32') - <_blake2.blake2b ...> - >>> get_hasher('blake2s') - <_blake2.blake2s ...> - >>> get_hasher('badname') # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE - Traceback (most recent call last): - ... - Exception: unsupported hash name: badname - >>> get_hasher('blake2x') # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE - Traceback (most recent call last): - ... - Exception: unsupported hash name: blake2x - >>> get_hasher('blake2') # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE - Traceback (most recent call last): - ... - Exception: unsupported hash name: blake2 - >>> get_hasher('sha256') # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE - Traceback (most recent call last): - ... - Exception: unsupported hash name: sha256 - """ - name = name.lower() - pattern = r"(?Psha2|blake2b|blake2s)(?P\d+)?" - - mo = re.match(pattern, name) - if not mo: - raise Exception(f"unsupported hash name: {name}") - - groups = mo.groupdict() - hash_name = groups["hash_name"] - digest_size = groups["digest_size"] - - if hash_name == "sha2" and digest_size: - raise Exception(f"unsupported hash name: {name}") - - func = HASH_FUNCTIONS[hash_name] - if digest_size: - return func(digest_size=int(digest_size)) - else: - return func() - - -def hash_digest_size(data, hasher_name): - size = len(data) - h = get_hasher(hasher_name) - h.update(data) - return (h.digest(), size) - - def compute_file(file, alg_name="fck4sha2"): """ @@ -236,34 +106,6 @@ def format_a_result(path, result, alg_name): return f"{digest.hex()} {path} {alg_name}!{chunks}" -def get_total_size(dir): - """ - >>> import tempfile - >>> import os.path - >>> dir = tempfile.TemporaryDirectory() - >>> file1 = os.path.join(dir.name, 'testfile') - >>> _ = open(file1, 'wb').write(b'hello') - >>> get_total_size(dir.name) - 5 - """ - total = 0 - with tqdm(desc="get total file size", delay=0.5) as t: - for root, dirs, files in os.walk(dir): - for file in files: - path = join(root, file) - total += getsize(path) - t.update() - return total - - -def sorted_walk(dir): - for root, dirs, files in os.walk(dir): - for file in sorted(files): - path = join(root, file) - yield path - dirs.sort() - - def walk(target, output_file, alg_name="fck4sha2", skip_func=None, total=0): """ >>> import os.path @@ -289,6 +131,7 @@ def walk(target, output_file, alg_name="fck4sha2", skip_func=None, total=0): for path in sorted_walk(target): if skip_func and skip_func(path): + t.update(getsize(path)) continue chunks = compute_file(open(path, "rb"), alg_name) print( diff --git a/chunksum/cli.py b/chunksum/cli.py index e444363..3f5b40d 100644 --- a/chunksum/cli.py +++ b/chunksum/cli.py @@ -1,17 +1,14 @@ +import argparse import sys +from os.path import exists -from .chunksum import get_total_size from .chunksum import walk from .parser import parse_chunksums +from .utils import get_total_size - -def help(): - print( - """Print FastCDC rolling hash chunks and checksums. - -Usage: {cmd} [] [] - -alg_name: +command_desc = "Print FastCDC rolling hash chunks and checksums." +command_long_desc = """ +alg-name: Format "fc[k|m|g][0-9][sha2|blake2b|blake2s][32]". For example, "fck4sha2", means using FastCDC("fc") with an @@ -27,7 +24,7 @@ def help(): (default: fck4sha2) -prev_chunksums_file: +chunksums-file and incr-file: You can specify the previous chunksums file if you want to resume a previous check, or if you want to find the incremental updates (new files) of the directory. @@ -35,15 +32,12 @@ def help(): Examples: - $ {cmd} /etc > ~/etc.chunksums + $ %(prog)s /etc > ~/etc.chunksums - $ {cmd} ~/Videos fcm4blake2b32 > ~/Videos/chunksums + $ %(prog)s -n fcm4blake2b32 -f ~/Videos/chunksums ~/Videos - $ {cmd} ~/Videos fcm4blake2b32 ~/chunksums > ~/chunksums.incr -""".format( - cmd=sys.argv[0], - ), - ) + $ %(prog)s -n fcm4blake2b32 -f ~/chunksums -i ~/chunksums.incr ~/Videos +""" def included_in_chunksums(chunksums_file): @@ -59,10 +53,13 @@ def included(path): def main(): """ # help - >>> sys.argv = ['chunksup'] - >>> main() # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE + >>> sys.argv = ['chunksum', '-h'] + >>> try: + ... main() # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE + ... except: + ... pass + usage: chunksum ... Print ... - Usage: ... ... # compute chunksums @@ -71,39 +68,89 @@ def main(): >>> dir = tempfile.TemporaryDirectory() >>> file1 = os.path.join(dir.name, 'testfile') >>> _ = open(file1, 'wb').write(b'hello') - >>> sys.argv = ['chunksum', dir.name] + >>> sys.argv = ['chunksum', '-f', '-', dir.name] # output to stdout >>> main() 9595...3d50 .../testfile fck4sha2!2cf2...9824:5 - >>> sys.argv = ['chunksum', dir.name, 'fcm0blake2b32'] + >>> sys.argv = ['chunksum', '-n', 'fcm0blake2b32', '-f', '-', dir.name] >>> main() 901c...ce59 .../testfile fcm0blake2b32!324d...72cf:5 - >>> sys.argv = ['chunksum', dir.name, 'fcm0blake2s'] + >>> sys.argv = ['chunksum', '-n', 'fcm0blake2s', '-f', '-', dir.name] >>> main() 8d95...5ee5 .../testfile fcm0blake2s!1921...ca25:5 + >>> dir2 = tempfile.TemporaryDirectory() + >>> chunksums = os.path.join(dir2.name, 'chunksums') + >>> sys.argv = ['chunksum', '-f', chunksums, dir.name] # output to a file + >>> main() - # skip files - >>> file2 = os.path.join(dir.name, 'newfile') - >>> _ = open(file2, 'wb').write(b'hello') + # incremental / skip file >>> chunksums = tempfile.NamedTemporaryFile() - >>> _ = open(chunksums.name, 'w').write(f'sum {file1} fck4sha2!') - >>> sys.argv = ['chunksum', dir.name, 'fck4sha2', chunksums.name] + >>> sys.argv = ['chunksum', '-f', chunksums.name, dir.name] + >>> main() + >>> file2 = os.path.join(dir.name, 'newfile') + >>> _ = open(file2, 'wb').write(b'world') + >>> incr = chunksums.name + '.incr' + >>> sys.argv = ['chunksum', '-f', chunksums.name, '-i', '-', dir.name] >>> main() - 9595...3d50 .../newfile fck4sha2!2cf2...9824:5 + 63...06 .../newfile fck4sha2!48...a7:5 + >>> sys.argv = ['chunksum', '-f', chunksums.name, '-i', incr, dir.name] + >>> main() + >>> open(incr).read().strip() + '63...06 .../newfile fck4sha2!48...a7:5' + + # resume + >>> sys.argv = ['chunksum', '-f', chunksums.name, dir.name] + >>> main() + >>> for line in open(chunksums.name).readlines(): + ... print(line.strip()) + 95...50 .../testfile fck4sha2!2c...24:5 + 63...06 .../newfile fck4sha2!48...a7:5 """ - if len(sys.argv) == 1: - help() - return + parser = argparse.ArgumentParser( + description=command_desc, + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=command_long_desc, + ) + parser.add_argument( + "-n", + "--alg-name", + default="fck4sha2", + help="chunksum algorithm name.", + ) + parser.add_argument( + "-f", + "--chunksums-file", + default="chunksums", + help="chunksum file path, `-' for standard output.", + ) + parser.add_argument( + "-i", + "--incr-file", + help="incremental updates file path", + ) + parser.add_argument("dir", nargs=1, help="directory") + args = parser.parse_args() skip_func = None - if len(sys.argv) > 3: - path, alg_name, prev_version_chunksums = sys.argv[1:4] - skip_func = included_in_chunksums(open(prev_version_chunksums)) - if len(sys.argv) > 2: - path, alg_name = sys.argv[1:3] + if exists(args.chunksums_file): + skip_func = included_in_chunksums(open(args.chunksums_file)) + + if args.chunksums_file == "-" or args.incr_file == "-": + output_file = sys.stdout + elif args.incr_file: + output_file = open(args.incr_file, "a") + elif exists(args.chunksums_file): + output_file = open(args.chunksums_file, "a") else: - path, alg_name = sys.argv[1], "fck4sha2" - total = get_total_size(path) - walk(path, sys.stdout, alg_name, skip_func=skip_func, total=total) + output_file = open(args.chunksums_file, "w") + + total = get_total_size(args.dir[0]) + walk( + args.dir[0], + output_file, + args.alg_name, + skip_func=skip_func, + total=total, + ) if __name__ == "__main__": diff --git a/chunksum/hash.py b/chunksum/hash.py new file mode 100644 index 0000000..4c096a7 --- /dev/null +++ b/chunksum/hash.py @@ -0,0 +1,66 @@ +import re +from hashlib import blake2b +from hashlib import blake2s +from hashlib import sha256 + + +HASH_FUNCTIONS = { + "sha2": sha256, + "blake2b": blake2b, + "blake2s": blake2s, +} + + +def get_hasher(name): + """ + >>> get_hasher('sha2') + + >>> get_hasher('blake2b') + <_blake2.blake2b ...> + >>> get_hasher('blake2b32') + <_blake2.blake2b ...> + >>> get_hasher('blake2s') + <_blake2.blake2s ...> + >>> get_hasher('badname') # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE + Traceback (most recent call last): + ... + Exception: unsupported hash name: badname + >>> get_hasher('blake2x') # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE + Traceback (most recent call last): + ... + Exception: unsupported hash name: blake2x + >>> get_hasher('blake2') # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE + Traceback (most recent call last): + ... + Exception: unsupported hash name: blake2 + >>> get_hasher('sha256') # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE + Traceback (most recent call last): + ... + Exception: unsupported hash name: sha256 + """ + name = name.lower() + pattern = r"(?Psha2|blake2b|blake2s)(?P\d+)?" + + mo = re.match(pattern, name) + if not mo: + raise Exception(f"unsupported hash name: {name}") + + groups = mo.groupdict() + hash_name = groups["hash_name"] + digest_size = groups["digest_size"] + + if hash_name == "sha2" and digest_size: + raise Exception(f"unsupported hash name: {name}") + + func = HASH_FUNCTIONS[hash_name] + if digest_size: + return func(digest_size=int(digest_size)) + else: + return func() + + +def hash_digest_size(data, hasher_name): + size = len(data) + h = get_hasher(hasher_name) + h.update(data) + return (h.digest(), size) diff --git a/chunksum/iter.py b/chunksum/iter.py new file mode 100644 index 0000000..7eba77b --- /dev/null +++ b/chunksum/iter.py @@ -0,0 +1,41 @@ +from os.path import getsize + +from tqdm.auto import tqdm +from tqdm.utils import CallbackIOWrapper + +from .utils import get_tqdm_limited_desc + + +def iter_file_content(file, size=1024): + if hasattr(file, "name"): + yield from _iter_file_content_progress(file, file.name, size=size) + else: + yield from _iter_file_content(file, size=size) + + +def _iter_file_content(file, size=1024): + """ + >>> import io + >>> stream = io.StringIO('abcdefg') + >>> list(_iter_file_content(stream, size=3)) + ['abc', 'def', 'g'] + """ + + while True: + content = file.read(size) + if not content: + break + yield content + + +def _iter_file_content_progress(file, path, size=1024): + with tqdm( + total=getsize(path), + desc=get_tqdm_limited_desc(path), + unit="B", + unit_scale=True, + unit_divisor=1024, + delay=1.0, + ) as t: + fobj = CallbackIOWrapper(t.update, file, "read") + yield from _iter_file_content(fobj, size) diff --git a/chunksum/utils.py b/chunksum/utils.py new file mode 100644 index 0000000..3e66654 --- /dev/null +++ b/chunksum/utils.py @@ -0,0 +1,64 @@ +import os +import sys +from os.path import getsize +from os.path import join + +from tqdm.auto import tqdm +from tqdm.utils import _screen_shape_wrapper + + +def get_screen_width(fd=sys.stdout): + """ + >>> get_screen_width(None) + (None, None) + """ + dynamic = _screen_shape_wrapper() + return dynamic(fd) + + +def get_tqdm_limited_desc(desc, fd=sys.stdout): + """ + >>> get_tqdm_limited_desc(str(list(range(100))), None) + '...93, 94, 95, 96, 97, 98, 99]' + """ + default_screen_width = 80 + reserve_size_for_tqdm = 50 + + width = get_screen_width() + if width and width[0]: + cols = width[0] # pragma: no cover + else: + cols = default_screen_width + desc_limit = cols - reserve_size_for_tqdm + if len(desc) > desc_limit: + return f"...{desc[3 - desc_limit: ]}" + else: + return desc + + +def get_total_size(dir): + """ + >>> import tempfile + >>> import os.path + >>> dir = tempfile.TemporaryDirectory() + >>> file1 = os.path.join(dir.name, 'testfile') + >>> _ = open(file1, 'wb').write(b'hello') + >>> get_total_size(dir.name) + 5 + """ + total = 0 + with tqdm(desc="get total file size", delay=0.5) as t: + for root, dirs, files in os.walk(dir): + for file in files: + path = join(root, file) + total += getsize(path) + t.update() + return total + + +def sorted_walk(dir): + for root, dirs, files in os.walk(dir): + for file in sorted(files): + path = join(root, file) + yield path + dirs.sort() diff --git a/poetry.lock b/poetry.lock index 65bc2a7..675073a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -83,7 +83,7 @@ python-versions = "*" [[package]] name = "exceptiongroup" -version = "1.0.3" +version = "1.0.4" description = "Backport of PEP 654 (exception groups)" category = "dev" optional = false @@ -490,8 +490,8 @@ distlib = [ {file = "distlib-0.3.6.tar.gz", hash = "sha256:14bad2d9b04d3a36127ac97f30b12a19268f211063d8f8ee4f47108896e11b46"}, ] exceptiongroup = [ - {file = "exceptiongroup-1.0.3-py3-none-any.whl", hash = "sha256:6002703c7d31fb9950ddc8780840f67880c440895dc1151dd551553aa1246e4a"}, - {file = "exceptiongroup-1.0.3.tar.gz", hash = "sha256:76cac74b5207c5997678a1c7105cb6f14213c9c63c096a38cfcb529d83ce5c02"}, + {file = "exceptiongroup-1.0.4-py3-none-any.whl", hash = "sha256:542adf9dea4055530d6e1279602fa5cb11dab2395fa650b8674eaec35fc4a828"}, + {file = "exceptiongroup-1.0.4.tar.gz", hash = "sha256:bd14967b79cd9bdb54d97323216f8fdf533e278df937aa2a90089e7d6e06e5ec"}, ] fastcdc = [ {file = "fastcdc-1.4.2-cp36-cp36m-macosx_10_15_x86_64.whl", hash = "sha256:6869c2db8dfb9ab7aa29173cf95201e1b12fab2a0bdd4dd41d01377405e7b31e"},