diff --git a/README.md b/README.md
index 0146a60..503d046 100644
--- a/README.md
+++ b/README.md
@@ -9,9 +9,23 @@
Print FastCDC rolling hash chunks and checksums.
```
-Usage: chunksum
[] []
+usage: chunksum [-h] [-n ALG_NAME] [-f CHUNKSUMS_FILE] [-i INCR_FILE] dir
-alg_name:
+Print FastCDC rolling hash chunks and checksums.
+
+positional arguments:
+ dir directory
+
+optional arguments:
+ -h, --help show this help message and exit
+ -n ALG_NAME, --alg-name ALG_NAME
+ chunksum algorithm name.
+ -f CHUNKSUMS_FILE, --chunksums-file CHUNKSUMS_FILE
+ chunksum file path, `-' for standard output.
+ -i INCR_FILE, --incr-file INCR_FILE
+ incremental updates file path
+
+alg-name:
Format "fc[k|m|g][0-9][sha2|blake2b|blake2s][32]".
For example, "fck4sha2", means using FastCDC("fc") with an
@@ -27,17 +41,16 @@ alg_name:
(default: fck4sha2)
-prev_chunksums_file:
+chunksums-file and incr-file:
You can specify the previous chunksums file if you want to
resume a previous check, or if you want to find the incremental
updates (new files) of the directory.
-
Examples:
$ chunksum /etc > ~/etc.chunksums
- $ chunksum ~/Videos fcm4blake2b32 > ~/Videos/chunksums
+ $ chunksum -n fcm4blake2b32 -f ~/Videos/chunksums ~/Videos
- $ chunksum ~/Videos fcm4blake2b32 ~/chunksums > ~/chunksums.incr
+ $ chunksum -n fcm4blake2b32 -f ~/chunksums -i ~/chunksums.incr ~/Videos
```
diff --git a/chunksum/chunksum.py b/chunksum/chunksum.py
index 4c1ea19..bbede61 100644
--- a/chunksum/chunksum.py
+++ b/chunksum/chunksum.py
@@ -1,21 +1,16 @@
#!/usr/bin/env python
-import os
import re
-import sys
-from hashlib import blake2b
-from hashlib import blake2s
-from hashlib import sha256
from os.path import getsize
-from os.path import join
from tqdm.auto import tqdm
-from tqdm.utils import _screen_shape_wrapper
-from tqdm.utils import CallbackIOWrapper
from .cdc import Chunker
from .chunksize import GIGA
from .chunksize import KILO
from .chunksize import MEGA
+from .hash import hash_digest_size
+from .iter import iter_file_content
+from .utils import sorted_walk
UNITS = {
@@ -24,76 +19,6 @@
"g": GIGA,
}
-HASH_FUNCTIONS = {
- "sha2": sha256,
- "blake2b": blake2b,
- "blake2s": blake2s,
-}
-
-
-def iter_file_content(file, size=1024):
- if hasattr(file, "name"):
- yield from _iter_file_content_progress(file, file.name, size=size)
- else:
- yield from _iter_file_content(file, size=size)
-
-
-def _iter_file_content(file, size=1024):
- """
- >>> import io
- >>> stream = io.StringIO('abcdefg')
- >>> list(_iter_file_content(stream, size=3))
- ['abc', 'def', 'g']
- """
-
- while True:
- content = file.read(size)
- if not content:
- break
- yield content
-
-
-def get_screen_width(fd=sys.stdout):
- """
- >>> get_screen_width(None)
- (None, None)
- """
- dynamic = _screen_shape_wrapper()
- return dynamic(fd)
-
-
-def get_tqdm_limited_desc(desc, fd=sys.stdout):
- """
- >>> get_tqdm_limited_desc(str(list(range(100))), None)
- '...93, 94, 95, 96, 97, 98, 99]'
- """
- default_screen_width = 80
- reserve_size_for_tqdm = 50
-
- width = get_screen_width()
- if width and width[0]:
- cols = width[0] # pragma: no cover
- else:
- cols = default_screen_width
- desc_limit = cols - reserve_size_for_tqdm
- if len(desc) > desc_limit:
- return f"...{desc[3 - desc_limit: ]}"
- else:
- return desc
-
-
-def _iter_file_content_progress(file, path, size=1024):
- with tqdm(
- total=getsize(path),
- desc=get_tqdm_limited_desc(path),
- unit="B",
- unit_scale=True,
- unit_divisor=1024,
- delay=1.0,
- ) as t:
- fobj = CallbackIOWrapper(t.update, file, "read")
- yield from _iter_file_content(fobj, size)
-
def get_chunker(size_name="", avg=1024, min=256, max=4096):
"""
@@ -128,61 +53,6 @@ def get_chunker(size_name="", avg=1024, min=256, max=4096):
return Chunker(size.avg, size.min, size.max)
-def get_hasher(name):
- """
- >>> get_hasher('sha2')
-
- >>> get_hasher('blake2b')
- <_blake2.blake2b ...>
- >>> get_hasher('blake2b32')
- <_blake2.blake2b ...>
- >>> get_hasher('blake2s')
- <_blake2.blake2s ...>
- >>> get_hasher('badname') # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
- Traceback (most recent call last):
- ...
- Exception: unsupported hash name: badname
- >>> get_hasher('blake2x') # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
- Traceback (most recent call last):
- ...
- Exception: unsupported hash name: blake2x
- >>> get_hasher('blake2') # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
- Traceback (most recent call last):
- ...
- Exception: unsupported hash name: blake2
- >>> get_hasher('sha256') # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
- Traceback (most recent call last):
- ...
- Exception: unsupported hash name: sha256
- """
- name = name.lower()
- pattern = r"(?Psha2|blake2b|blake2s)(?P\d+)?"
-
- mo = re.match(pattern, name)
- if not mo:
- raise Exception(f"unsupported hash name: {name}")
-
- groups = mo.groupdict()
- hash_name = groups["hash_name"]
- digest_size = groups["digest_size"]
-
- if hash_name == "sha2" and digest_size:
- raise Exception(f"unsupported hash name: {name}")
-
- func = HASH_FUNCTIONS[hash_name]
- if digest_size:
- return func(digest_size=int(digest_size))
- else:
- return func()
-
-
-def hash_digest_size(data, hasher_name):
- size = len(data)
- h = get_hasher(hasher_name)
- h.update(data)
- return (h.digest(), size)
-
-
def compute_file(file, alg_name="fck4sha2"):
"""
@@ -236,34 +106,6 @@ def format_a_result(path, result, alg_name):
return f"{digest.hex()} {path} {alg_name}!{chunks}"
-def get_total_size(dir):
- """
- >>> import tempfile
- >>> import os.path
- >>> dir = tempfile.TemporaryDirectory()
- >>> file1 = os.path.join(dir.name, 'testfile')
- >>> _ = open(file1, 'wb').write(b'hello')
- >>> get_total_size(dir.name)
- 5
- """
- total = 0
- with tqdm(desc="get total file size", delay=0.5) as t:
- for root, dirs, files in os.walk(dir):
- for file in files:
- path = join(root, file)
- total += getsize(path)
- t.update()
- return total
-
-
-def sorted_walk(dir):
- for root, dirs, files in os.walk(dir):
- for file in sorted(files):
- path = join(root, file)
- yield path
- dirs.sort()
-
-
def walk(target, output_file, alg_name="fck4sha2", skip_func=None, total=0):
"""
>>> import os.path
@@ -289,6 +131,7 @@ def walk(target, output_file, alg_name="fck4sha2", skip_func=None, total=0):
for path in sorted_walk(target):
if skip_func and skip_func(path):
+ t.update(getsize(path))
continue
chunks = compute_file(open(path, "rb"), alg_name)
print(
diff --git a/chunksum/cli.py b/chunksum/cli.py
index e444363..3f5b40d 100644
--- a/chunksum/cli.py
+++ b/chunksum/cli.py
@@ -1,17 +1,14 @@
+import argparse
import sys
+from os.path import exists
-from .chunksum import get_total_size
from .chunksum import walk
from .parser import parse_chunksums
+from .utils import get_total_size
-
-def help():
- print(
- """Print FastCDC rolling hash chunks and checksums.
-
-Usage: {cmd} [] []
-
-alg_name:
+command_desc = "Print FastCDC rolling hash chunks and checksums."
+command_long_desc = """
+alg-name:
Format "fc[k|m|g][0-9][sha2|blake2b|blake2s][32]".
For example, "fck4sha2", means using FastCDC("fc") with an
@@ -27,7 +24,7 @@ def help():
(default: fck4sha2)
-prev_chunksums_file:
+chunksums-file and incr-file:
You can specify the previous chunksums file if you want to
resume a previous check, or if you want to find the incremental
updates (new files) of the directory.
@@ -35,15 +32,12 @@ def help():
Examples:
- $ {cmd} /etc > ~/etc.chunksums
+ $ %(prog)s /etc > ~/etc.chunksums
- $ {cmd} ~/Videos fcm4blake2b32 > ~/Videos/chunksums
+ $ %(prog)s -n fcm4blake2b32 -f ~/Videos/chunksums ~/Videos
- $ {cmd} ~/Videos fcm4blake2b32 ~/chunksums > ~/chunksums.incr
-""".format(
- cmd=sys.argv[0],
- ),
- )
+ $ %(prog)s -n fcm4blake2b32 -f ~/chunksums -i ~/chunksums.incr ~/Videos
+"""
def included_in_chunksums(chunksums_file):
@@ -59,10 +53,13 @@ def included(path):
def main():
"""
# help
- >>> sys.argv = ['chunksup']
- >>> main() # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
+ >>> sys.argv = ['chunksum', '-h']
+ >>> try:
+ ... main() # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
+ ... except:
+ ... pass
+ usage: chunksum ...
Print ...
- Usage: ...
...
# compute chunksums
@@ -71,39 +68,89 @@ def main():
>>> dir = tempfile.TemporaryDirectory()
>>> file1 = os.path.join(dir.name, 'testfile')
>>> _ = open(file1, 'wb').write(b'hello')
- >>> sys.argv = ['chunksum', dir.name]
+ >>> sys.argv = ['chunksum', '-f', '-', dir.name] # output to stdout
>>> main()
9595...3d50 .../testfile fck4sha2!2cf2...9824:5
- >>> sys.argv = ['chunksum', dir.name, 'fcm0blake2b32']
+ >>> sys.argv = ['chunksum', '-n', 'fcm0blake2b32', '-f', '-', dir.name]
>>> main()
901c...ce59 .../testfile fcm0blake2b32!324d...72cf:5
- >>> sys.argv = ['chunksum', dir.name, 'fcm0blake2s']
+ >>> sys.argv = ['chunksum', '-n', 'fcm0blake2s', '-f', '-', dir.name]
>>> main()
8d95...5ee5 .../testfile fcm0blake2s!1921...ca25:5
+ >>> dir2 = tempfile.TemporaryDirectory()
+ >>> chunksums = os.path.join(dir2.name, 'chunksums')
+ >>> sys.argv = ['chunksum', '-f', chunksums, dir.name] # output to a file
+ >>> main()
- # skip files
- >>> file2 = os.path.join(dir.name, 'newfile')
- >>> _ = open(file2, 'wb').write(b'hello')
+ # incremental / skip file
>>> chunksums = tempfile.NamedTemporaryFile()
- >>> _ = open(chunksums.name, 'w').write(f'sum {file1} fck4sha2!')
- >>> sys.argv = ['chunksum', dir.name, 'fck4sha2', chunksums.name]
+ >>> sys.argv = ['chunksum', '-f', chunksums.name, dir.name]
+ >>> main()
+ >>> file2 = os.path.join(dir.name, 'newfile')
+ >>> _ = open(file2, 'wb').write(b'world')
+ >>> incr = chunksums.name + '.incr'
+ >>> sys.argv = ['chunksum', '-f', chunksums.name, '-i', '-', dir.name]
>>> main()
- 9595...3d50 .../newfile fck4sha2!2cf2...9824:5
+ 63...06 .../newfile fck4sha2!48...a7:5
+ >>> sys.argv = ['chunksum', '-f', chunksums.name, '-i', incr, dir.name]
+ >>> main()
+ >>> open(incr).read().strip()
+ '63...06 .../newfile fck4sha2!48...a7:5'
+
+ # resume
+ >>> sys.argv = ['chunksum', '-f', chunksums.name, dir.name]
+ >>> main()
+ >>> for line in open(chunksums.name).readlines():
+ ... print(line.strip())
+ 95...50 .../testfile fck4sha2!2c...24:5
+ 63...06 .../newfile fck4sha2!48...a7:5
"""
- if len(sys.argv) == 1:
- help()
- return
+ parser = argparse.ArgumentParser(
+ description=command_desc,
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ epilog=command_long_desc,
+ )
+ parser.add_argument(
+ "-n",
+ "--alg-name",
+ default="fck4sha2",
+ help="chunksum algorithm name.",
+ )
+ parser.add_argument(
+ "-f",
+ "--chunksums-file",
+ default="chunksums",
+ help="chunksum file path, `-' for standard output.",
+ )
+ parser.add_argument(
+ "-i",
+ "--incr-file",
+ help="incremental updates file path",
+ )
+ parser.add_argument("dir", nargs=1, help="directory")
+ args = parser.parse_args()
skip_func = None
- if len(sys.argv) > 3:
- path, alg_name, prev_version_chunksums = sys.argv[1:4]
- skip_func = included_in_chunksums(open(prev_version_chunksums))
- if len(sys.argv) > 2:
- path, alg_name = sys.argv[1:3]
+ if exists(args.chunksums_file):
+ skip_func = included_in_chunksums(open(args.chunksums_file))
+
+ if args.chunksums_file == "-" or args.incr_file == "-":
+ output_file = sys.stdout
+ elif args.incr_file:
+ output_file = open(args.incr_file, "a")
+ elif exists(args.chunksums_file):
+ output_file = open(args.chunksums_file, "a")
else:
- path, alg_name = sys.argv[1], "fck4sha2"
- total = get_total_size(path)
- walk(path, sys.stdout, alg_name, skip_func=skip_func, total=total)
+ output_file = open(args.chunksums_file, "w")
+
+ total = get_total_size(args.dir[0])
+ walk(
+ args.dir[0],
+ output_file,
+ args.alg_name,
+ skip_func=skip_func,
+ total=total,
+ )
if __name__ == "__main__":
diff --git a/chunksum/hash.py b/chunksum/hash.py
new file mode 100644
index 0000000..4c096a7
--- /dev/null
+++ b/chunksum/hash.py
@@ -0,0 +1,66 @@
+import re
+from hashlib import blake2b
+from hashlib import blake2s
+from hashlib import sha256
+
+
+HASH_FUNCTIONS = {
+ "sha2": sha256,
+ "blake2b": blake2b,
+ "blake2s": blake2s,
+}
+
+
+def get_hasher(name):
+ """
+ >>> get_hasher('sha2')
+
+ >>> get_hasher('blake2b')
+ <_blake2.blake2b ...>
+ >>> get_hasher('blake2b32')
+ <_blake2.blake2b ...>
+ >>> get_hasher('blake2s')
+ <_blake2.blake2s ...>
+ >>> get_hasher('badname') # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
+ Traceback (most recent call last):
+ ...
+ Exception: unsupported hash name: badname
+ >>> get_hasher('blake2x') # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
+ Traceback (most recent call last):
+ ...
+ Exception: unsupported hash name: blake2x
+ >>> get_hasher('blake2') # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
+ Traceback (most recent call last):
+ ...
+ Exception: unsupported hash name: blake2
+ >>> get_hasher('sha256') # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
+ Traceback (most recent call last):
+ ...
+ Exception: unsupported hash name: sha256
+ """
+ name = name.lower()
+ pattern = r"(?Psha2|blake2b|blake2s)(?P\d+)?"
+
+ mo = re.match(pattern, name)
+ if not mo:
+ raise Exception(f"unsupported hash name: {name}")
+
+ groups = mo.groupdict()
+ hash_name = groups["hash_name"]
+ digest_size = groups["digest_size"]
+
+ if hash_name == "sha2" and digest_size:
+ raise Exception(f"unsupported hash name: {name}")
+
+ func = HASH_FUNCTIONS[hash_name]
+ if digest_size:
+ return func(digest_size=int(digest_size))
+ else:
+ return func()
+
+
+def hash_digest_size(data, hasher_name):
+ size = len(data)
+ h = get_hasher(hasher_name)
+ h.update(data)
+ return (h.digest(), size)
diff --git a/chunksum/iter.py b/chunksum/iter.py
new file mode 100644
index 0000000..7eba77b
--- /dev/null
+++ b/chunksum/iter.py
@@ -0,0 +1,41 @@
+from os.path import getsize
+
+from tqdm.auto import tqdm
+from tqdm.utils import CallbackIOWrapper
+
+from .utils import get_tqdm_limited_desc
+
+
+def iter_file_content(file, size=1024):
+ if hasattr(file, "name"):
+ yield from _iter_file_content_progress(file, file.name, size=size)
+ else:
+ yield from _iter_file_content(file, size=size)
+
+
+def _iter_file_content(file, size=1024):
+ """
+ >>> import io
+ >>> stream = io.StringIO('abcdefg')
+ >>> list(_iter_file_content(stream, size=3))
+ ['abc', 'def', 'g']
+ """
+
+ while True:
+ content = file.read(size)
+ if not content:
+ break
+ yield content
+
+
+def _iter_file_content_progress(file, path, size=1024):
+ with tqdm(
+ total=getsize(path),
+ desc=get_tqdm_limited_desc(path),
+ unit="B",
+ unit_scale=True,
+ unit_divisor=1024,
+ delay=1.0,
+ ) as t:
+ fobj = CallbackIOWrapper(t.update, file, "read")
+ yield from _iter_file_content(fobj, size)
diff --git a/chunksum/utils.py b/chunksum/utils.py
new file mode 100644
index 0000000..3e66654
--- /dev/null
+++ b/chunksum/utils.py
@@ -0,0 +1,64 @@
+import os
+import sys
+from os.path import getsize
+from os.path import join
+
+from tqdm.auto import tqdm
+from tqdm.utils import _screen_shape_wrapper
+
+
+def get_screen_width(fd=sys.stdout):
+ """
+ >>> get_screen_width(None)
+ (None, None)
+ """
+ dynamic = _screen_shape_wrapper()
+ return dynamic(fd)
+
+
+def get_tqdm_limited_desc(desc, fd=sys.stdout):
+ """
+ >>> get_tqdm_limited_desc(str(list(range(100))), None)
+ '...93, 94, 95, 96, 97, 98, 99]'
+ """
+ default_screen_width = 80
+ reserve_size_for_tqdm = 50
+
+ width = get_screen_width()
+ if width and width[0]:
+ cols = width[0] # pragma: no cover
+ else:
+ cols = default_screen_width
+ desc_limit = cols - reserve_size_for_tqdm
+ if len(desc) > desc_limit:
+ return f"...{desc[3 - desc_limit: ]}"
+ else:
+ return desc
+
+
+def get_total_size(dir):
+ """
+ >>> import tempfile
+ >>> import os.path
+ >>> dir = tempfile.TemporaryDirectory()
+ >>> file1 = os.path.join(dir.name, 'testfile')
+ >>> _ = open(file1, 'wb').write(b'hello')
+ >>> get_total_size(dir.name)
+ 5
+ """
+ total = 0
+ with tqdm(desc="get total file size", delay=0.5) as t:
+ for root, dirs, files in os.walk(dir):
+ for file in files:
+ path = join(root, file)
+ total += getsize(path)
+ t.update()
+ return total
+
+
+def sorted_walk(dir):
+ for root, dirs, files in os.walk(dir):
+ for file in sorted(files):
+ path = join(root, file)
+ yield path
+ dirs.sort()
diff --git a/poetry.lock b/poetry.lock
index 65bc2a7..675073a 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -83,7 +83,7 @@ python-versions = "*"
[[package]]
name = "exceptiongroup"
-version = "1.0.3"
+version = "1.0.4"
description = "Backport of PEP 654 (exception groups)"
category = "dev"
optional = false
@@ -490,8 +490,8 @@ distlib = [
{file = "distlib-0.3.6.tar.gz", hash = "sha256:14bad2d9b04d3a36127ac97f30b12a19268f211063d8f8ee4f47108896e11b46"},
]
exceptiongroup = [
- {file = "exceptiongroup-1.0.3-py3-none-any.whl", hash = "sha256:6002703c7d31fb9950ddc8780840f67880c440895dc1151dd551553aa1246e4a"},
- {file = "exceptiongroup-1.0.3.tar.gz", hash = "sha256:76cac74b5207c5997678a1c7105cb6f14213c9c63c096a38cfcb529d83ce5c02"},
+ {file = "exceptiongroup-1.0.4-py3-none-any.whl", hash = "sha256:542adf9dea4055530d6e1279602fa5cb11dab2395fa650b8674eaec35fc4a828"},
+ {file = "exceptiongroup-1.0.4.tar.gz", hash = "sha256:bd14967b79cd9bdb54d97323216f8fdf533e278df937aa2a90089e7d6e06e5ec"},
]
fastcdc = [
{file = "fastcdc-1.4.2-cp36-cp36m-macosx_10_15_x86_64.whl", hash = "sha256:6869c2db8dfb9ab7aa29173cf95201e1b12fab2a0bdd4dd41d01377405e7b31e"},