Skip to content

Commit 1a70b75

Browse files
committed
Initial commit
0 parents  commit 1a70b75

15 files changed

+726
-0
lines changed

.gitignore

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
*.swp
2+
*.swo
3+
*.pyc
4+
.venv
5+
test-basic

LICENSE

+28
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
Modified BSD License
2+
3+
Copyright 2022 Mark Gordon
4+
5+
Redistribution and use in source and binary forms, with or without modification,
6+
are permitted provided that the following conditions are met:
7+
8+
1. Redistributions of source code must retain the above copyright notice, this
9+
list of conditions and the following disclaimer.
10+
11+
2. Redistributions in binary form must reproduce the above copyright notice,
12+
this list of conditions and the following disclaimer in the documentation
13+
and/or other materials provided with the distribution.
14+
15+
3. Neither the name of the copyright holder nor the names of its contributors
16+
may be used to endorse or promote products derived from this software
17+
without specific prior written permission.
18+
19+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20+
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21+
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
23+
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24+
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25+
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26+
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

Makefile

+39
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
.PHONY: format format-check pylint typecheck lint test docs build pypy-test pypy-live
2+
PYTHON ?= python3
3+
PLATFORM ?= linux/amd64
4+
PROFILE ?= dev
5+
6+
all: format lint test docs
7+
8+
format:
9+
$(PYTHON) -m black .
10+
$(PYTHON) -m isort --profile=black .
11+
12+
format-check:
13+
$(PYTHON) -m black --check .
14+
$(PYTHON) -m isort --profile=black --check .
15+
16+
pylint:
17+
$(PYTHON) -m pylint dirdiff tests
18+
19+
typecheck:
20+
$(PYTHON) -m mypy dirdiff tests
21+
22+
lint: format-check pylint typecheck
23+
24+
test:
25+
$(PYTHON) -m pytest -sv --cov=tplbuild -m unit tests
26+
27+
build:
28+
$(PYTHON) -m build
29+
30+
clean:
31+
rm -rf build dist *.egg-info
32+
33+
pypi-test: build
34+
TWINE_USERNAME=__token__ TWINE_PASSWORD="$(shell gpg -d test.pypi-token.gpg)" \
35+
$(PYTHON) -m twine upload --repository testpypi dist/*
36+
37+
pypi-live: build
38+
TWINE_USERNAME=__token__ TWINE_PASSWORD="$(shell gpg -d live.pypi-token.gpg)" \
39+
$(PYTHON) -m twine upload dist/*

README.md

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
WIP
2+
3+
CLI utility to compute the diff between two directories.
4+
5+
The initially supported output format will be overlayfs tar or direct file
6+
output. These output will be such that they can be used as the upper layer in an
7+
overlayfs mount alongside the left operand of the diff to produce the same
8+
contents present in the right operand as the merged mount.
9+
10+
This tool is intended to be used by systems directly managing container image
11+
contents.

dirdiff/__init__.py

Whitespace-only changes.

dirdiff/__main__.py

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
import logging
2+
import sys
3+
import tarfile
4+
5+
from dirdiff.differ import Differ
6+
from dirdiff.output_overlay import DiffOutputOverlayTar
7+
8+
9+
def main():
10+
# Just placeholder entrypoint for now
11+
logging.basicConfig(
12+
level=logging.INFO,
13+
stream=sys.stderr,
14+
)
15+
16+
with tarfile.open(mode="w|", fileobj=sys.stdout.buffer) as tf:
17+
output = DiffOutputOverlayTar(tf)
18+
differ = Differ(sys.argv[1], sys.argv[2], output)
19+
differ.diff()
20+
21+
22+
if __name__ == "__main__":
23+
main()

dirdiff/differ.py

+240
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,240 @@
1+
import dataclasses
2+
import logging
3+
import os
4+
import stat
5+
from enum import Enum
6+
from typing import List, Optional, Tuple
7+
8+
from dirdiff.filelib import DirectoryManager, FileManager, PathManager
9+
from dirdiff.output import DiffOutput, StatInfo
10+
11+
LOGGER = logging.getLogger(__name__)
12+
13+
14+
class DirEntryType(Enum):
15+
DIRECTORY = 1
16+
REGULAR_FILE = 2
17+
OTHER = 3
18+
19+
20+
def _dir_entry_type(dir_entry: os.DirEntry) -> DirEntryType:
21+
if dir_entry.is_dir(follow_symlinks=False):
22+
return DirEntryType.DIRECTORY
23+
if dir_entry.is_file(follow_symlinks=False):
24+
return DirEntryType.REGULAR_FILE
25+
return DirEntryType.OTHER
26+
27+
28+
@dataclasses.dataclass
29+
class DifferOptions:
30+
output_uid: Optional[int] = None
31+
output_gid: Optional[int] = None
32+
scrub_mtime: Optional[bool] = True
33+
34+
def stats_filter(self, x: os.stat_result) -> StatInfo:
35+
"""
36+
Return a copy of the stat_result object that has been adjusted based
37+
on the options set.
38+
"""
39+
return StatInfo(
40+
st_mode=x.st_mode,
41+
st_uid=self.output_uid if self.output_uid is not None else x.st_uid,
42+
st_gid=self.output_gid if self.output_gid is not None else x.st_gid,
43+
st_size=x.st_size,
44+
st_mtime=0 if self.scrub_mtime else int(x.st_mtime),
45+
st_rdev=x.st_rdev,
46+
)
47+
48+
def stats_differ(self, st_x: os.stat_result, st_y: os.stat_result) -> bool:
49+
"""
50+
Returns True if the data in the stat results are the same for the purposes
51+
of performing a diff. This takes into account the configuraiton options
52+
set in this object.
53+
54+
Note that this does not perform any deeper diffing that may be necessary
55+
for some file types. In particular symlinks and regular files contents
56+
should be inspected as well.
57+
"""
58+
x = self.stats_filter(st_x)
59+
y = self.stats_filter(st_y)
60+
if x.st_uid != y.st_uid:
61+
return True
62+
if x.st_gid != y.st_gid:
63+
return True
64+
if x.st_mode != y.st_mode:
65+
return True
66+
if stat.S_ISREG(x.st_mode) or stat.S_ISLNK(x.st_mode):
67+
if x.st_size != y.st_size:
68+
return True
69+
if stat.S_ISCHR(x.st_mode) or stat.S_ISBLK(x.st_mode):
70+
if x.st_rdev != y.st_rdev:
71+
return True
72+
return False
73+
74+
75+
class Differ:
76+
def __init__(
77+
self,
78+
upper_path,
79+
merged_path,
80+
output: DiffOutput,
81+
*,
82+
options: Optional[DifferOptions] = None,
83+
) -> None:
84+
self.upper_path = upper_path
85+
self.merged_path = merged_path
86+
self.output = output
87+
self.options = options or DifferOptions()
88+
self._dir_pending: List[Tuple[str, os.stat_result]] = []
89+
90+
def diff(self) -> None:
91+
with DirectoryManager(self.upper_path) as upper:
92+
with DirectoryManager(self.merged_path) as merged:
93+
self._diff_dirs(".", upper, merged)
94+
95+
def _diff_dirs(
96+
self,
97+
archive_path: str,
98+
upper: DirectoryManager,
99+
merged: DirectoryManager,
100+
) -> None:
101+
upper_map = {dir_entry.name: _dir_entry_type(dir_entry) for dir_entry in upper}
102+
103+
# If stats differ write dir now. Otherwise wait until we find an actual
104+
# difference underneath this directory. Note that a directory should be
105+
# written to the output if *any* child object has changed and it should
106+
# be written *before* that child. Therefore we push it to `_dir_pending`
107+
# which must be flushed before anything else can be written.
108+
self._dir_pending.append((archive_path, merged.stat))
109+
if self.options.stats_differ(upper.stat, merged.stat):
110+
self._flush_pending()
111+
112+
for dir_entry in merged:
113+
dir_entry_type = _dir_entry_type(dir_entry)
114+
cpath = os.path.join(archive_path, dir_entry.name)
115+
116+
upper_type = upper_map.pop(dir_entry.name, None)
117+
if dir_entry_type == DirEntryType.DIRECTORY:
118+
with merged.child_dir(dir_entry.name) as merged_cdir:
119+
if upper_type != DirEntryType.DIRECTORY:
120+
self._insert_dir(cpath, merged_cdir)
121+
continue
122+
123+
with upper.child_dir(dir_entry.name) as upper_cdir:
124+
self._diff_dirs(cpath, upper_cdir, merged_cdir)
125+
continue
126+
127+
if dir_entry_type == DirEntryType.REGULAR_FILE:
128+
with merged.child_file(dir_entry.name) as merged_cfile:
129+
if upper_type != DirEntryType.REGULAR_FILE:
130+
self._insert_file(cpath, merged_cfile)
131+
continue
132+
133+
with upper.child_file(dir_entry.name) as upper_cfile:
134+
self._diff_files(cpath, upper_cfile, merged_cfile)
135+
continue
136+
137+
with merged.child_path(dir_entry.name) as merged_cpath:
138+
if upper_type != DirEntryType.OTHER:
139+
self._insert_other(cpath, merged_cpath)
140+
continue
141+
142+
with upper.child_path(dir_entry.name) as upper_cpath:
143+
self._diff_other(cpath, upper_cpath, merged_cpath)
144+
145+
for name in upper_map:
146+
self._flush_pending()
147+
self.output.delete_marker(os.path.join(archive_path, name))
148+
149+
# Remove ourselves from _dir_pending if we're still there. Note at this
150+
# point if _dir_pending isn't empty we must be at the end of it.
151+
if self._dir_pending:
152+
self._dir_pending.pop()
153+
154+
def _diff_files(
155+
self,
156+
archive_path: str,
157+
upper: FileManager,
158+
merged: FileManager,
159+
) -> None:
160+
if self.options.stats_differ(upper.stat, merged.stat):
161+
self._insert_file(archive_path, merged)
162+
return
163+
164+
CHUNK_SIZE = 2**16
165+
differs = False
166+
with upper.reader() as upper_reader:
167+
with merged.reader() as merged_reader:
168+
while True:
169+
upper_data = upper_reader.read(CHUNK_SIZE)
170+
merged_data = merged_reader.read(CHUNK_SIZE)
171+
if upper_data != merged_data:
172+
differs = True
173+
break
174+
if not upper_data:
175+
break
176+
177+
if differs:
178+
self._insert_file(archive_path, merged)
179+
180+
def _diff_other(
181+
self,
182+
archive_path: str,
183+
upper: PathManager,
184+
merged: PathManager,
185+
) -> None:
186+
if not self.options.stats_differ(upper.stat, merged.stat):
187+
if not stat.S_ISLNK(merged.stat.st_mode):
188+
return
189+
if upper.linkname == merged.linkname:
190+
return
191+
self._insert_other(archive_path, merged)
192+
193+
def _flush_pending(self) -> None:
194+
for archive_path, dir_stat in self._dir_pending:
195+
self.output.write_dir(archive_path, self.options.stats_filter(dir_stat))
196+
self._dir_pending.clear()
197+
198+
def _insert_dir(
199+
self,
200+
archive_path: str,
201+
obj: DirectoryManager,
202+
) -> None:
203+
self._flush_pending()
204+
self.output.write_dir(archive_path, self.options.stats_filter(obj.stat))
205+
206+
for dir_entry in obj:
207+
cpath = os.path.join(archive_path, dir_entry.name)
208+
if dir_entry.is_dir(follow_symlinks=False):
209+
with obj.child_dir(dir_entry.name) as child_dir:
210+
self._insert_dir(cpath, child_dir)
211+
elif dir_entry.is_file(follow_symlinks=False):
212+
with obj.child_file(dir_entry.name) as child_file:
213+
self._insert_file(cpath, child_file)
214+
else:
215+
with obj.child_path(dir_entry.name) as child_path:
216+
self._insert_other(cpath, child_path)
217+
218+
def _insert_file(
219+
self,
220+
archive_path: str,
221+
obj: FileManager,
222+
) -> None:
223+
self._flush_pending()
224+
with obj.reader() as reader:
225+
self.output.write_file(
226+
archive_path, self.options.stats_filter(obj.stat), reader
227+
)
228+
229+
def _insert_other(
230+
self,
231+
archive_path: str,
232+
obj: PathManager,
233+
) -> None:
234+
self._flush_pending()
235+
if stat.S_ISLNK(obj.stat.st_mode):
236+
self.output.write_symlink(
237+
archive_path, self.options.stats_filter(obj.stat), obj.linkname
238+
)
239+
return
240+
self.output.write_other(archive_path, self.options.stats_filter(obj.stat))

0 commit comments

Comments
 (0)