Skip to content

Commit c0ee548

Browse files
committed
Add a tool which can clean up whitespace in a commit intelligently
Will ONLY modify lines that were changed unless asked to do the whole file. Will ignore files containing "DO NOT EDIT THIS FILE". Will convert tabs to spaces in files ending in .pl, .pm, .xs, .c, .h Will remove trailing whitespace from lines in all file types. Will remove blank lines at EOF in all file types.
1 parent 5079aff commit c0ee548

File tree

1 file changed

+341
-0
lines changed

1 file changed

+341
-0
lines changed

Porting/clean-commit

Lines changed: 341 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,341 @@
1+
#!/usr/bin/perl
2+
use Data::Dumper;
3+
use strict;
4+
use warnings;
5+
use Getopt::Long;
6+
use Pod::Usage;
7+
8+
my $DEBUG=0;
9+
10+
use constant {
11+
NULL_SHA1 => ("0" x 40),
12+
TAB => " " x 8,
13+
DO_NOT_EDIT => uc("do not edit this file"),
14+
};
15+
16+
my $TAB= TAB; # for regexes
17+
my $DO_NOT_EDIT= DO_NOT_EDIT; # for interpolation
18+
19+
sub read_blame_file {
20+
my ($args, $file, $callback)= @_;
21+
print "executing: git blame -p $args $file\n" if $DEBUG > 1;
22+
open my $fh,"git blame -p $args $file 2>/dev/null |"
23+
or die "Failed to open pipe: $?";
24+
my ($line_props, $sha1);
25+
my %commit_props;
26+
my $read= 0;
27+
while (<$fh>) {
28+
$read++;
29+
if (/^([0-9a-fA-F]{40}) (\d+) (\d+)(?: (\d+))/) {
30+
$line_props= { sha1=> ($sha1 = $1), src_line => $2, dst_line => $3, group_size => $4 };
31+
} elsif (/^(\S+)(?: (.*))?/) {
32+
$commit_props{$sha1}{$1}= $2;
33+
} elsif ( s/^\t// ) {
34+
$line_props->{text}= $_;
35+
$callback->($sha1, $line_props, \%commit_props);
36+
}
37+
}
38+
return $read;
39+
}
40+
41+
sub clean_file {
42+
my $file= shift
43+
or die "Must have a file name to clean\n";
44+
if (-B $file) {
45+
warn "skipping $file as it is binary\n";
46+
return;
47+
}
48+
my $full= shift;
49+
print "clean_file($file)\n" if $DEBUG > 1;
50+
my $clean_tabs= $file=~/\.(?:p[lm]|[ch]|xs)\z/;
51+
my $read;
52+
my @out_lines;
53+
unless ( $full ) {
54+
my $noted;
55+
$read= read_blame_file("--since=HEAD", $file, sub {
56+
my ($sha1, $line_props, $commit_props)= @_;
57+
# the following trickery is so if blame fails for an unchecked in file
58+
# we dont print "cleaning using blame" when we will do the full file.
59+
print "\tcleaning using blame '$file'\n" unless $noted++;
60+
my $line= $line_props->{text};
61+
if ($full or $sha1 eq NULL_SHA1) {
62+
my $modified= 0;
63+
$modified += $line =~ s/\t/$TAB/g if $clean_tabs and $file=~/\./;
64+
$modified += $line =~ s/\s+\z/\n/;
65+
print "\tcleaned line $line_props->{dst_line}\n"
66+
if $DEBUG > 2 and $modified;
67+
}
68+
push @out_lines, $line;
69+
});
70+
}
71+
if ($read) {
72+
write_file($file, \@out_lines);
73+
} else {
74+
print "\tcleaning new file '$file'\n";
75+
clean_new($file);
76+
}
77+
}
78+
79+
sub write_file {
80+
my ($file, $out_lines)= @_;
81+
pop @$out_lines while @$out_lines and $out_lines->[-1] eq "\n";
82+
my $mode= (stat $file)[2];
83+
open my $ofh, ">", "$file.out"
84+
or die "Failed to open '$file.out' for write";
85+
print $ofh @$out_lines;
86+
close $ofh or die "Failed to close '$file.out':$!";
87+
rename "$file.out", $file or die "Failed to rename '$file.out' to '$file':$!";
88+
chmod $mode, $file or die sprintf "Failed to chmod '%s' to %3o: %s", $file, $mode, $!;
89+
}
90+
91+
sub clean_new {
92+
my ($file)= @_;
93+
print "clean_new($file)\n" if $DEBUG > 1;
94+
my $clean_tabs= $file=~/\.(?:p[lm]|[ch]|xs)\z/;
95+
open my $ifh, "<", $file
96+
or die "Failed to open '$file' for read";
97+
my @out_lines;
98+
while (<$ifh>) {
99+
my $modified= 0;
100+
$modified += s/\t/$TAB/g if $clean_tabs;
101+
$modified += s/\s+\z/\n/;
102+
print "\tcleaned line $.\n"
103+
if $DEBUG > 2 and $modified;
104+
push @out_lines, $_;
105+
}
106+
close $ifh or die "Failed to close '$file':$1";
107+
write_file($file, \@out_lines);
108+
}
109+
110+
sub get_modified_files {
111+
my ($autodetect,$status)= @_;
112+
return [] if !defined $autodetect;
113+
$autodetect ||= "MA";
114+
if ($autodetect and $autodetect=~/([^ MADRCU])/) {
115+
die "Unknown mode '$1' in '$autodetect', must be one of [ MADRCU]\n",
116+
"See git status --help for more information\n";
117+
}
118+
print "looking for files with mode [$autodetect]...\n" if $DEBUG;
119+
print Dumper($status) if $DEBUG > 1;
120+
$status||= get_status();
121+
my @files;
122+
foreach my $file (sort keys %$status) {
123+
push @files, $file
124+
if $status->{$file}=~m/[$autodetect]/;
125+
}
126+
return filter_do_not_edit_files(\@files);
127+
}
128+
129+
sub filter_do_not_edit_files {
130+
my ($files)= @_;
131+
my @ret_files;
132+
# we hide this next line from the check by storing
133+
# it in lower case and then using uc to fix it for the grep
134+
while (@$files) {
135+
my @these_files= splice @$files,0,50;
136+
open my $cmd, "git grep -L '$DO_NOT_EDIT' @these_files |";
137+
while (<$cmd>) {
138+
chomp;
139+
push @ret_files, $_;
140+
}
141+
}
142+
return \@ret_files;
143+
}
144+
145+
sub get_status {
146+
open my $cmd, "git status --porcelain |"
147+
or die "No status?";
148+
my %files;
149+
while (<$cmd>) {
150+
print if $DEBUG > 2;
151+
my ($mode,$file1,$file2)= /(..) (.*?)(?: -> (.*))?$/
152+
or die "Can't parse: $_";
153+
$file2 ||= $file1;
154+
if ($mode =~ /[MARC]/) {
155+
$files{$file2}=$mode;
156+
}
157+
}
158+
#die Dumper(\%files);
159+
close $cmd;
160+
return \%files
161+
}
162+
163+
sub clean_files {
164+
my ($files, $status, $skip, $full)=@_;
165+
my @todo;
166+
FILE:
167+
for my $file (@$files) {
168+
print "\tchecking $file\n" if $DEBUG > 1;
169+
if (!-f $file) {
170+
print "ignoring '$file': not a regular file\n";
171+
next FILE;
172+
}
173+
for my $pat (@$skip) {
174+
if ($file =~ m/$pat/) {
175+
print "skipping '$file': it matches 'no' pattern $pat\n" if $DEBUG;
176+
next FILE;
177+
}
178+
}
179+
if ($full or $status->{$file}) {
180+
push @todo, $file;
181+
} else {
182+
print "leaving '$file': it is unchanged\n" if $DEBUG;
183+
}
184+
}
185+
if (@todo) {
186+
# might put stuff here
187+
clean_file($_,$full) for @todo;
188+
}
189+
}
190+
191+
my $full = 0; # if true clean the full file
192+
my $autodetect= 0; # '0' means DWIM (use @ARGV if it has stuff otherwise autodetect) ,
193+
# undef means use @ARGV regardless,
194+
# '' or anything else means autdetect regardless.
195+
Getopt::Long::Configure("bundling");
196+
GetOptions(
197+
'n|no=s' => \my @no,
198+
'v|verbose+' => \($DEBUG),
199+
'h|help|?' => \my $help,
200+
'man' => \my $man,
201+
'f|full' => \$full,
202+
'F|no-full' => sub { undef $full},
203+
'a|auto:s' => \$autodetect,
204+
'A|no-auto' => sub { undef $autodetect },
205+
) or pod2usage(2);
206+
pod2usage(1) if $help;
207+
pod2usage(-exitstatus => 0, -verbose => 2) if $man;
208+
209+
warn Data::Dumper->Dump([\@ARGV,$autodetect,$full,\@no,$DEBUG],[qw(*ARGV *autodetect *full *no *DEBUG)])
210+
if $DEBUG>2;
211+
exit(0) if $DEBUG > 9;
212+
213+
chomp(my $path= `git rev-parse --git-dir`);
214+
die "Not a git repo" if !$path;
215+
chdir( $path . "/.." )
216+
or die "Failed to chdir to '$path/..': $!";
217+
218+
my %seen;
219+
@no= map { !$seen{$_}++ ? qr/$_/ : () } @no;
220+
221+
my $status= get_status();
222+
exit(0) if !%$status && !@ARGV;
223+
224+
my $files= (!defined($autodetect) || # if autodetect is undef - do NOT use git status
225+
$autodetect eq '0' && @ARGV) # or autodetect eq '0' and @ARGV has stuff in it
226+
? \@ARGV # then use ARGV
227+
: get_modified_files($autodetect,$status); # otherwise use git status to find the files
228+
229+
clean_files($files,$status,\@no,$full);
230+
231+
__END__
232+
233+
=head1 NAME
234+
235+
clean-commit - whitespace clean modified files in a git repository
236+
237+
=head1 SYNOPSIS
238+
239+
clean-commit [options] [file ...]
240+
241+
Options:
242+
--no=REGEX ignore anything matching this
243+
-a --auto=MODE use git to find modified files - MODE is one of [MADRCU]
244+
-A --no-auto do not use git to find modified files if the arg list is empty
245+
-f --full clean the full file, not just the changed bits
246+
--help brief help message
247+
--man full documentation
248+
--verbose print debugging information
249+
250+
=head1 OPTIONS
251+
252+
Either processes the provided list of files or if none are provided then uses
253+
C<git status> to find the files. You can use the C<--auto> and C<--no-auto> to
254+
fine-tune this behaviour.
255+
256+
=over 8
257+
258+
=item B<--no=REGEX>
259+
260+
Any file matching this will be ignored. May be used more than once.
261+
REGEX is a perl syntax regular expression.
262+
263+
=item B<-a>
264+
265+
=item B<--auto>
266+
267+
=item B<--auto=MODES>
268+
269+
Use git status to find modified files. Defaults to 'M', legal values
270+
are as follows (most can be combined).
271+
272+
MODE Meaning
273+
'0' use @ARGV if its there, otherwise use default mode ('M')
274+
'' use default mode 'M'
275+
' ' unchanged
276+
'M' modified
277+
'A' added
278+
'R' renamed
279+
'C' copied
280+
'U' unmerged
281+
282+
The default behaviour of the tool is C<--auto=0>, which causes the tool
283+
to process any files passed in on the command line, and to otherwise use
284+
C<git status> with the default mode ('M') to find the files. Any other use
285+
of this option causes the tool to ignore any file on the command line.
286+
The use of the C<--no-auto> option overrides this behaviour the other way
287+
and causes the tool to process only the files passed in, even if that
288+
means doing nothing. If used together the last used wins.
289+
290+
See the documentation for the C<git status> command, and the C<--porcelain> option
291+
for more details on the mode values.
292+
293+
=item B<-A>
294+
295+
=item B<--no-auto>
296+
297+
Do not use C<git status>, only process files passed in on the command line, even if
298+
that means processing nothing. If combined with B<--auto> which is used last wins.
299+
See also the documentation for C<--auto>
300+
301+
=item B<-v>
302+
303+
=item B<--verbose>
304+
305+
Output debugging information. Right now this is not very pretty.
306+
307+
=item B<-help>
308+
309+
Print a brief help message and exits.
310+
311+
=item B<-man>
312+
313+
Prints the manual page and exits.
314+
315+
=back
316+
317+
=head1 DESCRIPTION
318+
319+
B<clean-commit> will read the given input file(s) and use git to determine which
320+
parts have been modified, and then clean any whitespace issues in the modified parts.
321+
Cleaning that is performed is to eliminate trailing whitespace and convert tabs to
322+
spaces.
323+
324+
The default behaviour is to DWIM, and either process the specified files, or use
325+
C<git status> to find them. The C<--no-auto> option means the tool will only process
326+
the explicitly provided files, even if that means doing nothing. The C<--auto> option
327+
can be used to force C<git status> to be used to find the files, even if a list of
328+
files have been provided, and can be used to change which types of file modifications
329+
are chosen to be cleaned, for instance C<--mode=MARC> would clean any modified, added,
330+
renamed, or copied files, instead of the normal default of just cleaning modified
331+
files.
332+
333+
Files which contain the string 'do not edit this file' in *uppercase* are excluded
334+
from cleanups.
335+
336+
Trailing whitespace is removed from all files, tabs are converted to spaces in
337+
files ending in C<.pl>, C<.pm>, C<.xs>, C<.c> and C<.h>, any totally blank lines
338+
at the end of a file will be removed (either when the file is new or, when the lines
339+
were added to an existing file).
340+
341+
=cut

0 commit comments

Comments
 (0)