Skip to content

Commit acbfd96

Browse files
committed
Add a tool which can clean up whitespace in a commit intelligently
Will ONLY modify lines that were changed unless asked to do the whole file. Will ignore files containing "DO NOT EDIT THIS FILE". Will convert tabs to spaces in files ending in .pl, .pm, .xs, .c, .h Will remove trailing whitespace from lines in all file types. Will remove blank lines at EOF in all file types.
1 parent 5079aff commit acbfd96

File tree

4 files changed

+357
-1
lines changed

4 files changed

+357
-1
lines changed

MANIFEST

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5403,6 +5403,7 @@ Porting/checkcfgvar.pl Check that config scripts define all symbols
54035403
Porting/checkpodencoding.pl Check POD encoding
54045404
Porting/checkURL.pl Check whether we have working URLs
54055405
Porting/checkVERSION.pl Check whether we have $VERSIONs
5406+
Porting/clean-commit Cleanup whitespace issues in a commit using git blame
54065407
Porting/cmpVERSION.pl Compare whether two trees have changed modules
54075408
Porting/config.sh Sample config.sh
54085409
Porting/config_H Sample config.h

Porting/README.pod

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,19 @@ Checks that all the URLs in the Perl source are valid.
8282
Used by F<t/porting/cmp_version.t> to ensure changed modules have had their
8383
versions updated.
8484

85+
=head2 F<clean-commit>
86+
87+
This tool can be used to intelligently clean up whitespace issue in a
88+
patch. For files already under git control it will only clean lines that
89+
have been modified as part of the commit. It will not modify generated
90+
files which have the marker "do not edit this file" (in uppercase) in
91+
them. It will convert tabs to spaces in files .pl, .pm, .xs, .c and .h
92+
files, and it will remove trailing whitespace from all file types. It will
93+
also remove blank lines at the end of a file should the commit add any.
94+
95+
By default it will not modify files not under git control, but if new
96+
files are `git add`ed then it will clean then entire file.
97+
8598
=head2 F<cmpVERSION.pl>
8699

87100
Compare the current Perl source tree and a given tag for modules that have
@@ -394,4 +407,3 @@ leaks.
394407
Guide for Vote Administrators for running Steering Council elections.
395408

396409
=cut
397-

Porting/clean-commit

Lines changed: 342 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,342 @@
1+
#!/usr/bin/perl
2+
use Data::Dumper;
3+
use strict;
4+
use warnings;
5+
use Getopt::Long;
6+
use Pod::Usage;
7+
8+
my $DEBUG=0;
9+
10+
use constant {
11+
NULL_SHA1 => ("0" x 40),
12+
TAB => " " x 8,
13+
DO_NOT_EDIT => uc("do not edit this file"),
14+
};
15+
16+
my $TAB= TAB; # for regexes
17+
my $DO_NOT_EDIT= DO_NOT_EDIT; # for interpolation
18+
19+
sub read_blame_file {
20+
my ($args, $file, $callback)= @_;
21+
print "executing: git blame -p $args $file\n" if $DEBUG > 1;
22+
open my $fh,"git blame -p $args $file 2>/dev/null |"
23+
or die "Failed to open pipe: $?";
24+
my ($line_props, $sha1);
25+
my %commit_props;
26+
my $read= 0;
27+
while (<$fh>) {
28+
$read++;
29+
if (/^([0-9a-fA-F]{40}) (\d+) (\d+)(?: (\d+))/) {
30+
$line_props= { sha1=> ($sha1 = $1), src_line => $2, dst_line => $3, group_size => $4 };
31+
} elsif (/^(\S+)(?: (.*))?/) {
32+
$commit_props{$sha1}{$1}= $2;
33+
} elsif ( s/^\t// ) {
34+
$line_props->{text}= $_;
35+
$callback->($sha1, $line_props, \%commit_props);
36+
}
37+
}
38+
return $read;
39+
}
40+
41+
sub clean_file {
42+
my $file= shift
43+
or die "Must have a file name to clean\n";
44+
if (-B $file) {
45+
warn "skipping $file as it is binary\n";
46+
return;
47+
}
48+
my $full= shift;
49+
print "clean_file($file)\n" if $DEBUG > 1;
50+
my $clean_tabs= $file=~/\.(?:p[lm]|[ch]|xs)\z/;
51+
my $read;
52+
my @out_lines;
53+
unless ( $full ) {
54+
my $noted;
55+
$read= read_blame_file("--since=HEAD", $file, sub {
56+
my ($sha1, $line_props, $commit_props)= @_;
57+
# the following trickery is so if blame fails for an unchecked in file
58+
# we dont print "cleaning using blame" when we will do the full file.
59+
print "\tcleaning using blame '$file'\n" unless $noted++;
60+
my $line= $line_props->{text};
61+
if ($full or $sha1 eq NULL_SHA1) {
62+
my $modified= 0;
63+
$modified += $line =~ s/\t/$TAB/g if $clean_tabs and $file=~/\./;
64+
$modified += $line =~ s/\s+\z/\n/;
65+
print "\tcleaned line $line_props->{dst_line}\n"
66+
if $DEBUG > 2 and $modified;
67+
}
68+
push @out_lines, $line;
69+
});
70+
}
71+
if ($read) {
72+
write_file($file, \@out_lines);
73+
} else {
74+
print "\tcleaning new file '$file'\n";
75+
clean_new($file);
76+
}
77+
}
78+
79+
sub write_file {
80+
my ($file, $out_lines)= @_;
81+
pop @$out_lines while @$out_lines and $out_lines->[-1] eq "\n";
82+
my $mode= (stat $file)[2];
83+
open my $ofh, ">", "$file.out"
84+
or die "Failed to open '$file.out' for write";
85+
print $ofh @$out_lines;
86+
close $ofh or die "Failed to close '$file.out':$!";
87+
rename "$file.out", $file or die "Failed to rename '$file.out' to '$file':$!";
88+
chmod $mode, $file or die sprintf "Failed to chmod '%s' to %3o: %s", $file, $mode, $!;
89+
}
90+
91+
sub clean_new {
92+
my ($file)= @_;
93+
print "clean_new($file)\n" if $DEBUG > 1;
94+
my $clean_tabs= $file=~/\.(?:p[lm]|[ch]|xs)\z/;
95+
open my $ifh, "<", $file
96+
or die "Failed to open '$file' for read";
97+
my @out_lines;
98+
while (<$ifh>) {
99+
my $modified= 0;
100+
$modified += s/\t/$TAB/g if $clean_tabs;
101+
$modified += s/\s+\z/\n/;
102+
print "\tcleaned line $.\n"
103+
if $DEBUG > 2 and $modified;
104+
push @out_lines, $_;
105+
}
106+
close $ifh or die "Failed to close '$file':$1";
107+
write_file($file, \@out_lines);
108+
}
109+
110+
sub get_modified_files {
111+
my ($autodetect,$status)= @_;
112+
return [] if !defined $autodetect;
113+
$autodetect ||= "MA";
114+
if ($autodetect and $autodetect=~/([^ MADRCU])/) {
115+
die "Unknown mode '$1' in '$autodetect', must be one of [ MADRCU]\n",
116+
"See git status --help for more information\n";
117+
}
118+
print "looking for files with mode [$autodetect]...\n" if $DEBUG;
119+
print Dumper($status) if $DEBUG > 1;
120+
$status||= get_status();
121+
my @files;
122+
foreach my $file (sort keys %$status) {
123+
push @files, $file
124+
if $status->{$file}=~m/[$autodetect]/;
125+
}
126+
return filter_do_not_edit_files(\@files);
127+
}
128+
129+
sub filter_do_not_edit_files {
130+
my ($files)= @_;
131+
my @ret_files;
132+
# we hide this next line from the check by storing
133+
# it in lower case and then using uc to fix it for the grep
134+
while (@$files) {
135+
my @these_files= splice @$files,0,50;
136+
open my $cmd, "git grep -L '$DO_NOT_EDIT' @these_files |";
137+
while (<$cmd>) {
138+
chomp;
139+
push @ret_files, $_;
140+
}
141+
}
142+
return \@ret_files;
143+
}
144+
145+
sub get_status {
146+
open my $cmd, "git status --porcelain |"
147+
or die "No status?";
148+
my %files;
149+
while (<$cmd>) {
150+
print if $DEBUG > 2;
151+
my ($mode,$file1,$file2)= /(..) (.*?)(?: -> (.*))?$/
152+
or die "Can't parse: $_";
153+
$file2 ||= $file1;
154+
if ($mode =~ /[MARC]/) {
155+
$files{$file2}=$mode;
156+
}
157+
}
158+
#die Dumper(\%files);
159+
close $cmd;
160+
return \%files
161+
}
162+
163+
sub clean_files {
164+
my ($files, $status, $skip, $full)=@_;
165+
my @todo;
166+
FILE:
167+
for my $file (@$files) {
168+
print "\tchecking $file\n" if $DEBUG > 1;
169+
if (!-f $file) {
170+
print "ignoring '$file': not a regular file\n";
171+
next FILE;
172+
}
173+
for my $pat (@$skip) {
174+
if ($file =~ m/$pat/) {
175+
print "skipping '$file': it matches 'no' pattern $pat\n" if $DEBUG;
176+
next FILE;
177+
}
178+
}
179+
if ($full or $status->{$file}) {
180+
push @todo, $file;
181+
} else {
182+
print "leaving '$file': it is unchanged\n" if $DEBUG;
183+
}
184+
}
185+
if (@todo) {
186+
# might put stuff here
187+
clean_file($_,$full) for @todo;
188+
}
189+
}
190+
191+
my $full = 0; # if true clean the full file
192+
my $autodetect= 0; # '0' means DWIM (use @ARGV if it has stuff otherwise autodetect) ,
193+
# undef means use @ARGV regardless,
194+
# '' or anything else means autdetect regardless.
195+
Getopt::Long::Configure("bundling");
196+
GetOptions(
197+
'n|no=s' => \my @no,
198+
'v|verbose+' => \($DEBUG),
199+
'h|help|?' => \my $help,
200+
'man' => \my $man,
201+
'f|full' => \$full,
202+
'F|no-full' => sub { undef $full},
203+
'a|auto:s' => \$autodetect,
204+
'A|no-auto' => sub { undef $autodetect },
205+
) or pod2usage(2);
206+
pod2usage(1) if $help;
207+
pod2usage(-exitstatus => 0, -verbose => 2) if $man;
208+
209+
warn Data::Dumper->Dump([\@ARGV,$autodetect,$full,\@no,$DEBUG],[qw(*ARGV *autodetect *full *no *DEBUG)])
210+
if $DEBUG>2;
211+
exit(0) if $DEBUG > 9;
212+
213+
chomp(my $path= `git rev-parse --git-dir`);
214+
die "Not a git repo" if !$path;
215+
chdir( $path . "/.." )
216+
or die "Failed to chdir to '$path/..': $!";
217+
218+
my %seen;
219+
@no= map { !$seen{$_}++ ? qr/$_/ : () } @no;
220+
221+
my $status= get_status();
222+
exit(0) if !%$status && !@ARGV;
223+
224+
my $files= (!defined($autodetect) || # if autodetect is undef - do NOT use git status
225+
$autodetect eq '0' && @ARGV) # or autodetect eq '0' and @ARGV has stuff in it
226+
? \@ARGV # then use ARGV
227+
: get_modified_files($autodetect,$status); # otherwise use git status to find the files
228+
229+
clean_files($files,$status,\@no,$full);
230+
231+
__END__
232+
233+
=head1 NAME
234+
235+
clean-commit - whitespace clean modified files in a git repository
236+
237+
=head1 SYNOPSIS
238+
239+
clean-commit [options] [file ...]
240+
241+
Options:
242+
--no=REGEX ignore anything matching this
243+
-a --auto=MODE use git to find modified files - MODE can be [MADRCU]
244+
-A --no-auto do not use git to find files if the arg list is empty
245+
-f --full clean the full file, not just the changed bits
246+
--help brief help message
247+
--man full documentation
248+
--verbose print debugging information
249+
250+
=head1 OPTIONS
251+
252+
Either processes the provided list of files or if none are provided then
253+
uses C<git status> to find the files. You can use the C<--auto> and
254+
C<--no-auto> to fine-tune this behaviour.
255+
256+
=over 8
257+
258+
=item B<--no=REGEX>
259+
260+
Any file matching this will be ignored. May be used more than once.
261+
REGEX is a perl syntax regular expression.
262+
263+
=item B<-a>
264+
265+
=item B<--auto>
266+
267+
=item B<--auto=MODES>
268+
269+
Use git status to find modified files. Defaults to 'M', legal values
270+
are as follows (most can be combined).
271+
272+
MODE Meaning
273+
'0' use @ARGV if its there, otherwise use mode ('M')
274+
'' use default mode 'M'
275+
' ' unchanged
276+
'M' modified
277+
'A' added
278+
'R' renamed
279+
'C' copied
280+
'U' unmerged
281+
282+
The default behaviour of the tool is C<--auto=0>, which causes the tool
283+
to process any files passed in on the command line, and to otherwise use
284+
C<git status> with the default mode ('M') to find the files. Any other use
285+
of this option causes the tool to ignore any file on the command line.
286+
The use of the C<--no-auto> option overrides this behaviour the other way
287+
and causes the tool to process only the files passed in, even if that
288+
means doing nothing. If used together the last used wins.
289+
290+
See the documentation for the C<git status> command, and the C<--porcelain> option
291+
for more details on the mode values.
292+
293+
=item B<-A>
294+
295+
=item B<--no-auto>
296+
297+
Do not use C<git status>, only process files passed in on the command line, even if
298+
that means processing nothing. If combined with B<--auto> which is used last wins.
299+
See also the documentation for C<--auto>
300+
301+
=item B<-v>
302+
303+
=item B<--verbose>
304+
305+
Output debugging information. Right now this is not very pretty.
306+
307+
=item B<-help>
308+
309+
Print a brief help message and exits.
310+
311+
=item B<-man>
312+
313+
Prints the manual page and exits.
314+
315+
=back
316+
317+
=head1 DESCRIPTION
318+
319+
B<clean-commit> will read the given input file(s) and use git to
320+
determine which parts have been modified, and then clean any whitespace
321+
issues in the modified parts. Cleaning that is performed is to eliminate
322+
trailing whitespace and convert tabs to spaces.
323+
324+
The default behaviour is to DWIM, and either process the specified
325+
files, or use C<git status> to find them. The C<--no-auto> option means
326+
the tool will only process the explicitly provided files, even if that
327+
means doing nothing. The C<--auto> option can be used to force C<git
328+
status> to be used to find the files, even if a list of files have been
329+
provided, and can be used to change which types of file modifications
330+
are chosen to be cleaned, for instance C<--mode=MARC> would clean any
331+
modified, added, renamed, or copied files, instead of the normal default
332+
of just cleaning modified files.
333+
334+
Files which contain the string 'do not edit this file' in *uppercase*
335+
are excluded from cleanups.
336+
337+
Trailing whitespace is removed from all files, tabs are converted to
338+
spaces in files ending in C<.pl>, C<.pm>, C<.xs>, C<.c> and C<.h>, any
339+
totally blank lines at the end of a file will be removed (either when
340+
the file is new or, when the lines were added to an existing file).
341+
342+
=cut

Porting/exec-bit.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ Porting/checkansi.pl
5050
Porting/checkcfguse.pl
5151
Porting/checkcfgvar.pl
5252
Porting/checkpodencoding.pl
53+
Porting/clean-commit
5354
Porting/cmpVERSION.pl
5455
Porting/config_h.pl
5556
Porting/corecpan.pl

0 commit comments

Comments
 (0)