Perl · demerphq · Feb 19, 2022 · jkeenan · Aug 7, 2022 · Leont
diff --git a/MANIFEST b/MANIFEST
@@ -5403,6 +5403,7 @@ Porting/checkcfgvar.pl		Check that config scripts define all symbols
 Porting/checkpodencoding.pl	Check POD encoding
 Porting/checkURL.pl		Check whether we have working URLs
 Porting/checkVERSION.pl		Check whether we have $VERSIONs
+Porting/clean-commit		Cleanup whitespace issues in a commit using git blame
 Porting/cmpVERSION.pl		Compare whether two trees have changed modules
 Porting/config.sh		Sample config.sh
 Porting/config_H		Sample config.h

diff --git a/Porting/README.pod b/Porting/README.pod
@@ -82,6 +82,19 @@ Checks that all the URLs in the Perl source are valid.
 Used by F<t/porting/cmp_version.t> to ensure changed modules have had their
 versions updated.
 
+=head2 F<clean-commit>
+
+This tool can be used to intelligently clean up whitespace issue in a
+patch. For files already under git control it will only clean lines that
+have been modified as part of the commit. It will not modify generated
+files which have the marker "do not edit this file" (in uppercase) in
+them. It will convert tabs to spaces in files .pl, .pm, .xs, .c and .h
+files, and it will remove trailing whitespace from all file types. It will
+also remove blank lines at the end of a file should the commit add any.
+
+By default it will not modify files not under git control, but if new
+files are `git add`ed then it will clean then entire file.
+
 =head2 F<cmpVERSION.pl>
 
 Compare the current Perl source tree and a given tag for modules that have
@@ -394,4 +407,3 @@ leaks.
 Guide for Vote Administrators for running Steering Council elections.
 
 =cut
-
diff --git a/Porting/clean-commit b/Porting/clean-commit
@@ -0,0 +1,342 @@
+#!/usr/bin/perl
+use Data::Dumper;
+use strict;
+use warnings;
+use Getopt::Long;
+use Pod::Usage;
+
+my $DEBUG=0;
+
+use constant {
+    NULL_SHA1 => ("0" x 40),
+    TAB => " " x 8,
+    DO_NOT_EDIT => uc("do not edit this file"),
+};
+
+my $TAB= TAB; # for regexes
+my $DO_NOT_EDIT= DO_NOT_EDIT; # for interpolation
+
+sub read_blame_file {
+    my ($args, $file, $callback)= @_;
+    print "executing: git blame -p $args $file\n" if $DEBUG > 1;
+    open my $fh,"git blame -p $args $file 2>/dev/null |"
+        or die "Failed to open pipe: $?";
+    my ($line_props, $sha1);
+    my %commit_props;
+    my $read= 0;
+    while (<$fh>) {
+        $read++;
+        if (/^([0-9a-fA-F]{40}) (\d+) (\d+)(?: (\d+))/) {
+            $line_props= { sha1=> ($sha1 = $1), src_line => $2, dst_line => $3, group_size => $4 };
+        } elsif (/^(\S+)(?: (.*))?/) {
+            $commit_props{$sha1}{$1}= $2;
+        } elsif ( s/^\t// ) {
+            $line_props->{text}= $_;
+            $callback->($sha1, $line_props, \%commit_props);
+        }
+    }
+    return $read;
+}
+
+sub clean_file {
+    my $file= shift
+        or die "Must have a file name to clean\n";
+    if (-B $file) {
+        warn "skipping $file as it is binary\n";
+        return;
+    }
+    my $full= shift;
+    print "clean_file($file)\n" if $DEBUG > 1;
+    my $clean_tabs= $file=~/\.(?:p[lm]|[ch]|xs)\z/;
+    my $read;
+    my @out_lines;
+    unless ( $full ) {
+        my $noted;
+        $read= read_blame_file("--since=HEAD", $file, sub {
+            my ($sha1, $line_props, $commit_props)= @_;
+            # the following trickery is so if blame fails for an unchecked in file
+            # we dont print "cleaning using blame" when we will do the full file.
+            print "\tcleaning using blame '$file'\n" unless $noted++;
+            my $line= $line_props->{text};
+            if  ($full or $sha1 eq NULL_SHA1) {
+                my $modified= 0;
+                $modified += $line =~ s/\t/$TAB/g if $clean_tabs and $file=~/\./;
+                $modified += $line =~ s/\s+\z/\n/;
+                print "\tcleaned line $line_props->{dst_line}\n"
+                    if $DEBUG > 2 and $modified;
+            }
+            push @out_lines, $line;
+        });
+    }
+    if ($read) {
+        write_file($file, \@out_lines);
+    } else {
+        print "\tcleaning new file '$file'\n";
+        clean_new($file);
+    }
+}
+
+sub write_file {
+    my ($file, $out_lines)= @_;
+    pop @$out_lines while @$out_lines and $out_lines->[-1] eq "\n";
+    my $mode= (stat $file)[2];
+    open my $ofh, ">", "$file.out"
+        or die "Failed to open '$file.out' for write";
+    print $ofh @$out_lines;
+    close $ofh or die "Failed to close '$file.out':$!";
+    rename "$file.out", $file or die "Failed to rename '$file.out' to '$file':$!";
+    chmod $mode, $file or die sprintf "Failed to chmod '%s' to %3o: %s", $file, $mode, $!;
+}
+
+sub clean_new {
+    my ($file)= @_;
+    print "clean_new($file)\n" if $DEBUG > 1;
+    my $clean_tabs= $file=~/\.(?:p[lm]|[ch]|xs)\z/;
+    open my $ifh, "<", $file
+        or die "Failed to open '$file' for read";
+    my @out_lines;
+    while (<$ifh>) {
+        my $modified= 0;
+        $modified += s/\t/$TAB/g if $clean_tabs;
+        $modified += s/\s+\z/\n/;
+        print "\tcleaned line $.\n"
+            if $DEBUG > 2 and $modified;
+        push @out_lines, $_;
+    }
+    close $ifh or die "Failed to close '$file':$1";
+    write_file($file, \@out_lines);
+}
+
+sub get_modified_files {
+    my ($autodetect,$status)= @_;
+    return [] if !defined $autodetect;
+    $autodetect ||= "MA";
+    if ($autodetect and $autodetect=~/([^ MADRCU])/) {
+        die "Unknown mode '$1' in '$autodetect', must be one of [ MADRCU]\n",
+            "See git status --help for more information\n";
+    }
+    print "looking for files with mode [$autodetect]...\n" if $DEBUG;
+    print Dumper($status) if $DEBUG > 1;
+    $status||= get_status();
+    my @files;
+    foreach my $file (sort keys %$status) {
+        push @files, $file
+            if $status->{$file}=~m/[$autodetect]/;
+    }
+    return filter_do_not_edit_files(\@files);
+}
+
+sub filter_do_not_edit_files {
+    my ($files)= @_;
+    my @ret_files;
+    # we hide this next line from the check by storing
+    # it in lower case and then using uc to fix it for the grep
+    while (@$files) {
+        my @these_files= splice @$files,0,50;
+        open my $cmd, "git grep -L '$DO_NOT_EDIT' @these_files |";
+        while (<$cmd>) {
+            chomp;
+            push @ret_files, $_;
+        }
+    }
+    return \@ret_files;
+}
+
+sub get_status {
+    open my $cmd, "git status --porcelain |"
+        or die "No status?";
+    my %files;
+    while (<$cmd>) {
+        print if $DEBUG > 2;
+        my ($mode,$file1,$file2)= /(..) (.*?)(?: -> (.*))?$/
+            or die "Can't parse: $_";
+        $file2 ||= $file1;
+        if ($mode =~ /[MARC]/) {
+            $files{$file2}=$mode;
+        }
+    }
+    #die Dumper(\%files);
+    close $cmd;
+    return \%files
+}
+
+sub clean_files {
+    my ($files, $status, $skip, $full)=@_;
+    my @todo;
+    FILE:
+    for my $file (@$files) {
+        print "\tchecking $file\n" if $DEBUG > 1;
+        if (!-f $file) {
+            print "ignoring '$file': not a regular file\n";
+            next FILE;
+        }
+        for my $pat (@$skip) {
+            if ($file =~ m/$pat/) {
+                print "skipping '$file': it matches 'no' pattern $pat\n" if $DEBUG;
+                next FILE;
+            }
+        }
+        if ($full or $status->{$file}) {
+            push @todo, $file;
+        } else {
+            print "leaving '$file': it is unchanged\n" if $DEBUG;
+        }
+    }
+    if (@todo) {
+        # might put stuff here
+        clean_file($_,$full) for @todo;
+    }
+}
+
+my $full      = 0; # if true clean the full file
+my $autodetect= 0; # '0' means DWIM (use @ARGV if it has stuff otherwise autodetect) ,
+                   # undef means use @ARGV regardless,
+                   # '' or anything else means autdetect regardless.
+Getopt::Long::Configure("bundling");
+GetOptions(
+    'n|no=s'          => \my @no,
+    'v|verbose+'    => \($DEBUG),
+    'h|help|?'        => \my $help,
+    'man'           => \my $man,
+    'f|full'        => \$full,
+    'F|no-full'     => sub { undef $full},
+    'a|auto:s'      => \$autodetect,
+    'A|no-auto'     => sub { undef $autodetect },
+) or pod2usage(2);
+pod2usage(1) if $help;
+pod2usage(-exitstatus => 0, -verbose => 2) if $man;
+
+warn Data::Dumper->Dump([\@ARGV,$autodetect,$full,\@no,$DEBUG],[qw(*ARGV *autodetect *full *no *DEBUG)])
+    if $DEBUG>2;
+exit(0) if $DEBUG > 9;
+
+chomp(my $path= `git rev-parse --git-dir`);
+die "Not a git repo" if !$path;
+chdir( $path . "/.." )
+    or die "Failed to chdir to '$path/..': $!";
+
+my %seen;
+@no= map { !$seen{$_}++ ? qr/$_/ : () } @no;
+
+my $status= get_status();
+exit(0) if !%$status && !@ARGV;
+
+my $files= (!defined($autodetect) ||           # if autodetect is undef - do NOT use git status
+            $autodetect eq '0' && @ARGV)       # or autodetect eq '0' and @ARGV has stuff in it
+           ? \@ARGV                            # then use ARGV
+           : get_modified_files($autodetect,$status);  # otherwise use git status to find the files
+
+clean_files($files,$status,\@no,$full);
+
+__END__
+
+=head1 NAME
+
+clean-commit - whitespace clean modified files in a git repository
+
+=head1 SYNOPSIS
+
+clean-commit [options] [file ...]
+
+ Options:
+   --no=REGEX        ignore anything matching this
+   -a --auto=MODE    use git to find modified files - MODE can be [MADRCU]
+   -A --no-auto      do not use git to find files if the arg list is empty
+   -f --full         clean the full file, not just the changed bits
+   --help            brief help message
+   --man             full documentation
+   --verbose         print debugging information
+
+=head1 OPTIONS
+
+Either processes the provided list of files or if none are provided then
+uses C<git status> to find the files. You can use the C<--auto> and
+C<--no-auto> to fine-tune this behaviour.
+
+=over 8
+
+=item B<--no=REGEX>
+
+Any file matching this will be ignored. May be used more than once.
+REGEX is a perl syntax regular expression.
+
+=item B<-a>
+
+=item B<--auto>
+
+=item B<--auto=MODES>
+
+Use git status to find modified files. Defaults to 'M', legal values
+are as follows (most can be combined).
+
+    MODE    Meaning
+    '0'     use @ARGV if its there, otherwise use mode ('M')
+    ''      use default mode 'M'
+    ' '     unchanged
+    'M'     modified
+    'A'     added
+    'R'     renamed
+    'C'     copied
+    'U'     unmerged
+
+The default behaviour of the tool is C<--auto=0>, which causes the tool
+to process any files passed in on the command line, and to otherwise use
+C<git status> with the default mode ('M') to find the files. Any other use
+of this option causes the tool to ignore any file on the command line.
+The use of the C<--no-auto> option overrides this behaviour the other way
+and causes the tool to process only the files passed in, even if that
+means doing nothing. If used together the last used wins.
+
+See the documentation for the C<git status> command, and the C<--porcelain> option
+for more details on the mode values.
+
+=item B<-A>
+
+=item B<--no-auto>
+
+Do not use C<git status>, only process files passed in on the command line, even if
+that means processing nothing. If combined with B<--auto> which is used last wins.
+See also the documentation for C<--auto>
+
+=item B<-v>
+
+=item B<--verbose>
+
+Output debugging information. Right now this is not very pretty.
+
+=item B<-help>
+
+Print a brief help message and exits.
+
+=item B<-man>
+
+Prints the manual page and exits.
+
+=back
+
+=head1 DESCRIPTION
+
+B<clean-commit> will read the given input file(s) and use git to
+determine which parts have been modified, and then clean any whitespace
+issues in the modified parts. Cleaning that is performed is to eliminate
+trailing whitespace and convert tabs to spaces.
+
+The default behaviour is to DWIM, and either process the specified
+files, or use C<git status> to find them. The C<--no-auto> option means
+the tool will only process the explicitly provided files, even if that
+means doing nothing. The C<--auto> option can be used to force C<git
+status> to be used to find the files, even if a list of files have been
+provided, and can be used to change which types of file modifications
+are chosen to be cleaned, for instance C<--mode=MARC> would clean any
+modified, added, renamed, or copied files, instead of the normal default
+of just cleaning modified files.
+
+Files which contain the string 'do not edit this file' in *uppercase*
+are excluded from cleanups.
+
+Trailing whitespace is removed from all files, tabs are converted to
+spaces in files ending in C<.pl>, C<.pm>, C<.xs>, C<.c> and C<.h>, any
+totally blank lines at the end of a file will be removed (either when
+the file is new or, when the lines were added to an existing file).
+
+=cut
diff --git a/Porting/exec-bit.txt b/Porting/exec-bit.txt
@@ -50,6 +50,7 @@ Porting/checkansi.pl
 Porting/checkcfguse.pl
 Porting/checkcfgvar.pl
 Porting/checkpodencoding.pl
+Porting/clean-commit
 Porting/cmpVERSION.pl
 Porting/config_h.pl
 Porting/corecpan.pl