-
Notifications
You must be signed in to change notification settings - Fork 603
Add a tool which can clean up whitespace in a commit intelligently #19441
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
demerphq
wants to merge
1
commit into
blead
Choose a base branch
from
yves/clean-commit
base: blead
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,342 @@ | ||
| #!/usr/bin/perl | ||
| use Data::Dumper; | ||
| use strict; | ||
| use warnings; | ||
| use Getopt::Long; | ||
| use Pod::Usage; | ||
|
|
||
| my $DEBUG=0; | ||
|
|
||
| use constant { | ||
| NULL_SHA1 => ("0" x 40), | ||
| TAB => " " x 8, | ||
| DO_NOT_EDIT => uc("do not edit this file"), | ||
| }; | ||
|
|
||
| my $TAB= TAB; # for regexes | ||
| my $DO_NOT_EDIT= DO_NOT_EDIT; # for interpolation | ||
|
|
||
| sub read_blame_file { | ||
| my ($args, $file, $callback)= @_; | ||
| print "executing: git blame -p $args $file\n" if $DEBUG > 1; | ||
| open my $fh,"git blame -p $args $file 2>/dev/null |" | ||
| or die "Failed to open pipe: $?"; | ||
| my ($line_props, $sha1); | ||
| my %commit_props; | ||
| my $read= 0; | ||
| while (<$fh>) { | ||
| $read++; | ||
| if (/^([0-9a-fA-F]{40}) (\d+) (\d+)(?: (\d+))/) { | ||
| $line_props= { sha1=> ($sha1 = $1), src_line => $2, dst_line => $3, group_size => $4 }; | ||
| } elsif (/^(\S+)(?: (.*))?/) { | ||
| $commit_props{$sha1}{$1}= $2; | ||
| } elsif ( s/^\t// ) { | ||
| $line_props->{text}= $_; | ||
| $callback->($sha1, $line_props, \%commit_props); | ||
| } | ||
| } | ||
| return $read; | ||
| } | ||
|
|
||
| sub clean_file { | ||
| my $file= shift | ||
| or die "Must have a file name to clean\n"; | ||
| if (-B $file) { | ||
| warn "skipping $file as it is binary\n"; | ||
| return; | ||
| } | ||
| my $full= shift; | ||
| print "clean_file($file)\n" if $DEBUG > 1; | ||
| my $clean_tabs= $file=~/\.(?:p[lm]|[ch]|xs)\z/; | ||
| my $read; | ||
| my @out_lines; | ||
| unless ( $full ) { | ||
| my $noted; | ||
| $read= read_blame_file("--since=HEAD", $file, sub { | ||
| my ($sha1, $line_props, $commit_props)= @_; | ||
| # the following trickery is so if blame fails for an unchecked in file | ||
| # we dont print "cleaning using blame" when we will do the full file. | ||
| print "\tcleaning using blame '$file'\n" unless $noted++; | ||
| my $line= $line_props->{text}; | ||
| if ($full or $sha1 eq NULL_SHA1) { | ||
| my $modified= 0; | ||
| $modified += $line =~ s/\t/$TAB/g if $clean_tabs and $file=~/\./; | ||
| $modified += $line =~ s/\s+\z/\n/; | ||
| print "\tcleaned line $line_props->{dst_line}\n" | ||
| if $DEBUG > 2 and $modified; | ||
| } | ||
| push @out_lines, $line; | ||
| }); | ||
| } | ||
| if ($read) { | ||
| write_file($file, \@out_lines); | ||
| } else { | ||
| print "\tcleaning new file '$file'\n"; | ||
| clean_new($file); | ||
| } | ||
| } | ||
|
|
||
| sub write_file { | ||
| my ($file, $out_lines)= @_; | ||
| pop @$out_lines while @$out_lines and $out_lines->[-1] eq "\n"; | ||
| my $mode= (stat $file)[2]; | ||
| open my $ofh, ">", "$file.out" | ||
| or die "Failed to open '$file.out' for write"; | ||
| print $ofh @$out_lines; | ||
| close $ofh or die "Failed to close '$file.out':$!"; | ||
| rename "$file.out", $file or die "Failed to rename '$file.out' to '$file':$!"; | ||
| chmod $mode, $file or die sprintf "Failed to chmod '%s' to %3o: %s", $file, $mode, $!; | ||
| } | ||
|
|
||
| sub clean_new { | ||
| my ($file)= @_; | ||
| print "clean_new($file)\n" if $DEBUG > 1; | ||
| my $clean_tabs= $file=~/\.(?:p[lm]|[ch]|xs)\z/; | ||
| open my $ifh, "<", $file | ||
| or die "Failed to open '$file' for read"; | ||
| my @out_lines; | ||
| while (<$ifh>) { | ||
| my $modified= 0; | ||
| $modified += s/\t/$TAB/g if $clean_tabs; | ||
| $modified += s/\s+\z/\n/; | ||
| print "\tcleaned line $.\n" | ||
| if $DEBUG > 2 and $modified; | ||
| push @out_lines, $_; | ||
| } | ||
| close $ifh or die "Failed to close '$file':$1"; | ||
| write_file($file, \@out_lines); | ||
| } | ||
|
|
||
| sub get_modified_files { | ||
| my ($autodetect,$status)= @_; | ||
| return [] if !defined $autodetect; | ||
| $autodetect ||= "MA"; | ||
| if ($autodetect and $autodetect=~/([^ MADRCU])/) { | ||
| die "Unknown mode '$1' in '$autodetect', must be one of [ MADRCU]\n", | ||
| "See git status --help for more information\n"; | ||
| } | ||
| print "looking for files with mode [$autodetect]...\n" if $DEBUG; | ||
| print Dumper($status) if $DEBUG > 1; | ||
| $status||= get_status(); | ||
| my @files; | ||
| foreach my $file (sort keys %$status) { | ||
| push @files, $file | ||
| if $status->{$file}=~m/[$autodetect]/; | ||
| } | ||
| return filter_do_not_edit_files(\@files); | ||
| } | ||
|
|
||
| sub filter_do_not_edit_files { | ||
| my ($files)= @_; | ||
| my @ret_files; | ||
| # we hide this next line from the check by storing | ||
| # it in lower case and then using uc to fix it for the grep | ||
| while (@$files) { | ||
| my @these_files= splice @$files,0,50; | ||
| open my $cmd, "git grep -L '$DO_NOT_EDIT' @these_files |"; | ||
| while (<$cmd>) { | ||
| chomp; | ||
| push @ret_files, $_; | ||
| } | ||
| } | ||
| return \@ret_files; | ||
| } | ||
|
|
||
| sub get_status { | ||
| open my $cmd, "git status --porcelain |" | ||
| or die "No status?"; | ||
| my %files; | ||
| while (<$cmd>) { | ||
| print if $DEBUG > 2; | ||
| my ($mode,$file1,$file2)= /(..) (.*?)(?: -> (.*))?$/ | ||
| or die "Can't parse: $_"; | ||
| $file2 ||= $file1; | ||
| if ($mode =~ /[MARC]/) { | ||
| $files{$file2}=$mode; | ||
| } | ||
| } | ||
| #die Dumper(\%files); | ||
| close $cmd; | ||
| return \%files | ||
| } | ||
|
|
||
| sub clean_files { | ||
| my ($files, $status, $skip, $full)=@_; | ||
| my @todo; | ||
| FILE: | ||
| for my $file (@$files) { | ||
| print "\tchecking $file\n" if $DEBUG > 1; | ||
| if (!-f $file) { | ||
| print "ignoring '$file': not a regular file\n"; | ||
| next FILE; | ||
| } | ||
| for my $pat (@$skip) { | ||
| if ($file =~ m/$pat/) { | ||
| print "skipping '$file': it matches 'no' pattern $pat\n" if $DEBUG; | ||
| next FILE; | ||
| } | ||
| } | ||
| if ($full or $status->{$file}) { | ||
| push @todo, $file; | ||
| } else { | ||
| print "leaving '$file': it is unchanged\n" if $DEBUG; | ||
| } | ||
| } | ||
| if (@todo) { | ||
| # might put stuff here | ||
| clean_file($_,$full) for @todo; | ||
| } | ||
| } | ||
|
|
||
| my $full = 0; # if true clean the full file | ||
| my $autodetect= 0; # '0' means DWIM (use @ARGV if it has stuff otherwise autodetect) , | ||
| # undef means use @ARGV regardless, | ||
| # '' or anything else means autdetect regardless. | ||
| Getopt::Long::Configure("bundling"); | ||
| GetOptions( | ||
| 'n|no=s' => \my @no, | ||
| 'v|verbose+' => \($DEBUG), | ||
| 'h|help|?' => \my $help, | ||
| 'man' => \my $man, | ||
| 'f|full' => \$full, | ||
| 'F|no-full' => sub { undef $full}, | ||
| 'a|auto:s' => \$autodetect, | ||
| 'A|no-auto' => sub { undef $autodetect }, | ||
| ) or pod2usage(2); | ||
| pod2usage(1) if $help; | ||
| pod2usage(-exitstatus => 0, -verbose => 2) if $man; | ||
|
|
||
| warn Data::Dumper->Dump([\@ARGV,$autodetect,$full,\@no,$DEBUG],[qw(*ARGV *autodetect *full *no *DEBUG)]) | ||
| if $DEBUG>2; | ||
| exit(0) if $DEBUG > 9; | ||
|
|
||
| chomp(my $path= `git rev-parse --git-dir`); | ||
| die "Not a git repo" if !$path; | ||
| chdir( $path . "/.." ) | ||
| or die "Failed to chdir to '$path/..': $!"; | ||
|
|
||
| my %seen; | ||
| @no= map { !$seen{$_}++ ? qr/$_/ : () } @no; | ||
|
|
||
| my $status= get_status(); | ||
| exit(0) if !%$status && !@ARGV; | ||
|
|
||
| my $files= (!defined($autodetect) || # if autodetect is undef - do NOT use git status | ||
| $autodetect eq '0' && @ARGV) # or autodetect eq '0' and @ARGV has stuff in it | ||
| ? \@ARGV # then use ARGV | ||
| : get_modified_files($autodetect,$status); # otherwise use git status to find the files | ||
|
|
||
| clean_files($files,$status,\@no,$full); | ||
|
|
||
| __END__ | ||
|
|
||
| =head1 NAME | ||
|
|
||
| clean-commit - whitespace clean modified files in a git repository | ||
|
|
||
| =head1 SYNOPSIS | ||
|
|
||
| clean-commit [options] [file ...] | ||
|
|
||
| Options: | ||
| --no=REGEX ignore anything matching this | ||
| -a --auto=MODE use git to find modified files - MODE can be [MADRCU] | ||
| -A --no-auto do not use git to find files if the arg list is empty | ||
| -f --full clean the full file, not just the changed bits | ||
| --help brief help message | ||
| --man full documentation | ||
| --verbose print debugging information | ||
|
|
||
| =head1 OPTIONS | ||
|
|
||
| Either processes the provided list of files or if none are provided then | ||
| uses C<git status> to find the files. You can use the C<--auto> and | ||
| C<--no-auto> to fine-tune this behaviour. | ||
|
|
||
| =over 8 | ||
|
|
||
| =item B<--no=REGEX> | ||
|
|
||
| Any file matching this will be ignored. May be used more than once. | ||
| REGEX is a perl syntax regular expression. | ||
|
|
||
| =item B<-a> | ||
|
|
||
| =item B<--auto> | ||
|
|
||
| =item B<--auto=MODES> | ||
|
|
||
| Use git status to find modified files. Defaults to 'M', legal values | ||
| are as follows (most can be combined). | ||
|
|
||
| MODE Meaning | ||
| '0' use @ARGV if its there, otherwise use mode ('M') | ||
| '' use default mode 'M' | ||
| ' ' unchanged | ||
| 'M' modified | ||
| 'A' added | ||
| 'R' renamed | ||
| 'C' copied | ||
| 'U' unmerged | ||
|
|
||
| The default behaviour of the tool is C<--auto=0>, which causes the tool | ||
| to process any files passed in on the command line, and to otherwise use | ||
| C<git status> with the default mode ('M') to find the files. Any other use | ||
| of this option causes the tool to ignore any file on the command line. | ||
| The use of the C<--no-auto> option overrides this behaviour the other way | ||
| and causes the tool to process only the files passed in, even if that | ||
| means doing nothing. If used together the last used wins. | ||
|
|
||
| See the documentation for the C<git status> command, and the C<--porcelain> option | ||
| for more details on the mode values. | ||
|
|
||
| =item B<-A> | ||
|
|
||
| =item B<--no-auto> | ||
|
|
||
| Do not use C<git status>, only process files passed in on the command line, even if | ||
| that means processing nothing. If combined with B<--auto> which is used last wins. | ||
| See also the documentation for C<--auto> | ||
|
|
||
| =item B<-v> | ||
|
|
||
| =item B<--verbose> | ||
|
|
||
| Output debugging information. Right now this is not very pretty. | ||
|
|
||
| =item B<-help> | ||
|
|
||
| Print a brief help message and exits. | ||
|
|
||
| =item B<-man> | ||
|
|
||
| Prints the manual page and exits. | ||
|
|
||
| =back | ||
|
|
||
| =head1 DESCRIPTION | ||
|
|
||
| B<clean-commit> will read the given input file(s) and use git to | ||
| determine which parts have been modified, and then clean any whitespace | ||
| issues in the modified parts. Cleaning that is performed is to eliminate | ||
| trailing whitespace and convert tabs to spaces. | ||
|
|
||
| The default behaviour is to DWIM, and either process the specified | ||
| files, or use C<git status> to find them. The C<--no-auto> option means | ||
| the tool will only process the explicitly provided files, even if that | ||
| means doing nothing. The C<--auto> option can be used to force C<git | ||
| status> to be used to find the files, even if a list of files have been | ||
| provided, and can be used to change which types of file modifications | ||
| are chosen to be cleaned, for instance C<--mode=MARC> would clean any | ||
| modified, added, renamed, or copied files, instead of the normal default | ||
| of just cleaning modified files. | ||
|
|
||
| Files which contain the string 'do not edit this file' in *uppercase* | ||
| are excluded from cleanups. | ||
|
|
||
| Trailing whitespace is removed from all files, tabs are converted to | ||
| spaces in files ending in C<.pl>, C<.pm>, C<.xs>, C<.c> and C<.h>, any | ||
| totally blank lines at the end of a file will be removed (either when | ||
| the file is new or, when the lines were added to an existing file). | ||
|
|
||
| =cut | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In most of the Perl code I've seen in the core distribution, 4 spaces is the equivalent of a tab. I would say that it's about 75% 4-spaces, 23% 2-spaces, balance other. So I think setting
TABto 8 is not consistent with our practice. (I'm not claiming we're very consistent about this.) I realize that inpod/perlpod.podthere is a section on "Verbatim Paragraph" that calls for 8-space tabs, but don't see any indication that that applies more generally.Perhaps we could make this configurable with a command-line switch?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No it not. The indent was 4 spaces, but a tab was always 8.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not going to get dogmatic about 8-space tab, but can you point us to where that's documented?