From a1751510661e0347815ffe6e1382b5f8be372ea2 Mon Sep 17 00:00:00 2001 From: FPa-riken Date: Tue, 8 Nov 2022 10:11:42 +0900 Subject: [PATCH 1/3] fix identification of valid annex pointer file --- internal/tool/file.go | 56 ++++++++++++++++++++++++++++++++++---- internal/tool/file_test.go | 52 +++++++++++++++++++++++++++++++++++ 2 files changed, 103 insertions(+), 5 deletions(-) create mode 100644 internal/tool/file_test.go diff --git a/internal/tool/file.go b/internal/tool/file.go index 6d1e3ad7c..f59442be1 100644 --- a/internal/tool/file.go +++ b/internal/tool/file.go @@ -8,6 +8,7 @@ import ( "fmt" "math" "net/http" + "regexp" "strings" ) @@ -27,16 +28,61 @@ func IsTextFile(data []byte) bool { return strings.Contains(http.DetectContentType(data), "text/") } +var RE_ANNEXPOINTERFILE = regexp.MustCompile(`^(/annex/objects/([A-Z][\-_0-9A-Za-z]+)(?:\n|\r|\z))`) + +//reference: https://git-annex.branchable.com/internals/pointer_file/ func IsAnnexedFile(data []byte) bool { - const ANNEXSNIFFSIZE = 5000 - if !(len(data) < ANNEXSNIFFSIZE) { - data = data[:ANNEXSNIFFSIZE] + + const ANNEXPOINTERFILE_MAXSIZE = 32 * 1024 + const ANNEXSNIFFSIZE = 512 + + var dataLen = len(data) + + //The maximum size of a pointer file is 32 kb. If it is any longer, it is not considered to be a valid pointer file. + if dataLen > ANNEXPOINTERFILE_MAXSIZE { + return false + } + + var sniffData []byte + if !(dataLen < ANNEXSNIFFSIZE) { + sniffData = data[:ANNEXSNIFFSIZE] + } else { + sniffData = data } - if strings.Contains(http.DetectContentType(data), "text/") { - return strings.Contains(string(data), "/annex/objects") + + //annex pointer file is a text file + if strings.Contains(http.DetectContentType(sniffData), "text/") { + + //A pointer file starts with "/annex/objects/", which is followed by the key + matchAnnexPointer := RE_ANNEXPOINTERFILE.FindStringSubmatch(string(sniffData)) + + if len(matchAnnexPointer) > 0 { + //var annexKey = matchAnnexPointer[2] + + //git-annex does support pointer files with additional text on subsequent lines. + var hasAdditionalText = len(sniffData) > len(matchAnnexPointer[1]) || dataLen > ANNEXSNIFFSIZE + + if hasAdditionalText { + //every such subsequent line must contain "/annex/" somewhere in it, and end with a newline. + var extraLines = strings.SplitAfter(string(data), "\n")[1:] + + if extraLines[len(extraLines)-1] != "" { + //if last line isn't empty, it means it was missing required newline character + return false + } else { + for _, line := range extraLines[0 : len(extraLines)-1] { + if !strings.Contains(line, "/annex/") { + return false + } + } + } + } + return true + } } return false } + func IsImageFile(data []byte) bool { return strings.Contains(http.DetectContentType(data), "image/") } diff --git a/internal/tool/file_test.go b/internal/tool/file_test.go new file mode 100644 index 000000000..654b023c9 --- /dev/null +++ b/internal/tool/file_test.go @@ -0,0 +1,52 @@ +package tool + +import ( + "strings" + "testing" + + . "github.com/smartystreets/goconvey/convey" +) + +func Test_IsValidAnnexPointerFile(t *testing.T) { + Convey("Check if a (file) content is a valid annex pointer file", t, func() { + testCases := []struct { + expect bool + content string + }{ + // valid key and EOF + {true, "/annex/objects/MD0-s232439--f4d0aaf2b2ac7a4cf00fbae9158a1b7c"}, + // not a key pattern + {false, "foo/bar"}, + + // key pattern doesn't start at the beginning of content + {false, " /annex/objects/MD1-s232439--f4d0aaf2b2ac7a4cf00fbae9158a1b7c"}, + // key contain invalid character + {false, "/annex/objects/M+D2-s232439--f4d0aaf2b2ac7a4cf00fbae9158a1b7c"}, + // newline after key (and no more content) + {true, "/annex/objects/MD3-f4d0aaf2b2ac-7a4cf00fbae9158a1b7c\n"}, + // key can contains underscore (depending on backend) + {true, "/annex/objects/SHA4_384-232439cf00fbae9158a1b7c"}, + + // empty additional line + {false, "/annex/objects/MD5-s232439--f4d0aaf2b2ac7a4cf00fbae9158a1b7c\n\n"}, + + // valid additional line + {true, "/annex/objects/MD6-f4d0aaf2ba4cf00fbae9158a1b7c\n/annex/\n"}, + // empty additional line + {false, "/annex/objects/MD7-s232439--f4d0aaf2b2ac7a4cf00fbae9158a1b7c\n/annex/\n\n"}, + // additional line not terminated by new line + {false, "/annex/objects/MD8-s232439--f4d0aaf2b2ac7a4cf00fbae9158a1b7c\n/annex/"}, + + // valid additional lines + {true, "/annex/objects/MD9-s232439--f4d0aaf2b2ac7a4cf00fbae9158a1b7c\r /annex/\n /annex/\n/annex/ \n"}, + // many valid additional lines, within the 32kb max file size + {true, "/annex/objects/MD10-s232439--f4d0aaf2b2ac7a4cf00fbae9158a1b7c\n" + strings.Repeat("/annex/89\n", 31*1024/10)}, + // many valid additional lines, over the 32kb max file size + {false, "/annex/objects/MD11-s232439--f4d0aaf2b2ac7a4cf00fbae9158a1b7c\n" + strings.Repeat("/annex/89\n", 32*1024/10)}, + } + + for _, tc := range testCases { + So(IsAnnexedFile([]byte(tc.content)), ShouldEqual, tc.expect) + } + }) +} From cbcb02fe9753923849224ab41320da9c4ad1595e Mon Sep 17 00:00:00 2001 From: FPa-riken Date: Tue, 8 Nov 2022 17:35:38 +0900 Subject: [PATCH 2/3] fix identification of valid symlik target to annex object --- internal/tool/file.go | 62 ++++++++++++++++++++++++-------------- internal/tool/file_test.go | 9 ++++++ 2 files changed, 48 insertions(+), 23 deletions(-) diff --git a/internal/tool/file.go b/internal/tool/file.go index f59442be1..635460e41 100644 --- a/internal/tool/file.go +++ b/internal/tool/file.go @@ -28,9 +28,14 @@ func IsTextFile(data []byte) bool { return strings.Contains(http.DetectContentType(data), "text/") } -var RE_ANNEXPOINTERFILE = regexp.MustCompile(`^(/annex/objects/([A-Z][\-_0-9A-Za-z]+)(?:\n|\r|\z))`) +/* +* An annex object can be checked into git in 2 ways: +* 1. As a "pointer file" (structure described here: https://git-annex.branchable.com/internals/pointer_file/ ) +* 2. As a symbolic link pointing to a file in the .git/annex/objects/ directory. + */ +var RE_ANNEXPOINTERFILE = regexp.MustCompile(`^(/annex/objects/([A-Z][\-\._0-9A-Za-z]+)(?:\n|\r|\z))`) +var RE_SYMLINKPOINTATANNEX = regexp.MustCompile(`^.git/annex/objects/.+`) -//reference: https://git-annex.branchable.com/internals/pointer_file/ func IsAnnexedFile(data []byte) bool { const ANNEXPOINTERFILE_MAXSIZE = 32 * 1024 @@ -39,6 +44,7 @@ func IsAnnexedFile(data []byte) bool { var dataLen = len(data) //The maximum size of a pointer file is 32 kb. If it is any longer, it is not considered to be a valid pointer file. + //The maximum size of a symlink target is SYMLINK_MAX (which is filesystem dependent) but typically way smaller than 32kb. if dataLen > ANNEXPOINTERFILE_MAXSIZE { return false } @@ -50,34 +56,44 @@ func IsAnnexedFile(data []byte) bool { sniffData = data } - //annex pointer file is a text file + //Annex pointer file/symlink target content is text type if strings.Contains(http.DetectContentType(sniffData), "text/") { - //A pointer file starts with "/annex/objects/", which is followed by the key - matchAnnexPointer := RE_ANNEXPOINTERFILE.FindStringSubmatch(string(sniffData)) + sniffStr := string(sniffData) + //Check if it's a symbolic link pointing to git-annex subdir + matchSymlinkTarget := RE_SYMLINKPOINTATANNEX.FindStringSubmatch(sniffStr) - if len(matchAnnexPointer) > 0 { - //var annexKey = matchAnnexPointer[2] - - //git-annex does support pointer files with additional text on subsequent lines. - var hasAdditionalText = len(sniffData) > len(matchAnnexPointer[1]) || dataLen > ANNEXSNIFFSIZE - - if hasAdditionalText { - //every such subsequent line must contain "/annex/" somewhere in it, and end with a newline. - var extraLines = strings.SplitAfter(string(data), "\n")[1:] - - if extraLines[len(extraLines)-1] != "" { - //if last line isn't empty, it means it was missing required newline character - return false - } else { - for _, line := range extraLines[0 : len(extraLines)-1] { - if !strings.Contains(line, "/annex/") { - return false + if len(matchSymlinkTarget) > 0 { + return true + } else { + //Check if it's a valid pointer file + + //A pointer file starts with "/annex/objects/", which is followed by the key + matchAnnexPointer := RE_ANNEXPOINTERFILE.FindStringSubmatch(sniffStr) + + if len(matchAnnexPointer) > 0 { + //var annexKey = matchAnnexPointer[2] + + //git-annex does support pointer files with additional text on subsequent lines. + var hasAdditionalText = len(sniffData) > len(matchAnnexPointer[1]) || dataLen > ANNEXSNIFFSIZE + + if hasAdditionalText { + //every such subsequent line must contain "/annex/" somewhere in it, and end with a newline. + var extraLines = strings.SplitAfter(string(data), "\n")[1:] + + if extraLines[len(extraLines)-1] != "" { + //if last line isn't empty, it means it was missing required newline character + return false + } else { + for _, line := range extraLines[0 : len(extraLines)-1] { + if !strings.Contains(line, "/annex/") { + return false + } } } } + return true } - return true } } return false diff --git a/internal/tool/file_test.go b/internal/tool/file_test.go index 654b023c9..2fb783f16 100644 --- a/internal/tool/file_test.go +++ b/internal/tool/file_test.go @@ -15,6 +15,9 @@ func Test_IsValidAnnexPointerFile(t *testing.T) { }{ // valid key and EOF {true, "/annex/objects/MD0-s232439--f4d0aaf2b2ac7a4cf00fbae9158a1b7c"}, + {true, "/annex/objects/SHA256E-s31390--f50d7ac4c6b9031379986bc362fcefb65f1e52621ce1708d537e740fefc59cc0.mp3"}, + {true, "/annex/objects/MD5E-s33142576--02b5f38377b5d268384633b3f1154d4e.nii.gz"}, + // not a key pattern {false, "foo/bar"}, @@ -43,6 +46,12 @@ func Test_IsValidAnnexPointerFile(t *testing.T) { {true, "/annex/objects/MD10-s232439--f4d0aaf2b2ac7a4cf00fbae9158a1b7c\n" + strings.Repeat("/annex/89\n", 31*1024/10)}, // many valid additional lines, over the 32kb max file size {false, "/annex/objects/MD11-s232439--f4d0aaf2b2ac7a4cf00fbae9158a1b7c\n" + strings.Repeat("/annex/89\n", 32*1024/10)}, + + // valid symlink target + {true, ".git/annex/objects/Z9/qQ/MD5E-s886791--49e415b10841cacff2d8fb8456ca1e67.png/MD5E-s886791--49e415b10841cacff2d8fb8456ca1e67.png"}, + // invalid symlink target + {false, "git/annex/objects/Z9/qQ/MD5E-s886791--49e415b10841cacff2d8fb8456ca1e67.png/MD5E-s886791--49e415b10841cacff2d8fb8456ca1e67.png"}, + {false, ".git/annex/objects/"}, } for _, tc := range testCases { From 62b8d32830dd21eeabd9b48e8a0295fbe5f91784 Mon Sep 17 00:00:00 2001 From: FPa-riken Date: Wed, 9 Nov 2022 09:04:32 +0900 Subject: [PATCH 3/3] fix annexed object valid symlink for file in subdir --- internal/tool/file.go | 9 ++++++--- internal/tool/file_test.go | 3 +++ 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/internal/tool/file.go b/internal/tool/file.go index 635460e41..f62df7ae0 100644 --- a/internal/tool/file.go +++ b/internal/tool/file.go @@ -31,10 +31,14 @@ func IsTextFile(data []byte) bool { /* * An annex object can be checked into git in 2 ways: * 1. As a "pointer file" (structure described here: https://git-annex.branchable.com/internals/pointer_file/ ) -* 2. As a symbolic link pointing to a file in the .git/annex/objects/ directory. +* 2. As a symbolic link pointing to a file in the git-annex directory (located in the .git dir at the base of the repository). */ + +//A pointer file starts with "/annex/objects/", which is followed by the key var RE_ANNEXPOINTERFILE = regexp.MustCompile(`^(/annex/objects/([A-Z][\-\._0-9A-Za-z]+)(?:\n|\r|\z))`) -var RE_SYMLINKPOINTATANNEX = regexp.MustCompile(`^.git/annex/objects/.+`) + +//The symbolic target is a relative path pointing to a file under the .git/annex/objects/ dir +var RE_SYMLINKPOINTATANNEX = regexp.MustCompile(`^(?:\.\./)*.git/annex/objects/.+`) func IsAnnexedFile(data []byte) bool { @@ -68,7 +72,6 @@ func IsAnnexedFile(data []byte) bool { } else { //Check if it's a valid pointer file - //A pointer file starts with "/annex/objects/", which is followed by the key matchAnnexPointer := RE_ANNEXPOINTERFILE.FindStringSubmatch(sniffStr) if len(matchAnnexPointer) > 0 { diff --git a/internal/tool/file_test.go b/internal/tool/file_test.go index 2fb783f16..1a2d3dacb 100644 --- a/internal/tool/file_test.go +++ b/internal/tool/file_test.go @@ -52,6 +52,9 @@ func Test_IsValidAnnexPointerFile(t *testing.T) { // invalid symlink target {false, "git/annex/objects/Z9/qQ/MD5E-s886791--49e415b10841cacff2d8fb8456ca1e67.png/MD5E-s886791--49e415b10841cacff2d8fb8456ca1e67.png"}, {false, ".git/annex/objects/"}, + // valid symlink target for files in sub-directory + {true, "../.git/annex/objects/Z9/qQ/MD5E-s886791--49e415b10841cacff2d8fb8456ca1e67.png/MD5E-s886791--49e415b10841cacff2d8fb8456ca1e67.png"}, + {true, "../../.git/annex/objects/Z9/qQ/MD5E-s886791--49e415b10841cacff2d8fb8456ca1e67.png/MD5E-s886791--49e415b10841cacff2d8fb8456ca1e67.png"}, } for _, tc := range testCases {