Skip to content

Commit ed143b7

Browse files
authored
fix: sanitize git clone repo input url (#5234)
1 parent 3641c2b commit ed143b7

File tree

3 files changed

+283
-5
lines changed

3 files changed

+283
-5
lines changed

src/sagemaker/git_utils.py

Lines changed: 69 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,78 @@
1414
from __future__ import absolute_import
1515

1616
import os
17-
from pathlib import Path
17+
import re
1818
import subprocess
1919
import tempfile
2020
import warnings
21+
from pathlib import Path
22+
from urllib.parse import urlparse
23+
2124
import six
2225
from six.moves import urllib
2326

2427

28+
def _sanitize_git_url(repo_url):
29+
"""Sanitize Git repository URL to prevent URL injection attacks.
30+
31+
Args:
32+
repo_url (str): The Git repository URL to sanitize
33+
34+
Returns:
35+
str: The sanitized URL
36+
37+
Raises:
38+
ValueError: If the URL contains suspicious patterns that could indicate injection
39+
"""
40+
at_count = repo_url.count("@")
41+
42+
if repo_url.startswith("git@"):
43+
# git@ format requires exactly one @
44+
if at_count != 1:
45+
raise ValueError("Invalid SSH URL format: git@ URLs must have exactly one @ symbol")
46+
elif repo_url.startswith("ssh://"):
47+
# ssh:// format can have 0 or 1 @ symbols
48+
if at_count > 1:
49+
raise ValueError("Invalid SSH URL format: multiple @ symbols detected")
50+
elif repo_url.startswith("https://") or repo_url.startswith("http://"):
51+
# HTTPS format allows 0 or 1 @ symbols
52+
if at_count > 1:
53+
raise ValueError("Invalid HTTPS URL format: multiple @ symbols detected")
54+
55+
# Check for invalid characters in the URL before parsing
56+
# These characters should not appear in legitimate URLs
57+
invalid_chars = ["<", ">", "[", "]", "{", "}", "\\", "^", "`", "|"]
58+
for char in invalid_chars:
59+
if char in repo_url:
60+
raise ValueError("Invalid characters in hostname")
61+
62+
try:
63+
parsed = urlparse(repo_url)
64+
65+
# Check for suspicious characters in hostname that could indicate injection
66+
if parsed.hostname:
67+
# Check for URL-encoded characters that might be used for obfuscation
68+
suspicious_patterns = ["%25", "%40", "%2F", "%3A"] # encoded %, @, /, :
69+
for pattern in suspicious_patterns:
70+
if pattern in parsed.hostname.lower():
71+
raise ValueError(f"Suspicious URL encoding detected in hostname: {pattern}")
72+
73+
# Validate that the hostname looks legitimate
74+
if not re.match(r"^[a-zA-Z0-9.-]+$", parsed.hostname):
75+
raise ValueError("Invalid characters in hostname")
76+
77+
except Exception as e:
78+
if isinstance(e, ValueError):
79+
raise
80+
raise ValueError(f"Failed to parse URL: {str(e)}")
81+
else:
82+
raise ValueError(
83+
"Unsupported URL scheme: only https://, http://, git@, and ssh:// are allowed"
84+
)
85+
86+
return repo_url
87+
88+
2589
def git_clone_repo(git_config, entry_point, source_dir=None, dependencies=None):
2690
"""Git clone repo containing the training code and serving code.
2791
@@ -87,6 +151,10 @@ def git_clone_repo(git_config, entry_point, source_dir=None, dependencies=None):
87151
if entry_point is None:
88152
raise ValueError("Please provide an entry point.")
89153
_validate_git_config(git_config)
154+
155+
# SECURITY: Sanitize the repository URL to prevent injection attacks
156+
git_config["repo"] = _sanitize_git_url(git_config["repo"])
157+
90158
dest_dir = tempfile.mkdtemp()
91159
_generate_and_run_clone_command(git_config, dest_dir)
92160

tests/unit/test_estimator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2794,7 +2794,7 @@ def test_git_support_bad_repo_url_format(sagemaker_session):
27942794
)
27952795
with pytest.raises(ValueError) as error:
27962796
fw.fit()
2797-
assert "Invalid Git url provided." in str(error)
2797+
assert "Unsupported URL scheme" in str(error)
27982798

27992799

28002800
@patch(

tests/unit/test_git_utils.py

Lines changed: 213 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,12 @@
1212
# language governing permissions and limitations under the License.
1313
from __future__ import absolute_import
1414

15-
import pytest
1615
import os
17-
from pathlib import Path
1816
import subprocess
19-
from mock import patch, ANY
17+
from pathlib import Path
18+
19+
import pytest
20+
from mock import ANY, patch
2021

2122
from sagemaker import git_utils
2223

@@ -494,3 +495,212 @@ def test_git_clone_repo_codecommit_https_creds_not_stored_locally(tempdir, mkdte
494495
with pytest.raises(subprocess.CalledProcessError) as error:
495496
git_utils.git_clone_repo(git_config, entry_point)
496497
assert "returned non-zero exit status" in str(error.value)
498+
499+
500+
class TestGitUrlSanitization:
501+
"""Test cases for Git URL sanitization to prevent injection attacks."""
502+
503+
def test_sanitize_git_url_valid_https_urls(self):
504+
"""Test that valid HTTPS URLs pass sanitization."""
505+
valid_urls = [
506+
"https://github.com/user/repo.git",
507+
"https://gitlab.com/user/repo.git",
508+
"https://[email protected]/user/repo.git",
509+
"https://user:[email protected]/user/repo.git",
510+
"http://internal-git.company.com/repo.git",
511+
]
512+
513+
for url in valid_urls:
514+
# Should not raise any exception
515+
result = git_utils._sanitize_git_url(url)
516+
assert result == url
517+
518+
def test_sanitize_git_url_valid_ssh_urls(self):
519+
"""Test that valid SSH URLs pass sanitization."""
520+
valid_urls = [
521+
"[email protected]:user/repo.git",
522+
"[email protected]:user/repo.git",
523+
"ssh://[email protected]/user/repo.git",
524+
"ssh://git-codecommit.us-west-2.amazonaws.com/v1/repos/test-repo/", # 0 @ symbols - valid for ssh://
525+
"[email protected]:repo.git",
526+
]
527+
528+
for url in valid_urls:
529+
# Should not raise any exception
530+
result = git_utils._sanitize_git_url(url)
531+
assert result == url
532+
533+
def test_sanitize_git_url_blocks_multiple_at_https(self):
534+
"""Test that HTTPS URLs with multiple @ symbols are blocked."""
535+
malicious_urls = [
536+
"https://[email protected]@github.com/repo.git",
537+
"https://[email protected]@gitlab.com/user/repo.git",
538+
"https://a@b@[email protected]/repo.git",
539+
"https://user@[email protected]/legit/repo.git",
540+
]
541+
542+
for url in malicious_urls:
543+
with pytest.raises(ValueError) as error:
544+
git_utils._sanitize_git_url(url)
545+
assert "multiple @ symbols detected" in str(error.value)
546+
547+
def test_sanitize_git_url_blocks_multiple_at_ssh(self):
548+
"""Test that SSH URLs with multiple @ symbols are blocked."""
549+
malicious_urls = [
550+
"[email protected]@github.com:repo.git",
551+
"git@[email protected]:user/repo.git",
552+
"ssh://git@[email protected]/repo.git",
553+
"git@a@b@c:repo.git",
554+
]
555+
556+
for url in malicious_urls:
557+
with pytest.raises(ValueError) as error:
558+
git_utils._sanitize_git_url(url)
559+
# git@ URLs should give "exactly one @ symbol" error
560+
# ssh:// URLs should give "multiple @ symbols detected" error
561+
assert any(
562+
phrase in str(error.value)
563+
for phrase in ["multiple @ symbols detected", "exactly one @ symbol"]
564+
)
565+
566+
def test_sanitize_git_url_blocks_invalid_schemes_and_git_at_format(self):
567+
"""Test that invalid schemes and git@ format violations are blocked."""
568+
# Test unsupported schemes
569+
unsupported_scheme_urls = [
570+
"git-github.com:user/repo.git", # Doesn't start with git@, ssh://, http://, https://
571+
]
572+
573+
for url in unsupported_scheme_urls:
574+
with pytest.raises(ValueError) as error:
575+
git_utils._sanitize_git_url(url)
576+
assert "Unsupported URL scheme" in str(error.value)
577+
578+
# Test git@ URLs with wrong @ count
579+
invalid_git_at_urls = [
580+
"[email protected]@evil.com:repo.git", # 2 @ symbols
581+
]
582+
583+
for url in invalid_git_at_urls:
584+
with pytest.raises(ValueError) as error:
585+
git_utils._sanitize_git_url(url)
586+
assert "exactly one @ symbol" in str(error.value)
587+
588+
def test_sanitize_git_url_blocks_url_encoding_obfuscation(self):
589+
"""Test that URL-encoded obfuscation attempts are blocked."""
590+
obfuscated_urls = [
591+
"https://github.com%25evil.com/repo.git",
592+
"https://[email protected]%40attacker.com/repo.git",
593+
"https://github.com%2Fevil.com/repo.git",
594+
"https://github.com%3Aevil.com/repo.git",
595+
]
596+
597+
for url in obfuscated_urls:
598+
with pytest.raises(ValueError) as error:
599+
git_utils._sanitize_git_url(url)
600+
# The error could be either suspicious encoding or invalid characters
601+
assert any(
602+
phrase in str(error.value)
603+
for phrase in ["Suspicious URL encoding detected", "Invalid characters in hostname"]
604+
)
605+
606+
def test_sanitize_git_url_blocks_invalid_hostname_chars(self):
607+
"""Test that hostnames with invalid characters are blocked."""
608+
invalid_urls = [
609+
"https://github<script>.com/repo.git",
610+
"https://github>.com/repo.git",
611+
"https://github[].com/repo.git",
612+
"https://github{}.com/repo.git",
613+
]
614+
615+
for url in invalid_urls:
616+
with pytest.raises(ValueError) as error:
617+
git_utils._sanitize_git_url(url)
618+
# The error could be various types due to URL parsing edge cases
619+
assert any(
620+
phrase in str(error.value)
621+
for phrase in [
622+
"Invalid characters in hostname",
623+
"Failed to parse URL",
624+
"does not appear to be an IPv4 or IPv6 address",
625+
]
626+
)
627+
628+
def test_sanitize_git_url_blocks_unsupported_schemes(self):
629+
"""Test that unsupported URL schemes are blocked."""
630+
unsupported_urls = [
631+
"ftp://github.com/repo.git",
632+
"file:///local/repo.git",
633+
"javascript:alert('xss')",
634+
"data:text/html,<script>alert('xss')</script>",
635+
]
636+
637+
for url in unsupported_urls:
638+
with pytest.raises(ValueError) as error:
639+
git_utils._sanitize_git_url(url)
640+
assert "Unsupported URL scheme" in str(error.value)
641+
642+
def test_git_clone_repo_blocks_malicious_https_url(self):
643+
"""Test that git_clone_repo blocks malicious HTTPS URLs."""
644+
malicious_git_config = {
645+
"repo": "https://[email protected]@github.com/legit/repo.git",
646+
"branch": "main",
647+
}
648+
entry_point = "train.py"
649+
650+
with pytest.raises(ValueError) as error:
651+
git_utils.git_clone_repo(malicious_git_config, entry_point)
652+
assert "multiple @ symbols detected" in str(error.value)
653+
654+
def test_git_clone_repo_blocks_malicious_ssh_url(self):
655+
"""Test that git_clone_repo blocks malicious SSH URLs."""
656+
malicious_git_config = {
657+
"repo": "git@[email protected]:sage-maker/temp-sev2.git",
658+
"branch": "main",
659+
}
660+
entry_point = "train.py"
661+
662+
with pytest.raises(ValueError) as error:
663+
git_utils.git_clone_repo(malicious_git_config, entry_point)
664+
assert "exactly one @ symbol" in str(error.value)
665+
666+
def test_git_clone_repo_blocks_url_encoded_attack(self):
667+
"""Test that git_clone_repo blocks URL-encoded attacks."""
668+
malicious_git_config = {
669+
"repo": "https://github.com%40attacker.com/repo.git",
670+
"branch": "main",
671+
}
672+
entry_point = "train.py"
673+
674+
with pytest.raises(ValueError) as error:
675+
git_utils.git_clone_repo(malicious_git_config, entry_point)
676+
assert "Suspicious URL encoding detected" in str(error.value)
677+
678+
def test_sanitize_git_url_comprehensive_attack_scenarios(self):
679+
attack_scenarios = [
680+
# Original PoC attack
681+
"https://USER@YOUR_NGROK_OR_LOCALHOST/[email protected]%25legit%25repo.git",
682+
# Variations of the attack
683+
"https://user@[email protected]/legit/repo.git",
684+
"[email protected]@github.com:user/repo.git",
685+
"ssh://[email protected]@github.com/repo.git",
686+
# URL encoding variations
687+
"https://github.com%40evil.com/repo.git",
688+
"https://[email protected]%2Fevil.com/repo.git",
689+
]
690+
691+
entry_point = "train.py"
692+
693+
for malicious_url in attack_scenarios:
694+
git_config = {"repo": malicious_url}
695+
with pytest.raises(ValueError) as error:
696+
git_utils.git_clone_repo(git_config, entry_point)
697+
# Should be blocked by sanitization
698+
assert any(
699+
phrase in str(error.value)
700+
for phrase in [
701+
"multiple @ symbols detected",
702+
"exactly one @ symbol",
703+
"Suspicious URL encoding detected",
704+
"Invalid characters in hostname",
705+
]
706+
)

0 commit comments

Comments
 (0)