Skip to content

Commit 892cd4f

Browse files
authored
ensure the path regexes will accept all valid paths (#48686)
Previously, we might try to interpret the random bytes in a path as UTF-8 and excluding \n, causing the regex match to fail or be incomplete in some cases. But those are valid in a path, so we want PCRE2 to treat them as transparent bytes. Accordingly, change r""a to specify all flags needed to interpret the values simply as ASCII. Note, this would be breaking if someone was previously trying to match a Unicode character by `\u` while also disabling UCP matching of \w and \s, but that seems an odd specific choice to need. julia> match(r"\u03b1"a, "α") ERROR: PCRE compilation error: character code point value in \u.... sequence is too large at offset 6 (this would have previously worked). Note that explicitly starting the regex with (*UTF) or using a literal α in the regex would continue to work as before however. Note that `s` (DOTALL) is a more efficient matcher (if the pattern contains `.`), as is `a`, so it is often preferable to set both when in doubt: http://man.he.net/man3/pcre2perform Refs: #48648
1 parent cbbfc68 commit 892cd4f

File tree

11 files changed

+91
-52
lines changed

11 files changed

+91
-52
lines changed

base/binaryplatforms.jl

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -741,10 +741,10 @@ function Base.parse(::Type{Platform}, triplet::String; validate_strict::Bool = f
741741
end
742742
os_version = nothing
743743
if os == "macos"
744-
os_version = extract_os_version("macos", r".*darwin([\d\.]+)")
744+
os_version = extract_os_version("macos", r".*darwin([\d\.]+)"sa)
745745
end
746746
if os == "freebsd"
747-
os_version = extract_os_version("freebsd", r".*freebsd([\d.]+)")
747+
os_version = extract_os_version("freebsd", r".*freebsd([\d.]+)"sa)
748748
end
749749
tags["os_version"] = os_version
750750

@@ -798,13 +798,13 @@ function parse_dl_name_version(path::String, os::String)
798798
local dlregex
799799
if os == "windows"
800800
# On Windows, libraries look like `libnettle-6.dll`
801-
dlregex = r"^(.*?)(?:-((?:[\.\d]+)*))?\.dll$"
801+
dlregex = r"^(.*?)(?:-((?:[\.\d]+)*))?\.dll$"sa
802802
elseif os == "macos"
803803
# On OSX, libraries look like `libnettle.6.3.dylib`
804-
dlregex = r"^(.*?)((?:\.[\d]+)*)\.dylib$"
804+
dlregex = r"^(.*?)((?:\.[\d]+)*)\.dylib$"sa
805805
else
806806
# On Linux and FreeBSD, libraries look like `libnettle.so.6.3.0`
807-
dlregex = r"^(.*?)\.so((?:\.[\d]+)*)$"
807+
dlregex = r"^(.*?)\.so((?:\.[\d]+)*)$"sa
808808
end
809809

810810
m = match(dlregex, basename(path))

base/compiler/ssair/show.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -796,7 +796,7 @@ function inline_linfo_printer(code::IRCode)
796796
end
797797
end
798798

799-
_strip_color(s::String) = replace(s, r"\e\[\d+m" => "")
799+
_strip_color(s::String) = replace(s, r"\e\[\d+m"a => "")
800800

801801
function statementidx_lineinfo_printer(f, code::IRCode)
802802
printer = f(code.linetable)

base/deprecated.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ arguments of type `Any`.
4848
4949
To restrict deprecation to a specific signature, annotate the
5050
arguments of `old`. For example,
51-
```jldoctest; filter = r"@ .*"
51+
```jldoctest; filter = r"@ .*"a
5252
julia> new(x::Int) = x;
5353
5454
julia> new(x::Float64) = 2x;

base/libc.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,7 @@ function strptime(fmt::AbstractString, timestr::AbstractString)
225225
@static if Sys.isapple()
226226
# if we didn't explicitly parse the weekday or year day, use mktime
227227
# to fill them in automatically.
228-
if !occursin(r"([^%]|^)%(a|A|j|w|Ow)", fmt)
228+
if !occursin(r"([^%]|^)%(a|A|j|w|Ow)"a, fmt)
229229
ccall(:mktime, Int, (Ref{TmStruct},), tm)
230230
end
231231
end

base/methodshow.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ function strip_gensym(sym)
77
if sym === :var"#self#" || sym === :var"#unused#"
88
return empty_sym
99
end
10-
return Symbol(replace(String(sym), r"^(.*)#(.*#)?\d+$" => s"\1"))
10+
return Symbol(replace(String(sym), r"^(.*)#(.*#)?\d+$"sa => s"\1"))
1111
end
1212

1313
function argtype_decl(env, n, @nospecialize(sig::DataType), i::Int, nargs, isva::Bool) # -> (argname, argtype)
@@ -364,7 +364,7 @@ function url(m::Method)
364364
(m.file === :null || m.file === :string) && return ""
365365
file = string(m.file)
366366
line = m.line
367-
line <= 0 || occursin(r"In\[[0-9]+\]", file) && return ""
367+
line <= 0 || occursin(r"In\[[0-9]+\]"a, file) && return ""
368368
Sys.iswindows() && (file = replace(file, '\\' => '/'))
369369
libgit2_id = PkgId(UUID((0x76f85450_5226_5b5a,0x8eaa_529ad045b433)), "LibGit2")
370370
if inbase(M)

base/path.jl

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,22 +20,22 @@ export
2020

2121
if Sys.isunix()
2222
const path_separator = "/"
23-
const path_separator_re = r"/+"
24-
const path_directory_re = r"(?:^|/)\.{0,2}$"
25-
const path_dir_splitter = r"^(.*?)(/+)([^/]*)$"
26-
const path_ext_splitter = r"^((?:.*/)?(?:\.|[^/\.])[^/]*?)(\.[^/\.]*|)$"
23+
const path_separator_re = r"/+"sa
24+
const path_directory_re = r"(?:^|/)\.{0,2}$"sa
25+
const path_dir_splitter = r"^(.*?)(/+)([^/]*)$"sa
26+
const path_ext_splitter = r"^((?:.*/)?(?:\.|[^/\.])[^/]*?)(\.[^/\.]*|)$"sa
2727

2828
splitdrive(path::String) = ("",path)
2929
elseif Sys.iswindows()
3030
const path_separator = "\\"
31-
const path_separator_re = r"[/\\]+"
32-
const path_absolute_re = r"^(?:[A-Za-z]+:)?[/\\]"
33-
const path_directory_re = r"(?:^|[/\\])\.{0,2}$"
34-
const path_dir_splitter = r"^(.*?)([/\\]+)([^/\\]*)$"
35-
const path_ext_splitter = r"^((?:.*[/\\])?(?:\.|[^/\\\.])[^/\\]*?)(\.[^/\\\.]*|)$"
31+
const path_separator_re = r"[/\\]+"sa
32+
const path_absolute_re = r"^(?:[A-Za-z]+:)?[/\\]"sa
33+
const path_directory_re = r"(?:^|[/\\])\.{0,2}$"sa
34+
const path_dir_splitter = r"^(.*?)([/\\]+)([^/\\]*)$"sa
35+
const path_ext_splitter = r"^((?:.*[/\\])?(?:\.|[^/\\\.])[^/\\]*?)(\.[^/\\\.]*|)$"sa
3636

3737
function splitdrive(path::String)
38-
m = match(r"^([^\\]+:|\\\\[^\\]+\\[^\\]+|\\\\\?\\UNC\\[^\\]+\\[^\\]+|\\\\\?\\[^\\]+:|)(.*)$"s, path)::AbstractMatch
38+
m = match(r"^([^\\]+:|\\\\[^\\]+\\[^\\]+|\\\\\?\\UNC\\[^\\]+\\[^\\]+|\\\\\?\\[^\\]+:|)(.*)$"sa, path)::AbstractMatch
3939
String(something(m.captures[1])), String(something(m.captures[2]))
4040
end
4141
else

base/regex.jl

Lines changed: 46 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -46,19 +46,24 @@ mutable struct Regex <: AbstractPattern
4646
end
4747

4848
function Regex(pattern::AbstractString, flags::AbstractString)
49-
options = DEFAULT_COMPILER_OPTS
49+
compile_options = DEFAULT_COMPILER_OPTS
50+
match_options = DEFAULT_MATCH_OPTS
5051
for f in flags
5152
if f == 'a'
52-
options &= ~PCRE.UCP
53+
# instruct pcre2 to treat the strings as simple bytes (aka "ASCII"), not char encodings
54+
compile_options &= ~PCRE.UCP # user can re-enable with (*UCP)
55+
compile_options &= ~PCRE.UTF # user can re-enable with (*UTF)
56+
compile_options &= ~PCRE.MATCH_INVALID_UTF # this would force on UTF
57+
match_options &= ~PCRE.NO_UTF_CHECK # if the user did force on UTF, we should check it for safety
5358
else
54-
options |= f=='i' ? PCRE.CASELESS :
55-
f=='m' ? PCRE.MULTILINE :
56-
f=='s' ? PCRE.DOTALL :
57-
f=='x' ? PCRE.EXTENDED :
58-
throw(ArgumentError("unknown regex flag: $f"))
59+
compile_options |= f=='i' ? PCRE.CASELESS :
60+
f=='m' ? PCRE.MULTILINE :
61+
f=='s' ? PCRE.DOTALL :
62+
f=='x' ? PCRE.EXTENDED :
63+
throw(ArgumentError("unknown regex flag: $f"))
5964
end
6065
end
61-
Regex(pattern, options, DEFAULT_MATCH_OPTS)
66+
Regex(pattern, compile_options, match_options)
6267
end
6368
Regex(pattern::AbstractString) = Regex(pattern, DEFAULT_COMPILER_OPTS, DEFAULT_MATCH_OPTS)
6469

@@ -96,9 +101,15 @@ listed after the ending quote, to change its behaviour:
96101
- `s` allows the `.` modifier to match newlines.
97102
- `x` enables "comment mode": whitespace is enabled except when escaped with `\\`, and `#`
98103
is treated as starting a comment.
99-
- `a` disables `UCP` mode (enables ASCII mode). By default `\\B`, `\\b`, `\\D`, `\\d`, `\\S`,
100-
`\\s`, `\\W`, `\\w`, etc. match based on Unicode character properties. With this option,
101-
these sequences only match ASCII characters.
104+
- `a` enables ASCII mode (disables `UTF` and `UCP` modes). By default `\\B`, `\\b`, `\\D`,
105+
`\\d`, `\\S`, `\\s`, `\\W`, `\\w`, etc. match based on Unicode character properties. With
106+
this option, these sequences only match ASCII characters. This includes `\\u` also, which
107+
will emit the specified character value directly as a single byte, and not attempt to
108+
encode it into UTF-8. Importantly, this option allows matching against invalid UTF-8
109+
strings, by treating both matcher and target as simple bytes (as if they were ISO/IEC
110+
8859-1 / Latin-1 bytes) instead of as character encodings. In this case, this option is
111+
often combined with `s`. This option can be further refined by starting the pattern with
112+
(*UCP) or (*UTF).
102113
103114
See [`Regex`](@ref) if interpolation is needed.
104115
@@ -112,23 +123,38 @@ This regex has the first three flags enabled.
112123
macro r_str(pattern, flags...) Regex(pattern, flags...) end
113124

114125
function show(io::IO, re::Regex)
115-
imsxa = PCRE.CASELESS|PCRE.MULTILINE|PCRE.DOTALL|PCRE.EXTENDED|PCRE.UCP
126+
imsx = PCRE.CASELESS|PCRE.MULTILINE|PCRE.DOTALL|PCRE.EXTENDED
127+
ac = PCRE.UTF|PCRE.MATCH_INVALID_UTF|PCRE.UCP
128+
am = PCRE.NO_UTF_CHECK
116129
opts = re.compile_options
117-
if (opts & ~imsxa) == (DEFAULT_COMPILER_OPTS & ~imsxa)
130+
mopts = re.match_options
131+
default = ((opts & ~imsx) | ac) == DEFAULT_COMPILER_OPTS
132+
if default
133+
if (opts & ac) == ac
134+
default = mopts == DEFAULT_MATCH_OPTS
135+
elseif (opts & ac) == 0
136+
default = mopts == (DEFAULT_MATCH_OPTS & ~am)
137+
else
138+
default = false
139+
end
140+
end
141+
if default
118142
print(io, "r\"")
119143
escape_raw_string(io, re.pattern)
120144
print(io, "\"")
121-
if (opts & PCRE.CASELESS ) != 0; print(io, 'i'); end
122-
if (opts & PCRE.MULTILINE) != 0; print(io, 'm'); end
123-
if (opts & PCRE.DOTALL ) != 0; print(io, 's'); end
124-
if (opts & PCRE.EXTENDED ) != 0; print(io, 'x'); end
125-
if (opts & PCRE.UCP ) == 0; print(io, 'a'); end
145+
if (opts & PCRE.CASELESS ) != 0; print(io, "i"); end
146+
if (opts & PCRE.MULTILINE) != 0; print(io, "m"); end
147+
if (opts & PCRE.DOTALL ) != 0; print(io, "s"); end
148+
if (opts & PCRE.EXTENDED ) != 0; print(io, "x"); end
149+
if (opts & ac ) == 0; print(io, "a"); end
126150
else
127151
print(io, "Regex(")
128152
show(io, re.pattern)
129-
print(io, ',')
153+
print(io, ", ")
130154
show(io, opts)
131-
print(io, ')')
155+
print(io, ", ")
156+
show(io, mopts)
157+
print(io, ")")
132158
end
133159
end
134160

base/set.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ See also: [`AbstractSet`](@ref), [`BitSet`](@ref), [`Dict`](@ref),
1313
[`push!`](@ref), [`empty!`](@ref), [`union!`](@ref), [`in`](@ref), [`isequal`](@ref)
1414
1515
# Examples
16-
```jldoctest filter = r"^\\S.+"
16+
```jldoctest; filter = r"^ '.'"ma
1717
julia> s = Set("aaBca")
1818
Set{Char} with 3 elements:
1919
'a'
@@ -23,9 +23,9 @@ Set{Char} with 3 elements:
2323
julia> push!(s, 'b')
2424
Set{Char} with 4 elements:
2525
'a'
26-
'c'
2726
'b'
2827
'B'
28+
'c'
2929
3030
julia> s = Set([NaN, 0.0, 1.0, 2.0]);
3131

base/shell.jl

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -292,9 +292,9 @@ function shell_escape_csh(io::IO, args::AbstractString...)
292292
first = false
293293
i = 1
294294
while true
295-
for (r,e) = (r"^[A-Za-z0-9/\._-]+\z" => "",
296-
r"^[^']*\z" => "'", r"^[^\$\`\"]*\z" => "\"",
297-
r"^[^']+" => "'", r"^[^\$\`\"]+" => "\"")
295+
for (r,e) = (r"^[A-Za-z0-9/\._-]+\z"sa => "",
296+
r"^[^']*\z"sa => "'", r"^[^\$\`\"]*\z"sa => "\"",
297+
r"^[^']+"sa => "'", r"^[^\$\`\"]+"sa => "\"")
298298
if ((m = match(r, SubString(arg, i))) !== nothing)
299299
write(io, e)
300300
write(io, replace(m.match, '\n' => "\\\n"))
@@ -391,7 +391,7 @@ julia> Base.shell_escape_wincmd("a^\\"^o\\"^u\\"")
391391
"""
392392
function shell_escape_wincmd(io::IO, s::AbstractString)
393393
# https://stackoverflow.com/a/4095133/1990689
394-
occursin(r"[\r\n\0]", s) &&
394+
occursin(r"[\r\n\0]"sa, s) &&
395395
throw(ArgumentError("control character unsupported by CMD.EXE"))
396396
i = 1
397397
len = ncodeunits(s)
@@ -446,7 +446,7 @@ function escape_microsoft_c_args(io::IO, args::AbstractString...)
446446
else
447447
write(io, ' ') # separator
448448
end
449-
if isempty(arg) || occursin(r"[ \t\"]", arg)
449+
if isempty(arg) || occursin(r"[ \t\"]"sa, arg)
450450
# Julia raw strings happen to use the same escaping convention
451451
# as the argv[] parser in Microsoft's C runtime library.
452452
write(io, '"')

test/path.jl

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,9 @@
171171
@test string(splitdrive(S(homedir()))...) == homedir()
172172
@test splitdrive("a\nb") == ("", "a\nb")
173173

174+
@test splitdir("a/\xfe/\n/b/c.ext") == ("a/\xfe/\n/b", "c.ext")
175+
@test splitext("a/\xfe/\n/b/c.ext") == ("a/\xfe/\n/b/c", ".ext")
176+
174177
if Sys.iswindows()
175178
@test splitdrive(S("\\\\servername\\hello.world\\filename.ext")) ==
176179
("\\\\servername\\hello.world","\\filename.ext")

test/regex.jl

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,11 @@
5959
@test repr(r"\\\"") == raw"r\"\\\\\\\"\""
6060
@test repr(s"\\\"\\") == raw"s\"\\\\\\\"\\\\\""
6161

62+
@test repr(r""a) == "r\"\"a"
63+
@test repr(r""imsxa) == "r\"\"imsxa"
64+
@test repr(Regex("", Base.DEFAULT_COMPILER_OPTS, UInt32(0))) == """Regex("", $(repr(Base.DEFAULT_COMPILER_OPTS)), $(repr(UInt32(0))))"""
65+
@test repr(Regex("", UInt32(0), Base.DEFAULT_MATCH_OPTS)) == """Regex("", $(repr(UInt32(0))), $(repr(Base.DEFAULT_MATCH_OPTS)))"""
66+
6267
# findall
6368
@test findall(r"\w+", "foo bar") == [1:3, 5:7]
6469
@test findall(r"\w+", "foo bar", overlap=true) == [1:3, 2:3, 3:3, 5:7, 6:7, 7:7]
@@ -122,18 +127,24 @@
122127

123128
# Backcapture reference in substitution string
124129
@test replace("abcde", r"(..)(?P<byname>d)" => s"\g<byname>xy\\\1") == "adxy\\bce"
125-
@test_throws ErrorException replace("a", r"(?P<x>)" => s"\g<y>")
130+
@test_throws(ErrorException("Bad replacement string: Group y not found in regex r\"(?P<x>)\""),
131+
replace("a", r"(?P<x>)" => s"\g<y>"))
126132
# test replace with invalid substitution group pattern
127-
@test_throws ErrorException replace("s", r"(?<g1>.)" => s"\gg1>")
133+
@test_throws(ErrorException("Bad replacement string: \\gg1>"),
134+
replace("s", r"(?<g1>.)" => s"\gg1>"))
128135
# test replace with 2-digit substitution group
129136
@test replace(("0" ^ 9) * "1", Regex(("(0)" ^ 9) * "(1)") => s"10th group: \10") == "10th group: 1"
130137

131138
# Proper unicode handling
132139
@test match(r"∀∀", "∀x∀∀∀").match == "∀∀"
133140

134-
# 'a' flag to disable UCP
141+
# 'a' flag to disable UCP and UTF
135142
@test match(r"\w+", "Düsseldorf").match == "Düsseldorf"
136143
@test match(r"\w+"a, "Düsseldorf").match == "D"
144+
@test match(r".+"a, "Düsseldorf").match == "Düsseldorf"
145+
@test match(r".+"a, "\xefsseldorf").match == "\xefsseldorf"
146+
@test_throws(ErrorException("PCRE.exec error: $(Base.PCRE.err_message(Base.PCRE.ERROR_UTF8_ERR6))"),
147+
match(r"(*UTF).+"a, "\xefsseldorf"))
137148

138149
# Regex behaves like a scalar in broadcasting
139150
@test occursin.(r"Hello", ["Hello", "World"]) == [true, false]
@@ -211,8 +222,7 @@
211222
end
212223

213224
# Test that PCRE throws the correct kind of error
214-
# TODO: Uncomment this once the corresponding change has propagated to CI
215-
#@test_throws ErrorException Base.PCRE.info(C_NULL, Base.PCRE.INFO_NAMECOUNT, UInt32)
225+
@test_throws ErrorException("PCRE error: NULL regex object") Base.PCRE.info(C_NULL, Base.PCRE.INFO_NAMECOUNT, UInt32)
216226

217227
# test that we can get the error message of negative error codes
218228
@test Base.PCRE.err_message(Base.PCRE.ERROR_NOMEMORY) isa String

0 commit comments

Comments
 (0)