Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion hilti/runtime/include/types/regexp.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,26 @@ class Pattern {
auto isCaseInsensitive() const { return _case_insensitive; }
auto matchID() const { return _id; }

/** Attempts to compile the pattern, returning an error if that fails. */
Result<Nothing> validate() const;

/**
* Returns the number of capture groups the pattern defines. Returns an
* error result if the number cannot be determined, such as if it cannot be
* compiled due to a syntax error.
*/
Result<uint64_t> numberOfCaptures() const;

void setValue(std::string value) { _value = std::move(value); }
void setCaseInsensitive(bool case_insensitive) { _case_insensitive = case_insensitive; }
void setMatchID(uint64_t id) { _id = id; }

private:
// Checks a pattern for syntactical correctness by trying to compile it.
// Returns an error if compilation fails, or the number of capture groups
// defined by the pattern if successful.
Result<uint64_t> _tryCompile() const;

std::string _value;
bool _case_insensitive;
uint64_t _id;
Expand Down Expand Up @@ -185,7 +200,7 @@ class CompiledRegExp {
};

void _newJrx();
void _compileOne(regexp::Pattern pattern, int idx);
void _compileOne(regexp::Pattern pattern);

regexp::Flags _flags{};
regexp::Patterns _patterns;
Expand Down
12 changes: 12 additions & 0 deletions hilti/runtime/src/tests/regexp.cc
Original file line number Diff line number Diff line change
Expand Up @@ -455,3 +455,15 @@ TEST_CASE("caching") {
CHECK_NE(re1a.jrx(), re3.jrx());
CHECK_NE(re1a.jrx(), re4.jrx());
}

TEST_CASE("validation") {
CHECK("abc"_p.validate().hasValue());
CHECK_EQ("abc("_p.validate().error().description(), "error compiling pattern 'abc(': bad pattern: syntax error");
}

TEST_CASE("capture-groups") {
CHECK_EQ("abc"_p.numberOfCaptures(), 0);
CHECK_EQ("a(b)c"_p.numberOfCaptures(), 1);
CHECK_EQ("a((b)c)()"_p.numberOfCaptures(), 3);
CHECK_FALSE(")"_p.numberOfCaptures());
}
34 changes: 31 additions & 3 deletions hilti/runtime/src/types/regexp.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <ranges>
#include <utility>

#include <hilti/rt/exception.h>
#include <hilti/rt/global-state.h>
#include <hilti/rt/types/regexp.h>
#include <hilti/rt/util.h>
Expand All @@ -19,6 +20,34 @@ using namespace hilti::rt::bytes;

// #define _DEBUG_MATCHING

Result<uint64_t> regexp::Pattern::_tryCompile() const {
int cflags = (REG_EXTENDED | REG_ANCHOR | REG_LAZY);

if ( _case_insensitive )
cflags |= REG_ICASE;

jrx_regex_t preg;
if ( auto rc = jrx_regcomp(&preg, _value.c_str(), cflags); rc == REG_OK ) {
auto nsub = preg.re_nsub;
jrx_regfree(&preg);
return nsub;
}
else {
static char err[256];
jrx_regerror(rc, &preg, err, sizeof(err));
return result::Error(fmt("error compiling pattern '%s': %s", _value, err));
}
}

Result<Nothing> regexp::Pattern::validate() const {
if ( auto rc = _tryCompile() )
return Nothing();
else
return rc.error();
}

Result<uint64_t> regexp::Pattern::numberOfCaptures() const { return _tryCompile(); }

// Determines which matcher (std vs. min) to use.
static bool _use_std_matcher(jrx_regex_t* jrx, jrx_match_state* ms) {
// Order of the checks is important.
Expand Down Expand Up @@ -233,9 +262,8 @@ regexp::detail::CompiledRegExp::CompiledRegExp(const regexp::Patterns& patterns,
if ( patterns.empty() )
return;

int idx = 0;
for ( const auto& p : patterns )
_compileOne(p, idx++);
_compileOne(p);

jrx_regset_finalize(jrx());
}
Expand All @@ -255,7 +283,7 @@ void regexp::detail::CompiledRegExp::_newJrx() {
jrx_regset_init(_jrx.get(), -1, cflags);
}

void regexp::detail::CompiledRegExp::_compileOne(regexp::Pattern pattern, int idx) {
void regexp::detail::CompiledRegExp::_compileOne(regexp::Pattern pattern) {
const auto& regexp = pattern.value();

int cflags = (pattern.isCaseInsensitive() ? REG_ICASE : 0);
Expand Down
5 changes: 5 additions & 0 deletions hilti/toolchain/src/compiler/validator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -479,6 +479,11 @@ struct VisitorPost : visitor::PreOrder, public validator::VisitorMixIn {
void operator()(ctor::Null* n) final {}

void operator()(ctor::RegExp* n) final {
for ( const auto& pattern : n->patterns() ) {
if ( auto rc = pattern.validate(); ! rc )
error(rc.error(), n);
}

if ( n->attributes()->find(hilti::attribute::kind::Anchor) )
// This can end up reporting the same location multiple times,
// which seems fine. Otherwise we'd need to explicitly track what's
Expand Down
48 changes: 43 additions & 5 deletions spicy/toolchain/src/compiler/validator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1237,21 +1237,59 @@ struct VisitorPost : visitor::PreOrder, hilti::validator::VisitorMixIn {
// We check the original type since regexps get parsed as bytes.
if ( n->kind() == hilti::expression::keyword::Kind::Captures ) {
UnqualifiedType* original_type = nullptr;
const type::unit::item::Field* field = nullptr;

// Check type in hook bodies.
if ( auto* hook = n->parent<declaration::Hook>() ) {
auto idx = hook->unitFieldIndex();
auto* field = context()->lookup(idx)->as<type::unit::item::Field>();
field = context()->lookup(idx)->as<type::unit::item::Field>();
original_type = field->originalType()->type();
}

// Captures can also appear in field attributes.
else if ( auto* field = n->parent<type::unit::item::Field>() )
original_type = field->originalType()->type();
else {
// Captures can also appear in field attributes.
field = n->parent<type::unit::item::Field>();

if ( field )
original_type = field->originalType()->type();
}

// In all other cases, or when we are not parsing a regexp raise an error.
if ( ! original_type || ! original_type->isA<hilti::type::RegExp>() )
if ( ! original_type || ! original_type->isA<hilti::type::RegExp>() ) {
error("capture groups can only be used in hooks for fields parsing regexp", n);
return;
}

if ( field ) {
if ( field->attributes()->find(attribute::kind::Nosub) ) {
error("cannot use capture group with &nosub field", n);
return;
}

if ( const auto* regexp = field->ctor()->tryAs<hilti::ctor::RegExp>() ) {
if ( regexp->patterns().size() > 1 ) {
error("cannot use capture groups with regular expression consisting of multiple patterns", n);
return;
}

const auto* index_operator = n->parent()->as<hilti::operator_::vector::IndexNonConst>();
auto index = index_operator->op1()
->as<hilti::expression::Ctor>()
->ctor()
->as<hilti::ctor::UnsignedInteger>()
->value();

assert(! regexp->patterns().empty());
const auto& pattern = regexp->patterns().front();

if ( auto captures = pattern.numberOfCaptures();
captures && index > *captures ) // ignore errors here, will be reported elsewhere
error(fmt("capture group index %" PRIu64
" is too large, the regular expression defines only %" PRIu64 " group%s",
index, *captures, (*captures > 1 ? "s" : "")),
n);
}
}
}
}
};
Expand Down
4 changes: 4 additions & 0 deletions tests/Baseline/hilti.types.regexp.ctor-fail/output
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63.
[error] <...>/ctor-fail.hlt:8:21-8:26: error compiling pattern 'gh(i': bad pattern: syntax error
[error] <...>/ctor-fail.hlt:11:17-11:22: error compiling pattern 'gh(i': bad pattern: syntax error
[error] hiltic: aborting after errors
9 changes: 9 additions & 0 deletions tests/Baseline/spicy.types.regexp.capture-groups-fail/output
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63.
[error] <...>/capture-groups-fail.spicy:32:7-32:8: capture groups can only be used in hooks for fields parsing regexp
[error] <...>/capture-groups-fail.spicy:10:15-10:16: capture groups can only be used in hooks for fields parsing regexp
[error] <...>/capture-groups-fail.spicy:14:15-14:16: capture group index 2 is too large, the regular expression defines only 1 group
[error] <...>/capture-groups-fail.spicy:18:15-18:16: cannot use capture groups with regular expression consisting of multiple patterns
[error] <...>/capture-groups-fail.spicy:22:15-22:16: cannot use capture group with &nosub field
[error] <...>/capture-groups-fail.spicy:25:10-25:17: error compiling pattern 'a(b))c': bad pattern: syntax error
[error] <...>/capture-groups-fail.spicy:29:11-29:12: capture groups can only be used in hooks for fields parsing regexp
[error] spicyc: aborting after errors

This file was deleted.

This file was deleted.

14 changes: 14 additions & 0 deletions tests/hilti/types/regexp/ctor-fail.hlt
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# @TEST-EXEC-FAIL: hiltic -p %INPUT >output 2>&1
# @TEST-EXEC: btest-diff output
#
# @TEST-DOC: Check that we capture invalid regular expression constructors at compile time.

module Foo {

global regexp re1 = /gh(i/;

function void f() {
local re2 = /gh(i/;
}

}
12 changes: 0 additions & 12 deletions tests/hilti/types/regexp/match-repeats.hlt
Original file line number Diff line number Diff line change
Expand Up @@ -16,31 +16,19 @@ assert |*data.match(/\x00{0,3}/)| == 3;
assert |*data.match(/\x00{0,4}/)| == 4;
assert |*data.match(/\x00{0,5}/)| == 5;

assert-exception data.match(/\x00{1,0}/);
assert |*data.match(/\x00{1,1}/)| == 1;
assert |*data.match(/\x00{1,2}/)| == 2;
assert |*data.match(/\x00{1,3}/)| == 3;
assert |*data.match(/\x00{1,4}/)| == 4;
assert |*data.match(/\x00{1,5}/)| == 5;

assert-exception data.match(/\x00{2,0}/);
assert-exception data.match(/\x00{2,1}/);
assert |*data.match(/\x00{2,2}/)| == 2;
assert |*data.match(/\x00{2,3}/)| == 3;
assert |*data.match(/\x00{2,4}/)| == 4;
assert |*data.match(/\x00{2,5}/)| == 5;

assert-exception data.match(/\x00{4,0}/);
assert-exception data.match(/\x00{4,1}/);
assert-exception data.match(/\x00{4,2}/);
assert-exception data.match(/\x00{4,3}/);
assert |*data.match(/\x00{4,4}/)| == 4;
assert |*data.match(/\x00{4,5}/)| == 5;

assert-exception data.match(/\x00{5,0}/);
assert-exception data.match(/\x00{5,1}/);
assert-exception data.match(/\x00{5,2}/);
assert-exception data.match(/\x00{5,3}/);
assert-exception data.match(/\x00{5,4}/);
assert |*data.match(/\x00{5,5}/)| == 5;
}
32 changes: 32 additions & 0 deletions tests/spicy/types/regexp/capture-groups-fail.spicy
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# @TEST-DOC: Validate that we reject invaliud usage of captures.
#
# @TEST-EXEC-FAIL: spicyc -dj %INPUT >output 2>&1
# @TEST-EXEC: btest-diff output

module test;

public type X = unit {
x: uint8 {
print $1; # not a regexp field
}

y: /a(b)c/ {
print $2; # group does not exist
}

z: /a(b)c/ | /d(e)f/ {
print $2; # capture groups not supported for parallel matching (because it's ill-defined)
}

nosub: /a(b)c/ &nosub {
print $1; # capture groups not available
}

err: /a(b))c/ { } # invalid regexp
};

function fn() {
print $1; # not inside a unit field
}

print $1; # not inside a unit field
18 changes: 0 additions & 18 deletions tests/spicy/types/regexp/capture-without-regexp.spicy

This file was deleted.

28 changes: 0 additions & 28 deletions tests/spicy/types/regexp/parse-ctor-captures-no-sub.spicy

This file was deleted.

Loading