Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial attempt at building up NFAs and DFAs with different code units #7

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions src/main/java/automata/DfaPattern.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package automata;

import automata.codegen.CompiledDfa;
import automata.graph.StandardCodeUnits;
import automata.graph.MatchMode;
import automata.graph.Tdfa;
import automata.graph.Tnfa;
Expand Down Expand Up @@ -121,8 +122,8 @@ public CompiledDfaPattern(
super(pattern);

// NFAs
final Tnfa nfaWithoutWildcard = Tnfa.parse(pattern, flags, true, false);
final Tnfa nfaWithWildcard = Tnfa.parse(pattern, flags, true, true);
final Tnfa nfaWithoutWildcard = Tnfa.parse(pattern, StandardCodeUnits.UTF_16, flags, true, false);
final Tnfa nfaWithWildcard = Tnfa.parse(pattern, StandardCodeUnits.UTF_16, flags, true, true);

// DFAs
final Tdfa matchesDfa = Tdfa.fromTnfa(nfaWithoutWildcard, MatchMode.FULL, optimized);
Expand Down Expand Up @@ -235,8 +236,8 @@ public InterpretableDfaPattern(
super(pattern);
this.printDebugInfo = printDebugInfo;

final Tnfa nfaWithoutWildcard = Tnfa.parse(pattern, flags, true, false);
final Tnfa nfaWithWildcard = Tnfa.parse(pattern, flags, true, true);
final Tnfa nfaWithoutWildcard = Tnfa.parse(pattern, StandardCodeUnits.UTF_16, flags, true, false);
final Tnfa nfaWithWildcard = Tnfa.parse(pattern, StandardCodeUnits.UTF_16, flags, true, true);

this.matchesDfa = Tdfa.fromTnfa(nfaWithoutWildcard, MatchMode.FULL, optimized);
this.lookingAtDfa = Tdfa.fromTnfa(nfaWithoutWildcard, MatchMode.PREFIX, optimized);
Expand Down
144 changes: 50 additions & 94 deletions src/main/java/automata/graph/RegexNfaBuilder.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,10 @@
import automata.parser.CodePointSetVisitor;
import automata.util.IntRange;
import automata.util.IntRangeSet;
import java.util.Stack;
import java.util.Optional;
import java.util.OptionalInt;
import java.util.Map;
import java.util.LinkedList;
import java.util.HashMap;
import java.util.function.UnaryOperator;
import java.util.stream.Collectors;

/**
* Regex AST visitor which can be used to build up the corresponding NFA.
Expand All @@ -30,6 +27,15 @@ public abstract class RegexNfaBuilder<Q>
extends CodePointSetVisitor
implements RegexVisitor<UnaryOperator<NfaState<Q>>, IntRangeSet> {

/**
* Code units to be used in the NFA.
*/
public final StandardCodeUnits codeUnits;

public RegexNfaBuilder(StandardCodeUnits codeUnits) {
this.codeUnits = codeUnits;
}

/**
* Summon a fresh state identifier.
*
Expand Down Expand Up @@ -87,114 +93,64 @@ public UnaryOperator<NfaState<Q>> visitEpsilon() {
return UnaryOperator.identity();
}

/**
* Break down the input code point set into a mapping of low ranges to high
* ranges.
*
* The values in the output should be disjoint and union out to a subset of
* the high surrogate range. The values in the keys won't necessarily be
* disjoint, but they should all be in the low surrogate range. Since the
* high surrogate range is just {@code 0xD800–0xDBFF} (1024 values), the
* total size of the output map is at most 1024 entries.
*
* @param codePointSet input code point set
* @return mapping from low surrogate ranges to high surrogate ranges
*/
public static Map<IntRangeSet, IntRangeSet> supplementaryCodeUnitRanges(
IntRangeSet codePointSet
) {

// TODO: this doesn't need to be a map since we scan high codepoints in order
// and only ever update the last one
final var supplementaryCodeUnits = new HashMap<Integer, LinkedList<IntRange>>();

for (IntRange range : codePointSet.difference(IntRangeSet.of(CodePoints.BMP_RANGE)).ranges()) {

int rangeStartHi = Character.highSurrogate(range.lowerBound());
int rangeStartLo = Character.lowSurrogate(range.lowerBound());

int rangeEndHi = Character.highSurrogate(range.upperBound());
int rangeEndLo = Character.lowSurrogate(range.upperBound());

if (rangeStartHi == rangeEndHi) {
// Add the _only_ range
supplementaryCodeUnits
.computeIfAbsent(rangeStartHi, k -> new LinkedList<>())
.addLast(IntRange.between(rangeStartLo, rangeEndLo));
} else {
// Add the first range
supplementaryCodeUnits
.computeIfAbsent(rangeStartHi, k -> new LinkedList<>())
.addLast(IntRange.between(rangeStartLo, Character.MAX_LOW_SURROGATE));

// Add the last range
supplementaryCodeUnits
.computeIfAbsent(rangeEndHi, k -> new LinkedList<>())
.addLast(IntRange.between(Character.MIN_LOW_SURROGATE, rangeEndLo));

// Add everything in between
for (int hi = rangeStartHi + 1; hi <= rangeEndHi - 1; hi++) {
supplementaryCodeUnits
.computeIfAbsent(hi, k -> new LinkedList<>())
.addLast(CodePoints.LOW_SURROGATE_RANGE);
}
}
}

return supplementaryCodeUnits
.entrySet()
.stream()
.collect(
Collectors.groupingBy(
e -> new IntRangeSet(e.getValue()),
Collectors.mapping(
e -> IntRangeSet.of(IntRange.single(e.getKey())),
Collectors.collectingAndThen(Collectors.toList(), IntRangeSet::union)
)
)
);
}

public UnaryOperator<NfaState<Q>> visitCharacterClass(IntRangeSet codePointSet) {
if (!codePointSet.difference(IntRangeSet.of(CodePoints.UNICODE_RANGE)).isEmpty()) {
throw new IllegalArgumentException("Codepoints outside the unicode range aren't allowed");
}

/* Code unit transitions corresponding to the basic multilingual plane.
* By definition of the BMP, this means these are exactly one code unit.
*/
final var basicCodeUnits = codePointSet.intersection(IntRangeSet.of(CodePoints.BMP_RANGE));

/* Mapping from the first (high) 16-bit code unit to the range of second
* (low) 16-bit code units. There are `0xDBFF - 0xD800 + 1 = 1024` high
* code points, so this map will have between 0 and 1024 entries.
*/
final var supplementaryCodeUnits = supplementaryCodeUnitRanges(codePointSet);
final var suffixTrie = codeUnits.codeUnitRangeSuffixTrie(codePointSet);

// How many code units wide is the class?
final Optional<Integer> classSize =
supplementaryCodeUnits.isEmpty() ? Optional.of(1) :
basicCodeUnits.isEmpty() ? Optional.of(2) :
Optional.empty();
final Optional<Integer> classSize = suffixTrie.inSetDepth();

// Depth first traversal of the tree
record TraversalEntry<Q>(
TrieSet<IntRangeSet> subTrie,
int distanceToRoot,
IntRangeSet codeUnitToParent,
Q parentNode
) { }

return (NfaState<Q> toState) -> {
final Q to = toState.state();
final Q start = freshState();
if (!basicCodeUnits.isEmpty()) {
addCodeUnitsState(start, basicCodeUnits, to);
final var start = freshState();

// Depth first traversal of the suffix trie
final var toVisit = new Stack<TraversalEntry<Q>>();
for (final var childEntry : suffixTrie.children.entrySet()) {
toVisit.push(new TraversalEntry<>(
childEntry.getValue(),
1,
childEntry.getKey(),
toState.state()
));
}
while (!toVisit.isEmpty()) {
final var entry = toVisit.pop();
final Q thisNode = entry.subTrie.inSet ? start : freshState();

// Visit this node
addCodeUnitsState(thisNode, entry.codeUnitToParent, entry.parentNode);
if (entry.subTrie.inSet && !entry.subTrie.children.isEmpty()) {
throw new IllegalStateException("One code unit sequence cannot be a suffix of another");
}

for (var loAndHigh : supplementaryCodeUnits.entrySet()) {
final Q hiEnd = freshState();
addCodeUnitsState(start, loAndHigh.getValue(), hiEnd);
addCodeUnitsState(hiEnd, loAndHigh.getKey(), to);
// Plan to visit children
for (final var childEntry : entry.subTrie.children.entrySet()) {
toVisit.push(new TraversalEntry<>(
childEntry.getValue(),
entry.distanceToRoot + 1,
childEntry.getKey(),
thisNode
));
}
}

final var fixedGroup = toState
.fixedGroup()
.flatMap(loc -> classSize.map(loc::addDistance));

return new NfaState<Q>(start, toState.insideRepetition(), toState.unavoidable(), fixedGroup);

};
}

Expand Down
180 changes: 180 additions & 0 deletions src/main/java/automata/graph/StandardCodeUnits.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
package automata.graph;

import automata.util.IntRange;
import automata.util.IntRangeSet;
import automata.parser.CodePoints;
import automata.graph.TrieSet;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

public enum StandardCodeUnits {

/**
* Codepoints are turned into one or two 16-bit code units.
*
* Java strings internally use this.
*/
UTF_16(IntRange.between(Character.MIN_VALUE, Character.MAX_VALUE)) {

/**
* Break down the input code point set into a mapping of low ranges to high
* ranges.
*
* The values in the output should be disjoint and union out to a subset of
* the high surrogate range. The values in the keys won't necessarily be
* disjoint, but they should all be in the low surrogate range. Since the
* high surrogate range is just {@code 0xD800–0xDBFF} (1024 values), the
* total size of the output map is at most 1024 entries.
*
* @param codePointSet input code point set
* @return mapping from low surrogate ranges to high surrogate ranges
*/
private static Map<IntRangeSet, IntRangeSet> supplementaryCodeUnitRanges(
IntRangeSet codePointSet
) {

// TODO: this doesn't need to be a map since we scan high codepoints in order
// and only ever update the last one
final var supplementaryCodeUnits = new HashMap<Integer, LinkedList<IntRange>>();

for (IntRange range : codePointSet.difference(IntRangeSet.of(CodePoints.BMP_RANGE)).ranges()) {

int rangeStartHi = Character.highSurrogate(range.lowerBound());
int rangeStartLo = Character.lowSurrogate(range.lowerBound());

int rangeEndHi = Character.highSurrogate(range.upperBound());
int rangeEndLo = Character.lowSurrogate(range.upperBound());

if (rangeStartHi == rangeEndHi) {
// Add the _only_ range
supplementaryCodeUnits
.computeIfAbsent(rangeStartHi, k -> new LinkedList<>())
.addLast(IntRange.between(rangeStartLo, rangeEndLo));
} else {
// Add the first range
supplementaryCodeUnits
.computeIfAbsent(rangeStartHi, k -> new LinkedList<>())
.addLast(IntRange.between(rangeStartLo, Character.MAX_LOW_SURROGATE));

// Add the last range
supplementaryCodeUnits
.computeIfAbsent(rangeEndHi, k -> new LinkedList<>())
.addLast(IntRange.between(Character.MIN_LOW_SURROGATE, rangeEndLo));

// Add everything in between
for (int hi = rangeStartHi + 1; hi <= rangeEndHi - 1; hi++) {
supplementaryCodeUnits
.computeIfAbsent(hi, k -> new LinkedList<>())
.addLast(CodePoints.LOW_SURROGATE_RANGE);
}
}
}

return supplementaryCodeUnits
.entrySet()
.stream()
.collect(
Collectors.groupingBy(
e -> new IntRangeSet(e.getValue()),
Collectors.mapping(
e -> IntRangeSet.of(IntRange.single(e.getKey())),
Collectors.collectingAndThen(Collectors.toList(), IntRangeSet::union)
)
)
);
}

@Override
public TrieSet<IntRangeSet> codeUnitRangeSuffixTrie(IntRangeSet codePointSet) {

/* Code unit transitions corresponding to the basic multilingual plane.
* By definition of the BMP, this means these are exactly one code unit.
*/
final var basicCodeUnits = codePointSet.intersection(IntRangeSet.of(CodePoints.BMP_RANGE));

/* Mapping from the first (high) 16-bit code unit to the range of second
* (low) 16-bit code units. There are `0xDBFF - 0xD800 + 1 = 1024` high
* code points, so this map will have between 0 and 1024 entries.
*/
final var supplementaryCodeUnits = supplementaryCodeUnitRanges(codePointSet);

final var output = new TrieSet<IntRangeSet>();
output.add(List.of(basicCodeUnits));
for (final var entry : supplementaryCodeUnits.entrySet()) {
output.add(List.of(entry.getKey(), entry.getValue()));
}
return output;
}
},

/**
* Codepoints are turned into one to four 8-bit code units.
*/
UTF_8(IntRange.between(0, 0xFF)) {

// Ranges of code points taking 1, 2, 3, and 4 code unit
private static final IntRange ONE_BYTE_RANGE = IntRange.between(0, 0x7F);
private static final IntRange TWO_BYTE_RANGE = IntRange.between(0x80, 0x7FF);
private static final IntRange THREE_BYTE_RANGE = IntRange.between(0x800, 0xFFFF);
private static final IntRange FOUR_BYTE_RANGE = IntRange.between(0x10000, 0x10FFFF);

@Override
public TrieSet<IntRangeSet> codeUnitRangeSuffixTrie(IntRangeSet codePointSet) {
throw new AssertionError("unimplemented");
}
},

/**
* Codepoints each map to exactly one 32-bit code unit.
*/
UTF_32(IntRange.between(Character.MIN_CODE_POINT, Character.MAX_CODE_POINT)) {
@Override
public TrieSet<IntRangeSet> codeUnitRangeSuffixTrie(IntRangeSet codePointSet) {
final var output = new TrieSet<IntRangeSet>();
output.add(List.of(codePointSet));
return output;
}
};

/**
* Full range of possible code units.
*/
public final IntRange codeUnitRange;

private StandardCodeUnits(IntRange codeUnitRange) {
this.codeUnitRange = codeUnitRange;
}

/**
* Converts a set of code points into a trie containing reversed code units.
*
* <p>The output trie must obey a handful of invariants:
*
* <ul>
* <li>
* The children of any node in the trie should have disjoint subsets of
* the valid code unit range.
*
* As a consequence, any code point {@code c} with code unit
* representation {@code u0, u1, ... un} can traced to a node in the trie
* by walking the suffix tree from the root using the reversed code units
* {@code un, ... u1, u0}. A code point is in the trie set if this process
* ends at a node which has {@code inSet = true}.
*
* <li>
* A code point is in the input set if and only if it is in the output set
* (using the definition of "in" from the previous invariant).
* </ul>
*
* <p>This is the information needed to efficiently create a small DFA for the
* range of code points.
*
* @param codePointRange range of code points to include in the trie
* @return trie ranges of reversed code units
*/
public abstract TrieSet<IntRangeSet> codeUnitRangeSuffixTrie(IntRangeSet codePointSet);

}
Loading