Skip to content
Closed
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions src/libraries/System.Memory/tests/Span/StringSearchValues.cs
Original file line number Diff line number Diff line change
Expand Up @@ -357,6 +357,17 @@ public static void IndexOfAny_InvalidUtf16()
// These should hit the Aho-Corasick implementation
[InlineData("a", "b")]
[InlineData("ab", "c")]
// Two-string specialization with same length values
[InlineData("foo", "bar")]
[InlineData("ab", "cd")]
[InlineData("hello", "world")]
[InlineData("test1", "test2")]
// Two-string specialization with same first character
[InlineData("abc", "axy")]
[InlineData("hello", "happy")]
// Two-string specialization with different first characters
[InlineData("abc", "xyz")]
[InlineData("foo", "baz")]
// Simple Teddy cases
[InlineData("abc", "cde")]
[InlineData("abc", "cd")]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -479,6 +479,7 @@
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\SingleStringSearchValuesPackedThreeChars.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\SingleStringSearchValuesThreeChars.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\SingleStringSearchValuesFallback.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\TwoStringSearchValuesThreeChars.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\StringSearchValues.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\StringSearchValuesBase.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\StringSearchValuesAhoCorasick.cs" />
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,36 @@ public static void GetSingleStringMultiCharacterOffsets(string value, bool ignor
}
}

/// <summary>
/// For a two-string search, we only need one additional anchor character per string (the first character is always used).
/// </summary>
public static int GetSecondCharacterOffset(string value, bool ignoreCase) =>
GetSecondCharacterOffset(value, ignoreCase, value.Length);

/// <summary>
/// For a two-string search with different length values, we constrain the offset to be within maxOffset.
/// This allows the inner loop to use the same offset bounds for both values.
/// </summary>
public static int GetSecondCharacterOffset(string value, bool ignoreCase, int maxOffset)
{
Debug.Assert(value.Length > 1);
Debug.Assert(maxOffset >= 2 && maxOffset <= value.Length);
Debug.Assert(!ignoreCase || char.IsAscii(value[0]));

int ch2Offset = IndexOfAsciiCharWithLowestFrequency(value.AsSpan(0, maxOffset), ignoreCase);

if (ch2Offset < 0)
{
// We have fewer than 2 ASCII chars in the value (within maxOffset).
Debug.Assert(!ignoreCase);

// We don't have a frequency table for non-ASCII characters, pick the last one within range.
ch2Offset = maxOffset - 1;
}

return ch2Offset;
}

private static int IndexOfAsciiCharWithLowestFrequency(ReadOnlySpan<char> span, bool ignoreCase, int excludeIndex = -1)
{
float minFrequency = float.MaxValue;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,13 @@ private static SearchValues<string> CreateFromNormalizedValues(
return CreateForSingleValue(values[0], uniqueValues, ignoreCase, allAscii, asciiLettersOnly);
}

if (values.Length == 2 &&
Vector128.IsHardwareAccelerated &&
TryCreateForTwoValues(values, uniqueValues, ignoreCase, allAscii, asciiLettersOnly, nonAsciiAffectedByCaseConversion) is { } twoValuesSearchValues)
{
return twoValuesSearchValues;
}

if ((Ssse3.IsSupported || AdvSimd.Arm64.IsSupported) &&
TryGetTeddyAcceleratedValues(values, uniqueValues, ignoreCase, allAscii, asciiLettersOnly, nonAsciiAffectedByCaseConversion, minLength) is { } searchValues)
{
Expand Down Expand Up @@ -407,6 +414,83 @@ private static SearchValues<string> CreateForSingleValue(
: new SingleStringSearchValuesFallback<SearchValues.FalseConst>(value, uniqueValues);
}

private static SearchValues<string>? TryCreateForTwoValues(
ReadOnlySpan<string> values,
HashSet<string> uniqueValues,
bool ignoreCase,
bool allAscii,
bool asciiLettersOnly,
bool nonAsciiAffectedByCaseConversion)
{
Debug.Assert(values.Length == 2);

string value0 = values[0];
string value1 = values[1];

// Both values must have at least 2 characters
if (value0.Length < 2 || value1.Length < 2)
{
return null;
}

// For case-insensitive matching with non-ASCII affected by case conversion,
// we need ASCII anchor characters in both values
if (nonAsciiAffectedByCaseConversion)
{
// Check that both values have at least 2 ASCII characters (for anchor points)
if (!char.IsAscii(value0[0]) || !char.IsAscii(value1[0]))
{
return null;
}

// Need at least one more ASCII character in each value for the second anchor
if (!value0.AsSpan(1).ContainsAnyInRange((char)0, (char)127) ||
!value1.AsSpan(1).ContainsAnyInRange((char)0, (char)127))
{
return null;
}
}

// Get the optimal second character offset for each value.
// Constrain offsets to be within the shorter value's length so both can be used
// with the same bounds in the vectorized inner loop.
int minLength = Math.Min(value0.Length, value1.Length);
int v0Ch2Offset = CharacterFrequencyHelper.GetSecondCharacterOffset(value0, ignoreCase, minLength);
int v1Ch2Offset = CharacterFrequencyHelper.GetSecondCharacterOffset(value1, ignoreCase, minLength);

// For case-insensitive search, ensure anchor characters are ASCII
if (ignoreCase)
{
if (!char.IsAscii(value0[0]) || !char.IsAscii(value0[v0Ch2Offset]) ||
!char.IsAscii(value1[0]) || !char.IsAscii(value1[v1Ch2Offset]))
{
return null;
}
}

if (!ignoreCase)
{
return new TwoStringSearchValuesThreeChars<CaseSensitive>(uniqueValues, value0, value1, v0Ch2Offset, v1Ch2Offset);
}

if (asciiLettersOnly)
{
return new TwoStringSearchValuesThreeChars<CaseInsensitiveAsciiLetters>(uniqueValues, value0, value1, v0Ch2Offset, v1Ch2Offset);
}

if (allAscii)
{
return new TwoStringSearchValuesThreeChars<CaseInsensitiveAscii>(uniqueValues, value0, value1, v0Ch2Offset, v1Ch2Offset);
}

if (nonAsciiAffectedByCaseConversion)
{
return new TwoStringSearchValuesThreeChars<CaseInsensitiveUnicode>(uniqueValues, value0, value1, v0Ch2Offset, v1Ch2Offset);
}

return new TwoStringSearchValuesThreeChars<CaseInsensitiveAscii>(uniqueValues, value0, value1, v0Ch2Offset, v1Ch2Offset);
}

private static SearchValues<string>? TryCreateSingleValuesThreeChars<TValueLength>(
string value,
HashSet<string>? uniqueValues,
Expand Down
Loading
Loading