Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 54 additions & 6 deletions src/libraries/System.Memory/tests/Span/StringSearchValues.cs
Original file line number Diff line number Diff line change
Expand Up @@ -357,9 +357,61 @@ public static void IndexOfAny_InvalidUtf16()
// These should hit the Aho-Corasick implementation
[InlineData("a", "b")]
[InlineData("ab", "c")]
// Simple Teddy cases
// Two-string specialization with same length values (short strings)
[InlineData("ab", "cd")]
[InlineData("ab", "xy")]
// Two-string specialization with same length values (medium strings)
[InlineData("foo", "bar")]
[InlineData("hello", "world")]
[InlineData("test1", "test2")]
// Two-string specialization with same length values (longer strings for vector testing)
[InlineData("abcdefghij", "klmnopqrst")]
[InlineData("abcdefghijklmno", "123456789abcdef")]
[InlineData("helloworld1234", "goodbyeworld56")]
// Two-string specialization with same first character
[InlineData("abc", "axy")]
[InlineData("hello", "happy")]
[InlineData("ab", "ac")]
[InlineData("test", "toad")]
// Two-string specialization with different first characters
[InlineData("abc", "xyz")]
[InlineData("foo", "baz")]
[InlineData("abc", "cde")]
[InlineData("abc", "cd")]
// Two-string specialization with different length values (short/medium)
[InlineData("ab", "abc")]
[InlineData("ab", "abcd")]
[InlineData("foo", "foobar")]
[InlineData("hello", "hi")]
[InlineData("test", "testing123")]
[InlineData("xy", "xyz123")]
[InlineData("abcdefgh", "ab")]
// Two-string specialization with different length values (longer strings)
[InlineData("ab", "abcdefghijklmnop")]
[InlineData("abcdefghijklmnop", "xy")]
[InlineData("hello", "helloworld12345")]
// Two-string specialization with special characters
[InlineData("ab", "!@")]
[InlineData("a!", "b@")]
[InlineData("foo!", "bar?")]
[InlineData("test%", "data#")]
// Two-string specialization with numbers
[InlineData("12", "34")]
[InlineData("123", "456")]
[InlineData("abc123", "def456")]
[InlineData("12ab", "34cd")]
// Two-string specialization with mixed case (will be tested case-sensitive and case-insensitive)
[InlineData("Ab", "Cd")]
[InlineData("Hello", "World")]
[InlineData("ABC", "XYZ")]
// Two-string specialization edge cases
[InlineData("ab", "bc")]
[InlineData("ab", "c!")]
[InlineData("abc", "bc")]
// Simple Teddy cases (3+ strings to ensure Teddy is used, not two-string specialization)
[InlineData("abc", "cde", "efg")]
[InlineData("abc", "cd", "ef")]
[InlineData("ab", "cd", "ef")]
// Teddy where all starting chars are letters, but not all other characters are
[InlineData("ab", "de%", "ghi", "jkl!")]
[InlineData("abc", "def%", "ghi", "jkl!")]
Expand All @@ -370,13 +422,9 @@ public static void IndexOfAny_InvalidUtf16()
[InlineData("12", "45b", "789")]
[InlineData("123", "456", "789")]
[InlineData("123", "456a", "789b")]
// We'll expand these values to all case permutations
[InlineData("ab", "bc")]
[InlineData("ab", "c!")]
// These will be expanded to all case permutations (3+ strings to test Teddy)
[InlineData("ab", "c!", "!%")]
// These won't be expanded as they would produce more than 8 permutations
[InlineData("ab", "bc", "c!")]
[InlineData("abc", "bc")]
// Rabin-Karp where one of the values is longer than what the implementation can match (17)
[InlineData("abc", "a012345678012345678")]
// Rabin-Karp where all of the values are longer than what the implementation can match (17)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -479,6 +479,7 @@
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\SingleStringSearchValuesPackedThreeChars.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\SingleStringSearchValuesThreeChars.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\SingleStringSearchValuesFallback.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\TwoStringSearchValuesPackedThreeChars.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\StringSearchValues.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\StringSearchValuesBase.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\StringSearchValuesAhoCorasick.cs" />
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,43 @@ public static void GetSingleStringMultiCharacterOffsets(string value, bool ignor
}
}

/// <summary>
/// For a two-string search, we only need one additional anchor character per string (the first character is always used).
/// </summary>
public static int GetSecondCharacterOffset(string value, bool ignoreCase) =>
GetSecondCharacterOffset(value, ignoreCase, value.Length);

/// <summary>
/// For a two-string search with different length values, we constrain the offset to be within maxOffset.
/// This allows the inner loop to use the same offset bounds for both values.
/// </summary>
/// <param name="value">The string to search for anchor characters.</param>
/// <param name="ignoreCase">Whether the search is case-insensitive.</param>
/// <param name="maxOffset">
/// The exclusive upper bound for the search range (must be at least 2).
/// When maxOffset is 2, only position 1 is considered for the second anchor since position 0 is always the first anchor.
/// </param>
public static int GetSecondCharacterOffset(string value, bool ignoreCase, int maxOffset)
{
Debug.Assert(value.Length > 1);
// maxOffset must be at least 2 so there's at least one position (index 1) to consider for the second anchor.
Debug.Assert(maxOffset >= 2 && maxOffset <= value.Length);
Debug.Assert(!ignoreCase || char.IsAscii(value[0]));

int ch2Offset = IndexOfAsciiCharWithLowestFrequency(value.AsSpan(0, maxOffset), ignoreCase);

if (ch2Offset < 0)
{
// We have fewer than 2 ASCII chars in the value (within maxOffset).
Debug.Assert(!ignoreCase);

// We don't have a frequency table for non-ASCII characters, pick the last one within range.
ch2Offset = maxOffset - 1;
}

return ch2Offset;
}

private static int IndexOfAsciiCharWithLowestFrequency(ReadOnlySpan<char> span, bool ignoreCase, int excludeIndex = -1)
{
float minFrequency = float.MaxValue;
Expand Down Expand Up @@ -123,5 +160,66 @@ private static int IndexOfAsciiCharWithLowestFrequency(ReadOnlySpan<char> span,

return minIndex;
}

/// <summary>
/// For a two-string search, finds the best shared second character offset that minimizes
/// the combined character frequency across both values at that offset.
/// This reduces the number of vector loads in the inner loop from 3 to 2.
/// </summary>
/// <param name="value0">First search string.</param>
/// <param name="value1">Second search string.</param>
/// <param name="ignoreCase">Whether the search is case-insensitive.</param>
/// <param name="maxOffset">The exclusive upper bound for the offset (typically min(value0.Length, value1.Length)).</param>
/// <returns>The offset (1 to maxOffset-1) with the lowest combined frequency.</returns>
public static int GetSharedSecondCharacterOffset(string value0, string value1, bool ignoreCase, int maxOffset)
{
Debug.Assert(value0.Length > 1);
Debug.Assert(value1.Length > 1);
Debug.Assert(maxOffset >= 2 && maxOffset <= Math.Min(value0.Length, value1.Length));

float minCombinedFrequency = float.MaxValue;
// Default to the last valid offset within range (same as single-string case when no ASCII chars found).
// This provides better filtering by using a character further from position 0.
int bestOffset = maxOffset - 1;

// Search for the offset with lowest combined frequency across both values
for (int i = 1; i < maxOffset; i++)
{
char c0 = value0[i];
char c1 = value1[i];

// We need both characters at this offset to be ASCII for frequency comparison
if (!char.IsAscii(c0) || !char.IsAscii(c1))
{
continue;
}

float freq0 = AsciiFrequency[c0];
float freq1 = AsciiFrequency[c1];

if (ignoreCase)
{
freq0 += AsciiFrequency[c0 ^ 0x20];
freq1 += AsciiFrequency[c1 ^ 0x20];
}

// Penalize early positions (same as single-value logic)
if (i <= 2)
{
freq0 *= 1.5f;
freq1 *= 1.5f;
}

float combinedFrequency = freq0 + freq1;

if (combinedFrequency < minCombinedFrequency)
{
minCombinedFrequency = combinedFrequency;
bestOffset = i;
}
}

return bestOffset;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,13 @@ private static SearchValues<string> CreateFromNormalizedValues(
return CreateForSingleValue(values[0], uniqueValues, ignoreCase, allAscii, asciiLettersOnly);
}

if (values.Length == 2 &&
Vector128.IsHardwareAccelerated &&
TryCreateForTwoValues(values, uniqueValues, ignoreCase, allAscii, asciiLettersOnly, nonAsciiAffectedByCaseConversion) is { } twoValuesSearchValues)
{
return twoValuesSearchValues;
}

if ((Ssse3.IsSupported || AdvSimd.Arm64.IsSupported) &&
TryGetTeddyAcceleratedValues(values, uniqueValues, ignoreCase, allAscii, asciiLettersOnly, nonAsciiAffectedByCaseConversion, minLength) is { } searchValues)
{
Expand Down Expand Up @@ -407,6 +414,100 @@ private static SearchValues<string> CreateForSingleValue(
: new SingleStringSearchValuesFallback<SearchValues.FalseConst>(value, uniqueValues);
}

private static SearchValues<string>? TryCreateForTwoValues(
ReadOnlySpan<string> values,
HashSet<string> uniqueValues,
bool ignoreCase,
bool allAscii,
bool asciiLettersOnly,
bool nonAsciiAffectedByCaseConversion)
{
Debug.Assert(values.Length == 2);

// Packed implementation requires SSE2 or ARM64 SIMD
if (!Sse2.IsSupported && !AdvSimd.Arm64.IsSupported)
{
return null;
}

string value0 = values[0];
string value1 = values[1];

// Both values must have at least 2 characters
if (value0.Length < 2 || value1.Length < 2)
{
return null;
}

// Constrain offsets to be within the shorter value's length so both can be used
// with the same bounds in the vectorized inner loop.
int minLength = Math.Min(value0.Length, value1.Length);

// For case-insensitive matching with non-ASCII affected by case conversion,
// we need ASCII anchor characters within the minLength range of both values
if (nonAsciiAffectedByCaseConversion)
{
// Check that both values have at least 2 ASCII characters within minLength (for anchor points)
if (!char.IsAscii(value0[0]) || !char.IsAscii(value1[0]))
{
return null;
}

// Need at least one more ASCII character in each value within the minLength range for the second anchor
if (!value0.AsSpan(1, minLength - 1).ContainsAnyInRange((char)0, (char)127) ||
!value1.AsSpan(1, minLength - 1).ContainsAnyInRange((char)0, (char)127))
{
return null;
}
}

// Get the shared second character offset with lowest combined frequency across both values.
// Using a shared offset reduces vector loads in the inner loop from 3 to 2.
int ch2Offset = CharacterFrequencyHelper.GetSharedSecondCharacterOffset(value0, value1, ignoreCase, minLength);

// Packed implementation requires all anchor characters to be packable (fit in a byte with certain constraints)
if (!CanUsePackedImpl(value0[0]) || !CanUsePackedImpl(value0[ch2Offset]) ||
!CanUsePackedImpl(value1[0]) || !CanUsePackedImpl(value1[ch2Offset]))
{
return null;
}

if (!ignoreCase)
{
return new TwoStringSearchValuesPackedThreeChars<CaseSensitive>(uniqueValues, value0, value1, ch2Offset);
}

// For case-insensitive search, ensure anchor characters are ASCII
if (!char.IsAscii(value0[0]) || !char.IsAscii(value0[ch2Offset]) ||
!char.IsAscii(value1[0]) || !char.IsAscii(value1[ch2Offset]))
{
return null;
}

if (asciiLettersOnly)
{
return new TwoStringSearchValuesPackedThreeChars<CaseInsensitiveAsciiLetters>(uniqueValues, value0, value1, ch2Offset);
}

if (allAscii)
{
return new TwoStringSearchValuesPackedThreeChars<CaseInsensitiveAscii>(uniqueValues, value0, value1, ch2Offset);
}

if (nonAsciiAffectedByCaseConversion)
{
return new TwoStringSearchValuesPackedThreeChars<CaseInsensitiveUnicode>(uniqueValues, value0, value1, ch2Offset);
}

return new TwoStringSearchValuesPackedThreeChars<CaseInsensitiveAscii>(uniqueValues, value0, value1, ch2Offset);
}

// Unlike with PackedSpanHelpers (Sse2 only), we are also using this approach on ARM64.
// We use PackUnsignedSaturate on X86 and UnzipEven on ARM, so the set of allowed characters differs slightly (we can't use it for \0 and \xFF on X86).
private static bool CanUsePackedImpl(char c) =>
PackedSpanHelpers.PackedIndexOfIsSupported ? PackedSpanHelpers.CanUsePackedIndexOf(c) :
(AdvSimd.Arm64.IsSupported && c <= byte.MaxValue);

private static SearchValues<string>? TryCreateSingleValuesThreeChars<TValueLength>(
string value,
HashSet<string>? uniqueValues,
Expand Down Expand Up @@ -454,12 +555,6 @@ private static SearchValues<string> CreateSingleValuesThreeChars<TValueLength, T
}

return new SingleStringSearchValuesThreeChars<TValueLength, TCaseSensitivity>(uniqueValues, value, ch2Offset, ch3Offset);

// Unlike with PackedSpanHelpers (Sse2 only), we are also using this approach on ARM64.
// We use PackUnsignedSaturate on X86 and UnzipEven on ARM, so the set of allowed characters differs slightly (we can't use it for \0 and \xFF on X86).
static bool CanUsePackedImpl(char c) =>
PackedSpanHelpers.PackedIndexOfIsSupported ? PackedSpanHelpers.CanUsePackedIndexOf(c) :
(AdvSimd.Arm64.IsSupported && c <= byte.MaxValue);
}

private static void AnalyzeValues(
Expand Down
Loading
Loading