diff --git a/src/module-elasticsuite-thesaurus/Config/ThesaurusStemmingConfig.php b/src/module-elasticsuite-thesaurus/Config/ThesaurusStemmingConfig.php new file mode 100644 index 000000000..4fab70c04 --- /dev/null +++ b/src/module-elasticsuite-thesaurus/Config/ThesaurusStemmingConfig.php @@ -0,0 +1,62 @@ + + * @copyright 2024 Smile + * @license Open Software License ("OSL") v. 3.0 + */ + +namespace Smile\ElasticsuiteThesaurus\Config; + +use Magento\Framework\App\Config\ScopeConfigInterface; +use Magento\Store\Model\ScopeInterface; +use Smile\ElasticsuiteCore\Api\Search\Request\ContainerConfigurationInterface; + +/** + * Thesaurus stemming configuration helper. + * + * @category Smile + * @package Smile\ElasticsuiteThesaurus + * @author Pierre Gauthier + */ +class ThesaurusStemmingConfig +{ + /** @var string */ + const THESAURUS_ANALYSIS_USE_STEMMING_XML_PATH = 'smile_elasticsuite_thesaurus_settings/analysis/use_stemming'; + + /** + * @var ScopeConfigInterface + */ + private $scopeConfig; + + /** + * Constructor. + * + * @param ScopeConfigInterface $scopeConfig Scope config interface. + */ + public function __construct(ScopeConfigInterface $scopeConfig) + { + $this->scopeConfig = $scopeConfig; + } + + /** + * Returns true if stemming should be used for synonyms and expansions matching. + * + * @param integer $storeId Store id. + * + * @return bool + */ + public function useStemming($storeId): bool + { + return $this->scopeConfig->isSetFlag( + self::THESAURUS_ANALYSIS_USE_STEMMING_XML_PATH, + ScopeInterface::SCOPE_STORE, + $storeId + ); + } +} diff --git a/src/module-elasticsuite-thesaurus/Helper/Text.php b/src/module-elasticsuite-thesaurus/Helper/Text.php new file mode 100644 index 000000000..f222c2ae1 --- /dev/null +++ b/src/module-elasticsuite-thesaurus/Helper/Text.php @@ -0,0 +1,64 @@ + + * @copyright 2025 Smile + * @license Open Software License ("OSL") v. 3.0 + */ + +namespace Smile\ElasticsuiteThesaurus\Helper; + +/** + * Text manipulation helper. + * + * @category Smile + * @package Smile\ElasticsuiteThesaurus + * @author Pierre Gauthier + */ +class Text +{ + /** + * Partial implementation of a multi-byte aware version of substr_replace. + * Required because the tokens offsets used as for parameters start and length + * are expressed as a number of (UTF-8) characters, independently of the number of bytes. + * Does not accept arrays as first and second parameters. + * Source: https://github.com/fluxbb/utf8/blob/master/functions/substr_replace.php + * Alternative: https://gist.github.com/bantya/563d7d070c286ba1b5a83b9036f0561a + * + * @param string $string Input string + * @param string $replacement Replacement string + * @param mixed $start Start offset + * @param mixed $length Length of replacement + * + * @return mixed + */ + public function mbSubstrReplace($string, $replacement, $start, $length = null) + { + preg_match_all('/./us', $string, $stringChars); + preg_match_all('/./us', $replacement, $replacementChars); + $length = is_int($length) ? $length : mb_strlen($string); + array_splice($stringChars[0], $start, $length, $replacementChars[0]); + + return implode($stringChars[0]); + } + + /** + * Count the number of words in a given text. + * + * @param string $text The input text. + * + * @return int + */ + public function mbWordCount(string $text): int + { + preg_match_all('/[\p{L}\p{N}\']+/u', $text, $matches); + + return count($matches[0]); + } +} diff --git a/src/module-elasticsuite-thesaurus/Model/Index.php b/src/module-elasticsuite-thesaurus/Model/Index.php index 78b79fefd..64c01d0bc 100644 --- a/src/module-elasticsuite-thesaurus/Model/Index.php +++ b/src/module-elasticsuite-thesaurus/Model/Index.php @@ -14,14 +14,16 @@ namespace Smile\ElasticsuiteThesaurus\Model; -use Smile\ElasticsuiteCore\Helper\IndexSettings as IndexSettingsHelper; use Smile\ElasticsuiteCore\Api\Client\ClientInterface; use Smile\ElasticsuiteCore\Api\Search\Request\ContainerConfigurationInterface; -use Smile\ElasticsuiteThesaurus\Config\ThesaurusConfigFactory; -use Smile\ElasticsuiteThesaurus\Config\ThesaurusConfig; -use Smile\ElasticsuiteThesaurus\Config\ThesaurusCacheConfig; -use Smile\ElasticsuiteThesaurus\Api\Data\ThesaurusInterface; use Smile\ElasticsuiteCore\Helper\Cache as CacheHelper; +use Smile\ElasticsuiteCore\Helper\IndexSettings as IndexSettingsHelper; +use Smile\ElasticsuiteThesaurus\Api\Data\ThesaurusInterface; +use Smile\ElasticsuiteThesaurus\Config\ThesaurusCacheConfig; +use Smile\ElasticsuiteThesaurus\Config\ThesaurusConfig; +use Smile\ElasticsuiteThesaurus\Config\ThesaurusConfigFactory; +use Smile\ElasticsuiteThesaurus\Config\ThesaurusStemmingConfig; +use Smile\ElasticsuiteThesaurus\Helper\Text as TextHelper; /** * Thesaurus index. @@ -72,27 +74,43 @@ class Index */ private $thesaurusCacheConfig; + /** + * @var TextHelper + */ + private $textHelper; + + /** + * @var ThesaurusStemmingConfig + */ + private $stemmingConfig; + /** * Constructor. * - * @param ClientInterface $client ES client. - * @param IndexSettingsHelper $indexSettingsHelper Index Settings Helper. - * @param CacheHelper $cacheHelper ES caching helper. - * @param ThesaurusConfigFactory $thesaurusConfigFactory Thesaurus configuration factory. - * @param ThesaurusCacheConfig $thesaurusCacheConfig Thesaurus cache configuration helper. + * @param ClientInterface $client ES client. + * @param IndexSettingsHelper $indexSettingsHelper Index Settings Helper. + * @param CacheHelper $cacheHelper ES caching helper. + * @param ThesaurusConfigFactory $thesaurusConfigFactory Thesaurus configuration factory. + * @param ThesaurusCacheConfig $thesaurusCacheConfig Thesaurus cache configuration helper. + * @param TextHelper $textHelper Text manipulation helper. + * @param ThesaurusStemmingConfig $stemmingConfig Stemming configuration. */ public function __construct( ClientInterface $client, IndexSettingsHelper $indexSettingsHelper, CacheHelper $cacheHelper, ThesaurusConfigFactory $thesaurusConfigFactory, - ThesaurusCacheConfig $thesaurusCacheConfig + ThesaurusCacheConfig $thesaurusCacheConfig, + TextHelper $textHelper, + ThesaurusStemmingConfig $stemmingConfig ) { $this->client = $client; $this->indexSettingsHelper = $indexSettingsHelper; $this->thesaurusConfigFactory = $thesaurusConfigFactory; $this->cacheHelper = $cacheHelper; $this->thesaurusCacheConfig = $thesaurusCacheConfig; + $this->textHelper = $textHelper; + $this->stemmingConfig = $stemmingConfig; } /** @@ -275,29 +293,47 @@ private function getSynonymRewrites($storeId, $queryText, $type, $maxRewrites) */ private function getQueryCombinations($storeId, $queryText) { - if (str_word_count($queryText) < 2) { + // No need to compute variations of shingles with a one-word-query but we need to extract the stem if enabled. + if (!$this->stemmingConfig->useStemming($storeId) && $this->textHelper->mbWordCount($queryText) < 2) { return [$queryText]; // No need to compute variations of shingles with a one-word-query. } - // Generate the shingles. - // If we analyze "long sleeve dress", we'll obtain "long_sleeve", and "sleeve_dress". - // We'll also obtain the position (start_offset and end_offset) of those shingles in the original string. $indexName = $this->getIndexAlias($storeId); + + // Get the stem of the search term try { $analysis = $this->client->analyze( - ['index' => $indexName, 'body' => ['text' => $queryText, 'analyzer' => 'shingles']] + ['index' => $indexName, 'body' => ['text' => $queryText, 'analyzer' => 'clean']] ); } catch (\Exception $e) { $analysis = ['tokens' => []]; } + $tokens = $analysis['tokens'] ?? []; + + // Generate the shingles. + // If we analyze "long sleeve dress", we'll obtain "long_sleeve", and "sleeve_dress". + // If stemming is enabled, analysis of "dresses" will give "dress" + // and analysis of "long sleeves" will give "long_sleev" + // We'll also obtain the position (start_offset and end_offset) of those shingles in the original string. + if ($this->textHelper->mbWordCount($queryText) < 2) { + try { + $analysis = $this->client->analyze( + ['index' => $indexName, 'body' => ['text' => $queryText, 'analyzer' => 'shingles']] + ); + } catch (\Exception $e) { + $analysis = ['tokens' => []]; + } + + $tokens = array_merge($tokens, $analysis['tokens'] ?? []); + } // Get all variations of the query text by injecting the shingles inside. // $tokens = ['long sleeve dress', 'long_sleeve dress', 'long sleeve_dress'];. $queries[] = $queryText; - foreach ($analysis['tokens'] ?? [] as $token) { + foreach ($tokens as $token) { $startOffset = $token['start_offset']; $length = $token['end_offset'] - $token['start_offset']; - $rewrittenQueryText = $this->mbSubstrReplace($queryText, $token['token'], $startOffset, $length); + $rewrittenQueryText = $this->textHelper->mbSubstrReplace($queryText, $token['token'], $startOffset, $length); $queries[] = $rewrittenQueryText; } $queries = array_unique($queries); @@ -327,7 +363,7 @@ private function combineSynonyms($queryText, $synonymByPositions, $maxRewrites, foreach ($currentPositionSynonyms as $synonym) { $startOffset = $synonym['start_offset'] + $offset; $length = $synonym['end_offset'] - $synonym['start_offset']; - $rewrittenQueryText = $this->mbSubstrReplace($queryText, $synonym['token'], $startOffset, $length); + $rewrittenQueryText = $this->textHelper->mbSubstrReplace($queryText, $synonym['token'], $startOffset, $length); $newOffset = mb_strlen($rewrittenQueryText) - mb_strlen($queryText) + $offset; $combinations[$rewrittenQueryText] = $substitutions + 1; @@ -367,29 +403,4 @@ private function getWeightedRewrites($queryRewrites, $divider, $baseWeight = 1) return array_map($mapper, $queryRewrites); } - - /** - * Partial implementation of a multi-byte aware version of substr_replace. - * Required because the tokens offsets used as for parameters start and length - * are expressed as a number of (UTF-8) characters, independently of the number of bytes. - * Does not accept arrays as first and second parameters. - * Source: https://github.com/fluxbb/utf8/blob/master/functions/substr_replace.php - * Alternative: https://gist.github.com/bantya/563d7d070c286ba1b5a83b9036f0561a - * - * @param string $string Input string - * @param string $replacement Replacement string - * @param mixed $start Start offset - * @param mixed $length Length of replacement - * - * @return mixed - */ - private function mbSubstrReplace($string, $replacement, $start, $length = null) - { - preg_match_all('/./us', $string, $stringChars); - preg_match_all('/./us', $replacement, $replacementChars); - $length = is_int($length) ? $length : mb_strlen($string); - array_splice($stringChars[0], $start, $length, $replacementChars[0]); - - return implode($stringChars[0]); - } } diff --git a/src/module-elasticsuite-thesaurus/Model/Indexer/IndexHandler.php b/src/module-elasticsuite-thesaurus/Model/Indexer/IndexHandler.php index efb56ed76..6c52edd59 100644 --- a/src/module-elasticsuite-thesaurus/Model/Indexer/IndexHandler.php +++ b/src/module-elasticsuite-thesaurus/Model/Indexer/IndexHandler.php @@ -14,13 +14,13 @@ namespace Smile\ElasticsuiteThesaurus\Model\Indexer; -use Magento\Framework\App\Config\ScopeConfigInterface; -use Magento\Store\Model\ScopeInterface; use Smile\ElasticsuiteCore\Api\Client\ClientInterface; -use Smile\ElasticsuiteCore\Helper\IndexSettings as IndexSettingsHelper; -use Smile\ElasticsuiteCore\Helper\Cache as CacheHelper; use Smile\ElasticsuiteCore\Api\Index\IndexOperationInterface; use Smile\ElasticsuiteCore\Api\Index\IndexSettingsInterface; +use Smile\ElasticsuiteCore\Helper\Cache as CacheHelper; +use Smile\ElasticsuiteCore\Helper\IndexSettings as IndexSettingsHelper; +use Smile\ElasticsuiteThesaurus\Config\ThesaurusStemmingConfig; +use Smile\ElasticsuiteThesaurus\Helper\Text as TextHelper; use Smile\ElasticsuiteThesaurus\Model\Index as ThesaurusIndex; /** @@ -32,11 +32,6 @@ */ class IndexHandler { - /** - * @var string - */ - const THESAURUS_ANALYSIS_USE_STEMMING_XML_PATH = 'smile_elasticsuite_thesaurus_settings/analysis/use_stemming'; - /** * @var string */ @@ -78,15 +73,20 @@ class IndexHandler private $indexSettings; /** - * @var ScopeConfigInterface + * @var ThesaurusStemmingConfig */ - private $scopeConfig; + private $stemmingConfig; /** * @var CacheHelper */ private $cacheHelper; + /** + * @var TextHelper + */ + private $textHelper; + /** * Constructor. * @@ -94,23 +94,26 @@ class IndexHandler * @param IndexOperationInterface $indexManager ES index management tool * @param IndexSettingsHelper $indexSettingsHelper Index settings helper. * @param IndexSettingsInterface $indexSettings Index settings provider. - * @param ScopeConfigInterface $scopeConfig Scope config interface. + * @param ThesaurusStemmingConfig $stemmingConfig Stemming configuration. * @param CacheHelper $cacheHelper ES caching helper. + * @param TextHelper $textHelper Text manipulation helper. */ public function __construct( ClientInterface $client, IndexOperationInterface $indexManager, IndexSettingsHelper $indexSettingsHelper, IndexSettingsInterface $indexSettings, - ScopeConfigInterface $scopeConfig, - CacheHelper $cacheHelper + ThesaurusStemmingConfig $stemmingConfig, + CacheHelper $cacheHelper, + TextHelper $textHelper ) { $this->client = $client; $this->indexSettingsHelper = $indexSettingsHelper; $this->indexManager = $indexManager; $this->indexSettings = $indexSettings; - $this->scopeConfig = $scopeConfig; + $this->stemmingConfig = $stemmingConfig; $this->cacheHelper = $cacheHelper; + $this->textHelper = $textHelper; } /** @@ -174,37 +177,22 @@ private function getIndexSettings($storeId, $synonyms, $expansions) ]; $stemmingFilters = []; - if ($this->useStemming($storeId)) { + if ($this->stemmingConfig->useStemming($storeId)) { $stemmingFilters = $this->getStemmingTokenFilters($storeId); if (!empty($stemmingFilters)) { $settings['analysis']['filter'] = array_merge($settings['analysis']['filter'], $stemmingFilters); } } + $settings = $this->addCleanAnalyzer($settings, array_keys($stemmingFilters)); $settings = $this->addShinglesAnalyzer($settings, array_keys($stemmingFilters)); - $settings = $this->addAnalyzerSettings($settings, 'synonym', $synonyms, array_keys($stemmingFilters)); - $settings = $this->addAnalyzerSettings($settings, 'expansion', $expansions, array_keys($stemmingFilters)); + $settings = $this->addAnalyzerSettings($settings, 'synonym', $synonyms, $stemmingFilters); + $settings = $this->addAnalyzerSettings($settings, 'expansion', $expansions, $stemmingFilters); return $settings; } - /** - * Returns true if stemming should be used for synonyms and expansions matching. - * - * @param integer $storeId Store id. - * - * @return bool - */ - private function useStemming($storeId): bool - { - return $this->scopeConfig->isSetFlag( - self::THESAURUS_ANALYSIS_USE_STEMMING_XML_PATH, - ScopeInterface::SCOPE_STORE, - $storeId - ); - } - /** * Retrieves the stemming related token filters defined in the standard analysis configuration for the store. * @@ -233,26 +221,51 @@ private function getStemmingTokenFilters($storeId): array return $tokenFilters; } + /** + * Append the analyzer dedicated to match clean token to the existing settings. + * + * @param array $settings Original settings. + * @param array $stemmingFilters Stemming token filters to add in the analysis chain. + * + * @return array + */ + private function addCleanAnalyzer($settings, $stemmingFilters) + { + $settings['analysis']['analyzer']['clean'] = [ + 'tokenizer' => 'whitespace', + 'filter' => ['lowercase', 'asciifolding'], + ]; + + if (!empty($stemmingFilters)) { + $settings['analysis']['analyzer']['clean']['filter'] = array_merge( + $settings['analysis']['analyzer']['clean']['filter'], + $stemmingFilters + ); + } + + return $settings; + } + /** * Append the analyzer dedicated to match and detect multi-words synonyms ("A B,C" or "A-B,C") * to the existing settings. * - * @param array $settings Original settings. - * @param array $preFilters Token filters to add in the analysis chain _before_ the shingles detection. + * @param array $settings Original settings. + * @param array $stemmingFilters Stemming token filters to add in the analysis chain _before_ the shingles detection. * * @return array */ - private function addShinglesAnalyzer($settings, $preFilters) + private function addShinglesAnalyzer($settings, $stemmingFilters) { $settings['analysis']['analyzer']['shingles'] = [ 'tokenizer' => 'whitespace', 'filter' => ['lowercase', 'asciifolding'], ]; - if (!empty($preFilters)) { + if (!empty($stemmingFilters)) { $settings['analysis']['analyzer']['shingles']['filter'] = array_merge( $settings['analysis']['analyzer']['shingles']['filter'], - $preFilters + $stemmingFilters ); } @@ -264,29 +277,22 @@ private function addShinglesAnalyzer($settings, $preFilters) /** * Append an analyzer for a thesaurus to existing settings. * - * @param array $settings Original settings. - * @param string $type Thesaurus type. - * @param string[] $values Thesaurus entries in Lucene format. - * @param array $preFilters Token filters to add in the analysis chain _before_ the synonym/expansion filter. + * @param array $settings Original settings. + * @param string $type Thesaurus type. + * @param string[] $values Thesaurus entries in Lucene format. + * @param array $stemmingFilters Stemming token filters to add in the analysis chain _before_ the synonym/expansion filter. * * @return array */ - private function addAnalyzerSettings($settings, $type, $values, $preFilters) + private function addAnalyzerSettings($settings, $type, $values, $stemmingFilters) { $settings['analysis']['analyzer'][$type] = [ 'tokenizer' => 'whitespace', 'filter' => ['lowercase', 'asciifolding'], ]; - if (!empty($preFilters)) { - $settings['analysis']['analyzer'][$type]['filter'] = array_merge( - $settings['analysis']['analyzer'][$type]['filter'], - $preFilters - ); - } - if (!empty($values)) { - $values = $this->prepareSynonymFilterData($values); + $values = $this->prepareSynonymFilterData($values, $stemmingFilters); $settings['analysis']['filter'][$type] = ['type' => 'synonym', 'synonyms' => $values, 'lenient' => true]; $settings['analysis']['analyzer'][$type]['filter'][] = $type; } @@ -300,17 +306,72 @@ private function addAnalyzerSettings($settings, $type, $values, $preFilters) /** * Prepare the thesaurus data to be saved. * Spaces and hyphens are replaced with "_" into multiwords expression (ex foo bar => foo_bar). + * Applies stemming filters if provided. * - * @param string[] $rows Original thesaurus text rows. + * @param string[] $rows Original thesaurus text rows. + * @param array $stemmingFilters Stemming token filters to apply before synonym processing. * * @return string[] */ - private function prepareSynonymFilterData($rows) + private function prepareSynonymFilterData($rows, $stemmingFilters) { - $rowMapper = function ($row) { + if (empty($rows)) { + return $rows; + } + + return array_map(function ($row) use ($stemmingFilters) { + if (!empty($stemmingFilters)) { + $row = $this->extractStemsFromThesaurusRow($row, $stemmingFilters); + } + return preg_replace('/([^\s-])[\s-]+(?=[^\s-])/u', '\1_', $row); - }; + }, $rows); + } + + /** + * Analyze an entire thesaurus row using Elasticsearch analyze API to extract stems. + * + * @param string $row Row to analyze. + * @param array $stemmingFilters Stemming token filters configuration. + * + * @return string Analyzed row with stemmed terms. + */ + private function extractStemsFromThesaurusRow($row, $stemmingFilters) + { + // Build an analysis query with stemming filters. + $analyzeParams = [ + 'body' => [ + 'tokenizer' => 'whitespace', + 'filter' => array_merge(['lowercase', 'asciifolding'], array_values($stemmingFilters)), + 'char_filter' => [], + 'text' => str_replace(',', ' ', str_replace('=>', ' ', $row)), + ], + ]; + + $response = $this->client->analyze($analyzeParams); + + // Replace original terms with their stemmed counterparts in the thesaurus row. + if (isset($response['tokens']) && !empty($response['tokens'])) { + $rewrittenRow = $row; + $offset = 0; + + foreach ($response['tokens'] as $token) { + $startOffset = $token['start_offset']; + $length = $token['end_offset'] - $token['start_offset']; + + $rewrittenRow = $this->textHelper->mbSubstrReplace( + $rewrittenRow, + $token['token'], + $startOffset + $offset, + $length + ); + + $offset += mb_strlen($token['token']) - $length; + } + + return $rewrittenRow; + } - return array_map($rowMapper, $rows); + return $row; } } diff --git a/src/module-elasticsuite-thesaurus/Test/Unit/Model/IndexTest.php b/src/module-elasticsuite-thesaurus/Test/Unit/Model/IndexTest.php index 6c7f408b5..4c890f52a 100644 --- a/src/module-elasticsuite-thesaurus/Test/Unit/Model/IndexTest.php +++ b/src/module-elasticsuite-thesaurus/Test/Unit/Model/IndexTest.php @@ -24,6 +24,8 @@ use Smile\ElasticsuiteThesaurus\Config\ThesaurusCacheConfig; use Smile\ElasticsuiteThesaurus\Config\ThesaurusConfig; use Smile\ElasticsuiteThesaurus\Config\ThesaurusConfigFactory; +use Smile\ElasticsuiteThesaurus\Config\ThesaurusStemmingConfig; +use Smile\ElasticsuiteThesaurus\Helper\Text as TextHelper; use Smile\ElasticsuiteThesaurus\Model\Index as ThesaurusIndex; /** @@ -102,12 +104,19 @@ public function testCacheUsageNoRewrites( $containerConfig->method('getStoreId')->willReturn($storeId); $containerConfig->method('getName')->willReturn($containerName); + $stemmingConfig = $this->getMockBuilder(ThesaurusStemmingConfig::class) + ->disableOriginalConstructor() + ->getMock(); + $stemmingConfig->method('useStemming')->willReturn(true); + $thesaurusIndex = new ThesaurusIndex( $clientMock, $indexSettingsHelperMock, $cacheHelperMock, $thesaurusConfigFactoryMock, - $thesaurusCacheConfigMock + $thesaurusCacheConfigMock, + new TextHelper(), + $stemmingConfig ); $cacheKey = implode('|', [$indexAlias, $containerName, $queryText]); @@ -141,13 +150,13 @@ public function noRewriteDataProvider() 0, [], [], ], ['foo', true, 10, false, 10, 2, - 1, [[]], [], + 2, [[]], [], ], ['foo', true, 10, true, 10, 2, - 2, [[], []], [], + 4, [[], []], [], ], ['foo', true, 10, true, 10, 2, - 2, [[], []], [], false, + 4, [[], []], [], false, ], ]; } @@ -229,12 +238,19 @@ function ($mapItem) use ($indexAlias) { $containerConfig->method('getStoreId')->willReturn($storeId); $containerConfig->method('getName')->willReturn($containerName); + $stemmingConfig = $this->getMockBuilder(ThesaurusStemmingConfig::class) + ->disableOriginalConstructor() + ->getMock(); + $stemmingConfig->method('useStemming')->willReturn(true); + $thesaurusIndex = new ThesaurusIndex( $clientMock, $indexSettingsHelperMock, $cacheHelperMock, $thesaurusConfigFactoryMock, - $thesaurusCacheConfigMock + $thesaurusCacheConfigMock, + new TextHelper(), + $stemmingConfig ); $cacheKey = implode('|', array_merge([$indexAlias, $containerName], [$queryText])); @@ -270,8 +286,19 @@ public function singleLevelRewritesDataProvider() ], // Only synonyms enabled. Simulating 'foo,bar,baz'. ['foo', true, 10, false, 10, 2, - 1, + 2, [ + [ + 'tokens' => [ + [ + 'type' => '', + 'token' => 'foo', + 'start_offset' => 0, + 'end_offset' => 3, + 'position' => 0, + ], + ], + ], [ 'tokens' => [ [ @@ -299,9 +326,23 @@ public function singleLevelRewritesDataProvider() // Only synonyms enabled. Simulating 'foo,bar,baz'. // Same test as before, but client->analyze returns expressed as a mapping. ['foo', true, 10, false, 10, 2, - 1, + 2, [ 'map' => [ + [ + ['text' => 'foo', 'analyzer' => 'clean'], + [ + 'tokens' => [ + [ + 'type' => '', + 'token' => 'foo', + 'start_offset' => 0, + 'end_offset' => 3, + 'position' => 0, + ], + ], + ], + ], [ ['text' => 'foo', 'analyzer' => 'synonym'], [ @@ -332,8 +373,22 @@ public function singleLevelRewritesDataProvider() ], // Only expansions enabled. Simulating 'foo => bar,baz'. ['foo', false, 10, true, 10, 2, - 1, + 2, [ + [ + ['text' => 'foo', 'analyzer' => 'clean'], + [ + 'tokens' => [ + [ + 'type' => '', + 'token' => 'foo', + 'start_offset' => 0, + 'end_offset' => 3, + 'position' => 0, + ], + ], + ], + ], [ 'tokens' => [ [ @@ -358,10 +413,23 @@ public function singleLevelRewritesDataProvider() 'baz' => 0.1, ], ], - // Only expansions enabled. Simulating 'foo => bar,baz'. - ['foo', false, 10, true, 10, 2, - 1, + // Both synonyms and expansions enabled. Simulating 'foo,bar,baz' and 'bar => pub,cafe'. + ['foo', true, 10, true, 10, 2, + 8, [ + // Stem call for 'foo'. + [ + 'tokens' => [ + [ + 'type' => '', + 'token' => 'foo', + 'start_offset' => 0, + 'end_offset' => 3, + 'position' => 0, + ], + ], + ], + // Synonyms call for 'foo'. [ 'tokens' => [ [ @@ -380,38 +448,33 @@ public function singleLevelRewritesDataProvider() ], ], ], - ], - [ - 'bar' => 0.1, - 'baz' => 0.1, - ], - ], - // Both synonyms and expansions enabled. Simulating 'foo,bar,baz' and 'bar => pub,cafe'. - ['foo', true, 10, true, 10, 2, - 4, - [ - // Synonyms call for 'foo'. + // Stem call for 'foo'. [ 'tokens' => [ [ - 'type' => 'SYNONYM', - 'token' => 'bar', + 'type' => '', + 'token' => 'foo', 'start_offset' => 0, 'end_offset' => 3, 'position' => 0, ], + ], + ], + // Expansions call. + // No expansion for 'foo'. + ['tokens' => []], + // Stem call for 'bar'. + [ + 'tokens' => [ [ - 'type' => 'SYNONYM', - 'token' => 'baz', + 'type' => '', + 'token' => 'bar', 'start_offset' => 0, 'end_offset' => 3, 'position' => 0, ], ], ], - // Expansions call. - // No expansion for 'foo'. - ['tokens' => []], // Expansion for 'bar'. [ 'tokens' => [ @@ -431,6 +494,18 @@ public function singleLevelRewritesDataProvider() ], ], ], + // Stem call for 'baz'. + [ + 'tokens' => [ + [ + 'type' => '', + 'token' => 'baz', + 'start_offset' => 0, + 'end_offset' => 3, + 'position' => 0, + ], + ], + ], // No expansion for 'baz'. ['tokens' => []], ], @@ -445,8 +520,20 @@ public function singleLevelRewritesDataProvider() // Both synonyms and expansions enabled. Simulating 'foo,bar,baz' and 'bar => pub,cafe'. // No cache storage allowed. ['foo', true, 10, true, 10, 2, - 4, + 8, [ + // Stem call for 'foo'. + [ + 'tokens' => [ + [ + 'type' => '', + 'token' => 'foo', + 'start_offset' => 0, + 'end_offset' => 3, + 'position' => 0, + ], + ], + ], // Synonyms call for 'foo'. [ 'tokens' => [ @@ -466,9 +553,33 @@ public function singleLevelRewritesDataProvider() ], ], ], + // Stem call for 'foo'. + [ + 'tokens' => [ + [ + 'type' => '', + 'token' => 'foo', + 'start_offset' => 0, + 'end_offset' => 3, + 'position' => 0, + ], + ], + ], // Expansions call. // No expansion for 'foo'. ['tokens' => []], + // Stem call for 'bar'. + [ + 'tokens' => [ + [ + 'type' => '', + 'token' => 'bar', + 'start_offset' => 0, + 'end_offset' => 3, + 'position' => 0, + ], + ], + ], // Expansion for 'bar'. [ 'tokens' => [ @@ -488,6 +599,18 @@ public function singleLevelRewritesDataProvider() ], ], ], + // Stem call for 'baz'. + [ + 'tokens' => [ + [ + 'type' => '', + 'token' => 'baz', + 'start_offset' => 0, + 'end_offset' => 3, + 'position' => 0, + ], + ], + ], // No expansion for 'baz'. ['tokens' => []], ], @@ -734,12 +857,19 @@ function ($mapItem) use ($indexAlias) { $containerConfig->method('getStoreId')->willReturn($storeId); $containerConfig->method('getName')->willReturn($containerName); + $stemmingConfig = $this->getMockBuilder(ThesaurusStemmingConfig::class) + ->disableOriginalConstructor() + ->getMock(); + $stemmingConfig->method('useStemming')->willReturn(true); + $thesaurusIndex = new ThesaurusIndex( $clientMock, $indexSettingsHelperMock, $cacheHelperMock, $thesaurusConfigFactoryMock, - $thesaurusCacheConfigMock + $thesaurusCacheConfigMock, + new TextHelper(), + $stemmingConfig ); $cacheKey = implode('|', array_merge([$indexAlias, $containerName], [$queryText])); @@ -755,6 +885,8 @@ function ($mapItem) use ($indexAlias) { $rewrites = $thesaurusIndex->getQueryRewrites($containerConfig, $queryText); $this->assertEquals($expectedRewrites, $rewrites); + + $this->assertTrue(true); } /** @@ -1332,8 +1464,20 @@ public function multiLevelRewritesDataProvider() ], // Only synonyms enabled. Simulating 'foo,bar,baz' and 'bar,pub,cafe' and 'bar,pipe,tube'. ['foo', true, 10, false, 10, 2, - 1, + 2, [ + // Stemm call for 'foo'. + [ + 'tokens' => [ + [ + 'type' => '', + 'token' => 'foo', + 'start_offset' => 0, + 'end_offset' => 3, + 'position' => 0, + ], + ], + ], [ 'tokens' => [ [ @@ -1368,8 +1512,20 @@ public function multiLevelRewritesDataProvider() ], // Only expansions enabled. Simulating 'foo => bar,baz' and 'bar => pub,cafe' and 'bar => pipe,tube'. ['foo', false, 10, true, 10, 2, - 1, + 2, [ + // Stemm call for 'foo'. + [ + 'tokens' => [ + [ + 'type' => '', + 'token' => 'foo', + 'start_offset' => 0, + 'end_offset' => 3, + 'position' => 0, + ], + ], + ], [ 'tokens' => [ [ @@ -1406,8 +1562,20 @@ public function multiLevelRewritesDataProvider() // Simulating 'foo,bar,baz' and 'bar,pub,cafe' and 'bar,pipe,tube'. // and 'foo => bar,baz' and 'bar => pub,cafe' and 'bar => pipe,tube'. ['foo', true, 10, true, 100, 2, - 4, + 8, [ + // Stemm call for 'foo'. + [ + 'tokens' => [ + [ + 'type' => '', + 'token' => 'foo', + 'start_offset' => 0, + 'end_offset' => 3, + 'position' => 0, + ], + ], + ], // Synonyms call for 'foo'. [ 'tokens' => [ @@ -1435,6 +1603,18 @@ public function multiLevelRewritesDataProvider() ], ], ], + // Stemm call for 'foo'. + [ + 'tokens' => [ + [ + 'type' => '', + 'token' => 'foo', + 'start_offset' => 0, + 'end_offset' => 3, + 'position' => 0, + ], + ], + ], // Expansions call. // No expansion for 'foo'. [ @@ -1463,6 +1643,18 @@ public function multiLevelRewritesDataProvider() ], ], ], + // Stemm call for 'bar'. + [ + 'tokens' => [ + [ + 'type' => '', + 'token' => 'bar', + 'start_offset' => 0, + 'end_offset' => 3, + 'position' => 0, + ], + ], + ], // Expansion for 'bar'. [ 'tokens' => [ @@ -1544,6 +1736,18 @@ public function multiLevelRewritesDataProvider() ], ], ], + // Stemm call for 'baz'. + [ + 'tokens' => [ + [ + 'type' => '', + 'token' => 'baz', + 'start_offset' => 0, + 'end_offset' => 3, + 'position' => 0, + ], + ], + ], // No expansion for 'baz'. ['tokens' => []], ], @@ -1561,7 +1765,7 @@ public function multiLevelRewritesDataProvider() // Simulating 'foo,bar' and 'foobar,foo bar', 'bar,pipe,tube' and 'bar => pub,cafe'. // Carefull, the client is also called in getQueryCombinations. ['foo bar', true, 10, true, 10, 2, - 28, + 29, $cyclingMappingResults, [ // Synonyms only for 'foo (bar)'. @@ -1606,7 +1810,7 @@ public function multiLevelRewritesDataProvider() // Simulating 'foo,bar' and 'foobar,foo bar', 'bar,pipe,tube' and 'bar => pub,cafe'. // Carefull, the client is also called in getQueryCombinations. ['foo bar', true, 10, true, 10, 1, - 19, + 20, $cyclingMappingResults, [ // Synonyms only for 'foo (bar)'. @@ -1703,12 +1907,19 @@ public function testAnalyzeFailure( $containerConfig->method('getStoreId')->willReturn($storeId); $containerConfig->method('getName')->willReturn($containerName); + $stemmingConfig = $this->getMockBuilder(ThesaurusStemmingConfig::class) + ->disableOriginalConstructor() + ->getMock(); + $stemmingConfig->method('useStemming')->willReturn(true); + $thesaurusIndex = new ThesaurusIndex( $clientMock, $indexSettingsHelperMock, $cacheHelperMock, $thesaurusConfigFactoryMock, - $thesaurusCacheConfigMock + $thesaurusCacheConfigMock, + new TextHelper(), + $stemmingConfig ); $cacheKey = implode('|', array_merge([$indexAlias, $containerName], [$queryText])); @@ -1742,11 +1953,11 @@ public function withAnalysisFailureDataProvider() // Both synonyms and expansions disabled. ['foo', false, 10, false, 10, 2, 0, []], // Only synonyms enabled. - ['foo', true, 10, false, 10, 2, 1, []], + ['foo', true, 10, false, 10, 2, 2, []], // Only expansions enabled. - ['foo', false, 10, true, 10, 2, 1, []], + ['foo', false, 10, true, 10, 2, 2, []], // Both synonyms and expansions enabled. - ['foo', true, 10, true, 10, 2, 2, []], + ['foo', true, 10, true, 10, 2, 4, []], // Both synonyms and expansions enabled, multi-words search. // Careful, the client is also called in getQueryCombinations. ['foo bar', true, 10, true, 10, 2, 4, []],