Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
<?php
/**
* DISCLAIMER
*
* Do not edit or add to this file if you wish to upgrade this module to newer versions in the future.
*
* @category Smile
* @package Smile\ElasticsuiteThesaurus
* @author Pierre Gauthier <pigau@smile.fr>
* @copyright 2024 Smile
* @license Open Software License ("OSL") v. 3.0
*/

namespace Smile\ElasticsuiteThesaurus\Config;

use Magento\Framework\App\Config\ScopeConfigInterface;
use Magento\Store\Model\ScopeInterface;
use Smile\ElasticsuiteCore\Api\Search\Request\ContainerConfigurationInterface;

/**
* Thesaurus stemming configuration helper.
*
* @category Smile
* @package Smile\ElasticsuiteThesaurus
* @author Pierre Gauthier <pigau@smile.fr>
*/
class ThesaurusStemmingConfig
{
/** @var string */
const THESAURUS_ANALYSIS_USE_STEMMING_XML_PATH = 'smile_elasticsuite_thesaurus_settings/analysis/use_stemming';

/**
* @var ScopeConfigInterface
*/
private $scopeConfig;

/**
* Constructor.
*
* @param ScopeConfigInterface $scopeConfig Scope config interface.
*/
public function __construct(ScopeConfigInterface $scopeConfig)
{
$this->scopeConfig = $scopeConfig;
}

/**
* Returns true if stemming should be used for synonyms and expansions matching.
*
* @param integer $storeId Store id.
*
* @return bool
*/
public function useStemming($storeId): bool
{
return $this->scopeConfig->isSetFlag(
self::THESAURUS_ANALYSIS_USE_STEMMING_XML_PATH,
ScopeInterface::SCOPE_STORE,
$storeId
);
}
}
64 changes: 64 additions & 0 deletions src/module-elasticsuite-thesaurus/Helper/Text.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
<?php
/**
* DISCLAIMER
*
* Do not edit or add to this file if you wish to upgrade Smile ElasticSuite to newer
* versions in the future.
*
* @category Smile
* @package Smile\ElasticsuiteThesaurus
* @author Pierre Gauthier <pigau@smile.fr>
* @copyright 2025 Smile
* @license Open Software License ("OSL") v. 3.0
*/

namespace Smile\ElasticsuiteThesaurus\Helper;

/**
* Text manipulation helper.
*
* @category Smile
* @package Smile\ElasticsuiteThesaurus
* @author Pierre Gauthier <pigau@smile.fr>
*/
class Text
{
/**
* Partial implementation of a multi-byte aware version of substr_replace.
* Required because the tokens offsets used as for parameters start and length
* are expressed as a number of (UTF-8) characters, independently of the number of bytes.
* Does not accept arrays as first and second parameters.
* Source: https://github.com/fluxbb/utf8/blob/master/functions/substr_replace.php
* Alternative: https://gist.github.com/bantya/563d7d070c286ba1b5a83b9036f0561a
*
* @param string $string Input string
* @param string $replacement Replacement string
* @param mixed $start Start offset
* @param mixed $length Length of replacement
*
* @return mixed
*/
public function mbSubstrReplace($string, $replacement, $start, $length = null)
{
preg_match_all('/./us', $string, $stringChars);
preg_match_all('/./us', $replacement, $replacementChars);
$length = is_int($length) ? $length : mb_strlen($string);
array_splice($stringChars[0], $start, $length, $replacementChars[0]);

return implode($stringChars[0]);
}

/**
* Count the number of words in a given text.
*
* @param string $text The input text.
*
* @return int
*/
public function mbWordCount(string $text): int
{
preg_match_all('/[\p{L}\p{N}\']+/u', $text, $matches);

return count($matches[0]);
}
}
99 changes: 55 additions & 44 deletions src/module-elasticsuite-thesaurus/Model/Index.php
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,16 @@

namespace Smile\ElasticsuiteThesaurus\Model;

use Smile\ElasticsuiteCore\Helper\IndexSettings as IndexSettingsHelper;
use Smile\ElasticsuiteCore\Api\Client\ClientInterface;
use Smile\ElasticsuiteCore\Api\Search\Request\ContainerConfigurationInterface;
use Smile\ElasticsuiteThesaurus\Config\ThesaurusConfigFactory;
use Smile\ElasticsuiteThesaurus\Config\ThesaurusConfig;
use Smile\ElasticsuiteThesaurus\Config\ThesaurusCacheConfig;
use Smile\ElasticsuiteThesaurus\Api\Data\ThesaurusInterface;
use Smile\ElasticsuiteCore\Helper\Cache as CacheHelper;
use Smile\ElasticsuiteCore\Helper\IndexSettings as IndexSettingsHelper;
use Smile\ElasticsuiteThesaurus\Api\Data\ThesaurusInterface;
use Smile\ElasticsuiteThesaurus\Config\ThesaurusCacheConfig;
use Smile\ElasticsuiteThesaurus\Config\ThesaurusConfig;
use Smile\ElasticsuiteThesaurus\Config\ThesaurusConfigFactory;
use Smile\ElasticsuiteThesaurus\Config\ThesaurusStemmingConfig;
use Smile\ElasticsuiteThesaurus\Helper\Text as TextHelper;

/**
* Thesaurus index.
Expand Down Expand Up @@ -72,27 +74,43 @@ class Index
*/
private $thesaurusCacheConfig;

/**
* @var TextHelper
*/
private $textHelper;

/**
* @var ThesaurusStemmingConfig
*/
private $stemmingConfig;

/**
* Constructor.
*
* @param ClientInterface $client ES client.
* @param IndexSettingsHelper $indexSettingsHelper Index Settings Helper.
* @param CacheHelper $cacheHelper ES caching helper.
* @param ThesaurusConfigFactory $thesaurusConfigFactory Thesaurus configuration factory.
* @param ThesaurusCacheConfig $thesaurusCacheConfig Thesaurus cache configuration helper.
* @param ClientInterface $client ES client.
* @param IndexSettingsHelper $indexSettingsHelper Index Settings Helper.
* @param CacheHelper $cacheHelper ES caching helper.
* @param ThesaurusConfigFactory $thesaurusConfigFactory Thesaurus configuration factory.
* @param ThesaurusCacheConfig $thesaurusCacheConfig Thesaurus cache configuration helper.
* @param TextHelper $textHelper Text manipulation helper.
* @param ThesaurusStemmingConfig $stemmingConfig Stemming configuration.
*/
public function __construct(
ClientInterface $client,
IndexSettingsHelper $indexSettingsHelper,
CacheHelper $cacheHelper,
ThesaurusConfigFactory $thesaurusConfigFactory,
ThesaurusCacheConfig $thesaurusCacheConfig
ThesaurusCacheConfig $thesaurusCacheConfig,
TextHelper $textHelper,
ThesaurusStemmingConfig $stemmingConfig
) {
$this->client = $client;
$this->indexSettingsHelper = $indexSettingsHelper;
$this->thesaurusConfigFactory = $thesaurusConfigFactory;
$this->cacheHelper = $cacheHelper;
$this->thesaurusCacheConfig = $thesaurusCacheConfig;
$this->textHelper = $textHelper;
$this->stemmingConfig = $stemmingConfig;
}

/**
Expand Down Expand Up @@ -275,29 +293,47 @@ private function getSynonymRewrites($storeId, $queryText, $type, $maxRewrites)
*/
private function getQueryCombinations($storeId, $queryText)
{
if (str_word_count($queryText) < 2) {
// No need to compute variations of shingles with a one-word-query but we need to extract the stem if enabled.
if (!$this->stemmingConfig->useStemming($storeId) && $this->textHelper->mbWordCount($queryText) < 2) {
return [$queryText]; // No need to compute variations of shingles with a one-word-query.
}

// Generate the shingles.
// If we analyze "long sleeve dress", we'll obtain "long_sleeve", and "sleeve_dress".
// We'll also obtain the position (start_offset and end_offset) of those shingles in the original string.
$indexName = $this->getIndexAlias($storeId);

// Get the stem of the search term
try {
$analysis = $this->client->analyze(
['index' => $indexName, 'body' => ['text' => $queryText, 'analyzer' => 'shingles']]
['index' => $indexName, 'body' => ['text' => $queryText, 'analyzer' => 'clean']]
);
} catch (\Exception $e) {
$analysis = ['tokens' => []];
}
$tokens = $analysis['tokens'] ?? [];

// Generate the shingles.
// If we analyze "long sleeve dress", we'll obtain "long_sleeve", and "sleeve_dress".
// If stemming is enabled, analysis of "dresses" will give "dress"
// and analysis of "long sleeves" will give "long_sleev"
// We'll also obtain the position (start_offset and end_offset) of those shingles in the original string.
if ($this->textHelper->mbWordCount($queryText) < 2) {
try {
$analysis = $this->client->analyze(
['index' => $indexName, 'body' => ['text' => $queryText, 'analyzer' => 'shingles']]
);
} catch (\Exception $e) {
$analysis = ['tokens' => []];
}

$tokens = array_merge($tokens, $analysis['tokens'] ?? []);
}

// Get all variations of the query text by injecting the shingles inside.
// $tokens = ['long sleeve dress', 'long_sleeve dress', 'long sleeve_dress'];.
$queries[] = $queryText;
foreach ($analysis['tokens'] ?? [] as $token) {
foreach ($tokens as $token) {
$startOffset = $token['start_offset'];
$length = $token['end_offset'] - $token['start_offset'];
$rewrittenQueryText = $this->mbSubstrReplace($queryText, $token['token'], $startOffset, $length);
$rewrittenQueryText = $this->textHelper->mbSubstrReplace($queryText, $token['token'], $startOffset, $length);
$queries[] = $rewrittenQueryText;
}
$queries = array_unique($queries);
Expand Down Expand Up @@ -327,7 +363,7 @@ private function combineSynonyms($queryText, $synonymByPositions, $maxRewrites,
foreach ($currentPositionSynonyms as $synonym) {
$startOffset = $synonym['start_offset'] + $offset;
$length = $synonym['end_offset'] - $synonym['start_offset'];
$rewrittenQueryText = $this->mbSubstrReplace($queryText, $synonym['token'], $startOffset, $length);
$rewrittenQueryText = $this->textHelper->mbSubstrReplace($queryText, $synonym['token'], $startOffset, $length);
$newOffset = mb_strlen($rewrittenQueryText) - mb_strlen($queryText) + $offset;
$combinations[$rewrittenQueryText] = $substitutions + 1;

Expand Down Expand Up @@ -367,29 +403,4 @@ private function getWeightedRewrites($queryRewrites, $divider, $baseWeight = 1)

return array_map($mapper, $queryRewrites);
}

/**
* Partial implementation of a multi-byte aware version of substr_replace.
* Required because the tokens offsets used as for parameters start and length
* are expressed as a number of (UTF-8) characters, independently of the number of bytes.
* Does not accept arrays as first and second parameters.
* Source: https://github.com/fluxbb/utf8/blob/master/functions/substr_replace.php
* Alternative: https://gist.github.com/bantya/563d7d070c286ba1b5a83b9036f0561a
*
* @param string $string Input string
* @param string $replacement Replacement string
* @param mixed $start Start offset
* @param mixed $length Length of replacement
*
* @return mixed
*/
private function mbSubstrReplace($string, $replacement, $start, $length = null)
{
preg_match_all('/./us', $string, $stringChars);
preg_match_all('/./us', $replacement, $replacementChars);
$length = is_int($length) ? $length : mb_strlen($string);
array_splice($stringChars[0], $start, $length, $replacementChars[0]);

return implode($stringChars[0]);
}
}
Loading
Loading