Skip to content

Commit 6c7c7e3

Browse files
committed
Add PostgresHybridStore for hybrid search with RRF
Combines pgvector semantic search with PostgreSQL Full-Text Search using Reciprocal Rank Fusion (RRF), following Supabase approach. Features: - Configurable semantic/keyword ratio (0.0 to 1.0) - RRF fusion with customizable k parameter - Multilingual FTS support (default: 'simple') - Optional relevance filtering with defaultMaxScore - All pgvector distance metrics supported
1 parent 5242431 commit 6c7c7e3

File tree

2 files changed

+847
-0
lines changed

2 files changed

+847
-0
lines changed
Lines changed: 348 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,348 @@
1+
<?php
2+
3+
/*
4+
* This file is part of the Symfony package.
5+
*
6+
* (c) Fabien Potencier <[email protected]>
7+
*
8+
* For the full copyright and license information, please view the LICENSE
9+
* file that was distributed with this source code.
10+
*/
11+
12+
namespace Symfony\AI\Store\Bridge\Postgres;
13+
14+
use Symfony\AI\Platform\Vector\Vector;
15+
use Symfony\AI\Platform\Vector\VectorInterface;
16+
use Symfony\AI\Store\Document\Metadata;
17+
use Symfony\AI\Store\Document\VectorDocument;
18+
use Symfony\AI\Store\Exception\InvalidArgumentException;
19+
use Symfony\AI\Store\ManagedStoreInterface;
20+
use Symfony\AI\Store\StoreInterface;
21+
use Symfony\Component\Uid\Uuid;
22+
23+
/**
24+
* Hybrid Search Store for PostgreSQL/Supabase
25+
* Combines pgvector (semantic) + PostgreSQL Full-Text Search (ts_rank_cd) using RRF.
26+
*
27+
* Uses Reciprocal Rank Fusion (RRF) to combine vector similarity and full-text search,
28+
* following the same approach as Supabase hybrid search implementation.
29+
*
30+
* Requirements:
31+
* - PostgreSQL with pgvector extension
32+
* - A 'content' text field for full-text search
33+
*
34+
* @see https://supabase.com/docs/guides/ai/hybrid-search
35+
*
36+
* @author Ahmed EBEN HASSINE <ahmedbhs123@æmail.com>
37+
*/
38+
final readonly class PostgresHybridStore implements ManagedStoreInterface, StoreInterface
39+
{
40+
/**
41+
* @param string $vectorFieldName Name of the vector field
42+
* @param string $contentFieldName Name of the text field for FTS
43+
* @param float $semanticRatio Ratio between semantic (vector) and keyword (FTS) search (0.0 to 1.0)
44+
* - 0.0 = 100% keyword search (FTS)
45+
* - 0.5 = balanced hybrid search
46+
* - 1.0 = 100% semantic search (vector only) - default
47+
* @param Distance $distance Distance metric for vector similarity
48+
* @param string $language PostgreSQL text search configuration (default: 'simple')
49+
* - 'simple': Works for ALL languages, no stemming (recommended for multilingual content)
50+
* - 'english', 'french', 'spanish', etc.: Language-specific stemming/stopwords
51+
* @param int $rrfK RRF (Reciprocal Rank Fusion) constant for hybrid search (default: 60)
52+
* Higher values = more equal weighting between results
53+
* @param float|null $defaultMaxScore Default maximum distance threshold for vector search (default: null = no filter)
54+
* Only applies to pure vector search (semanticRatio = 1.0)
55+
* Prevents returning irrelevant results with high distance scores
56+
* Example: 0.8 means only return documents with distance < 0.8
57+
*/
58+
public function __construct(
59+
private \PDO $connection,
60+
private string $tableName,
61+
private string $vectorFieldName = 'embedding',
62+
private string $contentFieldName = 'content',
63+
private float $semanticRatio = 1.0,
64+
private Distance $distance = Distance::L2,
65+
private string $language = 'simple',
66+
private int $rrfK = 60,
67+
private ?float $defaultMaxScore = null,
68+
) {
69+
if ($semanticRatio < 0.0 || $semanticRatio > 1.0) {
70+
throw new InvalidArgumentException(\sprintf('The semantic ratio must be between 0.0 and 1.0, "%s" given.', $semanticRatio));
71+
}
72+
}
73+
74+
public function setup(array $options = []): void
75+
{
76+
// Enable pgvector extension
77+
$this->connection->exec('CREATE EXTENSION IF NOT EXISTS vector');
78+
79+
// Create table with vector field, content field for FTS, and tsvector field
80+
$this->connection->exec(
81+
\sprintf(
82+
'CREATE TABLE IF NOT EXISTS %s (
83+
id UUID PRIMARY KEY,
84+
metadata JSONB,
85+
%s TEXT NOT NULL,
86+
%s %s(%d) NOT NULL,
87+
content_tsv tsvector GENERATED ALWAYS AS (to_tsvector(\'%s\', %s)) STORED
88+
)',
89+
$this->tableName,
90+
$this->contentFieldName,
91+
$this->vectorFieldName,
92+
$options['vector_type'] ?? 'vector',
93+
$options['vector_size'] ?? 1536,
94+
$this->language,
95+
$this->contentFieldName,
96+
),
97+
);
98+
99+
// Create vector index
100+
$this->connection->exec(
101+
\sprintf(
102+
'CREATE INDEX IF NOT EXISTS %s_%s_idx ON %s USING %s (%s %s)',
103+
$this->tableName,
104+
$this->vectorFieldName,
105+
$this->tableName,
106+
$options['index_method'] ?? 'ivfflat',
107+
$this->vectorFieldName,
108+
$options['index_opclass'] ?? 'vector_cosine_ops',
109+
),
110+
);
111+
112+
// Create GIN index for full-text search
113+
$this->connection->exec(
114+
\sprintf(
115+
'CREATE INDEX IF NOT EXISTS %s_content_tsv_idx ON %s USING gin(content_tsv)',
116+
$this->tableName,
117+
$this->tableName,
118+
),
119+
);
120+
}
121+
122+
public function drop(): void
123+
{
124+
$this->connection->exec(\sprintf('DROP TABLE IF EXISTS %s', $this->tableName));
125+
}
126+
127+
public function add(VectorDocument ...$documents): void
128+
{
129+
$statement = $this->connection->prepare(
130+
\sprintf(
131+
'INSERT INTO %1$s (id, metadata, %2$s, %3$s)
132+
VALUES (:id, :metadata, :content, :vector)
133+
ON CONFLICT (id) DO UPDATE SET
134+
metadata = EXCLUDED.metadata,
135+
%2$s = EXCLUDED.%2$s,
136+
%3$s = EXCLUDED.%3$s',
137+
$this->tableName,
138+
$this->contentFieldName,
139+
$this->vectorFieldName,
140+
),
141+
);
142+
143+
foreach ($documents as $document) {
144+
$operation = [
145+
'id' => $document->id->toRfc4122(),
146+
'metadata' => json_encode($document->metadata->getArrayCopy(), \JSON_THROW_ON_ERROR),
147+
'content' => $document->metadata->getText() ?? '',
148+
'vector' => $this->toPgvector($document->vector),
149+
];
150+
151+
$statement->execute($operation);
152+
}
153+
}
154+
155+
/**
156+
* Hybrid search combining vector similarity and full-text search.
157+
*
158+
* @param array{
159+
* q?: string,
160+
* semanticRatio?: float,
161+
* limit?: int,
162+
* where?: string,
163+
* params?: array,
164+
* maxScore?: float
165+
* } $options
166+
*/
167+
public function query(Vector $vector, array $options = []): array
168+
{
169+
$semanticRatio = $options['semanticRatio'] ?? $this->semanticRatio;
170+
171+
if ($semanticRatio < 0.0 || $semanticRatio > 1.0) {
172+
throw new InvalidArgumentException(\sprintf('The semantic ratio must be between 0.0 and 1.0, "%s" given.', $semanticRatio));
173+
}
174+
175+
$queryText = $options['q'] ?? '';
176+
$limit = $options['limit'] ?? 5;
177+
178+
// Build WHERE clause
179+
$where = [];
180+
$params = [];
181+
182+
// Only add embedding param if we're doing vector search
183+
if ($semanticRatio > 0.0) {
184+
$params['embedding'] = $this->toPgvector($vector);
185+
}
186+
187+
// Use maxScore from options, or defaultMaxScore if configured
188+
$maxScore = $options['maxScore'] ?? $this->defaultMaxScore;
189+
190+
if (null !== $maxScore) {
191+
$where[] = "({$this->vectorFieldName} {$this->distance->getComparisonSign()} :embedding) <= :maxScore";
192+
$params['maxScore'] = $maxScore;
193+
// Ensure embedding is available if maxScore is used
194+
if (!isset($params['embedding'])) {
195+
$params['embedding'] = $this->toPgvector($vector);
196+
}
197+
}
198+
199+
if ($options['where'] ?? false) {
200+
$where[] = '('.$options['where'].')';
201+
}
202+
203+
$whereClause = $where ? 'WHERE '.implode(' AND ', $where) : '';
204+
205+
// Choose query strategy based on semanticRatio and query text
206+
if (1.0 === $semanticRatio || empty($queryText)) {
207+
// Pure vector search
208+
$sql = $this->buildVectorOnlyQuery($whereClause, $limit);
209+
} elseif (0.0 === $semanticRatio) {
210+
// Pure full-text search
211+
$sql = $this->buildFtsOnlyQuery($whereClause, $limit);
212+
$params['query'] = $queryText;
213+
} else {
214+
// Hybrid search with weighted combination
215+
$sql = $this->buildHybridQuery($whereClause, $limit, $semanticRatio);
216+
$params['query'] = $queryText;
217+
}
218+
219+
$statement = $this->connection->prepare($sql);
220+
$statement->execute([...$params, ...($options['params'] ?? [])]);
221+
222+
$documents = [];
223+
foreach ($statement->fetchAll(\PDO::FETCH_ASSOC) as $result) {
224+
$documents[] = new VectorDocument(
225+
id: Uuid::fromString($result['id']),
226+
vector: new Vector($this->fromPgvector($result['embedding'])),
227+
metadata: new Metadata(json_decode($result['metadata'] ?? '{}', true, 512, \JSON_THROW_ON_ERROR)),
228+
score: $result['score'],
229+
);
230+
}
231+
232+
return $documents;
233+
}
234+
235+
private function buildVectorOnlyQuery(string $whereClause, int $limit): string
236+
{
237+
return \sprintf(<<<SQL
238+
SELECT id, %s AS embedding, metadata, (%s %s :embedding) AS score
239+
FROM %s
240+
%s
241+
ORDER BY score ASC
242+
LIMIT %d
243+
SQL,
244+
$this->vectorFieldName,
245+
$this->vectorFieldName,
246+
$this->distance->getComparisonSign(),
247+
$this->tableName,
248+
$whereClause,
249+
$limit,
250+
);
251+
}
252+
253+
private function buildFtsOnlyQuery(string $whereClause, int $limit): string
254+
{
255+
// Add FTS match filter to ensure only relevant documents are returned
256+
$ftsFilter = \sprintf("content_tsv @@ websearch_to_tsquery('%s', :query)", $this->language);
257+
258+
if ($whereClause) {
259+
// Combine existing WHERE clause with FTS filter
260+
$whereClause = str_replace('WHERE ', "WHERE $ftsFilter AND ", $whereClause);
261+
} else {
262+
$whereClause = "WHERE $ftsFilter";
263+
}
264+
265+
return \sprintf(<<<SQL
266+
SELECT id, %s AS embedding, metadata,
267+
(1.0 / (1.0 + ts_rank_cd(content_tsv, websearch_to_tsquery('%s', :query)))) AS score
268+
FROM %s
269+
%s
270+
ORDER BY score ASC
271+
LIMIT %d
272+
SQL,
273+
$this->vectorFieldName,
274+
$this->language,
275+
$this->tableName,
276+
$whereClause,
277+
$limit,
278+
);
279+
}
280+
281+
private function buildHybridQuery(string $whereClause, int $limit, float $semanticRatio): string
282+
{
283+
// Add FTS filter for the fts_scores CTE
284+
$ftsWhereClause = $whereClause;
285+
$ftsFilter = \sprintf("content_tsv @@ websearch_to_tsquery('%s', :query)", $this->language);
286+
287+
if ($whereClause) {
288+
$ftsWhereClause = str_replace('WHERE ', "WHERE $ftsFilter AND ", $whereClause);
289+
} else {
290+
$ftsWhereClause = "WHERE $ftsFilter";
291+
}
292+
293+
// RRF (Reciprocal Rank Fusion) - Same approach as Supabase
294+
// Formula: COALESCE(1.0 / (k + rank), 0.0) * weight
295+
// Lower score is better (like distance)
296+
return \sprintf(<<<SQL
297+
WITH vector_scores AS (
298+
SELECT id, %s AS embedding, metadata,
299+
ROW_NUMBER() OVER (ORDER BY %s %s :embedding) AS rank_ix
300+
FROM %s
301+
%s
302+
),
303+
fts_scores AS (
304+
SELECT id,
305+
ROW_NUMBER() OVER (ORDER BY ts_rank_cd(content_tsv, websearch_to_tsquery('%s', :query)) DESC) AS rank_ix
306+
FROM %s
307+
%s
308+
)
309+
SELECT v.id, v.embedding, v.metadata,
310+
(
311+
COALESCE(1.0 / (%d + v.rank_ix), 0.0) * %f +
312+
COALESCE(1.0 / (%d + f.rank_ix), 0.0) * %f
313+
) AS score
314+
FROM vector_scores v
315+
FULL OUTER JOIN fts_scores f ON v.id = f.id
316+
WHERE v.id IS NOT NULL OR f.id IS NOT NULL
317+
ORDER BY score DESC
318+
LIMIT %d
319+
SQL,
320+
$this->vectorFieldName,
321+
$this->vectorFieldName,
322+
$this->distance->getComparisonSign(),
323+
$this->tableName,
324+
$whereClause,
325+
$this->language,
326+
$this->tableName,
327+
$ftsWhereClause,
328+
$this->rrfK,
329+
$semanticRatio,
330+
$this->rrfK,
331+
1.0 - $semanticRatio,
332+
$limit,
333+
);
334+
}
335+
336+
private function toPgvector(VectorInterface $vector): string
337+
{
338+
return '['.implode(',', $vector->getData()).']';
339+
}
340+
341+
/**
342+
* @return float[]
343+
*/
344+
private function fromPgvector(string $vector): array
345+
{
346+
return json_decode($vector, true, 512, \JSON_THROW_ON_ERROR);
347+
}
348+
}

0 commit comments

Comments
 (0)