Skip to content

Commit 0f014e2

Browse files
authored
feat: implement document loader & transformer for store indexing (#343)
1 parent 2f25a00 commit 0f014e2

19 files changed

+679
-109
lines changed

examples/store/document-splitting.php

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
use PhpLlm\LlmChain\Store\Document\Loader\TextFileLoader;
6+
use PhpLlm\LlmChain\Store\Document\Transformer\TextSplitTransformer;
7+
8+
require_once dirname(__DIR__, 2).'/vendor/autoload.php';
9+
10+
$loader = new TextFileLoader();
11+
$splitter = new TextSplitTransformer();
12+
$source = dirname(__DIR__, 2).'/tests/Fixture/lorem.txt';
13+
14+
$documents = iterator_to_array($splitter($loader($source)));
15+
16+
dump($documents);
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
use PhpLlm\LlmChain\Platform\Bridge\OpenAI\Embeddings;
6+
use PhpLlm\LlmChain\Platform\Bridge\OpenAI\PlatformFactory;
7+
use PhpLlm\LlmChain\Store\Document\TextDocument;
8+
use PhpLlm\LlmChain\Store\Document\VectorDocument;
9+
use PhpLlm\LlmChain\Store\Document\Vectorizer;
10+
use Symfony\Component\Dotenv\Dotenv;
11+
use Symfony\Component\Uid\Uuid;
12+
13+
require_once dirname(__DIR__, 2).'/vendor/autoload.php';
14+
(new Dotenv())->loadEnv(dirname(__DIR__, 2).'/.env');
15+
16+
if (empty($_ENV['OPENAI_API_KEY'])) {
17+
echo 'Please set the OPENAI_API_KEY environment variable.'.\PHP_EOL;
18+
exit(1);
19+
}
20+
21+
$platform = PlatformFactory::create($_ENV['OPENAI_API_KEY']);
22+
$embeddings = new Embeddings(Embeddings::TEXT_3_LARGE);
23+
24+
$textDocuments = [
25+
new TextDocument(Uuid::v4(), 'Hello World'),
26+
new TextDocument(Uuid::v4(), 'Lorem ipsum dolor sit amet'),
27+
new TextDocument(Uuid::v4(), 'PHP Hypertext Preprocessor'),
28+
];
29+
30+
$vectorizer = new Vectorizer($platform, $embeddings);
31+
$vectorDocuments = $vectorizer->vectorizeDocuments($textDocuments);
32+
33+
dump(array_map(fn (VectorDocument $document) => $document->vector->getDimensions(), $vectorDocuments));

examples/store/mariadb-similarity-search-gemini.php

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
use PhpLlm\LlmChain\Store\Bridge\MariaDB\Store;
1616
use PhpLlm\LlmChain\Store\Document\Metadata;
1717
use PhpLlm\LlmChain\Store\Document\TextDocument;
18+
use PhpLlm\LlmChain\Store\Document\Vectorizer;
1819
use PhpLlm\LlmChain\Store\Indexer;
1920
use Symfony\Component\Dotenv\Dotenv;
2021
use Symfony\Component\Uid\Uuid;
@@ -57,7 +58,8 @@
5758
// create embeddings for documents
5859
$platform = PlatformFactory::create($_ENV['GOOGLE_API_KEY']);
5960
$embeddings = new Embeddings(options: ['dimensions' => 768, 'task_type' => TaskType::SemanticSimilarity]);
60-
$indexer = new Indexer($platform, $embeddings, $store);
61+
$vectorizer = new Vectorizer($platform, $embeddings);
62+
$indexer = new Indexer($vectorizer, $store);
6163
$indexer->index($documents);
6264

6365
$model = new Gemini(Gemini::GEMINI_2_FLASH_LITE);

examples/store/mariadb-similarity-search.php

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
use PhpLlm\LlmChain\Store\Bridge\MariaDB\Store;
1515
use PhpLlm\LlmChain\Store\Document\Metadata;
1616
use PhpLlm\LlmChain\Store\Document\TextDocument;
17+
use PhpLlm\LlmChain\Store\Document\Vectorizer;
1718
use PhpLlm\LlmChain\Store\Indexer;
1819
use Symfony\Component\Dotenv\Dotenv;
1920
use Symfony\Component\Uid\Uuid;
@@ -55,7 +56,8 @@
5556

5657
// create embeddings for documents
5758
$platform = PlatformFactory::create($_ENV['OPENAI_API_KEY']);
58-
$indexer = new Indexer($platform, $embeddings = new Embeddings(), $store);
59+
$vectorizer = new Vectorizer($platform, $embeddings = new Embeddings());
60+
$indexer = new Indexer($vectorizer, $store);
5961
$indexer->index($documents);
6062

6163
$model = new GPT(GPT::GPT_4O_MINI);

examples/store/mongodb-similarity-search.php

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
use PhpLlm\LlmChain\Store\Bridge\MongoDB\Store;
1414
use PhpLlm\LlmChain\Store\Document\Metadata;
1515
use PhpLlm\LlmChain\Store\Document\TextDocument;
16+
use PhpLlm\LlmChain\Store\Document\Vectorizer;
1617
use PhpLlm\LlmChain\Store\Indexer;
1718
use Symfony\Component\Dotenv\Dotenv;
1819
use Symfony\Component\Uid\Uuid;
@@ -52,7 +53,8 @@
5253

5354
// create embeddings for documents
5455
$platform = PlatformFactory::create($_ENV['OPENAI_API_KEY']);
55-
$indexer = new Indexer($platform, $embeddings = new Embeddings(), $store);
56+
$vectorizer = new Vectorizer($platform, $embeddings = new Embeddings());
57+
$indexer = new Indexer($vectorizer, $store);
5658
$indexer->index($documents);
5759

5860
// initialize the index

examples/store/pinecone-similarity-search.php

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
use PhpLlm\LlmChain\Store\Bridge\Pinecone\Store;
1313
use PhpLlm\LlmChain\Store\Document\Metadata;
1414
use PhpLlm\LlmChain\Store\Document\TextDocument;
15+
use PhpLlm\LlmChain\Store\Document\Vectorizer;
1516
use PhpLlm\LlmChain\Store\Indexer;
1617
use Probots\Pinecone\Pinecone;
1718
use Symfony\Component\Dotenv\Dotenv;
@@ -46,7 +47,8 @@
4647

4748
// create embeddings for documents
4849
$platform = PlatformFactory::create($_ENV['OPENAI_API_KEY']);
49-
$indexer = new Indexer($platform, $embeddings = new Embeddings(), $store);
50+
$vectorizer = new Vectorizer($platform, $embeddings = new Embeddings());
51+
$indexer = new Indexer($vectorizer, $store);
5052
$indexer->index($documents);
5153

5254
$model = new GPT(GPT::GPT_4O_MINI);
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace PhpLlm\LlmChain\Store\Document\Loader;
6+
7+
use PhpLlm\LlmChain\Store\Document\LoaderInterface;
8+
use PhpLlm\LlmChain\Store\Document\Metadata;
9+
use PhpLlm\LlmChain\Store\Document\TextDocument;
10+
use PhpLlm\LlmChain\Store\Exception\RuntimeException;
11+
use Symfony\Component\Uid\Uuid;
12+
13+
/**
14+
* @author Christopher Hertel <[email protected]>
15+
*/
16+
final readonly class TextFileLoader implements LoaderInterface
17+
{
18+
public function __invoke(string $source, array $options = []): iterable
19+
{
20+
if (!is_file($source)) {
21+
throw new RuntimeException(\sprintf('File "%s" does not exist.', $source));
22+
}
23+
24+
$content = file_get_contents($source);
25+
26+
if (false === $content) {
27+
throw new RuntimeException(\sprintf('Unable to read file "%s"', $source));
28+
}
29+
30+
yield new TextDocument(Uuid::v4(), trim($content), new Metadata([
31+
'source' => $source,
32+
]));
33+
}
34+
}
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace PhpLlm\LlmChain\Store\Document;
6+
7+
/**
8+
* @author Christopher Hertel <[email protected]>
9+
*/
10+
interface LoaderInterface
11+
{
12+
/**
13+
* @param string $source Identifier for the loader to load the documents from, e.g. file path, folder, or URL.
14+
* @param array<string, mixed> $options loader specific set of options to control the loading process
15+
*
16+
* @return iterable<TextDocument> iterable of TextDocuments loaded from the source
17+
*/
18+
public function __invoke(string $source, array $options = []): iterable;
19+
}
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace PhpLlm\LlmChain\Store\Document\Transformer;
6+
7+
use PhpLlm\LlmChain\Store\Document\TransformerInterface;
8+
9+
final readonly class ChainTransformer implements TransformerInterface
10+
{
11+
/**
12+
* @var TransformerInterface[]
13+
*/
14+
private array $transformers;
15+
16+
/**
17+
* @param iterable<TransformerInterface> $transformers
18+
*/
19+
public function __construct(iterable $transformers)
20+
{
21+
$this->transformers = $transformers instanceof \Traversable ? iterator_to_array($transformers) : $transformers;
22+
}
23+
24+
public function __invoke(iterable $documents, array $options = []): iterable
25+
{
26+
foreach ($this->transformers as $transformer) {
27+
$documents = $transformer($documents, $options);
28+
}
29+
30+
return $documents;
31+
}
32+
}
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace PhpLlm\LlmChain\Store\Document\Transformer;
6+
7+
use PhpLlm\LlmChain\Store\Document\TransformerInterface;
8+
use Symfony\Component\Clock\ClockInterface;
9+
10+
/**
11+
* This transformer splits the batch of documents into chunks and delays in-between with x seconds, which is useful
12+
* when indexing a lot of documents and facing API rate limits.
13+
*
14+
* @author Christopher Hertel <[email protected]>
15+
*/
16+
final readonly class ChunkDelayTransformer implements TransformerInterface
17+
{
18+
public const OPTION_CHUNK_SIZE = 'chunk_size';
19+
public const OPTION_DELAY = 'delay';
20+
21+
public function __construct(
22+
private ClockInterface $clock,
23+
) {
24+
}
25+
26+
/**
27+
* @param array{chunk_size?: int, delay?: int} $options
28+
*/
29+
public function __invoke(iterable $documents, array $options = []): iterable
30+
{
31+
$chunkSize = $options[self::OPTION_CHUNK_SIZE] ?? 50;
32+
$delay = $options[self::OPTION_DELAY] ?? 10;
33+
34+
$counter = 0;
35+
foreach ($documents as $document) {
36+
yield $document;
37+
++$counter;
38+
39+
if ($chunkSize === $counter && 0 !== $delay) {
40+
$this->clock->sleep($delay);
41+
}
42+
}
43+
}
44+
}

0 commit comments

Comments
 (0)