Skip to content

Commit 8180418

Browse files
committed
extend data loader
1 parent d1b5b28 commit 8180418

File tree

8 files changed

+116
-28
lines changed

8 files changed

+116
-28
lines changed
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
<?php
2+
3+
namespace NeuronAI\Exceptions;
4+
5+
class DataReaderException extends NeuronException
6+
{
7+
8+
}
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
<?php
2+
3+
namespace NeuronAI\Providers\Embeddings;
4+
5+
use NeuronAI\RAG\Document;
6+
7+
abstract class AbstractEmbeddingProvider implements EmbeddingsProviderInterface
8+
{
9+
public function embedDocuments(array $documents): array
10+
{
11+
/** @var Document $document */
12+
foreach ($documents as $index => $document) {
13+
$documents[$index] = $this->embedDocument($document);
14+
}
15+
16+
return $documents;
17+
}
18+
}

src/Providers/Embeddings/EmbeddingsProviderInterface.php

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,6 @@ interface EmbeddingsProviderInterface
1212
public function embedText(string $text): array;
1313

1414
public function embedDocument(Document $document): Document;
15+
16+
public function embedDocuments(array $documents): array;
1517
}

src/Providers/Embeddings/VoyageEmbeddingProvider.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
use NeuronAI\RAG\Document;
66
use GuzzleHttp\Client;
77

8-
class VoyageEmbeddingProvider implements EmbeddingsProviderInterface
8+
class VoyageEmbeddingProvider extends AbstractEmbeddingProvider
99
{
1010
protected Client $client;
1111

src/RAG/DataLoader/FileDataLoader.php

Lines changed: 40 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6,25 +6,51 @@
66

77
class FileDataLoader implements DataLoaderInterface
88
{
9-
public function __construct(
10-
protected string $filePath,
11-
protected array $extensions = [],
12-
) {}
9+
/**
10+
* @var array<string, ReaderInterface>
11+
*/
12+
protected array $readers = [];
13+
14+
public function __construct(protected string $path, array $readers = [])
15+
{
16+
$this->setReaders($readers);
17+
}
18+
19+
public static function file(...$args): static
20+
{
21+
return new static(...$args);
22+
}
23+
24+
public function addReader(string $fileExtension, ReaderInterface $reader): self
25+
{
26+
$this->readers[$fileExtension] = $reader;
27+
return $this;
28+
}
29+
30+
/**
31+
* @param array $readers
32+
* @return FileDataLoader
33+
*/
34+
public function setReaders(array $readers): self
35+
{
36+
$this->readers = $readers;
37+
return $this;
38+
}
1339

1440
public function getDocuments(): array
1541
{
16-
if (! file_exists($this->filePath)) {
42+
if (! file_exists($this->path)) {
1743
return [];
1844
}
1945

2046
// If it's a directory
21-
if (is_dir($this->filePath)) {
22-
return $this->getDocumentsFromDirectory($this->filePath);
47+
if (is_dir($this->path)) {
48+
return $this->getDocumentsFromDirectory($this->path);
2349
}
2450

2551
// If it's a file
2652
try {
27-
return [$this->getDocument($this->getContentFromFile($this->filePath), $this->filePath)];
53+
return [$this->getDocument($this->getContentFromFile($this->path), $this->path)];
2854
} catch (\Throwable $exception) {
2955
return [];
3056
}
@@ -67,10 +93,12 @@ protected function getContentFromFile(string $path): string|false
6793
{
6894
$fileExtension = strtolower(pathinfo($path, PATHINFO_EXTENSION));
6995

70-
return match ($fileExtension) {
71-
'pdf' => PdfParser::getText($path),
72-
default => file_get_contents($path)
73-
};
96+
if (\array_key_exists($fileExtension, $this->readers)) {
97+
$reader = $this->readers[$fileExtension];
98+
return $reader::getText($path);
99+
}
100+
101+
return TextFileReader::getText($path);
74102
}
75103

76104

src/RAG/DataLoader/PdfParser.php renamed to src/RAG/DataLoader/PdfReader.php

Lines changed: 28 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
namespace NeuronAI\RAG\DataLoader;
44

55
use Closure;
6+
use NeuronAI\Exceptions\DataReaderException;
67
use Symfony\Component\Process\Exception\ProcessFailedException;
78
use Symfony\Component\Process\Process;
89

@@ -11,7 +12,7 @@
1112
*
1213
* https://en.wikipedia.org/wiki/Pdftotext
1314
*/
14-
class PdfParser
15+
class PdfReader implements ReaderInterface
1516
{
1617
protected string $pdf;
1718

@@ -28,6 +29,12 @@ public function __construct(?string $binPath = null)
2829
$this->binPath = $binPath ?? $this->findPdfToText();
2930
}
3031

32+
public function setBinPath(string $binPath): self
33+
{
34+
$this->binPath = $binPath;
35+
return $this;
36+
}
37+
3138
protected function findPdfToText(): string
3239
{
3340
$commonPaths = [
@@ -44,13 +51,13 @@ protected function findPdfToText(): string
4451
}
4552
}
4653

47-
throw new \LogicException("The pdftotext binary was not found or is not executable.");
54+
throw new DataReaderException("The pdftotext binary was not found or is not executable.");
4855
}
4956

5057
public function setPdf(string $pdf): self
5158
{
5259
if (!is_readable($pdf)) {
53-
throw new \Exception("Could not read `{$pdf}`");
60+
throw new DataReaderException("Could not read `{$pdf}`");
5461
}
5562

5663
$this->pdf = $pdf;
@@ -97,11 +104,10 @@ public function setTimeout($timeout): self
97104
return $this;
98105
}
99106

100-
public function text(?Closure $callback = null): string
107+
public function text(): string
101108
{
102109
$process = new Process(array_merge([$this->binPath], $this->options, [$this->pdf, '-']));
103110
$process->setTimeout($this->timeout);
104-
$process = $callback ? $callback($process) : $process;
105111
$process->run();
106112
if (!$process->isSuccessful()) {
107113
throw new ProcessFailedException($process);
@@ -114,16 +120,23 @@ public function text(?Closure $callback = null): string
114120
* @throws \Exception
115121
*/
116122
public static function getText(
117-
string $pdf,
118-
?string $binPath = null,
119-
array $options = [],
120-
$timeout = 60,
121-
?Closure $callback = null
123+
string $filePath,
124+
array $options = []
122125
): string {
123-
return (new static($binPath))
124-
->setOptions($options)
125-
->setTimeout($timeout)
126-
->setPdf($pdf)
127-
->text($callback);
126+
$instance = new static();
127+
128+
if (\array_key_exists('binPath', $options)) {
129+
$instance->setBinPath($options['binPath']);
130+
}
131+
132+
if (\array_key_exists('options', $options)) {
133+
$instance->setOptions($options['options']);
134+
}
135+
136+
if (\array_key_exists('timeout', $options)) {
137+
$instance->setTimeout($options['timeout']);
138+
}
139+
140+
return $instance->text();
128141
}
129142
}
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
<?php
2+
3+
namespace NeuronAI\RAG\DataLoader;
4+
5+
interface ReaderInterface
6+
{
7+
public static function getText(string $filePath, array $options = []): string;
8+
}
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
<?php
2+
3+
namespace NeuronAI\RAG\DataLoader;
4+
5+
class TextFileReader implements ReaderInterface
6+
{
7+
public static function getText(string $filePath, array $options = []): string
8+
{
9+
return file_get_contents($filePath);
10+
}
11+
}

0 commit comments

Comments
 (0)