33namespace NeuronAI \RAG \DataLoader ;
44
55use Closure ;
6+ use NeuronAI \Exceptions \DataReaderException ;
67use Symfony \Component \Process \Exception \ProcessFailedException ;
78use Symfony \Component \Process \Process ;
89
1112 *
1213 * https://en.wikipedia.org/wiki/Pdftotext
1314 */
14- class PdfParser
15+ class PdfReader implements ReaderInterface
1516{
1617 protected string $ pdf ;
1718
@@ -28,6 +29,12 @@ public function __construct(?string $binPath = null)
2829 $ this ->binPath = $ binPath ?? $ this ->findPdfToText ();
2930 }
3031
32+ public function setBinPath (string $ binPath ): self
33+ {
34+ $ this ->binPath = $ binPath ;
35+ return $ this ;
36+ }
37+
3138 protected function findPdfToText (): string
3239 {
3340 $ commonPaths = [
@@ -44,13 +51,13 @@ protected function findPdfToText(): string
4451 }
4552 }
4653
47- throw new \ LogicException ("The pdftotext binary was not found or is not executable. " );
54+ throw new DataReaderException ("The pdftotext binary was not found or is not executable. " );
4855 }
4956
5057 public function setPdf (string $ pdf ): self
5158 {
5259 if (!is_readable ($ pdf )) {
53- throw new \ Exception ("Could not read ` {$ pdf }` " );
60+ throw new DataReaderException ("Could not read ` {$ pdf }` " );
5461 }
5562
5663 $ this ->pdf = $ pdf ;
@@ -97,11 +104,10 @@ public function setTimeout($timeout): self
97104 return $ this ;
98105 }
99106
100- public function text (? Closure $ callback = null ): string
107+ public function text (): string
101108 {
102109 $ process = new Process (array_merge ([$ this ->binPath ], $ this ->options , [$ this ->pdf , '- ' ]));
103110 $ process ->setTimeout ($ this ->timeout );
104- $ process = $ callback ? $ callback ($ process ) : $ process ;
105111 $ process ->run ();
106112 if (!$ process ->isSuccessful ()) {
107113 throw new ProcessFailedException ($ process );
@@ -114,16 +120,23 @@ public function text(?Closure $callback = null): string
114120 * @throws \Exception
115121 */
116122 public static function getText (
117- string $ pdf ,
118- ?string $ binPath = null ,
119- array $ options = [],
120- $ timeout = 60 ,
121- ?Closure $ callback = null
123+ string $ filePath ,
124+ array $ options = []
122125 ): string {
123- return (new static ($ binPath ))
124- ->setOptions ($ options )
125- ->setTimeout ($ timeout )
126- ->setPdf ($ pdf )
127- ->text ($ callback );
126+ $ instance = new static ();
127+
128+ if (\array_key_exists ('binPath ' , $ options )) {
129+ $ instance ->setBinPath ($ options ['binPath ' ]);
130+ }
131+
132+ if (\array_key_exists ('options ' , $ options )) {
133+ $ instance ->setOptions ($ options ['options ' ]);
134+ }
135+
136+ if (\array_key_exists ('timeout ' , $ options )) {
137+ $ instance ->setTimeout ($ options ['timeout ' ]);
138+ }
139+
140+ return $ instance ->text ();
128141 }
129142}
0 commit comments