Skip to content

Commit 703e50f

Browse files
committed
feat: add methods in libary to accept custom stream. either as parser or custom stream
1 parent 722e656 commit 703e50f

4 files changed

Lines changed: 95 additions & 13 deletions

File tree

TextExtraction/TableExtraction.cpp

Lines changed: 48 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -95,18 +95,21 @@ EStatusCode TableExtraction::ExtractTablePlacements(PDFParser* inParser, long in
9595

9696
static const string scEmpty = "";
9797

98-
EStatusCode TableExtraction::ExtractTables(const std::string& inFilePath, long inStartPage, long inEndPage) {
99-
EStatusCode status = eSuccess;
100-
InputFile sourceFile;
101-
102-
LatestWarnings.clear();
103-
LatestError.code = eErrorNone;
104-
LatestError.description = scEmpty;
105-
98+
void TableExtraction::ClearState() {
10699
textsForPages.clear();
107100
tableLinesForPages.clear();
108101
tablesForPages.clear();
109102
mediaBoxesForPages.clear();
103+
LatestWarnings.clear();
104+
LatestError.code = eErrorNone;
105+
LatestError.description = scEmpty;
106+
}
107+
108+
EStatusCode TableExtraction::ExtractTables(const std::string& inFilePath, long inStartPage, long inEndPage) {
109+
EStatusCode status = eSuccess;
110+
InputFile sourceFile;
111+
112+
ClearState();
110113

111114
do {
112115
status = sourceFile.OpenFile(inFilePath);
@@ -136,6 +139,43 @@ EStatusCode TableExtraction::ExtractTables(const std::string& inFilePath, long i
136139
return status;
137140
}
138141

142+
PDFHummus::EStatusCode TableExtraction::ExtractTables(PDFParser* inParser, long inStartPage, long inEndPage) {
143+
ClearState();
144+
145+
PDFHummus::EStatusCode status = ExtractTablePlacements(inParser, inStartPage, inEndPage);
146+
if(status == eSuccess) {
147+
ComposeTables();
148+
}
149+
150+
return status;
151+
}
152+
153+
PDFHummus::EStatusCode TableExtraction::ExtractTables(IByteReaderWithPosition* inStream, long inStartPage, long inEndPage) {
154+
EStatusCode status = eSuccess;
155+
156+
ClearState();
157+
158+
do {
159+
PDFParser parser;
160+
status = parser.StartPDFParsing(inStream);
161+
if(status != eSuccess)
162+
{
163+
LatestError.code = eErrorInternalPDFWriter;
164+
LatestError.description = string("Failed to parse file");
165+
break;
166+
}
167+
168+
status = ExtractTablePlacements(&parser, inStartPage, inEndPage);
169+
if(status != eSuccess)
170+
break;
171+
172+
ComposeTables();
173+
} while(false);
174+
175+
return status;
176+
}
177+
178+
139179

140180
void TableExtraction::ComposeTables() {
141181
TableComposer tableComposer;

TextExtraction/TableExtraction.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include "ErrorsAndWarnings.h"
1717

1818
class PDFParser;
19+
class IByteReaderWithPosition;
1920

2021
#include <sstream>
2122
#include <string>
@@ -33,6 +34,8 @@ class TableExtraction : public ITextInterpreterHandler, IGraphicContentInterpret
3334
virtual ~TableExtraction();
3435

3536
PDFHummus::EStatusCode ExtractTables(const std::string& inFilePath, long inStartPage=0, long inEndPage=-1);
37+
PDFHummus::EStatusCode ExtractTables(PDFParser* inParser, long inStartPage=0, long inEndPage=-1);
38+
PDFHummus::EStatusCode ExtractTables(IByteReaderWithPosition* inStream, long inStartPage=0, long inEndPage=-1);
3639

3740
ExtractionError LatestError;
3841
ExtractionWarningList LatestWarnings;
@@ -65,5 +68,6 @@ class TableExtraction : public ITextInterpreterHandler, IGraphicContentInterpret
6568

6669
PDFHummus::EStatusCode ExtractTablePlacements(PDFParser* inParser, long inStartPage, long inEndPage);
6770
void ComposeTables();
71+
void ClearState();
6872

6973
};

TextExtraction/TextExtraction.cpp

Lines changed: 39 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -80,15 +80,18 @@ EStatusCode TextExtraction::ExtractTextPlacements(PDFParser* inParser, long inSt
8080

8181
static const string scEmpty = "";
8282

83-
EStatusCode TextExtraction::ExtractText(const std::string& inFilePath, long inStartPage, long inEndPage) {
84-
EStatusCode status = eSuccess;
85-
InputFile sourceFile;
86-
83+
void TextExtraction::ClearState() {
84+
textsForPages.clear();
8785
LatestWarnings.clear();
8886
LatestError.code = eErrorNone;
8987
LatestError.description = scEmpty;
88+
}
9089

91-
textsForPages.clear();
90+
EStatusCode TextExtraction::ExtractText(const std::string& inFilePath, long inStartPage, long inEndPage) {
91+
EStatusCode status = eSuccess;
92+
InputFile sourceFile;
93+
94+
ClearState();
9295

9396
do {
9497
status = sourceFile.OpenFile(inFilePath);
@@ -117,6 +120,37 @@ EStatusCode TextExtraction::ExtractText(const std::string& inFilePath, long inSt
117120
return status;
118121
}
119122

123+
PDFHummus::EStatusCode TextExtraction::ExtractText(PDFParser* inParser, long inStartPage, long inEndPage) {
124+
ClearState();
125+
126+
return ExtractTextPlacements(inParser, inStartPage, inEndPage);
127+
}
128+
129+
PDFHummus::EStatusCode TextExtraction::ExtractText(IByteReaderWithPosition* inStream, long inStartPage, long inEndPage) {
130+
EStatusCode status = eSuccess;
131+
InputFile sourceFile;
132+
133+
ClearState();
134+
135+
do {
136+
PDFParser parser;
137+
status = parser.StartPDFParsing(inStream);
138+
if(status != eSuccess)
139+
{
140+
LatestError.code = eErrorInternalPDFWriter;
141+
LatestError.description = string("Failed to parse file");
142+
break;
143+
}
144+
145+
status = ExtractTextPlacements(&parser, inStartPage, inEndPage);
146+
if(status != eSuccess)
147+
break;
148+
149+
} while(false);
150+
151+
return status;
152+
}
153+
120154
static const string scCRLN = "\r\n";
121155

122156
std::string TextExtraction::GetResultsAsText(int bidiFlag, TextComposer::ESpacing spacingFlag) {

TextExtraction/TextExtraction.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include "ErrorsAndWarnings.h"
1212

1313
class PDFParser;
14+
class IByteReaderWithPosition;
1415

1516
#include <sstream>
1617
#include <string>
@@ -27,6 +28,8 @@ class TextExtraction : public ITextInterpreterHandler, IGraphicContentInterprete
2728
virtual ~TextExtraction();
2829

2930
PDFHummus::EStatusCode ExtractText(const std::string& inFilePath, long inStartPage=0, long inEndPage=-1);
31+
PDFHummus::EStatusCode ExtractText(PDFParser* inParser, long inStartPage=0, long inEndPage=-1);
32+
PDFHummus::EStatusCode ExtractText(IByteReaderWithPosition* inStream, long inStartPage=0, long inEndPage=-1);
3033

3134
ExtractionError LatestError;
3235
ExtractionWarningList LatestWarnings;
@@ -55,4 +58,5 @@ class TextExtraction : public ITextInterpreterHandler, IGraphicContentInterprete
5558
double currentPageScopeBox[4];
5659

5760
PDFHummus::EStatusCode ExtractTextPlacements(PDFParser* inParser, long inStartPage, long inEndPage);
61+
void ClearState();
5862
};

0 commit comments

Comments
 (0)