@@ -36,10 +36,6 @@ class LineBuilder(BaseBuilder):
3636 "The batch size to use for the ocr error detection model." ,
3737 "Default is None, which will use the default batch size for the model." ,
3838 ] = None
39- enable_table_ocr : Annotated [
40- bool ,
41- "Whether to skip OCR on tables. The TableProcessor will re-OCR them. Only enable if the TableProcessor is not running." ,
42- ] = False
4339 layout_coverage_min_lines : Annotated [
4440 int ,
4541 "The minimum number of PdfProvider lines that must be covered by the layout model" ,
@@ -54,17 +50,10 @@ class LineBuilder(BaseBuilder):
5450 float ,
5551 "If less pages than this threshold are good, OCR will happen in the document. Otherwise it will not." ,
5652 ] = 0.85
57- provider_line_detected_line_min_overlap_pct : Annotated [
58- float ,
59- "The percentage of a provider line that has to be covered by a detected line" ,
60- ] = 0.1
6153 provider_line_provider_line_min_overlap_pct : Annotated [
6254 float ,
6355 "The percentage of a provider line that has to be covered by a detected line" ,
64- ] = 0.1
65- line_vertical_merge_threshold : Annotated [
66- int , "The maximum pixel distance between y1s for two lines to be merged"
67- ] = 8
56+ ] = 0.15
6857 excluded_for_coverage : Annotated [
6958 Tuple [BlockTypes ],
7059 "A list of block types to exclude from the layout coverage check." ,
@@ -86,6 +75,10 @@ class LineBuilder(BaseBuilder):
8675 bool ,
8776 "Disable tqdm progress bars." ,
8877 ] = False
78+ disable_ocr : Annotated [
79+ bool ,
80+ "Disable OCR for the document. This will only use the lines from the provider." ,
81+ ] = False
8982 keep_chars : Annotated [bool , "Keep individual characters." ] = False
9083
9184 def __init__ (
@@ -169,6 +162,9 @@ def get_all_lines(self, document: Document, provider: PdfProvider):
169162 ), # Ensure provider lines don't overflow the page or intersect
170163 ]
171164 )
165+ if self .disable_ocr :
166+ provider_lines_good = True
167+
172168 layout_good .append (provider_lines_good )
173169
174170 run_detection = [not good for good in layout_good ]
@@ -191,12 +187,12 @@ def get_all_lines(self, document: Document, provider: PdfProvider):
191187 )
192188
193189 # Setup detection results
190+ detection_boxes = []
194191 if detection_result :
195192 detection_boxes = [
196193 PolygonBox (polygon = box .polygon ) for box in detection_result .bboxes
197194 ]
198- else :
199- detection_boxes = []
195+
200196 detection_boxes = sort_text_lines (detection_boxes )
201197
202198 if provider_lines_good :
@@ -257,6 +253,7 @@ def check_line_overlaps(
257253 provider_bboxes = [line .line .polygon .bbox for line in provider_lines ]
258254 # Add a small margin to account for minor overflows
259255 page_bbox = document_page .polygon .expand (5 , 5 ).bbox
256+
260257 for bbox in provider_bboxes :
261258 if bbox [0 ] < page_bbox [0 ]:
262259 return False
@@ -275,7 +272,7 @@ def check_line_overlaps(
275272 )
276273
277274 # There should be one intersection with itself
278- if intersect_counts > 1 :
275+ if intersect_counts > 2 :
279276 return False
280277
281278 return True
0 commit comments