Skip to content

Commit 574391d

Browse files
authored
Merge pull request #80 from RapidAI/sup_char_rec
Sup char rec
2 parents 9178a2c + 2519d98 commit 574391d

File tree

6 files changed

+89
-48
lines changed

6 files changed

+89
-48
lines changed

README.md

Lines changed: 22 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -15,16 +15,16 @@
1515
</div>
1616

1717
### 最近更新
18-
- **2024.10.22**
19-
- 补充复杂背景多表格检测提取方案[RapidTableDet](https://github.com/RapidAI/RapidTableDetection)
2018
- **2024.11.12**
21-
- 抽离模型识别和处理过程核心阈值,方便大家进行微调适配自己的场景[微调入参参考](#核心参数)
19+
- 抽离模型识别和处理过程核心阈值,方便大家进行微调适配自己的场景[输入参数](#核心参数)
2220
- **2024.11.16**
23-
- 补充文档扭曲矫正方案,可作为前置处理 [文档扭曲变形修正](https://github.com/Joker1212/RapidUnWrap)
21+
- 补充文档扭曲矫正方案,可作为前置处理 [RapidUnwrap](https://github.com/Joker1212/RapidUnWrap)
22+
- **2024.11.22**
23+
- 支持单字符匹配方案,需要RapidOCR>=1.4.0
2424

2525
### 简介
2626
💖该仓库是用来对文档中表格做结构化识别的推理库,包括来自阿里读光有线和无线表格识别模型,llaipython(微信)贡献的有线表格模型,网易Qanything内置表格分类模型等。\
27-
[快速开始](#安装) [模型评测](#指标结果) [使用建议](#使用建议) [文档扭曲变形修正](https://github.com/Joker1212/RapidUnWrap) [表格旋转及透视修正](#表格旋转及透视修正) [微调入参参考](#核心参数) [常见问题](#FAQ) [更新计划](#更新计划)
27+
[快速开始](#安装) [模型评测](#指标结果) [使用建议](#使用建议) [单字匹配](#单字ocr匹配) [文档扭曲修正](https://github.com/Joker1212/RapidUnWrap) [表格旋转及透视修正](#表格旋转及透视修正) [输入参数](#核心参数) [常见问题](#FAQ) [更新计划](#更新计划)
2828
#### 特点
2929

3030
**** 采用ONNXRuntime作为推理引擎,cpu下单图推理1-7s
@@ -70,7 +70,6 @@
7070
wired_table_rec_v2(有线表格精度最高): 通用场景有线表格(论文,杂志,期刊, 收据,单据,账单)
7171

7272
paddlex-SLANet-plus(综合精度最高): 文档场景表格(论文,杂志,期刊中的表格)
73-
[微调入参参考](#核心参数)
7473

7574
### 安装
7675

@@ -106,8 +105,7 @@ print(f"elasp: {elasp}")
106105
# 使用其他ocr模型
107106
#ocr_engine =RapidOCR(det_model_dir="xxx/det_server_infer.onnx",rec_model_dir="xxx/rec_server_infer.onnx")
108107
#ocr_res, _ = ocr_engine(img_path)
109-
#html, elasp, polygons, logic_points, ocr_res = table_engine(img_path, ocr_result=ocr_res)
110-
108+
#html, elasp, polygons, logic_points, ocr_res = table_engine(img_path, ocr_result=ocr_res)
111109
# output_dir = f'outputs'
112110
# complete_html = format_html(html)
113111
# os.makedirs(os.path.dirname(f"{output_dir}/table.html"), exist_ok=True)
@@ -121,6 +119,17 @@ print(f"elasp: {elasp}")
121119
# plot_rec_box(img_path, f"{output_dir}/ocr_box.jpg", ocr_res)
122120
```
123121

122+
#### 单字ocr匹配
123+
```python
124+
# 将单字box转换为行识别同样的结构)
125+
from rapidocr_onnxruntime import RapidOCR
126+
from wired_table_rec.utils_table_recover import trans_char_ocr_res
127+
img_path = "tests/test_files/wired/table4.jpg"
128+
ocr_engine =RapidOCR()
129+
ocr_res, _ = ocr_engine(img_path, return_word_box=True)
130+
ocr_res = trans_char_ocr_res(ocr_res)
131+
```
132+
124133
#### 表格旋转及透视修正
125134
##### 1.简单背景,小角度场景
126135
```python
@@ -165,19 +174,17 @@ for i, res in enumerate(result):
165174
```python
166175
wired_table_rec = WiredTableRecognition()
167176
html, elasp, polygons, logic_points, ocr_res = wired_table_rec(
168-
img_path,
177+
img, # 图片 Union[str, np.ndarray, bytes, Path, PIL.Image.Image]
178+
ocr_result, # 输入rapidOCR识别结果,不传默认使用内部rapidocr模型
169179
version="v2", #默认使用v2线框模型,切换阿里读光模型可改为v1
170-
morph_close=True, # 是否进行形态学操作,辅助找到更多线框,默认为True
171-
more_h_lines=True, # 是否基于线框检测结果进行更多水平线检查,辅助找到更小线框, 默认为True
172-
h_lines_threshold = 100, # 必须开启more_h_lines, 连接横线检测像素阈值,小于该值会生成新横线,默认为100
173-
more_v_lines=True, # 是否基于线框检测结果进行更多垂直线检查,辅助找到更小线框, 默认为True
174-
v_lines_threshold = 15, # 必须开启more_v_lines, 连接竖线检测像素阈值,小于该值会生成新竖线,默认为15
175-
extend_line=True, # 是否基于线框检测结果进行线段延长,辅助找到更多线框, 默认为True
180+
enhance_box_line=True, # 识别框切割增强(关闭避免多余切割,开启减少漏切割),默认为True
176181
need_ocr=True, # 是否进行OCR识别, 默认为True
177182
rec_again=True,# 是否针对未识别到文字的表格框,进行单独截取再识别,默认为True
178183
)
179184
lineless_table_rec = LinelessTableRecognition()
180185
html, elasp, polygons, logic_points, ocr_res = lineless_table_rec(
186+
img, # 图片 Union[str, np.ndarray, bytes, Path, PIL.Image.Image]
187+
ocr_result, # 输入rapidOCR识别结果,不传默认使用内部rapidocr模型
181188
need_ocr=True, # 是否进行OCR识别, 默认为True
182189
rec_again=True,# 是否针对未识别到文字的表格框,进行单独截取再识别,默认为True
183190
)

README_en.md

Lines changed: 25 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -13,17 +13,16 @@
1313
</div>
1414

1515
### Recent Updates
16-
- **2024.10.22**
17-
- Added the complex background multi-table detection and extraction solution [RapidTableDet](https://github.com/RapidAI/RapidTableDetection).
18-
1916
- **2024.11.12**
2017
- Extracted model recognition and processing core thresholds for easier fine-tuning according to specific scenarios. See [Core Parameters](#core-parameters).
2118
- **2024.11.16**
22-
- Added document distortion correction solution, which can be used as a pre-processing step [Document Distortion Correction](https://github.com/Joker1212/RapidUnWrap)
19+
- Added document distortion correction solution, which can be used as a pre-processing step [RapidUnWrap](https://github.com/Joker1212/RapidUnWrap)
20+
- **2024.11.22**
21+
- Support Char Rec, RapidOCR>=1.4.0
2322
### Introduction
2423
💖 This repository serves as an inference library for structured recognition of tables within documents, including models for wired and wireless table recognition from Alibaba DulaLight, a wired table model from llaipython (WeChat), and a built-in table classification model from NetEase Qanything.
2524

26-
[Quick Start](#installation) [Model Evaluation](#evaluation-results) [Usage Recommendations](#usage-recommendations) [Document Distortion Correction](https://github.com/Joker1212/RapidUnWrap) [Table Rotation & Perspective Correction](#table-rotation-and-perspective-correction) [Fine-tuning Input Parameters Reference](#core-parameters) [Frequently Asked Questions](#faqs) [Update Plan](#update-plan)
25+
[Quick Start](#installation) [Model Evaluation](#evaluation-results) [Char Rec](#Single-Character-OCR-Matching) [Usage Recommendations](#usage-recommendations) [Document Distortion Correction](https://github.com/Joker1212/RapidUnWrap) [Table Rotation & Perspective Correction](#table-rotation-and-perspective-correction) [Input Parameters](#core-parameters) [Frequently Asked Questions](#FAQ) [Update Plan](#update-plan)
2726
#### Features
2827

2928
**Fast:** Uses ONNXRuntime as the inference engine, achieving 1-7 seconds per image on CPU.
@@ -71,7 +70,7 @@ Surya-Tabled uses its built-in OCR module, which is a row-column recognition mod
7170
### Usage Recommendations
7271
wired_table_rec_v2 (highest precision for wired tables): General scenes for wired tables (papers, magazines, journals, receipts, invoices, bills)
7372

74-
paddlex-SLANet-plus (highest overall precision): Document scene tables (tables in papers, magazines, and journals) [Fine-tuning Input Parameters Reference](#core-parameters)
73+
paddlex-SLANet-plus (highest overall precision): Document scene tables (tables in papers, magazines, and journals)
7574

7675
### Installation
7776

@@ -121,6 +120,16 @@ print(f"elasp: {elasp}")
121120
# Visualize OCR recognition boxes
122121
# plot_rec_box(img_path, f"{output_dir}/ocr_box.jpg", ocr_res)
123122
```
123+
#### Single Character OCR Matching
124+
```python
125+
# Convert single character boxes to the same structure as line recognition
126+
from rapidocr_onnxruntime import RapidOCR
127+
from wired_table_rec.utils_table_recover import trans_char_ocr_res
128+
img_path = "tests/test_files/wired/table4.jpg"
129+
ocr_engine =RapidOCR()
130+
ocr_res, _ = ocr_engine(img_path, return_word_box=True)
131+
ocr_res = trans_char_ocr_res(ocr_res)
132+
```
124133

125134
#### Table Rotation and Perspective Correction
126135
##### 1. Simple Background, Small Angle Scene
@@ -166,21 +175,19 @@ for i, res in enumerate(result):
166175
```python
167176
wired_table_rec = WiredTableRecognition()
168177
html, elasp, polygons, logic_points, ocr_res = wired_table_rec(
169-
img_path,
170-
version="v2", # Default to use v2 line model, switch to Alibaba ReadLight model by changing to v1
171-
morph_close=True,# Whether to perform morphological operations to find more lines, default is True
172-
more_h_lines=True, # Whether to check for more horizontal lines based on line detection results to find smaller lines, default is True
173-
h_lines_threshold = 100, # Must enable more_h_lines, threshold for connecting horizontal line detection pixels, new horizontal lines will be generated if below this value, default is 100
174-
more_v_lines=True, # Whether to check for more vertical lines based on line detection results to find smaller lines, default is True
175-
v_lines_threshold = 15, # Must enable more_v_lines, threshold for connecting vertical line detection pixels, new vertical lines will be generated if below this value, default is 15
176-
extend_line=True, # Whether to extend line segments based on line detection results to find more lines, default is True
177-
need_ocr=True, # Whether to perform OCR recognition, default is True
178-
rec_again=True,# Whether to re-recognize table boxes that were not recognized, default is True
178+
img, # Image Union[str, np.ndarray, bytes, Path, PIL.Image.Image]
179+
ocr_result, # Input rapidOCR recognition result, use internal rapidocr model by default if not provided
180+
version="v2", # Default to using v2 line model, switch to AliDamo model by changing to v1
181+
enhance_box_line=True, # Enhance box line find (turn off to avoid excessive cutting, turn on to reduce missed cuts), default is True
182+
need_ocr=True, # Whether to perform OCR recognition, default is True
183+
rec_again=True, # Whether to re-recognize table boxes without detected text by cropping them separately, default is True
179184
)
180185
lineless_table_rec = LinelessTableRecognition()
181186
html, elasp, polygons, logic_points, ocr_res = lineless_table_rec(
182-
need_ocr=True, # Whether to perform OCR recognition, default is True
183-
rec_again=True, # Whether to re-recognize table boxes that were not recognized, default is True
187+
img, # Image Union[str, np.ndarray, bytes, Path, PIL.Image.Image]
188+
ocr_result, # Input rapidOCR recognition result, use internal rapidocr model by default if not provided
189+
need_ocr=True, # Whether to perform OCR recognition, default is True
190+
rec_again=True, # Whether to re-recognize table boxes without detected text by cropping them separately, default is True
184191
)
185192
```
186193

lineless_table_rec/utils_table_recover.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -289,7 +289,7 @@ def gather_ocr_list_by_row(ocr_list: List[Any], thehold: float = 0.2) -> List[An
289289
cur[0], next[0], axis="y", threhold=thehold
290290
)
291291
if c_idx:
292-
dis = max(next_box[0] - cur_box[0], 0)
292+
dis = max(next_box[0] - cur_box[2], 0)
293293
blank_str = int(dis / threshold) * " "
294294
cur[1] = cur[1] + blank_str + next[1]
295295
xmin = min(cur_box[0], next_box[0])
@@ -605,6 +605,19 @@ def format_html(html):
605605
"""
606606

607607

608+
def trans_char_ocr_res(ocr_res):
609+
word_result = []
610+
for res in ocr_res:
611+
score = res[2]
612+
for word_box, word in zip(res[3], res[4]):
613+
word_res = []
614+
word_res.append(word_box)
615+
word_res.append(word)
616+
word_res.append(score)
617+
word_result.append(word_res)
618+
return word_result
619+
620+
608621
def get_rotate_crop_image(img: np.ndarray, points: np.ndarray) -> np.ndarray:
609622
img_crop_width = int(
610623
max(

tests/test_wired_table_rec.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -68,17 +68,17 @@ def test_input_normal(img_path, gt_td_nums, gt2):
6868
@pytest.mark.parametrize(
6969
"img_path, gt_td_nums",
7070
[
71-
("wired_big_box.png", 70),
71+
("wired_big_box.png", 44),
7272
],
7373
)
74-
def test_input_normal(img_path, gt_td_nums):
74+
def test_enhance_box_line(img_path, gt_td_nums):
7575
img_path = test_file_dir / img_path
7676

7777
ocr_result, _ = ocr_engine(img_path)
78-
table_str, *_ = table_recog(str(img_path), ocr_result)
78+
table_str, *_ = table_recog(str(img_path), ocr_result, enhance_box_line=False)
7979
td_nums = get_td_nums(table_str)
8080

81-
assert td_nums >= gt_td_nums
81+
assert td_nums <= gt_td_nums
8282

8383

8484
@pytest.mark.parametrize(
@@ -285,7 +285,7 @@ def test_plot_html_table(logi_points, cell_box_map, expected_html):
285285
@pytest.mark.parametrize(
286286
"img_path, gt_td_nums, gt2",
287287
[
288-
("table_recognition.jpg", 35, "d colsp"),
288+
("table_recognition.jpg", 20, "d colsp"),
289289
],
290290
)
291291
def test_no_rec_again(img_path, gt_td_nums, gt2):

wired_table_rec/table_line_rec_plus.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -73,17 +73,18 @@ def postprocess(self, img, pred, **kwargs):
7373
h_lines_threshold = kwargs.get("h_lines_threshold", 100) if kwargs else 100
7474
v_lines_threshold = kwargs.get("v_lines_threshold", 15) if kwargs else 15
7575
angle = kwargs.get("angle", 50) if kwargs else 50
76+
enhance_box_line = kwargs.get("enhance_box_line") if kwargs else True
7677
morph_close = (
77-
kwargs.get("morph_close", True) if kwargs else True
78+
kwargs.get("morph_close", enhance_box_line) if kwargs else enhance_box_line
7879
) # 是否进行闭合运算以找到更多小的框
7980
more_h_lines = (
80-
kwargs.get("more_h_lines", True) if kwargs else True
81+
kwargs.get("more_h_lines", enhance_box_line) if kwargs else enhance_box_line
8182
) # 是否调整以找到更多的横线
8283
more_v_lines = (
83-
kwargs.get("more_v_lines", True) if kwargs else True
84+
kwargs.get("more_v_lines", enhance_box_line) if kwargs else enhance_box_line
8485
) # 是否调整以找到更多的横线
8586
extend_line = (
86-
kwargs.get("extend_line", True) if kwargs else True
87+
kwargs.get("extend_line", enhance_box_line) if kwargs else enhance_box_line
8788
) # 是否进行线段延长使得端点连接
8889

8990
ori_shape = img.shape

wired_table_rec/utils_table_recover.py

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -262,7 +262,7 @@ def plot_rec_box_with_logic_info(img_path, output_path, logic_points, sorted_pol
262262
y1 = round(y1)
263263
cv2.rectangle(img, (x0, y0), (x1, y1), (0, 0, 255), 1)
264264
# 增大字体大小和线宽
265-
font_scale = 0.7 # 原先是0.5
265+
font_scale = 0.9 # 原先是0.5
266266
thickness = 1 # 原先是1
267267
logic_point = logic_points[idx]
268268
cv2.putText(
@@ -288,6 +288,19 @@ def plot_rec_box_with_logic_info(img_path, output_path, logic_points, sorted_pol
288288
cv2.imwrite(output_path, img)
289289

290290

291+
def trans_char_ocr_res(ocr_res):
292+
word_result = []
293+
for res in ocr_res:
294+
score = res[2]
295+
for word_box, word in zip(res[3], res[4]):
296+
word_res = []
297+
word_res.append(word_box)
298+
word_res.append(word)
299+
word_res.append(score)
300+
word_result.append(word_res)
301+
return word_result
302+
303+
291304
def plot_rec_box(img_path, output_path, sorted_polygons):
292305
"""
293306
:param img_path
@@ -309,13 +322,13 @@ def plot_rec_box(img_path, output_path, sorted_polygons):
309322
y1 = round(y1)
310323
cv2.rectangle(img, (x0, y0), (x1, y1), (0, 0, 255), 1)
311324
# 增大字体大小和线宽
312-
font_scale = 1.0 # 原先是0.5
313-
thickness = 2 # 原先是1
325+
font_scale = 0.9 # 原先是0.5
326+
thickness = 1 # 原先是1
314327

315328
cv2.putText(
316329
img,
317330
str(idx),
318-
(x1, y1),
331+
(x0 + 5, y0 + 5),
319332
cv2.FONT_HERSHEY_PLAIN,
320333
font_scale,
321334
(0, 0, 255),
@@ -392,7 +405,7 @@ def gather_ocr_list_by_row(ocr_list: List[Any], threhold: float = 0.2) -> List[A
392405
cur[0], next[0], axis="y", threhold=threhold
393406
)
394407
if c_idx:
395-
dis = max(next_box[0] - cur_box[0], 0)
408+
dis = max(next_box[0] - cur_box[2], 0)
396409
blank_str = int(dis / threshold) * " "
397410
cur[1] = cur[1] + blank_str + next[1]
398411
xmin = min(cur_box[0], next_box[0])

0 commit comments

Comments
 (0)