Merge pull request #70 from RapidAI/add_param_for_ocr

Joker1212 · web-flow · commit fe6024610455 · 2024-11-12T13:57:34.000+08:00
Add param for ocr
diff --git a/README.md b/README.md
@@ -13,16 +13,16 @@
 </div>
 
 ### 最近更新
-- **2024.10.13**
-  - 补充最新paddlex-SLANet-plus 测评结果(已集成模型到[RapidTable](https://github.com/RapidAI/RapidTable)仓库)
 - **2024.10.22**
   - 补充复杂背景多表格检测提取方案[RapidTableDet](https://github.com/RapidAI/RapidTableDetection)
 - **2024.10.29**
   - 使用yolo11重新训练表格分类器，修正wired_table_rec v2逻辑坐标还原错误，并更新测评
+- **2024.11.12**
+  - 抽离模型识别和处理过程核心阈值，方便大家进行微调适配自己的场景[微调入参参考](#核心参数)   
     
 ### 简介
-💖该仓库是用来对文档中表格做结构化识别的推理库，包括来自阿里读光有线和无线表格识别模型，llaipython(微信)贡献的有线表格模型，网易Qanything内置表格分类模型等。
-
+💖该仓库是用来对文档中表格做结构化识别的推理库，包括来自阿里读光有线和无线表格识别模型，llaipython(微信)贡献的有线表格模型，网易Qanything内置表格分类模型等。\
+[快速开始](#安装) [模型评测](#指标结果) [使用建议](#使用建议) [表格旋转及透视修正](#表格旋转及透视修正) [微调入参参考](#核心参数) [常见问题](#FAQ) [更新计划](#更新计划)
 #### 特点
 
 ⚡  **快**  采用ONNXRuntime作为推理引擎，cpu下单图推理1-7s
@@ -68,6 +68,7 @@
 wired_table_rec_v2(有线表格精度最高): 通用场景有线表格(论文，杂志，期刊, 收据，单据，账单)
 
 paddlex-SLANet-plus(综合精度最高): 文档场景表格(论文，杂志，期刊中的表格)
+[微调入参参考](#核心参数)
 
 ### 安装
 
@@ -158,7 +159,30 @@ for i, res in enumerate(result):
 # cv2.imwrite(f"{out_dir}/{file_name}-visualize.jpg", img)
 ```
 
-## FAQ (Frequently Asked Questions)
+### 核心参数
+```python
+wired_table_rec = WiredTableRecognition()
+html, elasp, polygons, logic_points, ocr_res = wired_table_rec(
+    img_path,
+    version="v2", #默认使用v2线框模型，切换阿里读光模型可改为v1
+    morph_close=True, # 是否进行形态学操作,辅助找到更多线框,默认为True
+    more_h_lines=True, # 是否基于线框检测结果进行更多水平线检查，辅助找到更小线框, 默认为True
+    h_lines_threshold = 100, # 必须开启more_h_lines, 连接横线检测像素阈值，小于该值会生成新横线，默认为100
+    more_v_lines=True, # 是否基于线框检测结果进行更多垂直线检查，辅助找到更小线框, 默认为True
+    v_lines_threshold = 15, # 必须开启more_v_lines, 连接竖线检测像素阈值，小于该值会生成新竖线，默认为15
+    extend_line=True, # 是否基于线框检测结果进行线段延长，辅助找到更多线框, 默认为True
+    need_ocr=True, # 是否进行OCR识别, 默认为True
+    rec_again=True,# 是否针对未识别到文字的表格框,进行单独截取再识别,默认为True
+)
+lineless_table_rec = LinelessTableRecognition()
+html, elasp, polygons, logic_points, ocr_res = lineless_table_rec(
+    need_ocr=True, # 是否进行OCR识别, 默认为True
+    rec_again=True,# 是否针对未识别到文字的表格框,进行单独截取再识别,默认为True
+)
+```
+
+
+## FAQ
 1. **问：识别框丢失了内部文字信息**
    - 答：默认使用的rapidocr小模型，如果需要更高精度的效果，可以从 [模型列表](https://rapidai.github.io/RapidOCRDocs/model_list/#_1)
    下载更高精度的ocr模型,在执行时传入ocr_result即可
@@ -168,7 +192,7 @@ for i, res in enumerate(result):
       主要耗时在ocr阶段，可以参考 [rapidocr_paddle](https://rapidai.github.io/RapidOCRDocs/install_usage/rapidocr_paddle/usage/#_3)
       加速ocr识别过程
 
-### TODO List
+### 更新计划
 
 - [x] 图片小角度偏移修正方法补充
 - [x] 增加数据集数量，增加更多评测对比
diff --git a/lineless_table_rec/main.py b/lineless_table_rec/main.py
@@ -51,23 +51,37 @@ def __call__(
         self,
         content: InputType,
         ocr_result: Optional[List[Union[List[List[float]], str, str]]] = None,
+        **kwargs
     ):
         ss = time.perf_counter()
+        rec_again = True
+        need_ocr = True
+        if kwargs:
+            rec_again = kwargs.get("rec_again", True)
+            need_ocr = kwargs.get("need_ocr", True)
         img = self.load_img(content)
-        if self.ocr is None and ocr_result is None:
-            raise ValueError(
-                "One of two conditions must be met: ocr_result is not empty, or rapidocr_onnxruntime is installed."
-            )
-        if ocr_result is None:
-            ocr_result, _ = self.ocr(img)
         input_info = self.preprocess(img)
         try:
             polygons, slct_logi = self.infer(input_info)
             logi_points = self.filter_logi_points(slct_logi)
+            if not need_ocr:
+                sorted_polygons, idx_list = sorted_ocr_boxes(
+                    [box_4_2_poly_to_box_4_1(box) for box in polygons]
+                )
+                return (
+                    "",
+                    time.perf_counter() - ss,
+                    sorted_polygons,
+                    logi_points[idx_list],
+                    [],
+                )
+
+            if ocr_result is None and need_ocr:
+                ocr_result, _ = self.ocr(img)
             # ocr 结果匹配
             cell_box_det_map, no_match_ocr_det = match_ocr_cell(ocr_result, polygons)
             # 如果有识别框没有ocr结果，直接进行rec补充
-            cell_box_det_map = self.re_rec(img, polygons, cell_box_det_map)
+            cell_box_det_map = self.re_rec(img, polygons, cell_box_det_map, rec_again)
             # 转换为中间格式，修正识别框坐标,将物理识别框，逻辑识别框，ocr识别框整合为dict，方便后续处理
             t_rec_ocr_list = self.transform_res(cell_box_det_map, polygons, logi_points)
             # 拆分包含和重叠的识别框
@@ -81,7 +95,6 @@ def __call__(
             ]
             # 生成行列对应的二维表格, 合并同行同列识别框中的的ocr识别框
             t_rec_ocr_list, grid = self.handle_overlap_row_col(t_rec_ocr_list)
-            # todo 根据grid 及 not_match_orc_boxes，尝试将ocr识别填入单行单列中
             # 将同一个识别框中的ocr结果排序并同行合并
             t_rec_ocr_list = self.sort_and_gather_ocr_res(t_rec_ocr_list)
             # 渲染为html
@@ -192,11 +205,11 @@ def infer(self, input_content: Dict[str, Any]) -> Tuple[np.ndarray, np.ndarray]:
     def sort_and_gather_ocr_res(self, res):
         for i, dict_res in enumerate(res):
             _, sorted_idx = sorted_ocr_boxes(
-                [ocr_det[0] for ocr_det in dict_res["t_ocr_res"]], threhold=0.5
+                [ocr_det[0] for ocr_det in dict_res["t_ocr_res"]], threhold=0.3
             )
             dict_res["t_ocr_res"] = [dict_res["t_ocr_res"][i] for i in sorted_idx]
             dict_res["t_ocr_res"] = gather_ocr_list_by_row(
-                dict_res["t_ocr_res"], thehold=0.5
+                dict_res["t_ocr_res"], thehold=0.3
             )
         return res
 
@@ -263,12 +276,17 @@ def re_rec(
         img: np.ndarray,
         sorted_polygons: np.ndarray,
         cell_box_map: Dict[int, List[str]],
+        rec_again=True,
     ) -> Dict[int, List[any]]:
         """找到poly对应为空的框，尝试将直接将poly框直接送到识别中"""
         #
         for i in range(sorted_polygons.shape[0]):
             if cell_box_map.get(i):
                 continue
+            if not rec_again:
+                box = sorted_polygons[i]
+                cell_box_map[i] = [[box, "", 1]]
+                continue
             crop_img = get_rotate_crop_image(img, sorted_polygons[i])
             pad_img = cv2.copyMakeBorder(
                 crop_img, 5, 5, 100, 100, cv2.BORDER_CONSTANT, value=(255, 255, 255)
diff --git a/tests/test_files/wired/wired_big_box.png b/tests/test_files/wired/wired_big_box.png
diff --git a/tests/test_lineless_table_rec.py b/tests/test_lineless_table_rec.py
@@ -244,3 +244,39 @@ def test_plot_html_table(logi_points, cell_box_map, expected_html):
     assert (
         html_output == expected_html
     ), f"Expected HTML does not match. Got: {html_output}"
+
+
+@pytest.mark.parametrize(
+    "img_path, table_str_len, td_nums",
+    [
+        ("table.jpg", 2870, 160),
+    ],
+)
+def test_no_rec_again(img_path, table_str_len, td_nums):
+    img_path = test_file_dir / img_path
+    img = cv2.imread(str(img_path))
+
+    table_str, *_ = table_recog(img, rec_again=False)
+
+    assert len(table_str) >= table_str_len
+    assert table_str.count("td") == td_nums
+
+
+@pytest.mark.parametrize(
+    "img_path, html_output, points_len",
+    [
+        ("table.jpg", "", 77),
+        ("lineless_table_recognition.jpg", "", 51),
+    ],
+)
+def test_no_ocr(img_path, html_output, points_len):
+    img_path = test_file_dir / img_path
+
+    html, elasp, polygons, logic_points, ocr_res = table_recog(
+        str(img_path), need_ocr=False
+    )
+    assert len(ocr_res) == 0
+    assert len(polygons) > points_len
+    assert len(logic_points) > points_len
+    assert len(polygons) == len(logic_points)
+    assert html == html_output
diff --git a/tests/test_wired_table_rec.py b/tests/test_wired_table_rec.py
@@ -65,6 +65,22 @@ def test_input_normal(img_path, gt_td_nums, gt2):
     assert td_nums >= gt_td_nums
 
 
+@pytest.mark.parametrize(
+    "img_path, gt_td_nums",
+    [
+        ("wired_big_box.png", 70),
+    ],
+)
+def test_input_normal(img_path, gt_td_nums):
+    img_path = test_file_dir / img_path
+
+    ocr_result, _ = ocr_engine(img_path)
+    table_str, *_ = table_recog(str(img_path), ocr_result)
+    td_nums = get_td_nums(table_str)
+
+    assert td_nums >= gt_td_nums
+
+
 @pytest.mark.parametrize(
     "box1, box2, threshold, expected",
     [
@@ -264,3 +280,40 @@ def test_plot_html_table(logi_points, cell_box_map, expected_html):
     assert (
         html_output == expected_html
     ), f"Expected HTML does not match. Got: {html_output}"
+
+
+@pytest.mark.parametrize(
+    "img_path, gt_td_nums, gt2",
+    [
+        ("table_recognition.jpg", 35, "d colsp"),
+    ],
+)
+def test_no_rec_again(img_path, gt_td_nums, gt2):
+    img_path = test_file_dir / img_path
+
+    ocr_result, _ = ocr_engine(img_path)
+    table_str, *_ = table_recog(str(img_path), ocr_result, rec_again=False)
+    td_nums = get_td_nums(table_str)
+
+    assert td_nums >= gt_td_nums
+
+
+@pytest.mark.parametrize(
+    "img_path, html_output, points_len",
+    [
+        ("table2.jpg", "", 20),
+        ("row_span.png", "", 14),
+    ],
+)
+def test_no_ocr(img_path, html_output, points_len):
+    img_path = test_file_dir / img_path
+
+    ocr_result, _ = ocr_engine(img_path)
+    html, elasp, polygons, logic_points, ocr_res = table_recog(
+        str(img_path), ocr_result, need_ocr=False
+    )
+    assert len(ocr_res) == 0
+    assert len(polygons) > points_len
+    assert len(logic_points) > points_len
+    assert len(polygons) == len(logic_points)
+    assert html == html_output
diff --git a/wired_table_rec/main.py b/wired_table_rec/main.py
@@ -50,16 +50,21 @@ def __call__(
         self,
         img: InputType,
         ocr_result: Optional[List[Union[List[List[float]], str, str]]] = None,
+        **kwargs,
     ) -> Tuple[str, float, Any, Any, Any]:
         if self.ocr is None and ocr_result is None:
             raise ValueError(
                 "One of two conditions must be met: ocr_result is not empty, or rapidocr_onnxruntime is installed."
             )
 
         s = time.perf_counter()
-
+        rec_again = True
+        need_ocr = True
+        if kwargs:
+            rec_again = kwargs.get("rec_again", True)
+            need_ocr = kwargs.get("need_ocr", True)
         img = self.load_img(img)
-        polygons = self.table_line_rec(img)
+        polygons = self.table_line_rec(img, **kwargs)
         if polygons is None:
             logging.warning("polygons is None.")
             return "", 0.0, None, None, None
@@ -71,12 +76,22 @@ def __call__(
                 polygons[:, 3, :].copy(),
                 polygons[:, 1, :].copy(),
             )
-            if ocr_result is None:
+            if not need_ocr:
+                sorted_polygons, idx_list = sorted_ocr_boxes(
+                    [box_4_2_poly_to_box_4_1(box) for box in polygons]
+                )
+                return (
+                    "",
+                    time.perf_counter() - s,
+                    sorted_polygons,
+                    logi_points[idx_list],
+                    [],
+                )
+            if ocr_result is None and need_ocr:
                 ocr_result, _ = self.ocr(img)
             cell_box_det_map, not_match_orc_boxes = match_ocr_cell(ocr_result, polygons)
             # 如果有识别框没有ocr结果，直接进行rec补充
-            # cell_box_det_map = self.re_rec_high_precise(img, polygons, cell_box_det_map)
-            cell_box_det_map = self.re_rec(img, polygons, cell_box_det_map)
+            cell_box_det_map = self.re_rec(img, polygons, cell_box_det_map, rec_again)
             # 转换为中间格式，修正识别框坐标,将物理识别框，逻辑识别框，ocr识别框整合为dict，方便后续处理
             t_rec_ocr_list = self.transform_res(cell_box_det_map, polygons, logi_points)
             # 将每个单元格中的ocr识别结果排序和同行合并，输出的html能完整保留文字的换行格式
@@ -139,11 +154,11 @@ def transform_res(
     def sort_and_gather_ocr_res(self, res):
         for i, dict_res in enumerate(res):
             _, sorted_idx = sorted_ocr_boxes(
-                [ocr_det[0] for ocr_det in dict_res["t_ocr_res"]], threhold=0.5
+                [ocr_det[0] for ocr_det in dict_res["t_ocr_res"]], threhold=0.3
             )
             dict_res["t_ocr_res"] = [dict_res["t_ocr_res"][i] for i in sorted_idx]
             dict_res["t_ocr_res"] = gather_ocr_list_by_row(
-                dict_res["t_ocr_res"], threhold=0.5
+                dict_res["t_ocr_res"], threhold=0.3
             )
         return res
 
@@ -152,12 +167,16 @@ def re_rec(
         img: np.ndarray,
         sorted_polygons: np.ndarray,
         cell_box_map: Dict[int, List[str]],
+        rec_again=True,
     ) -> Dict[int, List[Any]]:
         """找到poly对应为空的框，尝试将直接将poly框直接送到识别中"""
-        #
         for i in range(sorted_polygons.shape[0]):
             if cell_box_map.get(i):
                 continue
+            if not rec_again:
+                box = sorted_polygons[i]
+                cell_box_map[i] = [[box, "", 1]]
+                continue
             crop_img = get_rotate_crop_image(img, sorted_polygons[i])
             pad_img = cv2.copyMakeBorder(
                 crop_img, 5, 5, 100, 100, cv2.BORDER_CONSTANT, value=(255, 255, 255)
diff --git a/wired_table_rec/table_line_rec.py b/wired_table_rec/table_line_rec.py
@@ -36,7 +36,7 @@ def __init__(self, model_path: Optional[str] = None):
 
         self.session = OrtInferSession(model_path)
 
-    def __call__(self, img: np.ndarray) -> Optional[np.ndarray]:
+    def __call__(self, img: np.ndarray, **kwargs) -> Optional[np.ndarray]:
         img_info = self.preprocess(img)
         pred = self.infer(img_info)
         polygons = self.postprocess(pred)
diff --git a/wired_table_rec/table_line_rec_plus.py b/wired_table_rec/table_line_rec_plus.py