Merge pull request #57 from kotaro-kinoshita/feature/implement-lite-m…

…odel implement lite mode
kotaro-kinoshita · Dec 15, 2024 · 513a7b3 · 513a7b3
2 parents 71c85bc + 573c69b
commit 513a7b3
Show file tree

Hide file tree

Showing 17 changed files with 197 additions and 98 deletions.
diff --git a/README.md b/README.md
@@ -54,13 +54,14 @@ pip install yomitoku
 ## 🚀 実行方法
 
 ```
-yomitoku ${path_data} -f md -o results -v --figure
+yomitoku ${path_data} -f md -o results -v --figure　--lite
 ```
 
 - `${path_data}` 解析対象の画像が含まれたディレクトリか画像ファイルのパスを直接して指定してください。ディレクトリを対象とした場合はディレクトリのサブディレクトリ内の画像も含めて処理を実行します。
 - `-f`, `--format` 出力形式のファイルフォーマットを指定します。(json, csv, html, md をサポート)
 - `-o`, `--outdir` 出力先のディレクトリ名を指定します。存在しない場合は新規で作成されます。
 - `-v`, `--vis` を指定すると解析結果を可視化した画像を出力します。
+- `-l`, `--lite` を指定すると軽量モデルで推論を実行します。CPUでも高速に推論可能です。
 - `-d`, `--device` モデルを実行するためのデバイスを指定します。gpu が利用できない場合は cpu で推論が実行されます。(デフォルト: cuda)
 - `--ignore_line_break` 画像の改行位置を無視して、段落内の文章を連結して返します。（デフォルト：画像通りの改行位置位置で改行します。）
 - `--figure_letter` 検出した図表に含まれる文字も出力ファイルにエクスポートします。

diff --git a/README_EN.md b/README_EN.md
@@ -63,6 +63,7 @@ yomitoku ${path_data} -f md -o results -v --figure
 - `-f`, `--format`: Specify the output file format. Supported formats are json, csv, html, and md.
 - `-o`, `--outdir`: Specify the name of the output directory. If it does not exist, it will be created.
 - `-v`, `--vis`: If specified, outputs visualized images of the analysis results.
+- `-l`, `--lite`: inference is performed using a lightweight model. This enables fast inference even on a CPU.
 - `-d`, `--device`: Specify the device for running the model. If a GPU is unavailable, inference will be executed on the CPU. (Default: cuda)
 - `--ignore_line_break`: Ignores line breaks in the image and concatenates sentences within a paragraph. (Default: respects line breaks as they appear in the image.)
 - `--figure_letter`: Exports characters contained within detected figures and tables to the output file.

diff --git a/configs/table_structure_recognitizer.yaml b/configs/table_structure_recognitizer.yaml
@@ -1,2 +1,67 @@
-hf_hub_repo: "KotaroKinoshita/yomitoku-table-structure-recognizer-rtdtrv2-open-beta"
-thresh_score: 0.6
+hf_hub_repo: KotaroKinoshita/yomitoku-table-structure-recognizer-rtdtrv2-open-beta
+thresh_score: 0.4
+data:
+  img_size:
+  - 640
+  - 640
+PResNet:
+  depth: 50
+  variant: d
+  freeze_at: 0
+  return_idx:
+  - 1
+  - 2
+  - 3
+  num_stages: 4
+  freeze_norm: true
+HybridEncoder:
+  in_channels:
+  - 512
+  - 1024
+  - 2048
+  feat_strides:
+  - 8
+  - 16
+  - 32
+  hidden_dim: 256
+  use_encoder_idx:
+  - 2
+  num_encoder_layers: 1
+  nhead: 8
+  dim_feedforward: 1024
+  dropout: 0.0
+  enc_act: gelu
+  expansion: 1.0
+  depth_mult: 1
+  act: silu
+RTDETRTransformerv2:
+  num_classes: 3
+  feat_channels:
+  - 256
+  - 256
+  - 256
+  feat_strides:
+  - 8
+  - 16
+  - 32
+  hidden_dim: 256
+  num_levels: 3
+  num_layers: 6
+  num_queries: 300
+  num_denoising: 100
+  label_noise_ratio: 0.5
+  box_noise_scale: 1.0
+  eval_spatial_size:
+  - 640
+  - 640
+  eval_idx: -1
+  num_points:
+  - 4
+  - 4
+  - 4
+  cross_attn_method: default
+  query_select_method: default
+category:
+- row
+- col
+- span
diff --git a/configs/text_recognizer.yaml b/configs/text_recognizer.yaml
@@ -1,14 +1,32 @@
 hf_hub_repo: KotaroKinoshita/yomitoku-text-recognizer-parseq-open-beta
+charset: /home/kinoshita/Projects/know-how/yomitoku/src/yomitoku/resource/charset.txt
+num_tokens: 7312
 max_label_length: 100
 decode_ar: 1
 refine_iters: 1
 data:
   num_workers: 4
   batch_size: 128
+  img_size:
+  - 32
+  - 800
+encoder:
+  patch_size:
+  - 8
+  - 8
+  num_heads: 8
+  embed_dim: 512
+  mlp_ratio: 4
+  depth: 12
+decoder:
+  embed_dim: 512
+  num_heads: 8
+  mlp_ratio: 4
+  depth: 1
 visualize:
-  font: resource/MPLUS1p-Medium.ttf
+  font: /home/kinoshita/Projects/know-how/yomitoku/src/yomitoku/resource/MPLUS1p-Medium.ttf
   color:
-    - 0
-    - 0
-    - 255
+  - 0
+  - 0
+  - 255
   font_size: 18
diff --git a/docs/usage.en.md b/docs/usage.en.md
@@ -12,6 +12,7 @@ yomitoku ${path_data} -f md -o results -v
 - `-f`, `--format`: Specify the output file format. Supported formats are json, csv, html, and md.
 - `-o`, `--outdir`: Specify the name of the output directory. If it does not exist, it will be created.
 - `-v`, `--vis`: If specified, outputs visualized images of the analysis results.
+- `-l`, `--lite`: inference is performed using a lightweight model. This enables fast inference even on a CPU.
 - `-d`, `--device`: Specify the device for running the model. If a GPU is unavailable, inference will be executed on the CPU. (Default: cuda)
 - `--ignore_line_break`: Ignores line breaks in the image and concatenates sentences within a paragraph. (Default: respects line breaks as they appear in the image.)
 - `--figure_letter`: Exports characters contained within detected figures and tables to the output file.

diff --git a/docs/usage.ja.md b/docs/usage.ja.md
@@ -10,6 +10,7 @@ yomitoku ${path_data} -f md -o results -v
 
 - `${path_data}` 解析対象の画像が含まれたディレクトリか画像ファイルのパスを直接して指定してください。かディレクトリを対象とした場合はディレクトリのサブディレクトリ内の画像も含めて処理を実行します。入力をサポートしているファイル形式は jpeg, png, bmp, tiff, pdf です。
 - `-f` 出力形式のファイルフォーマットを指定します。(json, csv, html, md をサポート)
+- `-l` 指定すると軽量モデルで推論を実行します。CPUでも高速に推論可能です。
 - `-o` 出力先のディレクトリ名を指定します。存在しない場合は新規で作成されます。
 - `-v` を指定すると解析結果を可視化した画像を出力します。
 - `-d` モデルを実行するためのデバイスを指定します。gpu が利用できない場合は cpu で推論が実行されます。(デフォルト: cuda)

diff --git a/scripts/register_hugging_face_hub.py b/scripts/register_hugging_face_hub.py
@@ -14,7 +14,7 @@ def get_module(module_name):
         return module
 
     elif module_name == "text_recognizer":
-        module = TextRecognizer(from_pretrained=False)
+        module = TextRecognizer(from_pretrained=False, model_name="parseq-small")
         return module
 
     elif module_name == "layout_parser":

diff --git a/src/yomitoku/cli/main.py b/src/yomitoku/cli/main.py
@@ -104,6 +104,12 @@ def main():
         default="results",
         help="output directory",
     )
+    parser.add_argument(
+        "-l",
+        "--lite",
+        action="store_true",
+        help="if set, use lite model",
+    )
     parser.add_argument(
         "-d",
         "--device",
@@ -197,6 +203,9 @@ def main():
         },
     }
 
+    if args.lite:
+        configs["ocr"]["text_recognizer"]["model_name"] = "parseq-small"
+
     analyzer = DocumentAnalyzer(
         configs=configs,
         visualize=args.vis,

diff --git a/src/yomitoku/configs/__init__.py b/src/yomitoku/configs/__init__.py
@@ -4,10 +4,12 @@
 )
 from .cfg_text_detector_dbnet import TextDetectorDBNetConfig
 from .cfg_text_recognizer_parseq import TextRecognizerPARSeqConfig
+from .cfg_text_recognizer_parseq_small import TextRecognizerPARSeqSmallConfig
 
 __all__ = [
     "TextDetectorDBNetConfig",
     "TextRecognizerPARSeqConfig",
     "LayoutParserRTDETRv2Config",
     "TableStructureRecognizerRTDETRv2Config",
+    "TextRecognizerPARSeqSmallConfig",
 ]
diff --git a/src/yomitoku/configs/cfg_text_recognizer_parseq_small.py b/src/yomitoku/configs/cfg_text_recognizer_parseq_small.py
@@ -0,0 +1,51 @@
+from dataclasses import dataclass, field
+from typing import List
+
+from ..constants import ROOT_DIR
+
+
+@dataclass
+class Data:
+    num_workers: int = 4
+    batch_size: int = 128
+    img_size: List[int] = field(default_factory=lambda: [32, 800])
+
+
+@dataclass
+class Encoder:
+    patch_size: List[int] = field(default_factory=lambda: [16, 16])
+    num_heads: int = 8
+    embed_dim: int = 384
+    mlp_ratio: int = 4
+    depth: int = 9
+
+
+@dataclass
+class Decoder:
+    embed_dim: int = 384
+    num_heads: int = 8
+    mlp_ratio: int = 4
+    depth: int = 1
+
+
+@dataclass
+class Visualize:
+    font: str = str(ROOT_DIR + "/resource/MPLUS1p-Medium.ttf")
+    color: List[int] = field(default_factory=lambda: [0, 0, 255])  # RGB
+    font_size: int = 18
+
+
+@dataclass
+class TextRecognizerPARSeqSmallConfig:
+    hf_hub_repo: str = "KotaroKinoshita/yomitoku-text-recognizer-parseq-small-open-beta"
+    charset: str = str(ROOT_DIR + "/resource/charset.txt")
+    num_tokens: int = 7312
+    max_label_length: int = 100
+    decode_ar: int = 1
+    refine_iters: int = 1
+
+    data: Data = field(default_factory=Data)
+    encoder: Encoder = field(default_factory=Encoder)
+    decoder: Decoder = field(default_factory=Decoder)
+
+    visualize: Visualize = field(default_factory=Visualize)
diff --git a/src/yomitoku/models/layers/rtdetr_backbone.py b/src/yomitoku/models/layers/rtdetr_backbone.py
@@ -59,9 +59,7 @@ def forward(self, x):
 class BasicBlock(nn.Module):
     expansion = 1
 
-    def __init__(
-        self, ch_in, ch_out, stride, shortcut, act="relu", variant="b"
-    ):
+    def __init__(self, ch_in, ch_out, stride, shortcut, act="relu", variant="b"):
         super().__init__()
 
         self.shortcut = shortcut
@@ -100,9 +98,7 @@ def forward(self, x):
 class BottleNeck(nn.Module):
     expansion = 4
 
-    def __init__(
-        self, ch_in, ch_out, stride, shortcut, act="relu", variant="b"
-    ):
+    def __init__(self, ch_in, ch_out, stride, shortcut, act="relu", variant="b"):
         super().__init__()
 
         if variant == "a":
@@ -125,17 +121,13 @@ def __init__(
                             ("pool", nn.AvgPool2d(2, 2, 0, ceil_mode=True)),
                             (
                                 "conv",
-                                ConvNormLayer(
-                                    ch_in, ch_out * self.expansion, 1, 1
-                                ),
+                                ConvNormLayer(ch_in, ch_out * self.expansion, 1, 1),
                             ),
                         ]
                     )
                 )
             else:
-                self.short = ConvNormLayer(
-                    ch_in, ch_out * self.expansion, 1, stride
-                )
+                self.short = ConvNormLayer(ch_in, ch_out * self.expansion, 1, stride)
 
         self.act = nn.Identity() if act is None else get_activation(act)
 
@@ -156,9 +148,7 @@ def forward(self, x):
 
 
 class Blocks(nn.Module):
-    def __init__(
-        self, block, ch_in, ch_out, count, stage_num, act="relu", variant="b"
-    ):
+    def __init__(self, block, ch_in, ch_out, count, stage_num, act="relu", variant="b"):
         super().__init__()
 
         self.blocks = nn.ModuleList()

diff --git a/src/yomitoku/models/layers/rtdetr_hybrid_encoder.py b/src/yomitoku/models/layers/rtdetr_hybrid_encoder.py
@@ -252,9 +252,7 @@ def __init__(
         for in_channel in in_channels:
             if version == "v1":
                 proj = nn.Sequential(
-                    nn.Conv2d(
-                        in_channel, hidden_dim, kernel_size=1, bias=False
-                    ),
+                    nn.Conv2d(in_channel, hidden_dim, kernel_size=1, bias=False),
                     nn.BatchNorm2d(hidden_dim),
                 )
             elif version == "v2":
@@ -290,9 +288,7 @@ def __init__(
 
         self.encoder = nn.ModuleList(
             [
-                TransformerEncoder(
-                    copy.deepcopy(encoder_layer), num_encoder_layers
-                )
+                TransformerEncoder(copy.deepcopy(encoder_layer), num_encoder_layers)
                 for _ in range(len(use_encoder_idx))
             ]
         )
@@ -347,9 +343,7 @@ def _reset_parameters(self):
                 # self.register_buffer(f'pos_embed{idx}', pos_embed)
 
     @staticmethod
-    def build_2d_sincos_position_embedding(
-        w, h, embed_dim=256, temperature=10000.0
-    ):
+    def build_2d_sincos_position_embedding(w, h, embed_dim=256, temperature=10000.0):
         """ """
         grid_w = torch.arange(int(w), dtype=torch.float32)
         grid_h = torch.arange(int(h), dtype=torch.float32)
@@ -387,9 +381,7 @@ def forward(self, feats):
                         src_flatten.device
                     )
 
-                memory: torch.Tensor = self.encoder[i](
-                    src_flatten, pos_embed=pos_embed
-                )
+                memory: torch.Tensor = self.encoder[i](src_flatten, pos_embed=pos_embed)
                 proj_feats[enc_ind] = (
                     memory.permute(0, 2, 1)
                     .reshape(-1, self.hidden_dim, h, w)
@@ -401,13 +393,9 @@ def forward(self, feats):
         for idx in range(len(self.in_channels) - 1, 0, -1):
             feat_heigh = inner_outs[0]
             feat_low = proj_feats[idx - 1]
-            feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx](
-                feat_heigh
-            )
+            feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx](feat_heigh)
             inner_outs[0] = feat_heigh
-            upsample_feat = F.interpolate(
-                feat_heigh, scale_factor=2.0, mode="nearest"
-            )
+            upsample_feat = F.interpolate(feat_heigh, scale_factor=2.0, mode="nearest")
             inner_out = self.fpn_blocks[len(self.in_channels) - 1 - idx](
                 torch.concat([upsample_feat, feat_low], dim=1)
             )