merge main

kotaro-kinoshita · Feb 21, 2025 · 2eb078d · 2eb078d
2 parents 23d097c + 378f422
commit 2eb078d
Show file tree

Hide file tree

Showing 9 changed files with 52 additions and 31 deletions.
diff --git a/README.md b/README.md
@@ -48,12 +48,6 @@ Markdown でエクスポートした結果は関してはリポジトリ内の[s
 pip install yomitoku
 ```
 
-onnxruntime の実行に GPU を使用する場合
-
-```
-pip install yomitoku[gpu]
-```
-
 - pytorch はご自身の CUDA のバージョンにあったものをインストールしてください。デフォルトでは CUDA12.4 以上に対応したものがインストールされます。
 - pytorch は 2.5 以上のバージョンに対応しています。その関係で CUDA11.8 以上のバージョンが必要になります。対応できない場合は、リポジトリ内の Dockerfile を利用してください。
 

diff --git a/README_EN.md b/README_EN.md
@@ -48,12 +48,6 @@ Source of the image: Created by processing content from “Reiwa 6 Edition Infor
 pip install yomitoku
 ```
 
-Using GPU with onnxruntime
-
-```
-pip install yomitoku[gpu]
-```
-
 - Please install the version of PyTorch that matches your CUDA version. By default, a version compatible with CUDA 12.4 or higher will be installed.
 - PyTorch versions 2.5 and above are supported. As a result, CUDA version 11.8 or higher is required. If this is not feasible, please use the Dockerfile provided in the repository.
 

diff --git a/docs/cli.en.md b/docs/cli.en.md
@@ -93,3 +93,18 @@ Specify the path to the config files for each module as follows:
 ```
 yomitoku ${path_data} --td_cfg ${path_yaml}
 ```
+
+## Do not include metadata in the output file
+
+You can exclude metadata such as headers and footers from the output file.
+```
+yomitoku ${path_data} --ignore_meta
+```
+
+## Combine multiple pages
+
+If the PDF contains multiple pages, you can export them as a single file.
+
+```
+yomitoku ${path_data} -f md --combine
+```
diff --git a/docs/cli.ja.md b/docs/cli.ja.md
@@ -89,3 +89,19 @@ yomitoku ${path_data} --encoding utf-8-sig
 ```
 yomitoku ${path_data} --td_cfg ${path_yaml}
 ```
+
+## メタ情報を出力ファイルに加えない
+
+ヘッダーやフッター等のメタデータを出力ファイルに加えないようにすることができます。
+
+```
+yomitoku ${path_data} --ignore_meta
+```
+
+## 複数ページを統合する
+
+PDFに複数ページが含まれる場合に複数ページを一つのファイルにまとめてエクスポートできます。
+
+```
+yomitoku ${path_data} -f md --combine
+```
diff --git a/docs/installation.en.md b/docs/installation.en.md
@@ -9,12 +9,6 @@ This package requires Python 3.10 or later and PyTorch 2.5 or later for executio
 pip install yomitoku
 ```
 
-Using GPU with onnxruntime
-```bash
-pip install yomitoku[gpu]
-```
-
-
 ## using uv
 This repository uses the package management tool [uv](https://docs.astral.sh/uv/). After installing uv, clone the repository and execute the following commands:
 

diff --git a/docs/installation.ja.md b/docs/installation.ja.md
@@ -8,11 +8,6 @@
 pip install yomitoku
 ```
 
-onnxruntimeの実行にGPUを使用する場合
-```bash
-pip install yomitoku[gpu]
-```
-
 ## uv でのインストール
 
 本リポジトリはパッケージ管理ツールに [uv](https://docs.astral.sh/uv/) を使用しています。uv をインストール後、リポジトリをクローンし、以下のコマンドを実行してください

diff --git a/pyproject.toml b/pyproject.toml
@@ -32,11 +32,6 @@ dependencies = [
     "onnxruntime>=1.20.1",
 ]
 
-[project.optional-dependencies]
-gpu = [
-    "onnxruntime-gpu>=1.20.1",
-]
-
 [tool.uv-dynamic-versioning]
 vcs = "git"
 style = "semver"

diff --git a/src/yomitoku/cli/main.py b/src/yomitoku/cli/main.py
@@ -298,6 +298,11 @@ def main():
         action="store_true",
         help="if set, merge all pages in the output",
     )
+    parser.add_argument(
+        "--ignore_meta",
+        action="store_true",
+        help="if set, ignore meta information(header, footer) in the output",
+    )
 
     args = parser.parse_args()
 
@@ -350,6 +355,7 @@ def main():
         configs=configs,
         visualize=args.vis,
         device=args.device,
+        ignore_meta=args.ignore_meta,
     )
 
     os.makedirs(args.outdir, exist_ok=True)

diff --git a/src/yomitoku/document_analyzer.py b/src/yomitoku/document_analyzer.py
@@ -322,7 +322,13 @@ def _split_text_across_cells(results_det, results_layout):
 
 
 class DocumentAnalyzer:
-    def __init__(self, configs={}, device="cuda", visualize=False):
+    def __init__(
+        self,
+        configs={},
+        device="cuda",
+        visualize=False,
+        ignore_meta=False,
+    ):
         default_configs = {
             "ocr": {
                 "text_detector": {
@@ -365,6 +371,8 @@ def __init__(self, configs={}, device="cuda", visualize=False):
         )
         self.visualize = visualize
 
+        self.ignore_meta = ignore_meta
+
     def aggregate(self, ocr_res, layout_res):
         paragraphs = []
         check_list = [False] * len(ocr_res.words)
@@ -425,11 +433,15 @@ def aggregate(self, ocr_res, layout_res):
         page_direction = judge_page_direction(paragraphs)
 
         headers = [
-            paragraph for paragraph in paragraphs if paragraph.role == "page_header"
+            paragraph
+            for paragraph in paragraphs
+            if paragraph.role == "page_header" and not self.ignore_meta
         ]
 
         footers = [
-            paragraph for paragraph in paragraphs if paragraph.role == "page_footer"
+            paragraph
+            for paragraph in paragraphs
+            if paragraph.role == "page_footer" and not self.ignore_meta
         ]
 
         page_contents = [