Skip to content

Commit

Permalink
Merge pull request #77 from kotaro-kinoshita/feature/merge-multipage-…
Browse files Browse the repository at this point in the history
…ducument

Feature/merge multipage ducument
  • Loading branch information
kotaro-kinoshita authored Feb 21, 2025
2 parents 3797f43 + 2eb078d commit a4029bf
Show file tree
Hide file tree
Showing 11 changed files with 212 additions and 83 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ yomitoku ${path_data} -f md -o results -v --figure --lite
- `--figure_letter` 検出した図表に含まれる文字も出力ファイルにエクスポートします。
- `--figure` 検出した図、画像を出力ファイルにエクスポートします。
- `--encoding` エクスポートする出力ファイルの文字エンコーディングを指定します。サポートされていない文字コードが含まれる場合は、その文字を無視します。(utf-8, utf-8-sig, shift-jis, enc-jp, cp932)
- `--combine` PDFを入力に与えたときに、複数ページが含まれる場合に、それらの予測結果を一つのファイルに統合してエクスポートします。
- `--ignore_meta` 文章のheater, fotterなどの文字情報を出力ファイルに含めません。

その他のオプションに関しては、ヘルプを参照

Expand Down
2 changes: 2 additions & 0 deletions README_EN.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ yomitoku ${path_data} -f md -o results -v --figure --lite
- `--figure_letter`: Exports characters contained within detected figures and tables to the output file.
- `--figure`: Exports detected figures and images to the output file
- `--encoding` Specifies the character encoding for the output file to be exported. If unsupported characters are included, they will be ignored. (utf-8, utf-8-sig, shift-jis, enc-jp, cp932)
- `--combine` When a PDF is provided as input and contains multiple pages, this option combines their prediction results into a single file for export.
- `--ignore_meta` Excludes text information such as headers and footers from the output file.

For other options, please refer to the help documentation.

Expand Down
2 changes: 1 addition & 1 deletion src/yomitoku/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ class Config:
validate_assignment = True

def to_json(self, out_path: str, **kwargs):
export_json(self, out_path, **kwargs)
return export_json(self, out_path, **kwargs)


class BaseModule:
Expand Down
118 changes: 110 additions & 8 deletions src/yomitoku/cli/main.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,65 @@
import argparse
import os
import torch
import time
from pathlib import Path

import cv2
import time
import torch

from ..constants import SUPPORT_OUTPUT_FORMAT
from ..data.functions import load_image, load_pdf
from ..document_analyzer import DocumentAnalyzer
from ..utils.logger import set_logger

from ..export import save_csv, save_html, save_json, save_markdown

logger = set_logger(__name__, "INFO")


def merge_all_pages(results):
out = None
for result in results:
format = result["format"]
data = result["data"]

if format == "json":
if out is None:
out = [data]
else:
out.append(data)

elif format == "csv":
if out is None:
out = data
else:
out.extend(data)

elif format == "html":
if out is None:
out = data
else:
out += "\n" + data

elif format == "md":
if out is None:
out = data
else:
out += "\n" + data

return out


def save_merged_file(out_path, args, out):
if args.format == "json":
save_json(out_path, args.encoding, out)
elif args.format == "csv":
save_csv(out_path, args.encoding, out)
elif args.format == "html":
save_html(out_path, args.encoding, out)
elif args.format == "md":
save_markdown(out_path, args.encoding, out)


def validate_encoding(encoding):
if encoding not in [
"utf-8",
Expand All @@ -32,8 +78,9 @@ def process_single_file(args, analyzer, path, format):
else:
imgs = [load_image(path)]

results = []
for page, img in enumerate(imgs):
results, ocr, layout = analyzer(img)
result, ocr, layout = analyzer(img)
dirname = path.parent.name
filename = path.stem

Expand All @@ -56,25 +103,47 @@ def process_single_file(args, analyzer, path, format):
out_path = os.path.join(args.outdir, f"{dirname}_{filename}_p{page+1}.{format}")

if format == "json":
results.to_json(
json = result.to_json(
out_path,
ignore_line_break=args.ignore_line_break,
encoding=args.encoding,
img=img,
export_figure=args.figure,
figure_dir=args.figure_dir,
)

results.append(
{
"format": format,
"data": json,
}
)

if not args.combine:
save_json(out_path, args.encoding, json)

elif format == "csv":
results.to_csv(
csv = result.to_csv(
out_path,
ignore_line_break=args.ignore_line_break,
encoding=args.encoding,
img=img,
export_figure=args.figure,
figure_dir=args.figure_dir,
)

results.append(
{
"format": format,
"data": csv,
}
)

if not args.combine:
save_csv(out_path, args.encoding, csv)

elif format == "html":
results.to_html(
html = result.to_html(
out_path,
ignore_line_break=args.ignore_line_break,
img=img,
Expand All @@ -84,8 +153,19 @@ def process_single_file(args, analyzer, path, format):
figure_dir=args.figure_dir,
encoding=args.encoding,
)

results.append(
{
"format": format,
"data": html,
}
)

if not args.combine:
save_html(out_path, args.encoding, html)

elif format == "md":
results.to_markdown(
md = result.to_markdown(
out_path,
ignore_line_break=args.ignore_line_break,
img=img,
Expand All @@ -96,7 +176,24 @@ def process_single_file(args, analyzer, path, format):
encoding=args.encoding,
)

logger.info(f"Output file: {out_path}")
results.append(
{
"format": format,
"data": md,
}
)

if not args.combine:
save_markdown(out_path, args.encoding, md)

out = merge_all_pages(results)
if args.combine:
out_path = os.path.join(args.outdir, f"{dirname}_{filename}.{format}")
save_merged_file(
out_path,
args,
out,
)


def main():
Expand Down Expand Up @@ -196,6 +293,11 @@ def main():
default="utf-8",
help="Specifies the character encoding for the output file to be exported. If unsupported characters are included, they will be ignored.",
)
parser.add_argument(
"--combine",
action="store_true",
help="if set, merge all pages in the output",
)
parser.add_argument(
"--ignore_meta",
action="store_true",
Expand Down
22 changes: 8 additions & 14 deletions src/yomitoku/document_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,19 @@
from typing import List, Union

import numpy as np

from pydantic import conlist

from yomitoku.text_detector import TextDetector
from yomitoku.text_recognizer import TextRecognizer

from .base import BaseSchema
from .export import export_csv, export_html, export_markdown
from .layout_analyzer import LayoutAnalyzer
from .ocr import OCRSchema, WordPrediction, ocr_aggregate
from .reading_order import prediction_reading_order
from .table_structure_recognizer import TableStructureRecognizerSchema
from .utils.misc import (
is_contained,
quad_to_xyxy,
calc_overlap_ratio,
)
from .utils.visualizer import reading_order_visualizer
from yomitoku.text_detector import TextDetector
from yomitoku.text_recognizer import TextRecognizer

from .utils.visualizer import det_visualizer
from .utils.misc import calc_overlap_ratio, is_contained, quad_to_xyxy
from .utils.visualizer import det_visualizer, reading_order_visualizer


class ParagraphSchema(BaseSchema):
Expand All @@ -47,13 +41,13 @@ class DocumentAnalyzerSchema(BaseSchema):
figures: List[FigureSchema]

def to_html(self, out_path: str, **kwargs):
export_html(self, out_path, **kwargs)
return export_html(self, out_path, **kwargs)

def to_markdown(self, out_path: str, **kwargs):
export_markdown(self, out_path, **kwargs)
return export_markdown(self, out_path, **kwargs)

def to_csv(self, out_path: str, **kwargs):
export_csv(self, out_path, **kwargs)
return export_csv(self, out_path, **kwargs)


def combine_flags(flag1, flag2):
Expand Down
19 changes: 14 additions & 5 deletions src/yomitoku/export/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
from .export_csv import export_csv
from .export_html import export_html
from .export_json import export_json
from .export_markdown import export_markdown
from .export_csv import export_csv, save_csv
from .export_html import export_html, save_html
from .export_json import export_json, save_json
from .export_markdown import export_markdown, save_markdown

__all__ = ["export_html", "export_markdown", "export_csv", "export_json"]
__all__ = [
"export_html",
"export_markdown",
"export_csv",
"export_json",
"save_html",
"save_markdown",
"save_csv",
"save_json",
]
6 changes: 5 additions & 1 deletion src/yomitoku/export/export_csv.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import csv
import cv2
import os

import cv2


def table_to_csv(table, ignore_line_break):
num_rows = table.n_row
Expand Down Expand Up @@ -98,7 +99,10 @@ def export_csv(
)

elements = sorted(elements, key=lambda x: x["order"])
return elements


def save_csv(out_path, encoding, elements):
with open(out_path, "w", newline="", encoding=encoding, errors="ignore") as f:
writer = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
for element in elements:
Expand Down
12 changes: 7 additions & 5 deletions src/yomitoku/export/export_html.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
import re
import os
import cv2

import re
from html import escape

import cv2
from lxml import etree, html


Expand Down Expand Up @@ -182,10 +181,13 @@ def export_html(
elements = sorted(elements, key=lambda x: x["order"])

html_string = "".join([element["html"] for element in elements])
html_string = add_html_tag(html_string)
# html_string = add_html_tag(html_string)

parsed_html = html.fromstring(html_string)
formatted_html = etree.tostring(parsed_html, pretty_print=True, encoding="unicode")
return formatted_html


def save_html(out_path, encoding, html):
with open(out_path, "w", encoding=encoding, errors="ignore") as f:
f.write(formatted_html)
f.write(html)
8 changes: 6 additions & 2 deletions src/yomitoku/export/export_json.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import json
import os

import cv2
import os


def paragraph_to_json(paragraph, ignore_line_break):
Expand Down Expand Up @@ -63,9 +63,13 @@ def export_json(
figure_dir=figure_dir,
)

return inputs.model_dump()


def save_json(out_path, encoding, data):
with open(out_path, "w", encoding=encoding, errors="ignore") as f:
json.dump(
inputs.model_dump(),
data,
f,
ensure_ascii=False,
indent=4,
Expand Down
7 changes: 6 additions & 1 deletion src/yomitoku/export/export_markdown.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import re

import cv2
import os


def escape_markdown_special_chars(text):
Expand Down Expand Up @@ -144,5 +145,9 @@ def export_markdown(
elements = sorted(elements, key=lambda x: x["order"])
markdown = "\n".join([element["md"] for element in elements])

return markdown


def save_markdown(out_path, encoding, markdown):
with open(out_path, "w", encoding=encoding, errors="ignore") as f:
f.write(markdown)
Loading

0 comments on commit a4029bf

Please sign in to comment.