diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/.gitignore b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/.gitignore new file mode 100644 index 00000000..b089da4a --- /dev/null +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/.gitignore @@ -0,0 +1,13 @@ +# python generated files +__pycache__/ +*.py[oc] +build/ +dist/ +wheels/ +*.egg-info + +# venv +.venv + +# bunkai +bunkai_model/ diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/.python-version b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/.python-version new file mode 100644 index 00000000..d9506ceb --- /dev/null +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/.python-version @@ -0,0 +1 @@ +3.12.5 diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/README.md b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/README.md new file mode 100644 index 00000000..aa940fe5 --- /dev/null +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/README.md @@ -0,0 +1,31 @@ +# ja-warp-pdf + +Preprocess text extracted from PDFs provided by WARP. + +## Environment + +- Python 3.12.5 + +## Installation + +Use [rye](https://rye.astral.sh/) to install the dependencies. + +```bash +RUSTFLAGS="-A invalid_reference_casting" rye sync +``` + +Then download the Bunkai sentence splitter model. + +```bash +rye run bunkai --model bunkai_model --setup +``` + +## Usage + +### Conversion + +This process converts text to remove unnecessary characters. + +```bash +rye run python scripts/convert.py --input-file --output-file +``` diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/examples/example.jsonl b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/examples/example.jsonl new file mode 100644 index 00000000..bdf53e5e --- /dev/null +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/examples/example.jsonl @@ -0,0 +1,5 @@ +{"docId": "example0", "text": "本勉強会では、自然言語処理および計算機システムの研究者が集まり大規模言語モデルの研究開発について定期的に情報共有を行っています。"} +{"docId": "example1", "text": "本勉強会では、自然言語処理および\n計算機システムの研究者が集まり\n大規模言語モデルの研究開発について\n定期的に情報共有を行っています。"} +{"docId": "example2", "text": "本勉強会では、自然言\n語処理および計算機シ\nステムの研究者が集ま\nり大規模言語モデルの\n研究開発について定期\n的に情報共有を行って\nいます。"} +{"docId": "example3", "text": "本\n勉\n強\n会\nで\nは\n、\n自\n然\n言\n語\n処\n理\nお\nよ\nび\n計\n算\n機\nシ\nス\nテ\nム\nの\n研\n究\n者\nが\n集\nま\nり\n大\n規\n模\n言\n語\nモ\nデ\nル\nの\n研\n究\n開\n発\nに\nつ\nい\nて\n定\n期\n的\nに\n情\n報\n共\n有\nを\n行\nっ\nて\nい\nま\nす\n。"} +{"docId": "example4", "text": "本 勉 強 会 で は 、 自 然 言 語 処 理 お よ び 計 算 機 シ ス テ ム の 研 究 者 が 集 ま り 大 規 模 言 語 モ デ ル の 研 究 開 発 に つ い て 定 期 的 に 情 報 共 有 を 行 っ て い ま す 。"} diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/pyproject.toml b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/pyproject.toml new file mode 100644 index 00000000..0434d064 --- /dev/null +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/pyproject.toml @@ -0,0 +1,19 @@ +[project] +name = "ja-warp-pdf" +version = "0.1.0" +description = "Add your description here" +authors = [ + { name = "Hirokazu Kiyomaru", email = "h.kiyomaru@gmail.com" } +] +dependencies = [ + "bunkai[lb] @ git+https://github.com/hkiyomaru/bunkai@feature/sequential-prediction", + "transformers==4.33.3", + "pytest>=8.3.4", +] +readme = "README.md" +requires-python = ">= 3.8" + +[tool.rye] +managed = true +virtual = true +dev-dependencies = [] diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/requirements-dev.lock b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/requirements-dev.lock new file mode 100644 index 00000000..9bf31530 --- /dev/null +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/requirements-dev.lock @@ -0,0 +1,132 @@ +# generated by rye +# use `rye lock` or `rye sync` to update this lockfile +# +# last locked with the following flags: +# pre: false +# features: [] +# all-features: false +# with-sources: false +# generate-hashes: false +# universal: false + +bunkai @ git+https://github.com/hkiyomaru/bunkai@12a2dfa9eb47e203f4135edac3087a2809d92ca7 +certifi==2024.12.14 + # via requests +charset-normalizer==3.4.1 + # via requests +dataclasses-json==0.6.7 + # via bunkai +emoji==2.14.0 + # via bunkai +emojis==0.7.0 + # via bunkai +filelock==3.16.1 + # via huggingface-hub + # via torch + # via transformers + # via triton +fsspec==2024.12.0 + # via huggingface-hub + # via torch +huggingface-hub==0.27.1 + # via transformers +idna==3.10 + # via requests +iniconfig==2.0.0 + # via pytest +janome==0.5.0 + # via bunkai +jinja2==3.1.5 + # via torch +markupsafe==3.0.2 + # via jinja2 +marshmallow==3.25.1 + # via dataclasses-json +more-itertools==10.6.0 + # via bunkai +mpmath==1.3.0 + # via sympy +mypy-extensions==1.0.0 + # via typing-inspect +networkx==3.4.2 + # via torch +numpy==2.2.1 + # via bunkai + # via transformers +nvidia-cublas-cu12==12.4.5.8 + # via nvidia-cudnn-cu12 + # via nvidia-cusolver-cu12 + # via torch +nvidia-cuda-cupti-cu12==12.4.127 + # via torch +nvidia-cuda-nvrtc-cu12==12.4.127 + # via torch +nvidia-cuda-runtime-cu12==12.4.127 + # via torch +nvidia-cudnn-cu12==9.1.0.70 + # via torch +nvidia-cufft-cu12==11.2.1.3 + # via torch +nvidia-curand-cu12==10.3.5.147 + # via torch +nvidia-cusolver-cu12==11.6.1.9 + # via torch +nvidia-cusparse-cu12==12.3.1.170 + # via nvidia-cusolver-cu12 + # via torch +nvidia-nccl-cu12==2.21.5 + # via torch +nvidia-nvjitlink-cu12==12.4.127 + # via nvidia-cusolver-cu12 + # via nvidia-cusparse-cu12 + # via torch +nvidia-nvtx-cu12==12.4.127 + # via torch +packaging==24.2 + # via huggingface-hub + # via marshmallow + # via pytest + # via transformers +pluggy==1.5.0 + # via pytest +pytest==8.3.4 +pyyaml==6.0.2 + # via huggingface-hub + # via transformers +regex==2024.11.6 + # via bunkai + # via transformers +requests==2.32.3 + # via bunkai + # via huggingface-hub + # via transformers +safetensors==0.5.2 + # via transformers +setuptools==75.8.0 + # via torch +spans==1.1.1 + # via bunkai +sympy==1.13.1 + # via torch +tokenizers==0.13.3 + # via transformers +toml==0.10.2 + # via bunkai +torch==2.5.1 + # via bunkai +tqdm==4.67.1 + # via bunkai + # via huggingface-hub + # via transformers +transformers==4.33.3 + # via bunkai +triton==3.1.0 + # via torch +typing-extensions==4.12.2 + # via huggingface-hub + # via torch + # via typing-inspect +typing-inspect==0.9.0 + # via dataclasses-json +urllib3==2.3.0 + # via requests diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/requirements.lock b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/requirements.lock new file mode 100644 index 00000000..9bf31530 --- /dev/null +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/requirements.lock @@ -0,0 +1,132 @@ +# generated by rye +# use `rye lock` or `rye sync` to update this lockfile +# +# last locked with the following flags: +# pre: false +# features: [] +# all-features: false +# with-sources: false +# generate-hashes: false +# universal: false + +bunkai @ git+https://github.com/hkiyomaru/bunkai@12a2dfa9eb47e203f4135edac3087a2809d92ca7 +certifi==2024.12.14 + # via requests +charset-normalizer==3.4.1 + # via requests +dataclasses-json==0.6.7 + # via bunkai +emoji==2.14.0 + # via bunkai +emojis==0.7.0 + # via bunkai +filelock==3.16.1 + # via huggingface-hub + # via torch + # via transformers + # via triton +fsspec==2024.12.0 + # via huggingface-hub + # via torch +huggingface-hub==0.27.1 + # via transformers +idna==3.10 + # via requests +iniconfig==2.0.0 + # via pytest +janome==0.5.0 + # via bunkai +jinja2==3.1.5 + # via torch +markupsafe==3.0.2 + # via jinja2 +marshmallow==3.25.1 + # via dataclasses-json +more-itertools==10.6.0 + # via bunkai +mpmath==1.3.0 + # via sympy +mypy-extensions==1.0.0 + # via typing-inspect +networkx==3.4.2 + # via torch +numpy==2.2.1 + # via bunkai + # via transformers +nvidia-cublas-cu12==12.4.5.8 + # via nvidia-cudnn-cu12 + # via nvidia-cusolver-cu12 + # via torch +nvidia-cuda-cupti-cu12==12.4.127 + # via torch +nvidia-cuda-nvrtc-cu12==12.4.127 + # via torch +nvidia-cuda-runtime-cu12==12.4.127 + # via torch +nvidia-cudnn-cu12==9.1.0.70 + # via torch +nvidia-cufft-cu12==11.2.1.3 + # via torch +nvidia-curand-cu12==10.3.5.147 + # via torch +nvidia-cusolver-cu12==11.6.1.9 + # via torch +nvidia-cusparse-cu12==12.3.1.170 + # via nvidia-cusolver-cu12 + # via torch +nvidia-nccl-cu12==2.21.5 + # via torch +nvidia-nvjitlink-cu12==12.4.127 + # via nvidia-cusolver-cu12 + # via nvidia-cusparse-cu12 + # via torch +nvidia-nvtx-cu12==12.4.127 + # via torch +packaging==24.2 + # via huggingface-hub + # via marshmallow + # via pytest + # via transformers +pluggy==1.5.0 + # via pytest +pytest==8.3.4 +pyyaml==6.0.2 + # via huggingface-hub + # via transformers +regex==2024.11.6 + # via bunkai + # via transformers +requests==2.32.3 + # via bunkai + # via huggingface-hub + # via transformers +safetensors==0.5.2 + # via transformers +setuptools==75.8.0 + # via torch +spans==1.1.1 + # via bunkai +sympy==1.13.1 + # via torch +tokenizers==0.13.3 + # via transformers +toml==0.10.2 + # via bunkai +torch==2.5.1 + # via bunkai +tqdm==4.67.1 + # via bunkai + # via huggingface-hub + # via transformers +transformers==4.33.3 + # via bunkai +triton==3.1.0 + # via torch +typing-extensions==4.12.2 + # via huggingface-hub + # via torch + # via typing-inspect +typing-inspect==0.9.0 + # via dataclasses-json +urllib3==2.3.0 + # via requests diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py new file mode 100644 index 00000000..3644eb0c --- /dev/null +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py @@ -0,0 +1,241 @@ +"""Remove intra-sentence line breaks from text.""" + +import argparse +import concurrent +import concurrent.futures +import json +import logging +import os +import pathlib +from typing import Iterator, TextIO + +import bunkai +import tqdm +import torch + +logger = logging.getLogger(__name__) + +torch.set_num_threads(1) + +root = pathlib.Path(__file__).parent.parent +model_path = root / "bunkai_model" +senter = bunkai.Bunkai(path_model=model_path) + + +def split_text_by_newline(text: str, window: int = 5) -> list[str]: + """Split text into chunks so that: + - Concatinating all chunks gives the original text. + - Each newline character has `window` characters before and after it at least, except for the first and last chunks. + + Args: + text (str): Input text. + + Returns: + list[str]: List of chunks. + + Example: + >>> list(split_text_by_newline("Hello World\n")) + ["Hello ", "World\n"] + >>> list(split_text_by_newline("Hello\nWorld")) + ["Hello\nWorld"] + >>> list(split_text_by_newline("Hello\nWorld\n")) + ["Hello\nWorld\n"] + >>> list(split_text_by_newline("Hello\nWorld\nHello\nWorld\n")) + ["Hello\nWorld\nHello\nWorld\n"] + >>> list(split_text_by_newline("CONTEXT|Hello\nWorld\nHello\nWorld|CONTEXT")) + ["CONTEXT|", "Hello\nWorld\nHello\nWorld", "|CONTEXT"] + """ + if "\n" not in text: + return [text] + + chunks: list[str] = [] + chunk: str = "" + newline_pos: int = -1 + for i, char in enumerate(text): + if char == "\n": + newline_pos = i + if len(chunk) > window and "\n" not in chunk: + chunks.append(chunk[:-window]) + chunk = chunk[-window:] + if newline_pos != -1 and i - newline_pos == window + 1: + chunks.append(chunk) + chunk = "" + newline_pos = -1 + chunk += char + if chunk: + chunks.append(chunk) + return chunks + + +def split_text_by_bunkai(text: str) -> list[str]: + """Split text by Bunkai. + + Args: + text (str): Input text. + + Returns: + list[str]: List of sentences. + """ + if not text: + return [""] + return [s.replace("▁", "\n") for s in senter(text.replace("\n", "▁"))] + + +def remove_intra_sentence_line_breaks(text: str) -> str: + """Remove intra-sentence line breaks. + + Args: + text (str): Input text. + + Returns: + str: Processed text. + """ + if all(char == "\n" for char in text): + return text + num_leading_newlines = len(text) - len(text.lstrip("\n")) + num_trailing_newlines = len(text) - len(text.rstrip("\n")) + return ( + "\n" * num_leading_newlines + + text.replace("\n", "") + + "\n" * num_trailing_newlines + ) + + +def process_line(line: str) -> str: + """Process line. + + Args: + line (str): Input line. + + Returns: + str: Processed line. + """ + dat = json.loads(line) + + text: str = dat["text"] + new_text: str = "" + for chunk in split_text_by_newline(text, window=5): + # Skip sentence splitting by bunkai if there is no line break + # as it aims to remove intra-sentence line breaks + if "\n" not in chunk: + new_text += chunk + continue + + # Skip long chunks as they are usually so noisy that + # bunkai will not work well + if len(chunk) > 20: + new_text += chunk + continue + + for sent in split_text_by_bunkai(chunk): + new_text += remove_intra_sentence_line_breaks(sent) + + assert text.replace("\n", "") == new_text.replace("\n", "") + + dat["text"] = new_text + + return json.dumps(dat, ensure_ascii=False) + "\n" + + +def process_lines(lines: list[str]) -> str: + """Process lines. + + Args: + lines (list[str]): Input lines. + + Returns: + str: Processed lines. + """ + ret: str = "" + for line in lines: + try: + ret += process_line(line) + except Exception as e: + logger.error(f"Error: {e}") + ret += line + return ret + + +def buffered_read( + file: TextIO, + processed_ids: set[str], + buffer_size: int = 32, +) -> Iterator[list[str]]: + """Buffered read. + + Args: + file: File object. + processed_ids: Processed IDs. + buffer_size: Buffer size. + + Yields: + str: Line. + """ + lines: list[str] = [] + for line in file: + dat = json.loads(line) + if dat["docId"] in processed_ids: + continue + lines.append(line) + if len(lines) == buffer_size: + yield lines + lines = [] + yield lines + + +def main() -> None: + """Main function.""" + parser = argparse.ArgumentParser("Remove intra-sentence line breaks from text.") + parser.add_argument("--input-file", type=str, required=True, help="Input file.") + parser.add_argument("--output-file", type=str, required=True, help="Output file.") + parser.add_argument( + "--num-workers", type=int, default=-1, help="Number of workers." + ) + parser.add_argument("--buffer-size", type=int, default=32, help="Buffer size.") + parser.add_argument( + "--overwrite", action="store_true", help="Overwrite output file." + ) + args = parser.parse_args() + + num_workers = args.num_workers if args.num_workers != -1 else os.cpu_count() + + os.makedirs(os.path.dirname(args.output_file), exist_ok=True) + + # Create an empty file if overwrite is True + if args.overwrite: + with open(args.output_file, "wt", encoding="utf-8"): + pass + + # Get processed lines if overwrite is False + processed_ids: set[str] = set() + if not args.overwrite and os.path.exists(args.output_file): + with open(args.output_file, "rt", encoding="utf-8") as fin: + for line in fin: + dat = json.loads(line) + processed_ids.add(dat["docId"]) + + with ( + open(args.input_file, "rt", encoding="utf-8") as fin, + open(args.output_file, "at", encoding="utf-8") as fout, + ): + with concurrent.futures.ProcessPoolExecutor(num_workers) as executor: + futures = [] + for lines in buffered_read( + fin, + processed_ids=processed_ids, + buffer_size=args.buffer_size, + ): + futures.append(executor.submit(process_lines, lines)) + for future in tqdm.tqdm( + concurrent.futures.as_completed(futures), + total=len(futures), + ): + fout.write(future.result()) + fout.flush() + + +if __name__ == "__main__": + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + ) + main() diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/filter.py b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/filter.py new file mode 100644 index 00000000..25b87e52 --- /dev/null +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/filter.py @@ -0,0 +1,166 @@ +"""Remove low-quality documents.""" + +import argparse +import logging +import json +import os +import concurrent +import concurrent.futures +from typing import Iterator, TextIO + +import tqdm + +logger = logging.getLogger(__name__) + + +def get_line_break_or_white_space_ratio(text: str) -> float: + """Get the ratio of line breaks and white spaces in the text. + + Args: + text: The text to analyze. + + Returns: + The ratio of line breaks and white spaces in the text. + """ + if text == "": + return 0.0 + return (text.count("\n") + text.count(" ")) / len(text) + + +def get_short_line_ratio(text: str, threshold: int = 5) -> float: + """Get the ratio of characters in short lines in the text. + + Args: + text: The text to analyze. + threshold: The threshold to determine if a line is short. + + Returns: + The ratio of short lines in the text. + """ + if text == "": + return 0.0 + lines = [line for line in text.split("\n") if line.strip()] + if len(lines) == 0: + return 0.0 + short_lines = [line for line in lines if len(line.replace(" ", "")) <= threshold] + return sum(map(len, short_lines)) / sum(map(len, lines)) + + +def process_line( + line: str, + line_break_or_white_space_ratio_threshold: float = 0.2, + short_line_ratio_threshold: float = 0.1, +) -> str: + """Process a line in the input file. + + Args: + line: A line in the input file. + line_break_or_white_space_ratio_threshold: The threshold of the ratio of line breaks and white spaces. + short_line_ratio_threshold: The threshold of the ratio of short lines. + """ + try: + row = json.loads(line) + except Exception as e: + logging.error(f"Error: {e}") + return line + text = row["text"] + orig_meta = row.get("meta", {}) + row["meta"] = { + "line_break_or_white_space_ratio": get_line_break_or_white_space_ratio(text), + "short_line_ratio": get_short_line_ratio(text), + } + if orig_meta: + row["meta"]["meta"] = orig_meta + if ( + row["meta"]["line_break_or_white_space_ratio"] + > line_break_or_white_space_ratio_threshold + ): + return "" + if row["meta"]["short_line_ratio"] > short_line_ratio_threshold: + return "" + return json.dumps(row, ensure_ascii=False) + "\n" + + +def process_lines(lines: list[str]) -> str: + """Process lines. + + Args: + lines (list[str]): Input lines. + + Returns: + str: Processed lines. + """ + ret: str = "" + for line in lines: + ret += process_line(line) + return ret + + +def buffered_read(file: TextIO, buffer_size: int = 32) -> Iterator[list[str]]: + """Buffered read. + + Args: + file: File object. + processed_ids: Processed IDs. + buffer_size: Buffer size. + + Yields: + str: Line. + """ + lines: list[str] = [] + for line in file: + lines.append(line) + if len(lines) == buffer_size: + yield lines + lines = [] + yield lines + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument( + "--input-file", + type=str, + required=True, + help="Path to the input file.", + ) + parser.add_argument( + "--output-file", + type=str, + required=True, + help="Path to the output file.", + ) + parser.add_argument( + "--num-workers", + type=int, + default=1, + help="Number of workers for multiprocessing.", + ) + parser.add_argument("--buffer-size", type=int, default=256, help="Buffer size.") + args = parser.parse_args() + + num_workers = args.num_workers if args.num_workers != -1 else os.cpu_count() + + os.makedirs(os.path.dirname(args.output_file), exist_ok=True) + + with ( + open(args.input_file, "rt", encoding="utf-8") as fin, + open(args.output_file, "wt", encoding="utf-8") as fout, + ): + with concurrent.futures.ProcessPoolExecutor(num_workers) as executor: + futures = [] + for lines in buffered_read(fin, buffer_size=args.buffer_size): + futures.append(executor.submit(process_lines, lines)) + for future in tqdm.tqdm( + concurrent.futures.as_completed(futures), + total=len(futures), + ): + fout.write(future.result()) + fout.flush() + + +if __name__ == "__main__": + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(message)s", + ) + main() diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/launch.py b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/launch.py new file mode 100644 index 00000000..704eecbc --- /dev/null +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/launch.py @@ -0,0 +1,134 @@ +import argparse +import logging +from pathlib import Path +import subprocess +from queue import Queue +from time import sleep + + +logger = logging.getLogger(__name__) + +here = Path(__file__).parent + +script_paths = { + "convert": here / "convert.py", + "filter": here / "filter.py", +} + +interpreter_path = here.parent / ".venv" / "bin" / "python" + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument( + "MODE", + type=str, + choices=["test", "convert", "filter"], + help="Mode to run", + ) + parser.add_argument( + "--input-dir", + type=str, + required=True, + help="Path to the input file", + ) + parser.add_argument( + "--output-dir", + type=str, + required=True, + help="Path to the output file", + ) + parser.add_argument( + "--hosts", + type=str, + required=True, + help="Path to the hosts file", + ) + args = parser.parse_args() + + script_path = script_paths[args.MODE] + + input_dir = Path(args.input_dir) + output_dir = Path(args.output_dir) + + queue = Queue() + for input_path in input_dir.glob("**/*.jsonl"): + output_path = output_dir / input_path.relative_to(input_dir) + command = [ + str(interpreter_path), + str(script_path), + "--input-file", + str(input_path), + "--output-file", + str(output_path), + ] + queue.put(command) + + waiting = Queue() + with open(args.hosts) as f: + for line in f: + host = line.strip() + proc = subprocess.Popen( + [ + "ssh", + "-i", + "~/.ssh/id_ed_25519", + "-o", + "StrictHostKeyChecking=no", + host, + "true", + ], + stderr=subprocess.PIPE, + ) + if proc.wait() != 0: + logger.error(f"Host {host} is not available") + continue + logger.info(f"Host {host} is available") + waiting.put(line.strip()) + + logger.info(f"Available hosts: {waiting.qsize()}") + + running = [] + while not queue.empty() or len(running) > 0: + while not waiting.empty() and not queue.empty(): + host = waiting.get() + command = queue.get() + logger.info(f"Running {command} on {host}") + proc = subprocess.Popen( + [ + "ssh", + "-i", + "~/.ssh/id_ed_25519", + "-o", + "StrictHostKeyChecking=no", + host, + ] + + command, + stdout=subprocess.DEVNULL, + stderr=subprocess.PIPE, + ) + running.append((host, proc)) + + for host, proc in running: + if proc.poll() is not None: + if proc.returncode != 0: + error_message = proc.stderr.read().decode("utf-8") + logger.error(f"Failed {proc.args} on {host}: {error_message}") + queue.put(proc.args[6:]) + else: + logger.info(f"Finished {proc.args} on {host}") + running.remove((host, proc)) + waiting.put(host) + break + + sleep(1) + + logger.info("All done") + + +if __name__ == "__main__": + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + ) + main() diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/test_convert.py b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/test_convert.py new file mode 100644 index 00000000..a9f8ab3a --- /dev/null +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/test_convert.py @@ -0,0 +1,82 @@ +import pytest + +from convert import ( + split_text_by_newline, + split_text_by_bunkai, + remove_intra_sentence_line_breaks, +) + + +@pytest.mark.parametrize( + "text, expected", + [ + ("Hello World\n", ["Hello ", "World\n"]), + ("Hello\nWorld", ["Hello\nWorld"]), + ("Hello\nWorld\n", ["Hello\nWorld\n"]), + ("Hello\nWorld\nHello\nWorld\n", ["Hello\nWorld\nHello\nWorld\n"]), + ( + "CONTEXT|Hello\nWorld\nHello\nWorld|CONTEXT", + ["CONTEXT|", "Hello\nWorld\nHello\nWorld", "|CONTEXT"], + ), + ], +) +def test_split_text_by_newline(text: str, expected: list[str]) -> None: + assert split_text_by_newline(text) == expected + + +@pytest.mark.parametrize( + "text, expected", + [ + ( + "こういう\n日本語の文章は\nよくあります", + ["こういう\n日本語の文章は\nよくあります"], + ), + ( + "改行が文区切りです\nこういう日本語の文章はよくあります", + ["改行が文区切りです\n", "こういう日本語の文章はよくあります"], + ), + ( + "改行が文区切り\nです\nこういう日本語\nの文章はよくあ\nります\n", + ["改行が文区切り\nです\n", "こういう日本語\nの文章はよくあ\nります\n"], + ), + ( + "\n", + ["\n"], + ), + ], +) +def test_split_text_by_bunkai(text: str, expected: list[str]) -> None: + assert split_text_by_bunkai(text) == expected + + +@pytest.mark.parametrize( + "text, expected", + [ + ( + "こういう\n日本語の文章は\nよくあります", + "こういう日本語の文章はよくあります", + ), + ( + "\n\nこういう\n日本語の文章は\nよくあります\n\n", + "\n\nこういう日本語の文章はよくあります\n\n", + ), + ( + "\nこういう\n日本語の文章は\nよくあります\n", + "\nこういう日本語の文章はよくあります\n", + ), + ( + "\n\n\nこういう\n日本語の文章は\nよくあります\n", + "\n\n\nこういう日本語の文章はよくあります\n", + ), + ( + "\n", + "\n", + ), + ( + "\n\n", + "\n\n", + ), + ], +) +def test_remove_intra_sentence_line_breaks(text: str, expected: str) -> None: + assert remove_intra_sentence_line_breaks(text) == expected