diff --git a/.circleci/check_head.py b/.circleci/check_head.py new file mode 100644 index 0000000..895f5e9 --- /dev/null +++ b/.circleci/check_head.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 +import codecs +import collections +import sys +import typing + + +def main(): + line2fnames: typing.DefaultDict[str, typing.List[str]] = collections.defaultdict(list) + for fname in sys.argv: + with codecs.open(fname, 'r', 'utf-8') as inf: + line = inf.readline()[:-1] + line2fnames[line].append(fname) + common_line = '' + common_num = 0 + for line, fnames in line2fnames.items(): + if len(fnames) > common_num: + common_num = len(fnames) + common_line = line + if len(line2fnames) != 1: + print(f'Common ({common_num}): {common_line}') + print('Others:') + for line, fnames in line2fnames.items(): + if line != common_line: + print(f'\t{line}: \t{fnames}') + sys.exit(1) + + +if __name__ == '__main__': + main() diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 0000000..7d405fa --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,71 @@ +--- +version: 2.1 + +jobs: + lint_with_node: + docker: + - image: circleci/node:12.20.0 + working_directory: ~/app_lintwithnode + steps: + - checkout + - restore_cache: + keys: + - cache-{{ .Environment.CACHE_VERSION }}-lint_with_node + + - run: + name: npm module install + command: | + make setup_node_module + - save_cache: + key: cache-{{ .Environment.CACHE_VERSION }}-lint_with_node + paths: + - ./node_modules + + - run: + name: markdownlint + command: | + make lint_markdown + + build: + docker: + - image: circleci/python:3.7.9 + working_directory: ~ + steps: + - checkout + + - restore_cache: + key: deps-{{ checksum "poetry.lock" }}\ + -{{ .Environment.CACHE_VERSION }} + - run: + name: Install dependencies + command: | + make -j $(nproc) dev_setup + - run: + name: setup-cc + command: | + poetry run make setup-cc + + - save_cache: + key: deps-{{ checksum "poetry.lock" }}\ + -{{ .Environment.CACHE_VERSION }} + paths: + - ~/.cache/pypoetry/virtualenvs + - ~/.local/bin-cc + - run: + name: Lint + command: | + poetry run make -j $(nproc) lint + - run: + name: Test Python codes + command: | + poetry run make -j $(nproc) test-cc + + - store_artifacts: + path: htmlcov + +workflows: + version: 2 + build: + jobs: + - lint_with_node + - build diff --git a/.circleci/show_method_names.py b/.circleci/show_method_names.py new file mode 100644 index 0000000..2365ae0 --- /dev/null +++ b/.circleci/show_method_names.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 +import ast +import typing + + +def get_method_names(tree_object): + result: typing.Dict[int, typing.Tuple[str, int]] = {} + searched_line_no = 0 + for node in ast.iter_child_nodes(tree_object): + if isinstance(node, ast.FunctionDef): + result[node.lineno] = (node.name, node.lineno) + + for child in ast.iter_child_nodes(node): + get_method_names(child) + + if hasattr(node, 'lineno'): + if node.lineno > searched_line_no: + searched_line_no = node.lineno + else: + t = result.get(node.lineno) + if t is not None: + result[node.lineno] = (t[0], searched_line_no) + + return result + + +if __name__ == '__main__': + import codecs + import os + import sys + with codecs.open(sys.argv[1], 'r', 'utf-8') as f: + try: + source = f.read() + except UnicodeDecodeError: + raise Exception(sys.argv[1]) + + tree = ast.parse(source, os.path.basename(sys.argv[1])) + import re + pattern = re.compile(r'split|divide|文分割') + + for line_no, v in get_method_names(tree).items(): + if len(pattern.findall(v[0])) > 0: + raise Exception(f'method name violation {v[0]} at file-name = {sys.argv[1]}') diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..bbe7045 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,10 @@ + +[run] +omit = + */site-packages/* + */distutils/* + tests/* + +[report] +exclude_lines = + if __name__ == .__main__.: diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..cfc4ffc --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,9 @@ +--- +version: 2 +updates: + - package-ecosystem: pip + directory: "/" + schedule: + interval: daily + open-pull-requests-limit: 10 + versioning-strategy: lockfile-only diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..27df48d --- /dev/null +++ b/.gitignore @@ -0,0 +1,154 @@ +python_env +out +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# static files generated from Django application using `collectstatic` +media +static + +.idea/ +.DS_Store +resource/ +runs/ +work_scripts/ +models/ +data +OLD diff --git a/.isort.cfg b/.isort.cfg new file mode 100644 index 0000000..68c11e0 --- /dev/null +++ b/.isort.cfg @@ -0,0 +1,2 @@ +[settings] +known_third_party= diff --git a/.markdownlint.json b/.markdownlint.json new file mode 100644 index 0000000..4b10703 --- /dev/null +++ b/.markdownlint.json @@ -0,0 +1,7 @@ +{ + "MD007": { + "indent": 4 + }, + "line-length": false, + "no-inline-html": false +} diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..d645695 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..eeef336 --- /dev/null +++ b/Makefile @@ -0,0 +1,21 @@ + +all: lint lint_markdown test + +include mks/lint.mk +include mks/lbd.mk +include mks/vanilla.mk + + +.PHONY: all setup \ + flake8 autopep8 mypy isort jsonlint yamllint\ + terms_check_path term_check_method term_check_file_content\ + check_firstline \ + lint \ + _run_isort _coverage\ + test test-coverage setup-cc test-cc\ + setup_node_module lint_markdown circleci_local + +.DELETE_ON_ERROR: + +circleci_local: + circleci local execute diff --git a/README.md b/README.md new file mode 100644 index 0000000..53d4d78 --- /dev/null +++ b/README.md @@ -0,0 +1,50 @@ +# Bunkai + +[![PyPI version](https://badge.fury.io/py/bunkai.svg)](https://badge.fury.io/py/bunkai) +[![Python Versions](https://img.shields.io/pypi/pyversions/bunkai.svg)](https://pypi.org/project/bunkai/) +[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) + +[![CircleCI](https://circleci.com/gh/megagonlabs/bunkai.svg?style=svg&circle-token=c555b8070630dfe98f0406a3892fc228b2370951)](https://app.circleci.com/pipelines/github/megagonlabs/bunkai) +[![Maintainability](https://api.codeclimate.com/v1/badges/640b02fa0164c131da10/maintainability)](https://codeclimate.com/github/megagonlabs/bunkai/maintainability) +[![Test Coverage](https://api.codeclimate.com/v1/badges/640b02fa0164c131da10/test_coverage)](https://codeclimate.com/github/megagonlabs/bunkai/test_coverage) +[![markdownlint](https://img.shields.io/badge/markdown-lint-lightgrey)](https://github.com/markdownlint/markdownlint) +[![jsonlint](https://img.shields.io/badge/json-lint-lightgrey)](https://github.com/dmeranda/demjson) +[![yamllint](https://img.shields.io/badge/yaml-lint-lightgrey)](https://github.com/adrienverge/yamllint) + +Bunkai is a sentence boundary (SB) disambiguation tool for Japanese. + +## Quick Start + +```console +$ pip install bunkai +$ echo -e '宿を予約しました♪!まだ2ヶ月も先だけど。早すぎかな(笑)楽しみです★\n2文書目の先頭行です。▁改行はU+2581で表現します。' \ + | bunkai +│宿を予約しました♪!│まだ2ヶ月も先だけど。│早すぎかな(笑)│楽しみです★ +│2文書目の先頭行です。│▁改行はU+2581で表現します。 +``` + +Feed a document as one line by using ``▁`` (U+2581) for line breaks. +The output shows sentence boundaries with ``│`` (U+2502). + +If you want to disambiguate sentence boundaries for line breaks, please add a `--model` option with the path to the model. +Trained models are available [here](https://github.com/megagonlabs/bunkai/releases). + +```console +$ echo -e "文の途中で改行を▁入れる文章ってありますよね▁それも対象です。" | bunkai --model /path/to/model +文の途中で改行を▁入れる文章ってありますよね▁│それも対象です。 +``` + +For more information, see [examples](example) or [documents](docs). + +## References + +- Yuta Hayashibe and Kensuke Mitsuzawa. + Sentence Boundary Detection on Line Breaks in Japanese. + Proceedings of The 6th Workshop on Noisy User-generated Text (W-NUT 2020), pp.71-75. + November 2020. + [[PDF]](https://www.aclweb.org/anthology/2020.wnut-1.10.pdf) + [[bib]](https://www.aclweb.org/anthology/2020.wnut-1.10.bib) + +## License + +Apache License 2.0 diff --git a/bunkai/__init__.py b/bunkai/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bunkai/algorithm/__init__.py b/bunkai/algorithm/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bunkai/algorithm/bunkai_sbd/__init__.py b/bunkai/algorithm/bunkai_sbd/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bunkai/algorithm/bunkai_sbd/annotator/__init__.py b/bunkai/algorithm/bunkai_sbd/annotator/__init__.py new file mode 100644 index 0000000..3b0cdd2 --- /dev/null +++ b/bunkai/algorithm/bunkai_sbd/annotator/__init__.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 +from bunkai.algorithm.bunkai_sbd.annotator.basic_annotator import BasicRule +from bunkai.algorithm.bunkai_sbd.annotator.dot_exception_annotator import \ + DotExceptionAnnotator +from bunkai.algorithm.bunkai_sbd.annotator.emoji_annotator import \ + EmojiAnnotator +from bunkai.algorithm.bunkai_sbd.annotator.emotion_expression_annotator import \ + EmotionExpressionAnnotator +from bunkai.algorithm.bunkai_sbd.annotator.facemark_detector import \ + FaceMarkDetector +from bunkai.algorithm.bunkai_sbd.annotator.indirect_quote_exception_annotator import \ + IndirectQuoteExceptionAnnotator +from bunkai.algorithm.bunkai_sbd.annotator.linebreak_annotator import \ + LinebreakAnnotator +from bunkai.algorithm.bunkai_sbd.annotator.linebreak_force_annotator import \ + LinebreakForceAnnotator +from bunkai.algorithm.bunkai_sbd.annotator.morph_annotator import \ + MorphAnnotatorJanome +from bunkai.algorithm.bunkai_sbd.annotator.number_exception_annotator import \ + NumberExceptionAnnotator + +__all__ = ['BasicRule', 'EmotionExpressionAnnotator', 'FaceMarkDetector', + 'IndirectQuoteExceptionAnnotator', 'MorphAnnotatorJanome', + 'LinebreakAnnotator', 'LinebreakForceAnnotator', 'NumberExceptionAnnotator', + 'DotExceptionAnnotator', + 'EmojiAnnotator'] diff --git a/bunkai/algorithm/bunkai_sbd/annotator/basic_annotator.py b/bunkai/algorithm/bunkai_sbd/annotator/basic_annotator.py new file mode 100644 index 0000000..4f6208c --- /dev/null +++ b/bunkai/algorithm/bunkai_sbd/annotator/basic_annotator.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 +import re + +from bunkai.algorithm.bunkai_sbd.annotator import constant +from bunkai.base.annotation import Annotations, SpanAnnotation +from bunkai.base.annotator import Annotator + +RE_SENT_SPLIT = re.compile("[" + constant.PUNCTUATIONS + r"]+\s*") + + +class BasicRule(Annotator): + def __init__(self): + super().__init__(rule_name=self.__class__.__name__) + + def annotate(self, + original_text: str, + spans: Annotations) -> Annotations: + reg_points = RE_SENT_SPLIT.finditer(original_text) + + __return = [SpanAnnotation(rule_name=self.rule_name, + start_index=r_obj.regs[0][0], + end_index=r_obj.regs[0][1], + split_string_type=BasicRule.__name__, + split_string_value=original_text[r_obj.regs[0][0]:r_obj.regs[0][1]]) + for r_obj in reg_points] + # filter out strings between face marks + spans = self.add_forward_rule(__return, spans) + return spans diff --git a/bunkai/algorithm/bunkai_sbd/annotator/constant.py b/bunkai/algorithm/bunkai_sbd/annotator/constant.py new file mode 100644 index 0000000..cb427ef --- /dev/null +++ b/bunkai/algorithm/bunkai_sbd/annotator/constant.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python3 + +import typing + +PUNCTUATIONS: str = "。!?.!?." +EMOTION_SYMBOLS: str = '…\u2605\u2606\u266a' +EMOTION_CHARACTERS: typing.List[str] = list('笑泣汗涙怒嬉爆驚喜悲謎恥焦苦照憂') +EMOTION_EXPRESSIONS: typing.List[str] = '笑い|わら|泣き|怒り|照れ'.split('|') +FACE_SYMBOL_PREFIX_SUFFIX: str = '[!-/:-@[-`{-~!-/:-@[-`{-~\u00A1-\u0FFF\u1000-\u2FFF\u4E00艸]' +ALPHABETS_REGEXP: str = 'a-zA-Z0-9a-zA-Z0-9' +FACE_SYMBOL1_REGEXP: str = "[" + ALPHABETS_REGEXP + "!-/:-@[-`{-~!-/:-@[-`{-~a-zA-Z0-9\u00A1-\u0FFF\u1000-\u2FFF\u4E00艸]" +FACE_SYMBOL2_REGEXP: str = "[!-/:-@[-`{-~!-/:-@[-`{-~\u00A1-\u0FFF\u1000-\u2FFF\u4E00艸]" +FACE_EXPRESSION_REGEXP: str = \ + FACE_SYMBOL_PREFIX_SUFFIX \ + + r'*[(\(]' \ + + FACE_SYMBOL1_REGEXP + '*' \ + + FACE_SYMBOL2_REGEXP + '+' \ + + FACE_SYMBOL1_REGEXP + '*' \ + + r'[)\)]' \ + + FACE_SYMBOL_PREFIX_SUFFIX + '*' + + +NUMBER_WORD_REGEXP: str = r'[nNnN][oOoO]' + +LAYER_NAME_FIRST = 'first' diff --git a/bunkai/algorithm/bunkai_sbd/annotator/dot_exception_annotator.py b/bunkai/algorithm/bunkai_sbd/annotator/dot_exception_annotator.py new file mode 100644 index 0000000..3f64ed8 --- /dev/null +++ b/bunkai/algorithm/bunkai_sbd/annotator/dot_exception_annotator.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 +import re + +from bunkai.base.annotation import Annotations +from bunkai.base.annotator import AnnotationFilter + +NumericExpression = re.compile(r'[〇一二三四五六七八九十百千万億兆京\d]+') +MailaddressCharacter = re.compile(r'[a-zA-Z0-9]') + + +class DotExceptionAnnotator(AnnotationFilter): + + def __init__(self): + super().__init__(rule_name=self.__class__.__name__) + + @staticmethod + def is_exception_numeric(original_text: str, index: int) -> bool: + """ + Ignore dot between numbers. + + Eg: 和室3.5畳 / 1.5リットル以上のペットボトル. + """ + if index <= 0: + return False + if index + 1 >= len(original_text): + return False + if original_text[index] != '.' and original_text[index] != '.': + return False + if not NumericExpression.match(original_text[index - 1]): + return False + if not NumericExpression.match(original_text[index + 1]): + return False + return True + + @staticmethod + def is_exception_mailaddress(original_text: str, index: int) -> bool: + if index <= 0: + return False + if index + 1 >= len(original_text): + return False + if original_text[index] != '.' and original_text[index] != '.': + return False + if not MailaddressCharacter.match(original_text[index - 1]): + return False + if not MailaddressCharacter.match(original_text[index + 1]): + return False + return True + + def annotate(self, original_text: str, + spans: Annotations) -> Annotations: + __return_span_ann = [] + for __s in spans.get_final_layer(): + if self.is_exception_numeric(original_text, __s.start_index): + continue + elif self.is_exception_mailaddress(original_text, __s.start_index): + continue + __return_span_ann.append(__s) + + spans.add_annotation_layer(self.rule_name, __return_span_ann) + return spans diff --git a/bunkai/algorithm/bunkai_sbd/annotator/emoji_annotator.py b/bunkai/algorithm/bunkai_sbd/annotator/emoji_annotator.py new file mode 100644 index 0000000..27aed89 --- /dev/null +++ b/bunkai/algorithm/bunkai_sbd/annotator/emoji_annotator.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 +import dataclasses +import typing + +import emoji +import emojis + +from bunkai.base.annotation import Annotations, SpanAnnotation +from bunkai.base.annotator import Annotator + +EMOJI_UNICODE_ENGLISH = emoji.UNICODE_EMOJI['en'] + + +"""This module detects Emoji""" + +# You could set any emoji-category that functions as an end-of-sentence +DEFAULT_TARGET_EMOJI_CATEGORY = ('Smileys & Emotion', 'Symbols') + + +@dataclasses.dataclass +class EmojiText(object): + start_index: int + end_index: int + category: typing.List[typing.Optional[str]] + + def get_span(self) -> int: + return self.end_index - self.start_index + + def check_emoji_category(self, category: typing.Iterable[str]) -> bool: + """Be True if the given category-name exists else False.""" + if len(set(category) & set(self.category)) > 0: + return True + else: + return False + + +class EmojiAnnotator(Annotator): + def __init__(self, default_target_emoji_category: typing.Tuple[str, ...] = DEFAULT_TARGET_EMOJI_CATEGORY): + super().__init__(rule_name=self.__class__.__name__) + self.default_target_emoji_category = default_target_emoji_category + + @staticmethod + def get_emoji_info(emoji_character: str) -> typing.Optional[str]: + """Get emoji info. return a name of a category.""" + info = emojis.db.get_emoji_by_code(emoji_character) + try: + if info is None: + info = emojis.db.get_emoji_by_code(f'{emoji_character}\ufe0f') + return info.category + else: + return info.category + except AttributeError: + return None + + def __find_emoji(self, text: str) -> typing.List[EmojiText]: + """:return: spans of emoji index. [[start-index, end-index]].""" + __i: int = 0 + emoji_spans = [] + emoji_categories = [] + is_emoji_span: bool = False + span_emoji_start: int = 0 + for __i in range(len(text)): + __emoji: str = text[__i] + if __emoji in EMOJI_UNICODE_ENGLISH: + if is_emoji_span and __i + 1 <= len(text) and text[__i + 1] not in EMOJI_UNICODE_ENGLISH: + # 絵文字の範囲終了 + is_emoji_span = False + emoji_categories.append(self.get_emoji_info(text[__i])) + emoji_spans.append(EmojiText(span_emoji_start, __i + 1, emoji_categories)) + emoji_categories = [] + elif is_emoji_span is False and __i + 2 <= len(text) and text[__i + 1] not in EMOJI_UNICODE_ENGLISH: + # 絵文字が1文字で終了の場合 + emoji_spans.append(EmojiText(__i - 1, __i + 1, [self.get_emoji_info(text[__i])])) + elif is_emoji_span is False and __i + 1 == len(text): + # テキストの末尾が絵文字の場合 + emoji_spans.append(EmojiText(__i - 1, __i + 1, [self.get_emoji_info(text[__i])])) + elif is_emoji_span is False: + # 絵文字範囲のスタート + is_emoji_span = True + span_emoji_start = __i + emoji_categories.append(self.get_emoji_info(text[__i])) + elif is_emoji_span is True: + # 絵文字範囲の途中 + emoji_categories.append(self.get_emoji_info(text[__i])) + return emoji_spans + + def annotate(self, + original_text: str, + spans: Annotations, + emoji_threshold: int = 1) -> Annotations: + __emoji_spans = self.__find_emoji(original_text) + target_emoji = [e_span for e_span in __emoji_spans if e_span.get_span() >= emoji_threshold] + target_emoji = [e for e in target_emoji if e.check_emoji_category(self.default_target_emoji_category)] + + __return = [SpanAnnotation(rule_name=self.rule_name, + start_index=emoji.start_index, + end_index=emoji.end_index, + split_string_type=EmojiAnnotator.__name__, + split_string_value=original_text[emoji.start_index:emoji.end_index], + args={'emoji': emoji}) + for emoji in target_emoji] + spans = self.add_forward_rule(__return, spans) + return spans diff --git a/bunkai/algorithm/bunkai_sbd/annotator/emotion_expression_annotator.py b/bunkai/algorithm/bunkai_sbd/annotator/emotion_expression_annotator.py new file mode 100644 index 0000000..328f976 --- /dev/null +++ b/bunkai/algorithm/bunkai_sbd/annotator/emotion_expression_annotator.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 +import re + +from bunkai.algorithm.bunkai_sbd.annotator import constant +from bunkai.base.annotation import Annotations, SpanAnnotation +from bunkai.base.annotator import Annotator + +PARENT_EMOTION = '|'.join(constant.EMOTION_CHARACTERS + constant.EMOTION_EXPRESSIONS) +RE_PARENT_EMOTION = re.compile(r'[(\(](' + PARENT_EMOTION + r')[\))]' + + '|' + f'[{constant.EMOTION_SYMBOLS}]+[{constant.PUNCTUATIONS}]?') + + +class EmotionExpressionAnnotator(Annotator): + def __init__(self): + super().__init__(rule_name=self.__class__.__name__) + + def annotate(self, + original_text: str, + spans: Annotations) -> Annotations: + reg_points = RE_PARENT_EMOTION.finditer(original_text) + + __return = [SpanAnnotation(rule_name=self.rule_name, + start_index=r_obj.regs[0][0], + end_index=r_obj.regs[0][1], + split_string_type=EmotionExpressionAnnotator.__name__, + split_string_value=original_text[r_obj.regs[0][0]:r_obj.regs[0][1]]) + for r_obj in reg_points] + spans = self.add_forward_rule(__return, spans) + return spans diff --git a/bunkai/algorithm/bunkai_sbd/annotator/facemark_detector.py b/bunkai/algorithm/bunkai_sbd/annotator/facemark_detector.py new file mode 100644 index 0000000..ef6df3e --- /dev/null +++ b/bunkai/algorithm/bunkai_sbd/annotator/facemark_detector.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python3 +import re +from typing import List, Optional + +from bunkai.algorithm.bunkai_sbd.annotator import constant +from bunkai.base.annotation import SpanAnnotation +from bunkai.base.annotator import Annotations, Annotator + +RE_FACEMARK = re.compile(constant.FACE_EXPRESSION_REGEXP) + + +class FaceMarkDetector(Annotator): + def __init__(self, *, path_model: Optional[str] = None): + super().__init__(FaceMarkDetector.__name__) + + @staticmethod + def __find_facemark(text: str) -> List[SpanAnnotation]: + """""" + __spans = [] + for match_obj in RE_FACEMARK.finditer(text): + ann = SpanAnnotation( + rule_name=FaceMarkDetector.__name__, + start_index=match_obj.regs[0][0], + end_index=match_obj.regs[0][1], + split_string_type='facemark', + split_string_value=text[match_obj.regs[0][0]: match_obj.regs[0][1]]) + __spans.append(ann) + return __spans + + def annotate(self, original_text: str, + spans: Annotations) -> Annotations: + span_ann = self.__find_facemark(original_text) + spans = self.add_forward_rule(span_ann, spans) + return spans diff --git a/bunkai/algorithm/bunkai_sbd/annotator/indirect_quote_exception_annotator.py b/bunkai/algorithm/bunkai_sbd/annotator/indirect_quote_exception_annotator.py new file mode 100644 index 0000000..65f2d6a --- /dev/null +++ b/bunkai/algorithm/bunkai_sbd/annotator/indirect_quote_exception_annotator.py @@ -0,0 +1,143 @@ +#!/usr/bin/env python3 +import dataclasses +from typing import Dict, List, Tuple + +from bunkai.algorithm.bunkai_sbd.annotator.basic_annotator import BasicRule +from bunkai.algorithm.bunkai_sbd.annotator.constant import LAYER_NAME_FIRST +from bunkai.algorithm.bunkai_sbd.annotator.emoji_annotator import \ + EmojiAnnotator +from bunkai.algorithm.bunkai_sbd.annotator.emotion_expression_annotator import \ + EmotionExpressionAnnotator +from bunkai.algorithm.bunkai_sbd.annotator.facemark_detector import \ + FaceMarkDetector +from bunkai.algorithm.bunkai_sbd.annotator.linebreak_annotator import \ + LinebreakAnnotator +from bunkai.algorithm.bunkai_sbd.annotator.morph_annotator import \ + MorphAnnotatorJanome +from bunkai.base.annotation import Annotations, SpanAnnotation, TokenResult +from bunkai.base.annotator import AnnotationFilter + +DEFAULT_RULE_TARGET = (LAYER_NAME_FIRST, + BasicRule.__name__, + LinebreakAnnotator.__name__, + EmojiAnnotator.__name__, EmotionExpressionAnnotator.__name__, + FaceMarkDetector.__name__) + + +@dataclasses.dataclass +class RuleObject(object): + size_n: int + rule_target_morpheme: str + rule_word_surface: List[str] + is_valid: bool = True + + def is_rule_valid(self, + sb_candidate_morpheme: str, + current_target_index: int, + index2token_obj: Dict[int, TokenResult] + ): + if self.rule_target_morpheme != '*' and self.rule_target_morpheme != sb_candidate_morpheme: + return False + + range_check = current_target_index + self.size_n + i_rule_morpheme: int = 0 + for __check in range(current_target_index, range_check): + if self.rule_word_surface[i_rule_morpheme] != index2token_obj[__check].word_surface: + return False + i_rule_morpheme += 1 + return True + + +MORPHEMES_AFTER_CANDIDATE = [ + RuleObject(size_n=1, rule_target_morpheme='*', rule_word_surface=['て']), + RuleObject(size_n=1, rule_target_morpheme='*', rule_word_surface=['の']), + RuleObject(size_n=1, rule_target_morpheme='*', rule_word_surface=['と']), + RuleObject(size_n=1, rule_target_morpheme='*', rule_word_surface=['って']), + RuleObject(size_n=1, rule_target_morpheme='*', rule_word_surface=['という']), + RuleObject(size_n=1, rule_target_morpheme='*', rule_word_surface=['に']), + RuleObject(size_n=1, rule_target_morpheme='*', rule_word_surface=['など']), + RuleObject(size_n=2, rule_target_morpheme='*', rule_word_surface=['くらい', 'の']), + RuleObject(size_n=2, rule_target_morpheme='*', rule_word_surface=['くらい', 'です']), + RuleObject(size_n=2, rule_target_morpheme='*', rule_word_surface=['くらい', 'でし']), + RuleObject(size_n=2, rule_target_morpheme='*', rule_word_surface=['も', 'あり']), + RuleObject(size_n=2, rule_target_morpheme='*', rule_word_surface=['ほど', 'でし']), +] + + +class IndirectQuoteExceptionAnnotator(AnnotationFilter): + def __init__(self, rule_targets: Tuple[str, ...] = DEFAULT_RULE_TARGET): + super().__init__(rule_name=self.__class__.__name__) + self.rule_targets = rule_targets + + @staticmethod + def is_exception_particle(original_text: str, + start_index: int, + end_index: int, + index2token_obj: Dict[int, TokenResult]) -> bool: + """ + 形態素解析の結果、基本分割文字列の後ろが助詞だった場合は 分割を行わない. + + ただし、助詞のすべてのケースで分割するわけではない。ルールを参照して、分割可否を決定する. + + 例: 合宿免許? の若者さん達でしょうか / スタッフ? と話し込み. + + :return: True: not SB False: SB. + """ + __next_end_index = end_index + # __next_end_indexが最後の文字の場合 + if __next_end_index == len(original_text): + return False + # 特殊なルール end_indexの次の文字が改行記号である場合: 改行記号の後の形態素を判定基準にする。 + if original_text[__next_end_index] == '\n': + while original_text[__next_end_index] == '\n' and __next_end_index + 1 < len(original_text): + __next_end_index += 1 + + if __next_end_index not in index2token_obj: + return False + else: + if any([rule_object.is_rule_valid(sb_candidate_morpheme=original_text[start_index:end_index], + current_target_index=__next_end_index, + index2token_obj=index2token_obj) + for rule_object in MORPHEMES_AFTER_CANDIDATE]): + return True + else: + return False + + def __generate(self, anns: List[SpanAnnotation]) -> Dict[int, TokenResult]: + index2tokens = {} + __start_index = 0 + __tokenizer_anns = [ + ann for ann in anns if ann.rule_name == MorphAnnotatorJanome.__name__] + __processed = [] + for ann in __tokenizer_anns: + t_obj = ann.args['token'] # type: ignore + if t_obj in __processed: + continue + __length = len(t_obj.word_surface) + for __i in range(__start_index, __start_index + __length): + index2tokens[__i] = t_obj + __start_index += __length + __processed.append(t_obj) + else: + return index2tokens + + def annotate(self, original_text: str, + spans: Annotations) -> Annotations: + index2token_obj = self.__generate(list( + spans.get_annotation_layer(MorphAnnotatorJanome.__name__))) + + __return_span_ann = [] + for target_rule_name in self.rule_targets: + if target_rule_name == LinebreakAnnotator.__name__ and target_rule_name not in spans.name2order: + continue + for __s in spans.get_annotation_layer(target_rule_name): + if self.is_exception_particle(original_text, + __s.start_index, + __s.end_index, + index2token_obj=index2token_obj): + continue + else: + __return_span_ann.append(__s) + + spans.add_annotation_layer(self.rule_name, self.unify_span_annotations(__return_span_ann)) + return spans diff --git a/bunkai/algorithm/bunkai_sbd/annotator/linebreak_annotator.py b/bunkai/algorithm/bunkai_sbd/annotator/linebreak_annotator.py new file mode 100644 index 0000000..7448f57 --- /dev/null +++ b/bunkai/algorithm/bunkai_sbd/annotator/linebreak_annotator.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 + +import pathlib +import typing + +from bunkai.algorithm.bunkai_sbd.annotator.morph_annotator import \ + MorphAnnotatorJanome +from bunkai.algorithm.lbd.predict import Predictor +from bunkai.base.annotation import SpanAnnotation +from bunkai.base.annotator import Annotations, Annotator +from bunkai.constant import METACHAR_LINE_BREAK + + +class LinebreakAnnotator(Annotator): + def __init__(self, *, path_model: str): + super().__init__(LinebreakAnnotator.__name__) + self.linebreak_detector = Predictor(modelpath=pathlib.Path(path_model)) + + @staticmethod + def generate_sentence_structure(annotation_object: Annotations, + *, + splitter: str = '\n', + attribute_name: str = 'MorphAnnotatorJanome' + ) -> typing.List[str]: + input_tokens: typing.List[str] = \ + [span_obj.args['token'].word_surface for span_obj in annotation_object.get_annotation_layer(attribute_name) + if span_obj.args is not None] + sentence_tokens: typing.List[str] = [] + __sentence: str + for tokens in input_tokens: + if tokens == splitter: + sentence_tokens.append(METACHAR_LINE_BREAK) + else: + sentence_tokens.append(tokens) + + return sentence_tokens + + @staticmethod + def merge_preceding_eos(text: str, spans: typing.List[SpanAnnotation]) -> typing.List[SpanAnnotation]: + # TODO: Make two annotators? + current_index: int = 0 + processed_spans = [] + sorted_spans = list(sorted(spans, key=lambda s: s.start_index)) + while True: + if current_index + 1 >= len(sorted_spans): + break + span_ann = sorted_spans[current_index] + if span_ann.end_index == sorted_spans[current_index + 1].start_index and \ + sorted_spans[current_index + 1].rule_name == LinebreakAnnotator.__name__: + processed_spans.append(SpanAnnotation( + rule_name=LinebreakAnnotator.__name__, + start_index=span_ann.start_index, + end_index=sorted_spans[current_index + 1].end_index, + split_string_type='linebreak', + split_string_value=text[span_ann.start_index: sorted_spans[current_index + 1].end_index], + )) + current_index += 2 + else: + processed_spans.append(span_ann) + current_index += 1 + return processed_spans + + def annotate(self, + original_text: str, + spans: Annotations) -> Annotations: + """Tokenize済み結果をデータ加工する。Predictorが求める形式にする.""" + sub_texts = self.generate_sentence_structure(spans) + # tokenizerを更新する。すでにTokenize済みの結果を利用する。 + # self.linebreak_detector.reset_tokenizer(word_tokenizer_type='pre_tokenize', sentence2tokens=sentence2tokens) + __result = list(self.linebreak_detector.predict([sub_texts])) + + new_spans = spans.get_final_layer() + morpheme_sequence = list(spans.get_annotation_layer(MorphAnnotatorJanome.__name__)) + if len(__result) > 0: + # result: typing.List[TokenIndex] = __result[0] # type: ignore + for result in __result: + for predicted_index in result: + char_index_start = morpheme_sequence[predicted_index].start_index + char_index_end = morpheme_sequence[predicted_index].end_index + ann = SpanAnnotation( + rule_name=LinebreakAnnotator.__name__, + start_index=char_index_start, + end_index=char_index_end, + split_string_type='linebreak', + split_string_value=original_text[char_index_start: char_index_end]) + new_spans.append(ann) + merged_spans = self.merge_preceding_eos(original_text, new_spans) + spans.add_annotation_layer(self.rule_name, merged_spans) + else: + spans.add_annotation_layer(self.rule_name, new_spans) + + return spans diff --git a/bunkai/algorithm/bunkai_sbd/annotator/linebreak_force_annotator.py b/bunkai/algorithm/bunkai_sbd/annotator/linebreak_force_annotator.py new file mode 100644 index 0000000..c760fcd --- /dev/null +++ b/bunkai/algorithm/bunkai_sbd/annotator/linebreak_force_annotator.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 +import re + +from bunkai.base.annotation import Annotations, SpanAnnotation +from bunkai.base.annotator import Annotator + +RE_LBS = re.compile(r'[\n\s]*\n[\n\s]*') + + +class LinebreakForceAnnotator(Annotator): + def __init__(self): + super().__init__(rule_name=self.__class__.__name__) + + def annotate(self, original_text: str, spans: Annotations) -> Annotations: + s2regs = {} + for r_obj in RE_LBS.finditer(original_text): + s: int = r_obj.regs[0][0] + s2regs[s] = r_obj.regs[0] + + __return_span_ann = [] + + def _add(ro): + __return_span_ann.append(SpanAnnotation( + rule_name=self.rule_name, + start_index=s, + end_index=ro[1], + split_string_type='linebreak', + split_string_value=original_text[ro[0]:ro[1]] + )) + + for __s in spans.get_final_layer(): + ro = s2regs.get(__s.end_index) + if ro is None: + __return_span_ann.append(__s) + else: + _add(ro) + del s2regs[__s.end_index] + + for ro in s2regs.values(): + _add(ro) + + spans.add_annotation_layer(self.rule_name, __return_span_ann) + return spans diff --git a/bunkai/algorithm/bunkai_sbd/annotator/morph_annotator.py b/bunkai/algorithm/bunkai_sbd/annotator/morph_annotator.py new file mode 100644 index 0000000..cdcdbce --- /dev/null +++ b/bunkai/algorithm/bunkai_sbd/annotator/morph_annotator.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python3 +from typing import List + +from janome.tokenizer import Tokenizer + +from bunkai.base.annotation import Annotations, SpanAnnotation, TokenResult +from bunkai.base.annotator import Annotator + + +class MorphAnnotatorJanome(Annotator): + def __init__(self): + super().__init__(rule_name=self.__class__.__name__) + self.tokenizer = Tokenizer() + + def __generate(self, text: str) -> List[SpanAnnotation]: + tokenizer_result = self.tokenizer.tokenize(text) + span_ann = [] + __start_index = 0 + for t_obj in tokenizer_result: + __pos = t_obj.part_of_speech.split(',') + __length = len(t_obj.surface) + token = TokenResult( + node_obj=t_obj, + tuple_pos=__pos, + word_stem=t_obj.base_form, + word_surface=t_obj.surface) + span_ann.append(SpanAnnotation( + rule_name=self.rule_name, + start_index=__start_index, + end_index=__start_index + __length, + split_string_type='janome', + split_string_value='token', + args={'token': token})) + __start_index += __length + else: + if __start_index < len(text) and text[__start_index: len(text)] == '\n': + # 末尾が改行のケースで改行記号を手動で追加する。 + token = TokenResult( + node_obj=None, + tuple_pos=('記号', '空白', '*', '*'), + word_stem='\n', + word_surface='\n') + span_ann.append(SpanAnnotation( + rule_name=self.rule_name, + start_index=__start_index, + end_index=len(text), + split_string_type='janome', + split_string_value='token', + args={'token': token})) + + return span_ann + + def annotate(self, original_text: str, spans: Annotations) -> Annotations: + anns = self.__generate(original_text) + spans.add_annotation_layer(self.rule_name, anns + list(spans.flatten())) + return spans diff --git a/bunkai/algorithm/bunkai_sbd/annotator/number_exception_annotator.py b/bunkai/algorithm/bunkai_sbd/annotator/number_exception_annotator.py new file mode 100644 index 0000000..924f52f --- /dev/null +++ b/bunkai/algorithm/bunkai_sbd/annotator/number_exception_annotator.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 +import re + +from bunkai.algorithm.bunkai_sbd.annotator import constant +from bunkai.base.annotation import Annotations +from bunkai.base.annotator import AnnotationFilter + +RE_NUMBER_WORD = re.compile(constant.NUMBER_WORD_REGEXP) + + +class NumberExceptionAnnotator(AnnotationFilter): + def __init__(self): + super().__init__(rule_name=self.__class__.__name__) + + @staticmethod + def is_exception_no(original_text: str, start_index: int, end_index: int) -> bool: + """ + .の前にNoがあり、かつ後ろが数字で合った場合には分割を行わない. + + 例: おすすめ度No.1 / ROOM No.411. + """ + if original_text[start_index:end_index] != '.' and original_text[start_index:end_index] != '.': + return False + + if RE_NUMBER_WORD.match(original_text[start_index - 2:start_index]) and \ + re.match(r'\d', original_text[end_index]): + return True + return False + + def annotate(self, original_text: str, + spans: Annotations) -> Annotations: + __return_span_ann = [] + for __s in spans.get_final_layer(): + if self.is_exception_no(original_text, __s.start_index, __s.end_index): + continue + else: + __return_span_ann.append(__s) + spans.add_annotation_layer(self.rule_name, __return_span_ann) + return spans diff --git a/bunkai/algorithm/bunkai_sbd/bunkai_sbd.py b/bunkai/algorithm/bunkai_sbd/bunkai_sbd.py new file mode 100644 index 0000000..24cd39e --- /dev/null +++ b/bunkai/algorithm/bunkai_sbd/bunkai_sbd.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python3 +from typing import Iterator, List, Optional + +from bunkai.algorithm.bunkai_sbd.annotator import ( + BasicRule, DotExceptionAnnotator, EmojiAnnotator, + EmotionExpressionAnnotator, FaceMarkDetector, + IndirectQuoteExceptionAnnotator, LinebreakAnnotator, + LinebreakForceAnnotator, MorphAnnotatorJanome, NumberExceptionAnnotator) +from bunkai.algorithm.bunkai_sbd.annotator.constant import LAYER_NAME_FIRST +from bunkai.base.annotation import Annotations, SpanAnnotation +from bunkai.base.annotator import (AnnotatorPipeline, RuleOrderException, + SentenceBoundaryDisambiguator) + + +class BunkaiPipeline(AnnotatorPipeline): + def check(self) -> bool: + """ + Check following rule-order. + + 1. MorphAnnotatorJanome must be before ExceptionParticle + + 2. LinebreakFilter must be in the last position + + 3. Facemark detector should be at first position + + """ + class_name2order = {process.rule_name: __i for __i, process in enumerate(self.pipeline)} + if class_name2order[BasicRule.__name__] < class_name2order[FaceMarkDetector.__name__]: + raise RuleOrderException(f'{FaceMarkDetector.__name__} should be at first position.') + if class_name2order[BasicRule.__name__] < class_name2order[EmotionExpressionAnnotator.__name__]: + raise RuleOrderException(f'{EmotionExpressionAnnotator.__name__} should be before {BasicRule.__name__}.') + if class_name2order[MorphAnnotatorJanome.__name__] > class_name2order[IndirectQuoteExceptionAnnotator.__name__]: + raise RuleOrderException(f'{MorphAnnotatorJanome.__name__} must be' + f' before {IndirectQuoteExceptionAnnotator.__name__}') + if LinebreakAnnotator.__name__ in class_name2order: + if class_name2order[MorphAnnotatorJanome.__name__] > class_name2order[LinebreakAnnotator.__name__]: + raise RuleOrderException(f'{MorphAnnotatorJanome.__name__} must be' + f' before {LinebreakAnnotator.__name__}') + if class_name2order[LinebreakAnnotator.__name__] > class_name2order[IndirectQuoteExceptionAnnotator.__name__]: + raise RuleOrderException(f'{LinebreakAnnotator.__name__} must be' + f' before {IndirectQuoteExceptionAnnotator.__name__}') + return True + + +class BunkaiSentenceBoundaryDisambiguation(SentenceBoundaryDisambiguator): + def __init__(self, *, path_model: Optional[str]): + morph_annotator = MorphAnnotatorJanome() + + _annotators = [ + FaceMarkDetector(), + EmotionExpressionAnnotator(), + EmojiAnnotator(), + BasicRule(), + morph_annotator, + IndirectQuoteExceptionAnnotator(), + DotExceptionAnnotator(), + NumberExceptionAnnotator(), + ] + + if path_model is None: + _annotators.append(LinebreakForceAnnotator()) + else: + _idxs = [i for i, ann in enumerate(_annotators) if ann.rule_name == MorphAnnotatorJanome.__name__] + assert len(_idxs) > 0, f'{MorphAnnotatorJanome.__name__} does not exist in a pipeline' + _annotators.insert(_idxs[0] + 1, LinebreakAnnotator(path_model=path_model)) + + self.pipeline = BunkaiPipeline(_annotators) + super().__init__() + + def _eos(self, text: str) -> Annotations: + annotations = Annotations() + annotations.add_annotation_layer(LAYER_NAME_FIRST, [SpanAnnotation(rule_name=LAYER_NAME_FIRST, + start_index=len(text) - 1, + end_index=len(text), + split_string_type=None, + split_string_value=None)]) + for rule_obj in self.pipeline: + rule_obj.annotate(text, annotations) + return annotations + + def find_eos(self, text: str) -> List[int]: + annotations = self._eos(text) + end_index = list(sorted( + list(set([s_a.end_index for s_a in annotations.get_final_layer()])))) + return end_index + + def __call__(self, text: str) -> Iterator[str]: + annotations = self._eos(text) + end_index = sorted( + list(set([s_a.end_index for s_a in annotations.get_final_layer()]))) + __start_index = 0 + __end_index = 0 + for e_i in end_index: + part_sentences = text[__start_index:e_i] + __start_index = e_i + __end_index = e_i + yield part_sentences + + if __end_index < len(text): + part_sentences = text[__end_index:] + yield part_sentences diff --git a/bunkai/algorithm/lbd/__init__.py b/bunkai/algorithm/lbd/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bunkai/algorithm/lbd/corpus.py b/bunkai/algorithm/lbd/corpus.py new file mode 100644 index 0000000..5bc0e11 --- /dev/null +++ b/bunkai/algorithm/lbd/corpus.py @@ -0,0 +1,188 @@ +#!/usr/bin/env python3 + +import argparse +import random +import re +import sys +import typing +from pathlib import Path + +from bunkai.algorithm.lbd.custom_tokenizers import JanomeTokenizer +from bunkai.base.annotation import Tokens +from bunkai.constant import METACHAR_LINE_BREAK, METACHAR_SENTENCE_BOUNDARY + +REGEXP_LB_SPAN: str = f'[\\s{METACHAR_SENTENCE_BOUNDARY}{METACHAR_LINE_BREAK}]*' + \ + f'{METACHAR_LINE_BREAK}[\\s{METACHAR_LINE_BREAK}{METACHAR_SENTENCE_BOUNDARY}]*' +RE_LB_SPAN = re.compile(REGEXP_LB_SPAN) + +LABEL_OTHER: str = 'O' +LABEL_SEP: str = 'LB_SEP' +LABEL_NSEP: str = 'LB_NS' +LABELS: typing.List[str] = [LABEL_OTHER, LABEL_SEP, LABEL_NSEP] + + +def annotation2spans(sentence: str) -> Tokens: + """ + Cut a sentence into list of sentence. + + Cation: Return value is Tokens, but this is not actual token. Set of text-segments. + """ + prev: int = 0 + tokens = Tokens() + for match in RE_LB_SPAN.finditer(sentence): + myspan = sentence[prev:match.start()].replace(METACHAR_SENTENCE_BOUNDARY, '') + if len(myspan) > 0: + tokens.spans.append(myspan) + tokens.labels.append(LABEL_OTHER) + + myspan = match.group() + prev = match.end() + tokens.spans.append(myspan.replace(METACHAR_SENTENCE_BOUNDARY, '')) + if METACHAR_SENTENCE_BOUNDARY in myspan: + tokens.labels.append(LABEL_SEP) + else: + tokens.labels.append(LABEL_NSEP) + + lastspan = sentence[prev:] + if len(lastspan) != 0: + tokens.spans.append(lastspan.replace(METACHAR_SENTENCE_BOUNDARY, '')) + tokens.labels.append(LABEL_OTHER) + + assert sentence.replace(METACHAR_SENTENCE_BOUNDARY, '') == ''.join(tokens.spans) + return tokens + + +def _fix_tokens(mytxt: str, ts: typing.List[str]) -> typing.List[str]: + # restore blanks erased by the tokeniser + ret: typing.List[str] = [] + index: int = 0 + prev: int = 0 + for _token in ts: + index += mytxt[index:].index(_token) + if index != prev: + ret.append(mytxt[prev:index]) + ret.append(_token) + index += len(_token) + prev = index + lasttk = mytxt[index:] + if len(lasttk) > 0: + ret.append(lasttk) + return ret + + +def spans2tokens(tokenizer: JanomeTokenizer, tokens: Tokens) -> Tokens: + assert len(tokens.spans) == len(tokens.labels) + new_tokens = Tokens() + for mytxt, label in zip(tokens.spans, tokens.labels): + if label == LABEL_OTHER: + ts = tokenizer.tokenize(mytxt) + ts = _fix_tokens(mytxt, ts) + new_tokens.spans += ts + new_tokens.labels += [LABEL_OTHER] * len(ts) + else: # Line break + new_tokens.spans.append(mytxt) + new_tokens.labels.append(label) + + return new_tokens + + +def convert(inpath: typing.IO, + tokenizer: JanomeTokenizer, + remove_trailing_lb: bool = True,) -> typing.Iterator[Tokens]: + with inpath as inf: + for lid, line in enumerate(inf): + sentence: str = line[:-1] + if METACHAR_LINE_BREAK not in sentence: + continue + + tokens = annotation2spans(sentence) + if remove_trailing_lb and tokens.labels[-1] != LABEL_OTHER: + tokens.spans = tokens.spans[:-1] + tokens.labels = tokens.labels[:-1] + if len(tokens.labels) == 1: + continue + new_tokens = spans2tokens(tokenizer, tokens) + new_tokens.meta['lid'] = lid + assert len(new_tokens.labels) == len(new_tokens.spans) + assert sentence.replace(METACHAR_SENTENCE_BOUNDARY, '').startswith(''.join(tokens.spans)) + yield new_tokens + + +def corpus_separate( + lines: typing.List[str], + ratio_train: float, + ratio_dev: float) -> typing.Dict[str, typing.List[str]]: + rets: typing.Dict[str, typing.List[str]] = {} + + index_train = int(len(lines) * ratio_train) + seq_train_set = lines[:index_train] + seq_dev = seq_train_set[:int(len(seq_train_set) * ratio_dev)] + seq_train = seq_train_set[int(len(seq_train_set) * ratio_dev):] + seq_test = lines[index_train:] + + rets['train'] = seq_train + rets['test'] = seq_test + rets['dev'] = seq_dev + return rets + + +def pseudo_linebreak(text: str, rnd: random.Random, *, ratio_lb: float = 0.5) -> str: + rets = [] + text = re.sub(r'。+', METACHAR_LINE_BREAK + METACHAR_SENTENCE_BOUNDARY, text) + for char in text: + if char == '、' and rnd.random() < ratio_lb: + rets.append(METACHAR_LINE_BREAK) + else: + rets.append(char) + return ''.join(rets) + + +def get_opts() -> argparse.Namespace: + oparser = argparse.ArgumentParser() + oparser.add_argument("--input", "-i", type=argparse.FileType("r"), default=sys.stdin) + oparser.add_argument("--output", "-o", type=Path, required=True) + oparser.add_argument("--split", action="store_true") + oparser.add_argument("--pseudo", action="store_true") + oparser.add_argument("--train", type=float, default=0.8) + oparser.add_argument("--dev", type=float, default=0.0) + oparser.add_argument("--seed", type=int, default=12345) + oparser.add_argument("--base", default="cl-tohoku/bert-base-japanese-whole-word-masking") + return oparser.parse_args() + + +def main() -> None: + opts = get_opts() + + if opts.split: + lines = opts.input.readlines() + random.Random(opts.seed).shuffle(lines) + name2lines = corpus_separate(lines, ratio_train=opts.train, ratio_dev=opts.dev) + opts.output.mkdir(exist_ok=True, parents=True) + + ppmap = {LABEL_SEP: METACHAR_SENTENCE_BOUNDARY} + for name, lines in name2lines.items(): + opath = opts.output.joinpath(f'{name}.jsonl') + otxtpath = opts.output.joinpath(f'{name}.txt') + with opath.open('w') as f,\ + otxtpath.open('w') as tf: + for line in lines: + f.write(line) + ts = Tokens.from_json(line) + tf.write(f'{ts.pretty(ppmap)}\n') + elif opts.pseudo: + rnd = random.Random(opts.seed) + with opts.output.open('w') as outf: + for line in opts.input: + line = line[:-1].replace(METACHAR_SENTENCE_BOUNDARY, '').replace(METACHAR_LINE_BREAK, '') + line2 = pseudo_linebreak(line, rnd) + outf.write(f'{line2}\n') + else: + opts.output.parent.mkdir(exist_ok=True, parents=True) + tokenizer = JanomeTokenizer(normalize_text=False) + with opts.output.open('w') as f: + for tokens in convert(inpath=opts.input, tokenizer=tokenizer): + f.write(f'{tokens.to_json(ensure_ascii=False, sort_keys=True)}\n') + + +if __name__ == '__main__': + main() diff --git a/bunkai/algorithm/lbd/custom_tokenizers.py b/bunkai/algorithm/lbd/custom_tokenizers.py new file mode 100644 index 0000000..86cafe4 --- /dev/null +++ b/bunkai/algorithm/lbd/custom_tokenizers.py @@ -0,0 +1,245 @@ +#!/usr/bin/env python3 + +import collections +import logging +import os +import typing +import unicodedata + +from janome.tokenizer import Tokenizer +from transformers.file_utils import cached_path +from transformers.models.bert.tokenization_bert import (BertTokenizer, + WordpieceTokenizer, + load_vocab) + +import bunkai.constant + +"""MecabではなくJanomeに合わせて作成されたTokenizer +The original source code is from cl-tohoku/bert-japanese. +https://github.com/cl-tohoku/bert-japanese/blob/master/tokenization.py +The original source code is under Apache-2.0 License. +""" + +logger = logging.getLogger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} + +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "cl-tohoku/bert-base-japanese": + "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese/vocab.txt", + "cl-tohoku/bert-base-japanese-whole-word-masking": + "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking/vocab.txt", + "cl-tohoku/bert-base-japanese-char": + "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char/vocab.txt", + "cl-tohoku/bert-base-japanese-char-whole-word-masking": + "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking/vocab.txt", + } +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "cl-tohoku/bert-base-japanese": 512, + "cl-tohoku/bert-base-japanese-whole-word-masking": 512, + "cl-tohoku/bert-base-japanese-char": 512, + "cl-tohoku/bert-base-japanese-char-whole-word-masking": 512, +} + +PRETRAINED_INIT_CONFIGURATION = { + "cl-tohoku/bert-base-japanese": { + "do_lower_case": False, + "word_tokenizer_type": "janome", + "subword_tokenizer_type": "wordpiece", + }, + "cl-tohoku/bert-base-japanese-whole-word-masking": { + "do_lower_case": False, + "word_tokenizer_type": "janome", + "subword_tokenizer_type": "wordpiece", + }, + "cl-tohoku/bert-base-japanese-char": { + "do_lower_case": False, + "word_tokenizer_type": "janome", + "subword_tokenizer_type": "character", + }, + "cl-tohoku/bert-base-japanese-char-whole-word-masking": { + "do_lower_case": False, + "word_tokenizer_type": "janome", + "subword_tokenizer_type": "character", + }, +} + + +class JanomeTokenizer(object): + """Runs basic tokenization with Janome morphological parser.""" + + def __init__(self, *, do_lower_case=False, never_split=None, normalize_text=True): + """ + Construct a JanomeTokenizer. + + :arg do_lower_case: (`optional`) boolean (default True) + Whether to lower case the input. + :arg never_split: (`optional`) list of str + Kept for backward compatibility purposes. + Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`) + List of token not to split. + :arg normalize_text: (`optional`) boolean (default True) + Whether to apply unicode normalization to text before tokenization. + """ + self.do_lower_case = do_lower_case + self.never_split = never_split if never_split is not None else [] + self.normalize_text = normalize_text + self.janome_tokenizer = Tokenizer() + + def tokenize(self, text: str, *, never_split=None, **kwargs): + """Tokenizes a piece of text.""" + if self.normalize_text: + text = unicodedata.normalize("NFKC", text) + + never_split = self.never_split + (never_split if never_split is not None else []) + tokens = self.janome_tokenizer.tokenize(text) + __tokens = [] + last_index = 0 + for t in tokens: + token = t.surface + token_start = text.index(token, last_index) + if last_index != token_start: + __tokens.append(text[last_index: token_start]) + + if self.do_lower_case and token not in never_split: + token = token.lower() + __tokens.append(token.lower()) + else: + __tokens.append(token) + last_index = token_start + len(token) + + if len(text) != last_index: + __tokens.append(text[last_index:]) + + assert text == ''.join(__tokens), f"[{text}] != [{''.join(__tokens)}]" + return __tokens + + +class CharacterTokenizer(object): + """Runs Character tokenziation.""" + + def __init__(self, vocab, unk_token, normalize_text=True): + self.vocab = vocab + self.unk_token = unk_token + self.normalize_text = normalize_text + + def tokenize(self, text): + """ + Tokenize a piece of text into characters. + + For example: + input = "apple" + output = ["a", "p", "p", "l", "e"] + :arg text: A single token or whitespace separated tokens. + This should have already been passed through `BasicTokenizer`. + :return: A list of characters. + """ + if self.normalize_text: + text = unicodedata.normalize("NFKC", text) + + output_tokens = [] + for char in text: + if char not in self.vocab: + output_tokens.append(self.unk_token) + continue + + output_tokens.append(char) + + return output_tokens + + +class JanomeSubwordsTokenizer(BertTokenizer): + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + + def __init__( + self, + vocab_file, + *, + subword_tokenizer_type="wordpiece", + do_subword_tokenize: bool = True, + never_split=None, + unk_token="[UNK]", + sep_token="[SEP]", + pad_token="[PAD]", + cls_token="[CLS]", + mask_token="[MASK]", + **kwargs + ): + """ + Construct a MecabBertTokenizer. + + :arg vocab_file: Path to a one-wordpiece-per-line vocabulary file. + :arg do_lower_case: (`optional`) boolean (default True) + Whether to lower case the input. + Only has an effect when do_basic_tokenize=True. + :arg do_word_tokenize: (`optional`) boolean (default True) Whether to do word tokenization. + :arg do_subword_tokenize: (`optional`) boolean (default True) Whether to do subword tokenization. + :arg word_tokenizer_type: (`optional`) string (default "basic") Type of word tokenizer. basic / janome / pre_tokenize + :arg subword_tokenizer_type: (`optional`) string (default "wordpiece") Type of subword tokenizer. + :arg cls_token: No description. + """ + super(BertTokenizer, self).__init__( + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + **kwargs, + ) + if vocab_file in PRETRAINED_VOCAB_FILES_MAP['vocab_file']: + self.vocab = load_vocab( + cached_path( + PRETRAINED_VOCAB_FILES_MAP['vocab_file'][vocab_file], + ) + ) + elif not os.path.isfile(vocab_file): + raise ValueError( + "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " + "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file) + ) + else: + self.vocab = load_vocab(vocab_file) + + # add new vocab + self.add_tokens([' ', bunkai.constant.METACHAR_LINE_BREAK]) + + self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) + + self.do_word_tokenize = False + self.do_subword_tokenize = True + if do_subword_tokenize: + if subword_tokenizer_type == "wordpiece": + self.subword_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) + elif subword_tokenizer_type == "character": + self.subword_tokenizer = CharacterTokenizer(vocab=self.vocab, unk_token=self.unk_token) + else: + raise ValueError("Invalid subword_tokenizer_type '{}' is specified.".format(subword_tokenizer_type)) + + self.janome_tokenizer = JanomeTokenizer() + + def tokenize(self, text: typing.Union[str, typing.List[str]]) -> typing.List[str]: + if isinstance(text, str): + morphemes = self.janome_tokenizer.tokenize(text) + elif isinstance(text, list) and all([isinstance(t, str) for t in text]): + morphemes = text + else: + raise Exception(f'Invalid input-type {text}') + + if self.do_subword_tokenize: + split_tokens = [] + for token in morphemes: + sts = [sub_token for sub_token in self.subword_tokenizer.tokenize(token)] + if len(sts) == 0: + split_tokens.append(token) + else: + split_tokens += sts + else: + split_tokens = morphemes + + return split_tokens diff --git a/bunkai/algorithm/lbd/predict.py b/bunkai/algorithm/lbd/predict.py new file mode 100644 index 0000000..2c30511 --- /dev/null +++ b/bunkai/algorithm/lbd/predict.py @@ -0,0 +1,198 @@ +#!/usr/bin/env python3 + +import argparse +import sys +import typing +from pathlib import Path + +import numpy as np +import torch +from more_itertools import chunked +from transformers import AutoModelForTokenClassification + +import bunkai.constant +from bunkai.algorithm.lbd.corpus import (LABEL_OTHER, LABEL_SEP, + annotation2spans) +from bunkai.algorithm.lbd.custom_tokenizers import (VOCAB_FILES_NAMES, + JanomeSubwordsTokenizer, + JanomeTokenizer) +from bunkai.algorithm.lbd.train import BunkaiConfig, MyDataset +from bunkai.base.annotation import Tokens +from bunkai.third.utils_ner import InputExample, get_labels + +StringMorphemeInputType = typing.List[typing.List[str]] # (batch-size * variable-length of sentence * tokens) + + +class Predictor(object): + def __init__(self, modelpath: Path) -> None: + """ + Use JanomeTokenizer by default if the input is Document(String). + + If the input is Morpheme(String), Tokenizers are not called. + + :param subword_tokenizer_type: pre_tokenize, janome, basic + """ + self.model = AutoModelForTokenClassification.from_pretrained(str(modelpath)) + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.model = self.model.to(self.device) + + self.labels = get_labels(str(modelpath.joinpath('labels.txt'))) + self.label_map: typing.Dict[int, str] = {i: label for i, label in enumerate(self.labels)} + + with modelpath.joinpath('bunkai.json').open() as bcf: + self.bc = BunkaiConfig.from_json(bcf.read()) + # use janome tokenizer or tokenizer based on a vocab-file. + self.path_tokenizer_model: str = str(Path(modelpath).joinpath(VOCAB_FILES_NAMES['vocab_file'])) + self.tokenizer = JanomeSubwordsTokenizer(self.path_tokenizer_model) + + # hotfix + if self.model.base_model_prefix == 'distilbert' \ + and 'token_type_ids' in self.tokenizer.model_input_names: + self.tokenizer.model_input_names.remove('token_type_ids') + + def _split_long_text(self, tokens: typing.List[str])\ + -> typing.Tuple[typing.List[typing.List[str]], typing.List[typing.List[int]]]: + """ + Split documents(tokens) into sub-documents(tokens). + + This is because Bert has the maximum token-length of the input. + """ + # tokenized_spans_list is a temporary stack which holds subword-token and underbar. + # That is because underbar is replaced into UNK if underbar is put into a subword-tokeniser. + tokenized_spans_list: typing.List[typing.List[str]] = [] + tmp_stack: typing.List[str] = [] + for __t in tokens: + if __t == bunkai.constant.METACHAR_LINE_BREAK: + if len(tmp_stack) > 0: + tokenized_spans_list.append(tmp_stack) + tokenized_spans_list.append([bunkai.constant.METACHAR_LINE_BREAK]) + tmp_stack = [] + else: + # sub-word tokenize + tmp_stack.append(__t) + if len(tmp_stack) > 0: + tokenized_spans_list.append(tmp_stack) + + # run subword-tokeniser + processed_tokens: typing.List[typing.List[str]] = [[]] + processed_num_sws: typing.List[typing.List[int]] = [[]] + current_count: int = 0 + tokenized_span: typing.List[str] + for tokenized_span in tokenized_spans_list: + for word in tokenized_span: + subwords = self.tokenizer.tokenize(word) + num_subwords = len(subwords) + current_count += num_subwords + if current_count >= self.bc.max_seq_length: + processed_tokens.append([]) + processed_num_sws.append([]) + current_count = num_subwords + assert current_count < self.bc.max_seq_length + processed_tokens[-1].append(word) + processed_num_sws[-1].append(num_subwords) + return processed_tokens, processed_num_sws + + def predict(self, documents_morphemes: StringMorphemeInputType) -> typing.List[typing.Set[int]]: + """ + Run prediction on incoming inputs. Inputs are 2 dims array with [[sentence]]. + + :param spans_list: 2 dims list (batch-size * variable-length of sentence) or + [['ラウンジ', 'も', '気軽', 'に', '利用', 'でき', '、', '申し分', 'ない', 'です', '。', '▁', '']]. + """ + examples = [] + + # Note: gave up to separate this process in MyDataset because _split_long_text calls a tokenizer. + num_subwords = [] + for d_id, spans in enumerate(documents_morphemes): + sentences_within_length, _num_subwords = self._split_long_text(spans) + num_subwords += _num_subwords + for local_s_id, tokenized_spans in enumerate(sentences_within_length): + examples.append(InputExample(guid=f'{d_id}-{local_s_id}', + words=tokenized_spans, + labels=[LABEL_OTHER] * len(tokenized_spans), + is_document_first=bool(local_s_id == 0))) + ds = MyDataset(examples, self.labels, self.bc.max_seq_length, self.tokenizer, False) + + kwargs = { + 'input_ids': torch.tensor([f.input_ids for f in ds.features], device=self.device), + 'attention_mask': torch.tensor([f.attention_mask for f in ds.features], device=self.device), + } + if self.model.base_model_prefix == 'bert': # hotfix + kwargs['token_type_ids'] = torch.tensor([f.token_type_ids for f in ds.features], device=self.device) + + predictions = self.model(**kwargs).logits.to('cpu').detach().numpy() + + assert isinstance(predictions, np.ndarray), \ + f"Unexpected error. A value type of a model prediction is {type(predictions)}. expect = numpy.ndarray" + assert len(predictions.shape) == 3, \ + f"Unexpected error. A value tensor of a model prediction is {len(predictions.shape)} tensor. " \ + f"expect = 3rd tensor." + + out: typing.List[typing.Set[int]] = [] + word_idx_offset = 0 + for idx, (example, pred) in enumerate(zip(examples, predictions)): + sw_idx = 0 + if example.is_document_first: + word_idx_offset = 0 # reset + out.append(set()) + else: + word_idx_offset += len(examples[idx - 1].words) + + for word_idx, word in enumerate(example.words): + num_sw: int = num_subwords[idx][word_idx] + while num_sw > 0: + sw_idx += 1 + label_high_prob = int(np.argmax(pred[sw_idx])) + if self.label_map[label_high_prob] == LABEL_SEP: + out[-1].add(word_idx + word_idx_offset) + num_sw -= 1 + + return out + + +def get_opts() -> argparse.Namespace: + oparser = argparse.ArgumentParser() + oparser.add_argument("--input", "-i", type=argparse.FileType('r'), required=False, default=sys.stdin) + oparser.add_argument("--output", "-o", type=argparse.FileType('w'), required=False, default=sys.stdout) + oparser.add_argument("--model", "-m", type=Path, required=True) + oparser.add_argument("--batch", "-b", type=int, default=1, help='Number of documents to feed a batch') + return oparser.parse_args() + + +def generate_initial_annotation_obj(input_stream: typing.Iterator[str]) -> typing.Iterator[typing.List[str]]: + tokenizer = JanomeTokenizer() + for document in input_stream: + # document: a text = document + assert bunkai.constant.METACHAR_SENTENCE_BOUNDARY not in document + document_spans: Tokens = annotation2spans(document[:-1]) + document_tokens: typing.List[str] = [] + for fragment in document_spans.spans: + if bunkai.constant.METACHAR_LINE_BREAK in fragment: + document_tokens.append(fragment) + else: + tokens = tokenizer.tokenize(fragment) + document_tokens += tokens + + assert ''.join(document_tokens) == "".join(document_spans.spans) + yield document_tokens + + +def main() -> None: + + opts = get_opts() + pdt = Predictor(opts.model) + + with opts.input as inf, \ + opts.output as outf: + for one_batch in chunked(generate_initial_annotation_obj(inf), n=opts.batch): + for did, token_ids_seps in enumerate(pdt.predict(one_batch)): + for tid, token in enumerate(one_batch[did]): + outf.write(token) + if tid in token_ids_seps: + outf.write(bunkai.constant.METACHAR_SENTENCE_BOUNDARY) + else: + outf.write('\n') + + +if __name__ == '__main__': + main() diff --git a/bunkai/algorithm/lbd/train.py b/bunkai/algorithm/lbd/train.py new file mode 100644 index 0000000..49a439c --- /dev/null +++ b/bunkai/algorithm/lbd/train.py @@ -0,0 +1,160 @@ +#!/usr/bin/env python3 + +import argparse +import dataclasses +import json +import sys +import typing +from pathlib import Path + +from dataclasses_json import DataClassJsonMixin +from torch import nn +from torch.utils.data.dataset import Dataset + +import bunkai.constant +import bunkai.third.run_ner +from bunkai.algorithm.lbd.corpus import LABEL_NSEP, LABEL_OTHER, LABEL_SEP +from bunkai.base.annotation import Tokens +from bunkai.third.utils_ner import InputFeatures, convert_examples_to_features + + +def convert(datapath: Path, data_dir: Path, name: str): + train_data_path = data_dir.joinpath(f'{name}.txt') + with datapath.open() as inf,\ + train_data_path.open('w') as outf: + outf.write('-DOCSTART-\n') + for line in inf: + tokens = Tokens.from_json(line) + for idx, span in enumerate(tokens.spans): + outf.write(f'{span} {tokens.labels[idx]}\n') + outf.write('\n') + + +@dataclasses.dataclass +class BunkaiConfig(DataClassJsonMixin): + max_seq_length: int + base_model: str + + +def prepare_config(train_path: Path, + dev_path: Path, + modelpath: Path, + max_seq_length: int, + base_model: str, + num_train_epochs: int) -> Path: + json_config = { + "data_dir": "", + "labels": "", + "output_dir": "", + "model_name_or_path": base_model, + "max_seq_length": max_seq_length, + "num_train_epochs": num_train_epochs, + "per_device_train_batch_size": 32, + "save_steps": 750, + "seed": 1, + "do_train": True, + "do_eval": False, + "do_predict": False, + "overwrite_output_dir": True, + "overwrite_cache": True + } + + modelpath.mkdir(exist_ok=True, parents=True) + path_config_json = modelpath.joinpath('config.json') + + data_dir = modelpath.joinpath('data') + data_dir.mkdir(exist_ok=True, parents=True) + + out_dir = modelpath.joinpath('out') + out_dir.mkdir(exist_ok=True, parents=True) + + label_path = out_dir.joinpath('labels.txt') + with out_dir.joinpath('bunkai.json').open('w') as f: + json.dump(BunkaiConfig( + base_model=base_model, + max_seq_length=max_seq_length, + ).to_dict(), f, indent=4, sort_keys=True) + + json_config['data_dir'] = str(data_dir.absolute()) + json_config['labels'] = str(label_path.absolute()) + json_config['output_dir'] = str(out_dir.absolute()) + + with path_config_json.open('w') as f: + json.dump(json_config, f, indent=4, sort_keys=True) + with label_path.open('w') as f: + f.write(f'{LABEL_OTHER}\n{LABEL_SEP}\n{LABEL_NSEP}\n') + + convert(train_path, data_dir, 'train') + if dev_path is not None: + convert(dev_path, data_dir, 'dev') + return path_config_json + + +def train(train_path: Path, dev_path: Path, modelpath: Path, max_seq_length: int, + base_model: str, num_train_epochs: int): + path_config_json = prepare_config(train_path, dev_path, modelpath, max_seq_length, + base_model, num_train_epochs) + sys.argv = ['', str(path_config_json)] + bunkai.third.run_ner.main() + + +class MyDataset(Dataset): + features: typing.List[InputFeatures] + pad_token_label_id: int = nn.CrossEntropyLoss().ignore_index + + def __len__(self): + return len(self.features) + + def __getitem__(self, i) -> InputFeatures: + return self.features[i] + + def __init__(self, + examples, + labels, + max_seq_length: int, + tokenizer, + is_xlnet: bool): + self.features = convert_examples_to_features( + examples, + labels, + max_seq_length, + tokenizer, + cls_token_at_end=is_xlnet, + # xlnet has a cls token at the end + cls_token=tokenizer.cls_token, + cls_token_segment_id=2 if is_xlnet else 0, + sep_token=tokenizer.sep_token, + pad_on_left=bool(tokenizer.padding_side == "left"), + pad_token=tokenizer.pad_token_id, + pad_token_segment_id=tokenizer.pad_token_type_id, + pad_token_label_id=self.pad_token_label_id, + ) + + +def get_opts() -> argparse.Namespace: + oparser = argparse.ArgumentParser() + oparser.add_argument("--input", "-i", type=Path, required=True) + oparser.add_argument("--dev", type=Path) + oparser.add_argument("--model", "-m", type=Path, required=True) + oparser.add_argument("--seq", default=320, type=int) + oparser.add_argument("--epoch", default=30, type=int) + oparser.add_argument("--base", default="bandainamco-mirai/distilbert-base-japanese", + choices=['cl-tohoku/bert-base-japanese-whole-word-masking', + 'bandainamco-mirai/distilbert-base-japanese'], + ) + return oparser.parse_args() + + +def main() -> None: + opts = get_opts() + + train(train_path=opts.input, + dev_path=opts.dev, + modelpath=opts.model, + max_seq_length=opts.seq, + base_model=opts.base, + num_train_epochs=opts.epoch) + + +if __name__ == '__main__': + main() diff --git a/bunkai/algorithm/tsunoda_sbd/__init__.py b/bunkai/algorithm/tsunoda_sbd/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bunkai/algorithm/tsunoda_sbd/annotator/__init__.py b/bunkai/algorithm/tsunoda_sbd/annotator/__init__.py new file mode 100644 index 0000000..77ceace --- /dev/null +++ b/bunkai/algorithm/tsunoda_sbd/annotator/__init__.py @@ -0,0 +1,14 @@ +#!/usr/bin/env python3 +from bunkai.algorithm.tsunoda_sbd.annotator.basic_annotator import BasicRule +from bunkai.algorithm.tsunoda_sbd.annotator.exception_no import ExceptionNo +from bunkai.algorithm.tsunoda_sbd.annotator.exception_numeric import \ + ExceptionNumeric +from bunkai.algorithm.tsunoda_sbd.annotator.exception_particle import \ + ExceptionParticle +from bunkai.algorithm.tsunoda_sbd.annotator.morph_annotator_janome import \ + MorphAnnotatorJanome +from bunkai.algorithm.tsunoda_sbd.annotator.replace_parentheses import \ + ExceptionParentheses + +__all__ = ['BasicRule', 'ExceptionNo', 'ExceptionNumeric', + 'ExceptionParticle', 'MorphAnnotatorJanome', 'ExceptionParentheses'] diff --git a/bunkai/algorithm/tsunoda_sbd/annotator/basic_annotator.py b/bunkai/algorithm/tsunoda_sbd/annotator/basic_annotator.py new file mode 100644 index 0000000..74cd24a --- /dev/null +++ b/bunkai/algorithm/tsunoda_sbd/annotator/basic_annotator.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python3 +import re + +from bunkai.algorithm.tsunoda_sbd.annotator import constant +from bunkai.base.annotation import Annotations, SpanAnnotation +from bunkai.base.annotator import Annotator + +RE_SENT_SPLIT = re.compile("[" + constant.PUNCTUATIONS + "]+|" + constant.SYMBOLS) + + +class BasicRule(Annotator): + def __init__(self): + super().__init__(rule_name=self.__class__.__name__) + + def annotate(self, + original_text: str, + spans: Annotations) -> Annotations: + reg_points = RE_SENT_SPLIT.finditer(original_text) + + __return = [SpanAnnotation(rule_name=self.rule_name, + start_index=r_obj.regs[0][0], + end_index=r_obj.regs[0][1], + split_string_type="regular", + split_string_value=original_text[r_obj.regs[0][0]:r_obj.regs[0][1]]) + for r_obj in reg_points] + spans.add_annotation_layer(self.rule_name, __return) + return spans diff --git a/bunkai/algorithm/tsunoda_sbd/annotator/constant.py b/bunkai/algorithm/tsunoda_sbd/annotator/constant.py new file mode 100644 index 0000000..92b4d6c --- /dev/null +++ b/bunkai/algorithm/tsunoda_sbd/annotator/constant.py @@ -0,0 +1,16 @@ +#!/usr/bin/env python3 + +import typing + +# \u2605: ★ +# \u2606: ☆ +# \u266a: ♪ +PUNCTUATIONS: str = "。!?.!?.\u2605\u2606\u266a\\**※" +SYMBOLS: str = r"[((\(]笑[)\))]|[((\(]泣[)\))]|[((\(]涙[)\))]" + +SPANS_PARENTHESES1_REGEXP: str = r'\((?!涙|笑|泣).+?(? bool: + """ + .の前にNoがあり、かつ後ろが数字で合った場合には分割を行わない. + + 例: おすすめ度No.1 / ROOM No.411 . + + """ + if original_text[start_index:end_index] != '.' and original_text[start_index:end_index] != '.': + return False + + if RE_NUMBER_WORD.match(original_text[start_index - 2:start_index]) and \ + re.match(r'\d', original_text[end_index]): + return True + return False + + def annotate(self, original_text: str, + spans: Annotations) -> Annotations: + __return_span_ann = [] + for __s in spans.get_final_layer(): + if self.is_exception_no(original_text, __s.start_index, __s.end_index): + continue + else: + __s.rule_name = self.rule_name + __return_span_ann.append(__s) + spans.add_annotation_layer(self.rule_name, __return_span_ann) + return spans diff --git a/bunkai/algorithm/tsunoda_sbd/annotator/exception_numeric.py b/bunkai/algorithm/tsunoda_sbd/annotator/exception_numeric.py new file mode 100644 index 0000000..df3c169 --- /dev/null +++ b/bunkai/algorithm/tsunoda_sbd/annotator/exception_numeric.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 +import re + +from bunkai.base.annotation import Annotations +from bunkai.base.annotator import Annotator + + +class ExceptionNumeric(Annotator): + def __init__(self): + super().__init__(rule_name=self.__class__.__name__) + + @staticmethod + def is_exception_numeric(original_text: str, start_index: int, end_index: int) -> bool: + """ + .の前後が数値であった場合は分割を行わない. + + 例: 和室3.5畳 / 1.5リットル以上のペットボトル. + """ + if original_text[start_index:end_index] != '.' and original_text[start_index:end_index] != '.': + return False + if re.match(r'\d', original_text[start_index - 1]) and re.match(r'\d', original_text[end_index]): + return True + return False + + def annotate(self, original_text: str, + spans: Annotations) -> Annotations: + __return_span_ann = [] + for __s in spans.get_final_layer(): + if self.is_exception_numeric(original_text, __s.start_index, __s.end_index): + continue + else: + __s.rule_name = self.rule_name + __return_span_ann.append(__s) + + spans.add_annotation_layer(self.rule_name, __return_span_ann) + return spans diff --git a/bunkai/algorithm/tsunoda_sbd/annotator/exception_particle.py b/bunkai/algorithm/tsunoda_sbd/annotator/exception_particle.py new file mode 100644 index 0000000..6bb5482 --- /dev/null +++ b/bunkai/algorithm/tsunoda_sbd/annotator/exception_particle.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 +from typing import Dict, List, Type + +from bunkai.algorithm.tsunoda_sbd.annotator.basic_annotator import BasicRule +from bunkai.algorithm.tsunoda_sbd.annotator.morph_annotator_janome import \ + MorphAnnotatorJanome +from bunkai.base.annotation import Annotations, SpanAnnotation, TokenResult +from bunkai.base.annotator import Annotator + + +class ExceptionParticle(Annotator): + def __init__(self, morph_annotator_class: Type[MorphAnnotatorJanome]): + super().__init__(rule_name=self.__class__.__name__) + self.morph_annotator_class = morph_annotator_class + + @staticmethod + def is_exception_particle(original_text: str, start_index: int, end_index: int, + index2token_obj: Dict[int, TokenResult]) -> bool: + """ + 形態素解析の結果、基本分割文字列の後ろが助詞だった場合は 分割を行わない. + + 例: 合宿免許? の若者さん達でしょうか / スタッフ? と話し込み . + """ + __next_end_index = end_index + if __next_end_index not in index2token_obj: + return False + else: + token_obj = index2token_obj[__next_end_index] + if token_obj.tuple_pos[0] == '助詞': + return True + else: + return False + + def __generate(self, anns: List[SpanAnnotation]) -> Dict[int, TokenResult]: + index2tokens = {} + __start_index = 0 + __tokenizer_anns = [ + ann for ann in anns if ann.rule_name == self.morph_annotator_class.__name__] + __processed = [] + for ann in __tokenizer_anns: + t_obj = ann.args['token'] # type: ignore + if t_obj in __processed: + continue + __length = len(t_obj.word_surface) + for __i in range(__start_index, __start_index + __length): + index2tokens[__i] = t_obj + __start_index += __length + __processed.append(t_obj) + else: + return index2tokens + + def annotate(self, original_text: str, + spans: Annotations, + ) -> Annotations: + index2token_obj = self.__generate( + list(spans.get_annotation_layer(self.morph_annotator_class.__name__))) + + __return_span_ann = [] + for __s in spans.name2spans[BasicRule.__name__]: + if self.is_exception_particle(original_text, + __s.start_index, + __s.end_index, + index2token_obj=index2token_obj): + continue + else: + __s.rule_name = self.rule_name + __return_span_ann.append(__s) + + spans.add_annotation_layer(self.rule_name, __return_span_ann) + return spans diff --git a/bunkai/algorithm/tsunoda_sbd/annotator/morph_annotator_janome.py b/bunkai/algorithm/tsunoda_sbd/annotator/morph_annotator_janome.py new file mode 100644 index 0000000..dcf2965 --- /dev/null +++ b/bunkai/algorithm/tsunoda_sbd/annotator/morph_annotator_janome.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python3 +from typing import List + +from janome.tokenizer import Tokenizer + +from bunkai.base.annotation import Annotations, SpanAnnotation, TokenResult +from bunkai.base.annotator import Annotator + + +class MorphAnnotatorJanome(Annotator): + def __init__(self): + super().__init__(rule_name=self.__class__.__name__) + self.tokenizer = Tokenizer() + + def __generate(self, text: str) -> List[SpanAnnotation]: + tokenizer_result = self.tokenizer.tokenize(text) + span_ann = [] + __start_index = 0 + for t_obj in tokenizer_result: + __pos = t_obj.part_of_speech.split(',') + __length = len(t_obj.surface) + token = TokenResult( + node_obj=t_obj, + tuple_pos=__pos, + word_stem=t_obj.base_form, + word_surface=t_obj.surface) + span_ann.append(SpanAnnotation( + rule_name=self.rule_name, + start_index=__start_index, + end_index=__start_index + __length, + split_string_type='janome', + split_string_value='token', + args={'token': token})) + __start_index += __length + else: + return span_ann + + def annotate(self, original_text: str, spans: Annotations) -> Annotations: + anns = self.__generate(original_text) + spans.add_annotation_layer(self.rule_name, anns + list(spans.flatten())) + return spans diff --git a/bunkai/algorithm/tsunoda_sbd/annotator/replace_parentheses.py b/bunkai/algorithm/tsunoda_sbd/annotator/replace_parentheses.py new file mode 100644 index 0000000..748ae1d --- /dev/null +++ b/bunkai/algorithm/tsunoda_sbd/annotator/replace_parentheses.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 +import re +from typing import Dict, List + +from bunkai.algorithm.tsunoda_sbd.annotator import constant +from bunkai.base.annotation import SpanAnnotation +from bunkai.base.annotator import Annotations, Annotator + + +class ExceptionParentheses(Annotator): + def __init__(self): + super().__init__(rule_name=self.__class__.__name__) + self.re_spans_parentheses1 = re.compile(constant.SPANS_PARENTHESES1_REGEXP) + self.re_spans_parentheses2 = re.compile(constant.SPANS_PARENTHESES2_REGEXP) + + def replace_parentheses_no1(self, original_text: str, split_points: List[SpanAnnotation]) -> List[SpanAnnotation]: + """ + 括弧内に次の文字列があった場合は、括弧及び括弧内の文字列を一文とする. + + 例: ̃ (近日中には冷房に切り替わる予定です。) ̃ 1 時間飲み放題(カクテル各種! ! )はお勧め. + """ + def unique_obj(input_list: List[SpanAnnotation]) -> List[SpanAnnotation]: + # filter out same index + filtered = [] + __added = [] + for s in input_list: + if s.end_index not in __added: + filtered.append(s) + __added.append(s.end_index) + return filtered + + spans_parentheses = self.re_spans_parentheses1.finditer(original_text) + filtered_split_point = [] + skip_end_index = [] + for parentheses_point in spans_parentheses: + p_start_index = parentheses_point.regs[0][0] + p_end_index = parentheses_point.regs[0][1] + for split_candidate in split_points: + __split_char = original_text[split_candidate.start_index:split_candidate.end_index] + if p_start_index < split_candidate.start_index and split_candidate.end_index < p_end_index: + if __split_char in constant.CHARS_FOR_IGNORE_PARENTHESES1: + # 該当の区切り文字候補は破棄。代わりに()のインデックス情報 + __new_parentheses_point = SpanAnnotation( + rule_name=self.rule_name, + start_index=p_end_index - 1, + end_index=p_end_index, + split_string_type='parentheses-sentence', + split_string_value=original_text[p_start_index:p_end_index]) + filtered_split_point.append(__new_parentheses_point) + skip_end_index.append(split_candidate.end_index) + else: + pass + else: + pass + else: + pass + else: + pass + + for s in split_points: + if s.end_index not in skip_end_index: + filtered_split_point.append(s) + + return unique_obj(filtered_split_point) + + # FIXME: Duplicated span bug + def replace_parentheses_no2(self, original_text: str, split_points: List[SpanAnnotation]) -> List[SpanAnnotation]: + """ + 括弧内に、括弧の扱い(1)の文字列ではない基本分割文字列が現れた場合は、二回以上登場した際に限り分割点を付与する. + + 要するに、括弧内で文境界付与を与えるということ. + + 例: ̃(セルフドリンクサービスはすごく良かったです!種類も豊富。). + """ + spans_parentheses = list(self.re_spans_parentheses2.finditer(original_text)) + filtered_split_point: List[SpanAnnotation] = [] + + target_strings_positions: Dict[int, SpanAnnotation] = {} + for split_candidate in split_points: + __split_char = original_text[split_candidate.start_index:split_candidate.end_index] + if __split_char in constant.CHARS_FOR_IGNORE_PARENTHESES2: + target_strings_positions[split_candidate.end_index] = split_candidate + + # add split points between parentheses and frequency is more than 2 inside parentheses + for parentheses_point in spans_parentheses: + p_start_index = parentheses_point.regs[0][0] + p_end_index = parentheses_point.regs[0][1] + __split_char_in_parentheses = [(end_pos, reg_obj) for end_pos, reg_obj in target_strings_positions.items() + if p_start_index < end_pos < p_end_index] + if len(__split_char_in_parentheses) >= 2: + filtered_split_point += [t[1] + for t in __split_char_in_parentheses] + + # add split points outside of parentheses + t_span_parentheses = [(parentheses_point.regs[0][0], parentheses_point.regs[0][1]) + for parentheses_point in spans_parentheses] + + def is_outside_of_parentheses(t_s): + if s_point.split_string_type == 'parentheses-sentence': + return True + elif s_point.start_index <= t_s[0] and s_point.end_index <= t_s[0]: + return True + elif s_point.start_index >= t_s[1] and s_point.end_index >= t_s[1]: + return True + else: + return False + + for s_point in split_points: + _ = [is_outside_of_parentheses(t_s) for t_s in t_span_parentheses] + if len(_) > 0 and all(_): + filtered_split_point.append(s_point) + + return filtered_split_point + + def annotate(self, original_text: str, + spans: Annotations) -> Annotations: + __s_no1 = self.replace_parentheses_no1( + original_text, spans.get_final_layer()) + if len(re.findall(r'\(.+\)', original_text)) > 0: + __s_no2 = self.replace_parentheses_no2(original_text, __s_no1) + else: + __s_no2 = __s_no1 + + spans.add_annotation_layer(self.rule_name, __s_no2) + + return spans diff --git a/bunkai/algorithm/tsunoda_sbd/tsunoda_sbd.py b/bunkai/algorithm/tsunoda_sbd/tsunoda_sbd.py new file mode 100644 index 0000000..2872d04 --- /dev/null +++ b/bunkai/algorithm/tsunoda_sbd/tsunoda_sbd.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 +import typing +from typing import Iterator, List + +from bunkai.algorithm.tsunoda_sbd.annotator import (BasicRule, ExceptionNo, + ExceptionNumeric, + ExceptionParentheses, + ExceptionParticle, + MorphAnnotatorJanome) +from bunkai.base.annotation import Annotations, SpanAnnotation +from bunkai.base.annotator import (AnnotatorPipeline, RuleOrderException, + SentenceBoundaryDisambiguator) + + +class TsunodaPipeline(AnnotatorPipeline): + def check(self) -> bool: + # MorphAnnotator must be before ExceptionParticle + order_morph_annotator = 0 + order_exception_particle = 0 + for __i, a in enumerate(self.pipeline): + if isinstance(a, MorphAnnotatorJanome): + order_morph_annotator = __i + elif isinstance(a, ExceptionParticle): + order_exception_particle = __i + + if order_morph_annotator > order_exception_particle: + raise RuleOrderException(f'MorphAnnotator at {order_morph_annotator} must be' + f' before ExceptionParticle at {order_exception_particle}') + + return True + + +class TsunodaSentenceBoundaryDisambiguation(SentenceBoundaryDisambiguator): + def __init__(self, *, path_model: typing.Any = None): + morph_annotator = MorphAnnotatorJanome() # type: ignore + particle_annotator = ExceptionParticle(MorphAnnotatorJanome) + + self.pipeline = TsunodaPipeline([ + BasicRule(), + morph_annotator, + particle_annotator, + ExceptionNumeric(), + ExceptionNo(), + ExceptionParentheses() + ]) + super().__init__() + + def _eos(self, text: str) -> Annotations: + annotations = Annotations() + annotations.add_annotation_layer('first', [SpanAnnotation(rule_name=None, + start_index=0, + end_index=len( + text), + split_string_type=None, split_string_value=None)]) + for rule_obj in self.pipeline: + rule_obj.annotate(text, annotations) + return annotations + + def find_eos(self, text: str) -> List[int]: + annotations = self._eos(text) + end_index = list(sorted( + list(set([s_a.end_index for s_a in annotations.get_final_layer()])))) + return end_index + + def __call__(self, text: str) -> Iterator[str]: + annotations = self._eos(text) + end_index = sorted( + list(set([s_a.end_index for s_a in annotations.get_final_layer()]))) + __start_index = 0 + __end_index = 0 + for e_i in end_index: + part_sentences = text[__start_index:e_i] + __start_index = e_i + __end_index = e_i + yield part_sentences + + if __end_index < len(text): + part_sentences = text[__end_index:] + yield part_sentences diff --git a/bunkai/base/__init__.py b/bunkai/base/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bunkai/base/annotation.py b/bunkai/base/annotation.py new file mode 100644 index 0000000..8bfc204 --- /dev/null +++ b/bunkai/base/annotation.py @@ -0,0 +1,119 @@ +#!/usr/bin/env python3 +import dataclasses +import itertools +from typing import Any, Dict, Iterator, List, Optional, Tuple + +import spans +from dataclasses_json import DataClassJsonMixin + + +class TokenResult: + def __init__(self, + node_obj: Any, + tuple_pos: Tuple[str, ...], + word_stem: str, + word_surface: str, + is_feature=True, + is_surface=False, + misc_info=None): + self.node_obj = node_obj + self.word_stem = word_stem + self.word_surface = word_surface + self.is_surface = is_surface + self.is_feature = is_feature + self.misc_info = misc_info + self.tuple_pos = tuple_pos + + def __str__(self): + return self.word_surface + + +@dataclasses.dataclass +class SpanAnnotation: + rule_name: Optional[str] + start_index: int + end_index: int + split_string_type: Optional[str] + split_string_value: Optional[str] + args: Optional[Dict[str, Any]] = None + + def __str__(self): + return f'{self.start_index}-{self.end_index}/{self.rule_name}/{self.split_string_value}' + + def __int__(self) -> int: + return self.end_index + + def __spans(self) -> spans.intrange: + return spans.intrange(self.start_index, self.end_index) + + def get_spans(self) -> spans.intrange: + return self.__spans() + + def update_rule_name(self, new_rule_name: str) -> None: + self.rule_name = new_rule_name + + +@dataclasses.dataclass +class Annotations: + annotator_forward: Optional[str] = None + name2spans: Dict[str, List[SpanAnnotation]] = dataclasses.field(default_factory=dict) + name2order: Dict[str, int] = dataclasses.field(default_factory=dict) + current_order: int = 0 + + def add_annotation_layer(self, annotator_name: str, annotations: List[SpanAnnotation]) -> None: + self.name2spans[annotator_name] = annotations + self.name2order[annotator_name] = self.current_order + self.annotator_forward = annotator_name + self.current_order += 1 + + def add_flatten_annotations(self, annotations: List[SpanAnnotation]): + self.name2spans = {str(r): list(g_obj) for r, g_obj + in itertools.groupby(sorted(annotations, key=lambda a: a.rule_name), # type: ignore + key=lambda a: a.rule_name)} + + def flatten(self) -> Iterator[SpanAnnotation]: + return itertools.chain.from_iterable(self.name2spans.values()) + + def get_final_layer(self) -> List[SpanAnnotation]: + if self.annotator_forward is None: + return [] + else: + return self.name2spans[self.annotator_forward] + + def get_annotation_layer(self, layer_name: str) -> Iterator[SpanAnnotation]: + assert layer_name in list(self.name2spans.keys()), f'{layer_name} not in analysis layers.' + span_anns = {str(ann): ann for ann in itertools.chain.from_iterable(self.name2spans.values())} + for ann in span_anns.values(): + if ann.rule_name is not None and ann.rule_name == layer_name: + yield ann + return + + def get_morph_analysis(self, name_annotation_layer: str = 'MorphAnnotatorJanome') -> Iterator[TokenResult]: + """Get Tokens analysis from Janome.""" + assert name_annotation_layer in self.name2spans, f'{name_annotation_layer} not in annotation layer.' + for span_ann in self.name2spans[name_annotation_layer]: + if span_ann.rule_name == name_annotation_layer: + ret = span_ann.args['token'] # type: ignore + assert isinstance(ret, TokenResult) + yield ret + return + + def available_layers(self) -> List[str]: + return list(self.name2spans.keys()) + + +@dataclasses.dataclass +class Tokens(DataClassJsonMixin): + meta: Dict[str, Any] = dataclasses.field(default_factory=dict) + spans: List[str] = dataclasses.field(default_factory=list) + labels: List[str] = dataclasses.field(default_factory=list) + + def pretty(self, label2surface: Dict[str, str]) -> str: + rets: List[str] = [] + for idx, span in enumerate(self.spans): + rets.append(span) + label = self.labels[idx] + olabel = label2surface.get(label) + if olabel: + rets.append(olabel) + return ''.join(rets) diff --git a/bunkai/base/annotator.py b/bunkai/base/annotator.py new file mode 100644 index 0000000..a00e30e --- /dev/null +++ b/bunkai/base/annotator.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +import typing +from abc import ABCMeta, abstractmethod +from typing import Callable, Iterator, List + +from bunkai.base.annotation import Annotations, SpanAnnotation + + +def func_filter_span(spans_wide: typing.List[SpanAnnotation], + spans_narrow: typing.List[SpanAnnotation]) -> typing.List[SpanAnnotation]: + """Compare spans_wide and spans_narrow. If there is an overlap, use wider one.""" + __filtered = [] + for b_ann in spans_narrow: + is_skip = False + b_span = b_ann.get_spans() + for f_ann in spans_wide: + if b_span.within(f_ann.get_spans()): + is_skip = True + else: + pass + if is_skip is False: + __filtered.append(b_ann) + return __filtered + + +def func_filter_previous_rule_same_span(spans_current: typing.List[SpanAnnotation], + spans_previous: typing.List[SpanAnnotation]) -> typing.List[SpanAnnotation]: + """If there are conflicting results, use the result of the previous rules.""" + spans_current_map = {(sp.start_index, sp.end_index): sp for sp in spans_current} + spans_previous_map = {(sp.start_index, sp.end_index): sp for sp in spans_previous} + common_key = list(set(spans_current_map.keys()) & set(spans_previous_map.keys())) + filtered = [] + for span_key, sp in spans_current_map.items(): + if span_key in common_key: + continue + else: + filtered.append(sp) + return filtered + spans_previous + + +class RuleOrderException(Exception): + """class if the order of rule is out-of-definition.""" + + +class Annotator(metaclass=ABCMeta): + def __init__(self, rule_name: str): + self.rule_name = rule_name + + def add_forward_rule(self, + annotation_this_layer: List[SpanAnnotation], + spans: Annotations, + func_filtering: Callable = func_filter_previous_rule_same_span) -> Annotations: + filtered = func_filtering(annotation_this_layer, spans.get_final_layer()) + spans.add_annotation_layer(self.rule_name, filtered) + return spans + + @abstractmethod + def annotate(self, original_text: str, + spans: Annotations) -> Annotations: + raise NotImplementedError() + + +class AnnotationFilter(Annotator): + @staticmethod + def unify_span_annotations(span_annotations: List[SpanAnnotation]) -> List[SpanAnnotation]: + span_anns = {str(ann): ann for ann in span_annotations} + return list(span_anns.values()) + + +class AnnotatorPipeline(metaclass=ABCMeta): + def __init__(self, pipeline: List[Annotator]): + self.pipeline = pipeline + self.check() + + @abstractmethod + def check(self) -> bool: + raise NotImplementedError() + + def __iter__(self): + return iter(self.pipeline) + + +class SentenceBoundaryDisambiguator(metaclass=ABCMeta): + def __init__(self, *, path_model: typing.Any = None): + pass + + @abstractmethod + def _eos(self, text: str) -> Annotations: + raise NotImplementedError() + + @abstractmethod + def find_eos(self, text: str) -> List[int]: + raise NotImplementedError() + + @abstractmethod + def __call__(self, text: str) -> Iterator[str]: + raise NotImplementedError() diff --git a/bunkai/cli.py b/bunkai/cli.py new file mode 100644 index 0000000..93bd115 --- /dev/null +++ b/bunkai/cli.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 + +import argparse +import sys +import typing + +import bunkai.constant +from bunkai.algorithm.bunkai_sbd.bunkai_sbd import \ + BunkaiSentenceBoundaryDisambiguation +from bunkai.algorithm.tsunoda_sbd.tsunoda_sbd import \ + TsunodaSentenceBoundaryDisambiguation + +DEFAULT_ALGORITHM = 'bunkai' +algorithm2class: typing.Dict[str, typing.Type] = { + DEFAULT_ALGORITHM: BunkaiSentenceBoundaryDisambiguation, + 'tsunoda': TsunodaSentenceBoundaryDisambiguation, +} + + +def get_opts() -> argparse.Namespace: + oparser = argparse.ArgumentParser() + oparser.add_argument("--algorithm", "-a", default=DEFAULT_ALGORITHM, + choices=sorted(list(algorithm2class.keys()))) + oparser.add_argument("--input", "-i", type=argparse.FileType("r"), default='-') + oparser.add_argument("--output", "-o", type=argparse.FileType("w"), default='-') + oparser.add_argument("--model", "-m") + return oparser.parse_args() + + +def run(annotator, _text: str) -> typing.Iterator[str]: + assert '\n' not in _text + assert bunkai.constant.METACHAR_SENTENCE_BOUNDARY not in _text + + text: str = _text.replace(bunkai.constant.METACHAR_LINE_BREAK, '\n') + __annotator_result: typing.List[int] = annotator.find_eos(text) + if len(__annotator_result) == 0 or __annotator_result[-1] != len(text): + __annotator_result.append(len(text)) + + last: int = 0 + for idx, split_point in enumerate(__annotator_result): + yield text[last:split_point].replace('\n', bunkai.constant.METACHAR_LINE_BREAK) + if idx != len(__annotator_result) - 1: + yield bunkai.constant.METACHAR_SENTENCE_BOUNDARY + last = split_point + + +def main() -> None: + opts = get_opts() + cls = algorithm2class[opts.algorithm] + annotator = cls(path_model=opts.model) + warned: bool = False + + with opts.input as inf,\ + opts.output as outf: + for line in inf: + ol: str = line[:-1] + if bunkai.constant.METACHAR_SENTENCE_BOUNDARY in ol: + ol = ol.replace(bunkai.constant.METACHAR_SENTENCE_BOUNDARY, '') + if not warned: + sys.stderr.write( + '\033[91m' + f'[Warning] All {bunkai.constant.METACHAR_SENTENCE_BOUNDARY} will be removed for input\n' + '\033[0m') + warned = True + + for ot in run(annotator, ol): + outf.write(ot) + outf.write('\n') + + +if __name__ == '__main__': + main() diff --git a/bunkai/constant.py b/bunkai/constant.py new file mode 100644 index 0000000..d321540 --- /dev/null +++ b/bunkai/constant.py @@ -0,0 +1,4 @@ +#!/usr/bin/env python3 + +METACHAR_SENTENCE_BOUNDARY: str = '\u2502' +METACHAR_LINE_BREAK: str = '\u2581' diff --git a/bunkai/experiment/__init__.py b/bunkai/experiment/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bunkai/experiment/convert/__init__.py b/bunkai/experiment/convert/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bunkai/experiment/convert/bccwj.py b/bunkai/experiment/convert/bccwj.py new file mode 100644 index 0000000..55e7b74 --- /dev/null +++ b/bunkai/experiment/convert/bccwj.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python3 + +import argparse +import typing +import unicodedata +from pathlib import Path +from xml.etree import ElementTree as ET + +import tqdm + +import bunkai.constant + + +def get_sentence(node) -> typing.Iterator[str]: + if node.tag == 'sentence': + yield bunkai.constant.METACHAR_SENTENCE_BOUNDARY + for luw in node: + if luw.tag == 'webBr': + yield bunkai.constant.METACHAR_LINE_BREAK + continue + elif luw.tag.lower() != 'luw': + yield from get_sentence(luw) + continue + + for fragment in luw: + if fragment.tag in ['webBr', 'br']: + yield bunkai.constant.METACHAR_LINE_BREAK + continue + elif fragment.tag.startswith('note'): + continue + elif fragment.tag in ['sampling', 'fraction', 'info']: + continue + elif fragment.tag.lower() != 'suw': + raise NotImplementedError(fragment.tag) + elif fragment.text is None: + for unit in fragment: + if unit.tag == 'sampling': + pass + elif unit.tag == 'correction': + yield unit.get('originalText') + elif unit.tag not in ['ruby', 'quote', 'enclosedCharacter', 'delete', 'subScript', 'superScript']: + raise NotImplementedError(unit.tag) + else: + yield unit.text + continue + yield fragment.text + + +def operate_article(parent) -> typing.Iterator[str]: + for node in parent: + if node.tag in ['rejectedBlock', 'info', 'titleBlock', 'abstract', 'authorsData', 'figureBlock']: + continue + elif node.tag in ['webBr', 'br']: + yield bunkai.constant.METACHAR_LINE_BREAK + elif node.tag == 'sentence': + yield from get_sentence(node) + else: + yield from operate_article(node) + + +def operation(data: str) -> typing.Iterator[str]: + root = ET.fromstring(data) + for child in root: + if child.tag != 'article': + continue + for i, t in enumerate(operate_article(child)): + if i == 0: + yield t.lstrip(bunkai.constant.METACHAR_SENTENCE_BOUNDARY) + else: + yield t + yield '\n' + + +def get_opts() -> argparse.Namespace: + oparser = argparse.ArgumentParser() + oparser.add_argument("--input", "-i", type=Path, required=True) + oparser.add_argument("--output", "-o", type=Path, required=True) + oparser.add_argument("--nonfkc", action='store_true') + return oparser.parse_args() + + +def main() -> None: + opts = get_opts() + if opts.input.is_dir(): + opts.output.mkdir(exist_ok=True, parents=True) + targets = [n for n in opts.input.iterdir()] + for fname in tqdm.tqdm(targets, leave=False): + with fname.open() as inf,\ + opts.output.joinpath(f'{fname.stem}.txt').open('w') as outf: + for string in operation(inf.read()): + if not opts.nonfkc: + string = unicodedata.normalize("NFKC", string) + outf.write(string) + else: + with opts.input.open() as inf,\ + opts.output.open('w') as outf: + for string in operation(inf.read()): + if not opts.nonfkc: + string = unicodedata.normalize("NFKC", string) + outf.write(string) + + +if __name__ == '__main__': + main() diff --git a/bunkai/experiment/convert/jalan.py b/bunkai/experiment/convert/jalan.py new file mode 100644 index 0000000..a9e2e80 --- /dev/null +++ b/bunkai/experiment/convert/jalan.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 + +import argparse +import dataclasses +import json +import typing +import unicodedata + +import bunkai.constant + + +@dataclasses.dataclass +class DoccanoDataObject: + annotation_id: int + text: str + labels: typing.List[typing.Tuple[int, int, str]] + meta: typing.Dict[str, typing.Any] + + +def sbd(dobj: DoccanoDataObject) -> typing.Iterator[str]: + __start_index = 0 + labels_sequence = sorted( + [label_tuple for label_tuple in dobj.labels + if label_tuple[2] == 'SEP' or label_tuple[2].startswith('SEP-')], + key=lambda t: t[0]) + for label_tuple in labels_sequence: + yield dobj.text[__start_index: label_tuple[1]] + __start_index = label_tuple[1] + + if __start_index != len(dobj.text) - 1: + yield dobj.text[__start_index:] + + +def operation(data: dict) -> typing.Iterator[str]: + dobj = DoccanoDataObject(annotation_id=data['id'], + text=data['text'], + labels=data['labels'], + meta=data['meta'], + ) + for text_part in sbd(dobj): + if len(text_part) > 0: + yield text_part + + +def get_opts() -> argparse.Namespace: + oparser = argparse.ArgumentParser() + oparser.add_argument("--input", "-i", type=argparse.FileType("r"), default='-', required=True) + oparser.add_argument("--output", "-o", type=argparse.FileType("w"), default='-') + oparser.add_argument("--nonfkc", action='store_true') + return oparser.parse_args() + + +def main() -> None: + opts = get_opts() + with opts.input as inf,\ + opts.output as outf: + for line in inf: + data = json.loads(line) + for __i, string in enumerate(operation(data)): + if __i > 0: + outf.write(bunkai.constant.METACHAR_SENTENCE_BOUNDARY) + if not opts.nonfkc: + string = unicodedata.normalize("NFKC", string) + outf.write(string) + outf.write('\n') + + +if __name__ == '__main__': + main() diff --git a/bunkai/experiment/evaluate.py b/bunkai/experiment/evaluate.py new file mode 100644 index 0000000..711e7eb --- /dev/null +++ b/bunkai/experiment/evaluate.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 + +import argparse +import json +import re +import typing + +from seqeval.metrics import performance_measure + +from bunkai.constant import METACHAR_LINE_BREAK, METACHAR_SENTENCE_BOUNDARY + +REGEXP_SB_WITH_BLANKS_SPAN: str = f'({METACHAR_SENTENCE_BOUNDARY})([\\s{METACHAR_LINE_BREAK}]*)' +RE_SB_WITH_BLANKS_SPAN = re.compile(REGEXP_SB_WITH_BLANKS_SPAN) +RE_SBS_SPAN = re.compile(f'{METACHAR_SENTENCE_BOUNDARY}+') + + +def get_bo(text: str, lbonly: bool) -> typing.List[str]: + """Get a list of BIO sequence from input text.""" + assert METACHAR_SENTENCE_BOUNDARY + METACHAR_SENTENCE_BOUNDARY not in text + assert not text.startswith(METACHAR_SENTENCE_BOUNDARY) + ret: typing.List[str] = [] + exist_under_bar: bool = False + for char in text: + if char == METACHAR_SENTENCE_BOUNDARY: + if lbonly and exist_under_bar is False: + continue + else: + ret[-1] = 'SEP' + exist_under_bar = False + continue + if lbonly: + if char == METACHAR_LINE_BREAK: + exist_under_bar = True + ret.append('O') + else: + ret.append('O') + return ret + + +def get_score(data: typing.Dict[str, int]) -> typing.Dict[str, float]: + ret: typing.Dict[str, float] = {} + try: + precision = data['TP'] / float(data['TP'] + data['FP']) + except ZeroDivisionError: + precision = float('nan') + recall = data['TP'] / float(data['TP'] + data['FN']) + ret['f1'] = 2 * precision * recall / (recall + precision) + ret['precision'] = precision + ret['recall'] = recall + return ret + + +def evaluate(golds: typing.List[str], systems: typing.List[str], lbonly: bool) -> str: + assert len(golds) == len(systems) + golds_bo = [] + systems_bo = [] + for gold, system in zip(golds, systems): + assert gold.replace(METACHAR_SENTENCE_BOUNDARY, '') == system.replace(METACHAR_SENTENCE_BOUNDARY, ''), \ + f'{gold}\n{system}' + + system_bo = get_bo(system, lbonly) + systems_bo.append(system_bo) + + gold_bo = get_bo(gold, lbonly) + golds_bo.append(gold_bo) + assert len(gold_bo) == len(system_bo), f'gold={gold} \nsystem={system}\n' \ + f'N(gold)={len(gold_bo)} N(systems)={len(system_bo)}' + + measure: typing.Dict[str, int] = performance_measure(golds_bo, systems_bo) + score = get_score(measure) + return f'{json.dumps(measure, sort_keys=True)}\n{json.dumps(score, sort_keys=True)}\n' + + +def trim(val: str) -> str: + old: str = val + new: str = val + while True: + new = RE_SBS_SPAN.sub(METACHAR_SENTENCE_BOUNDARY, RE_SB_WITH_BLANKS_SPAN.sub(r'\2\1', old)) + if old == new: + break + old = new + return new + + +def get_opts() -> argparse.Namespace: + oparser = argparse.ArgumentParser() + oparser.add_argument("--input", "-i", type=argparse.FileType('r'), required=True) + oparser.add_argument("--gold", "-g", type=argparse.FileType('r'), required=False) + oparser.add_argument("--output", "-o", type=argparse.FileType('w'), default='-') + oparser.add_argument("--lb", action="store_true", help='Checks only Line-break marked with "\u2502"') + oparser.add_argument("--trim", action="store_true") + return oparser.parse_args() + + +def main() -> None: + opts = get_opts() + with opts.output as outf: + if opts.trim: + for line in opts.input: + outf.write(trim(line[:-1])) + outf.write('\n') + else: + r = evaluate([l1[:-1] for l1 in opts.gold.readlines()], + [m[:-1] for m in opts.input.readlines()], opts.lb) + outf.write(r) + + +if __name__ == '__main__': + main() diff --git a/bunkai/experiment/statics.py b/bunkai/experiment/statics.py new file mode 100644 index 0000000..7783e1e --- /dev/null +++ b/bunkai/experiment/statics.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 +import argparse +import collections +import dataclasses +import json +import typing +from pathlib import Path + +import tqdm +from dataclasses_json import DataClassJsonMixin + +import bunkai.constant +from bunkai.algorithm.lbd.corpus import (LABEL_NSEP, LABEL_OTHER, LABEL_SEP, + annotation2spans) + + +@dataclasses.dataclass +class Statics(DataClassJsonMixin): + name: str = '' + num_file: int = 0 + line_break: int = 0 + line_break_without_sb: int = 0 + sentence: int = 0 + sentence_include_line_break: int = 0 + sentence_include_line_break_without_sb: int = 0 + + def add(self, st): + self.num_file += 1 + for f in dataclasses.fields(st): + self.__setattr__(f.name, self.__getattribute__(f.name) + st.__getattribute__(f.name)) + + +def count(path: Path, outf: typing.IO, show: bool = False) -> Statics: + st = Statics() + with path.open() as inf: + for doc in inf: + text = doc[:-1] + st.sentence += text .count(bunkai.constant.METACHAR_SENTENCE_BOUNDARY) + tokens = annotation2spans(text) + if len(tokens.labels) == 0: + continue + if tokens.labels[-1] != LABEL_OTHER: + del tokens.spans[-1] + del tokens.labels[-1] + cnt: int = 0 + line_break_without_sb: int = 0 + for label in tokens.labels: + if label == LABEL_SEP: + cnt += 1 + elif label == LABEL_NSEP: + cnt += 1 + line_break_without_sb += 1 + if cnt > 0: + st.sentence_include_line_break += 1 + st.line_break += cnt + + st.line_break_without_sb += line_break_without_sb + if line_break_without_sb > 0: + st.sentence_include_line_break_without_sb += 1 + if show: + outf.write(f'{path}\t{tokens}\n') + return st + + +def count_char(path: Path) -> typing.Iterator[str]: + with path.open() as inf: + for doc in inf: + text = doc[:-1] + tokens = annotation2spans(text) + if len(tokens.labels) == 0: + continue + if tokens.labels[-1] != LABEL_OTHER: + del tokens.spans[-1] + del tokens.labels[-1] + for tid, label in enumerate(tokens.labels): + if tid < len(tokens.labels) - 1 and label == LABEL_OTHER: + yield tokens.spans[tid][-1] + + +def get_opts() -> argparse.Namespace: + oparser = argparse.ArgumentParser() + oparser.add_argument("--input", "-i", type=Path, required=True) + oparser.add_argument("--output", "-o", type=argparse.FileType('w'), default='-') + oparser.add_argument("--show", action="store_true") + oparser.add_argument("--char", action="store_true") + return oparser.parse_args() + + +def main() -> None: + opts = get_opts() + + targets: typing.List[typing.Tuple[str, Path]] = [('__', opts.input)] + if opts.input.is_dir(): + targets = [(fi.name[:2], fi) for fi in opts.input.iterdir()] + + if opts.char: + genre2chars: typing.Dict[str, typing.DefaultDict[str, int]] = {} + genre2chars['ALL'] = collections.defaultdict(int) + for (genre, fpath) in tqdm.tqdm(targets, leave=False): + chars: typing.Optional[typing.DefaultDict[str, int]] = genre2chars.get(genre) + if chars is None: + chars = collections.defaultdict(int) + genre2chars[genre] = chars + for mychar in count_char(fpath): + chars[mychar] += 1 + genre2chars['ALL'][mychar] += 1 + with opts.output as outf: + for genre, chars in sorted(genre2chars.items()): + outf.write(f'{genre}\t') + outf.write(json.dumps(chars, ensure_ascii=False, sort_keys=True)) + outf.write('\n') + return + + st_all = Statics(name='ALL') + st_detail: typing.DefaultDict[str, Statics] = collections.defaultdict(Statics) + with opts.output as outf: + for (genre, fpath) in tqdm.tqdm(targets, leave=False): + st = count(fpath, outf, opts.show) + st_all.add(st) + st_detail[genre].name = genre + st_detail[genre].add(st) + + if not opts.show: + if len(targets) != 1: + for genre, st in sorted(st_detail.items()): + outf.write(f'{st.to_json()}\n') + outf.write(f'{st_all.to_json()}\n') + + +if __name__ == '__main__': + main() diff --git a/bunkai/third/__init__.py b/bunkai/third/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bunkai/third/run_ner.py b/bunkai/third/run_ner.py new file mode 100644 index 0000000..17e790d --- /dev/null +++ b/bunkai/third/run_ner.py @@ -0,0 +1,329 @@ +# mypy: ignore-errors +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Fine-tuning the library models for named entity recognition on CoNLL-2003. + + +import logging +import os +import sys +import tempfile +from dataclasses import dataclass, field +from typing import Dict, List, Optional, Tuple + +import numpy as np +from seqeval.metrics import f1_score, precision_score, recall_score +from torch import nn +from transformers import (AutoConfig, AutoModelForTokenClassification, + AutoTokenizer, EvalPrediction, HfArgumentParser, + Trainer, TrainingArguments, set_seed) + +from bunkai.algorithm.lbd.custom_tokenizers import JanomeSubwordsTokenizer +from bunkai.third.utils_ner import NerDataset, Split, get_labels + +logger = logging.getLogger(__name__) + + +@dataclass +class ModelArguments: + """Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.""" + + model_name_or_path: str = field( + metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} + ) + config_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} + ) + tokenizer_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} + ) + use_fast: bool = field(default=False, metadata={"help": "Set this flag to use fast tokenization."}) + # If you want to tweak more attributes on your tokenizer, you should do it in a distinct script, + # or just modify its tokenizer_config.json. + cache_dir: Optional[str] = field( + default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} + ) + + +@dataclass +class DataTrainingArguments: + """Arguments pertaining to what data we are going to input our model for training and eval.""" + + data_dir: str = field( + metadata={"help": "The input data dir. Should contain the .txt files for a CoNLL-2003-formatted task."} + ) + labels: Optional[str] = field( + default=None, + metadata={"help": "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used."}, + ) + max_seq_length: int = field( + default=128, + metadata={ + "help": "The maximum total input sequence length after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded." + }, + ) + overwrite_cache: bool = field( + default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} + ) + + +def main(): + # See all possible arguments in src/transformers/training_args.py + # or by passing the --help flag to this script. + # We now keep distinct sets of args, for a cleaner separation of concerns. + + parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) + if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): + # If we pass only one argument to the script and it's the path to a json file, + # let's parse it to get our arguments. + model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) + else: + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + + if ( + os.path.exists(training_args.output_dir) + and os.listdir(training_args.output_dir) + and training_args.do_train + and not training_args.overwrite_output_dir + ): + raise ValueError( + f"Output directory ({training_args.output_dir}) " + f"already exists and is not empty. Use --overwrite_output_dir to overcome." + ) + + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, + ) + logger.warning( + "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", + training_args.local_rank, + training_args.device, + training_args.n_gpu, + bool(training_args.local_rank != -1), + training_args.fp16, + ) + logger.info("Training/evaluation parameters %s", training_args) + + # Set seed + set_seed(training_args.seed) + + # Prepare CONLL-2003 task + labels = get_labels(data_args.labels) + label_map: Dict[int, str] = {i: label for i, label in enumerate(labels)} + num_labels = len(labels) + + # Load pretrained model and tokenizer + # + # Distributed training: + # The .from_pretrained methods guarantee that only one local process can concurrently + # download model & vocab. + + config = AutoConfig.from_pretrained( + model_args.config_name if model_args.config_name else model_args.model_name_or_path, + num_labels=num_labels, + id2label=label_map, + label2id={label: i for i, label in enumerate(labels)}, + cache_dir=model_args.cache_dir, + ) + + if hasattr(model_args, 'is_auto_tokenizer') and model_args.is_auto_tokenizer: + tokenizer = AutoTokenizer.from_pretrained( + model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + use_fast=model_args.use_fast) + else: + logger.info("Use JanomeSubwordsTokenizer in Bunkai Project.") + if 'distilbert-base-japanese' in model_args.model_name_or_path: + # if model is distilbert-base-japanese, download the model and save vocab file into your local. + logger.info(f"Downloading {model_args.model_name_or_path} for vocab configuration.") + __tokenizer = AutoTokenizer.from_pretrained( + model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + use_fast=model_args.use_fast) + tmp_model_path: str = tempfile.mkdtemp() + logger.info(f'Saving vocab file into local {tmp_model_path}...') + __tokenizer.save_pretrained(tmp_model_path) + tokenizer = JanomeSubwordsTokenizer(vocab_file=os.path.join(tmp_model_path, 'vocab.txt')) + else: + tokenizer = JanomeSubwordsTokenizer( + vocab_file=model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path) + + # add new vocab + # __current_vocab_size: int = len(tokenizer) + # tokenizer.add_tokens([METACHAR_LINE_BREAK]) + # assert len(tokenizer) > __current_vocab_size + # tokenizer.vocab[METACHAR_LINE_BREAK] = len(tokenizer) - 1 + # tokenizer.ids_to_tokens[len(tokenizer) - 1] = METACHAR_LINE_BREAK + + model = AutoModelForTokenClassification.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + ) + model.resize_token_embeddings(len(tokenizer)) + if model.base_model_prefix == 'distilbert' or 'distilbert-base-japanese' in model_args.model_name_or_path: # hotfix + is_distil_bert = True + if hasattr(tokenizer.model_input_names, 'token_type_ids'): + tokenizer.model_input_names.remove('token_type_ids') + else: + is_distil_bert = False + + # Get datasets + train_dataset = ( + NerDataset( + data_dir=data_args.data_dir, + tokenizer=tokenizer, + labels=labels, + model_type=config.model_type, + max_seq_length=data_args.max_seq_length, + overwrite_cache=data_args.overwrite_cache, + mode=Split.train, + is_distil_bert=is_distil_bert) + if training_args.do_train + else None + ) + eval_dataset = ( + NerDataset( + data_dir=data_args.data_dir, + tokenizer=tokenizer, + labels=labels, + model_type=config.model_type, + max_seq_length=data_args.max_seq_length, + overwrite_cache=data_args.overwrite_cache, + mode=Split.dev, + is_distil_bert=is_distil_bert + ) + if training_args.do_eval + else None + ) + + def align_predictions(predictions: np.ndarray, label_ids: np.ndarray) -> Tuple[List[int], List[int]]: + preds = np.argmax(predictions, axis=2) + + batch_size, seq_len = preds.shape + + out_label_list = [[] for _ in range(batch_size)] + preds_list = [[] for _ in range(batch_size)] + + for i in range(batch_size): + for j in range(seq_len): + if label_ids[i, j] != nn.CrossEntropyLoss().ignore_index: + out_label_list[i].append(label_map[label_ids[i][j]]) + preds_list[i].append(label_map[preds[i][j]]) + + return preds_list, out_label_list + + def compute_metrics(p: EvalPrediction) -> Dict: + preds_list, out_label_list = align_predictions(p.predictions, p.label_ids) + return { + "precision": precision_score(out_label_list, preds_list), + "recall": recall_score(out_label_list, preds_list), + "f1": f1_score(out_label_list, preds_list), + } + + # Initialize our Trainer + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + compute_metrics=compute_metrics, + ) + + # Training + if training_args.do_train: + trainer.train( + resume_from_checkpoint=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None + ) + trainer.save_model() + # For convenience, we also re-save the tokenizer to the same directory, + # so that you can share your model easily on huggingface.co/models =) + if trainer.is_world_process_zero(): + tokenizer.save_pretrained(training_args.output_dir) + + # Evaluation + results = {} + if training_args.do_eval: + logger.info("*** Evaluate ***") + + result = trainer.evaluate() + + output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt") + if trainer.is_world_process_zero(): + with open(output_eval_file, "w") as writer: + logger.info("***** Eval results *****") + for key, value in result.items(): + logger.info(" %s = %s", key, value) + writer.write("%s = %s\n" % (key, value)) + + results.update(result) + + # Predict + if training_args.do_predict: + test_dataset = NerDataset( + data_dir=data_args.data_dir, + tokenizer=tokenizer, + labels=labels, + model_type=config.model_type, + max_seq_length=data_args.max_seq_length, + overwrite_cache=data_args.overwrite_cache, + mode=Split.test, + ) + + predictions, label_ids, metrics = trainer.predict(test_dataset) + preds_list, _ = align_predictions(predictions, label_ids) + + output_test_results_file = os.path.join(training_args.output_dir, "test_results.txt") + if trainer.is_world_process_zero(): + with open(output_test_results_file, "w") as writer: + for key, value in metrics.items(): + logger.info(" %s = %s", key, value) + writer.write("%s = %s\n" % (key, value)) + + # Save predictions + output_test_predictions_file = os.path.join(training_args.output_dir, "test_predictions.txt") + if trainer.is_world_process_zero(): + with open(output_test_predictions_file, "w") as writer: + with open(os.path.join(data_args.data_dir, "test.txt"), "r") as f: + example_id = 0 + for line in f: + if line.startswith("-DOCSTART-") or line == "" or line == "\n": + writer.write(line) + if not preds_list[example_id]: + example_id += 1 + elif preds_list[example_id]: + output_line = line.split()[0] + " " + preds_list[example_id].pop(0) + "\n" + writer.write(output_line) + else: + logger.warning( + "Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0] + ) + + return results + + +def _mp_fn(index): + # For xla_spawn (TPUs) + main() + + +if __name__ == "__main__": + main() diff --git a/bunkai/third/utils_ner.py b/bunkai/third/utils_ner.py new file mode 100644 index 0000000..9c5a15b --- /dev/null +++ b/bunkai/third/utils_ner.py @@ -0,0 +1,417 @@ +# mypy: ignore-errors +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Named entity recognition fine-tuning: utilities to work with CoNLL-2003 task. + + +import logging +import os +from dataclasses import dataclass +from enum import Enum +from typing import List, Optional, Union + +from filelock import FileLock +from transformers import (PreTrainedTokenizer, is_tf_available, + is_torch_available) + +from bunkai.algorithm.lbd.custom_tokenizers import JanomeSubwordsTokenizer + +logger = logging.getLogger(__name__) + + +@dataclass +class InputExample: + """ + A single training/test example for token classification. + + :param guid: Unique id for the example. + :param words: list. The words of the sequence. + :param labels: (Optional) list. The labels for each word of the sequence. This should be + :param specified for train and dev examples, but not for test examples. + :param is_document_first: First fragment of document. + """ + + guid: str + words: List[str] + labels: Optional[List[str]] + is_document_first: bool + + +@dataclass +class InputFeatures: + """ + A single set of features of data. + + Property names are the same names as the corresponding inputs to a model. + """ + + input_ids: List[int] + attention_mask: List[int] + token_type_ids: Optional[List[int]] = None + label_ids: Optional[List[int]] = None + document_id: Optional[str] = None + + +class Split(Enum): + train = "train" + dev = "dev" + test = "test" + + +if is_torch_available(): + import torch + from torch import nn + from torch.utils.data.dataset import Dataset + + class NerDataset(Dataset): + features: List[InputFeatures] + pad_token_label_id: int = nn.CrossEntropyLoss().ignore_index + # Use cross entropy ignore_index as padding label id so that only + # real label ids contribute to the loss later. + + def __init__( + self, + data_dir: str, + tokenizer: JanomeSubwordsTokenizer, + labels: List[str], + model_type: str, + max_seq_length: Optional[int] = None, + overwrite_cache=False, + mode: Split = Split.train, + is_distil_bert: bool = False): + # Load data features from cache or dataset file + cached_features_file = os.path.join( + data_dir, "cached_{}_{}_{}".format(mode.value, tokenizer.__class__.__name__, str(max_seq_length)), + ) + + # Make sure only the first process in distributed training processes the dataset, + # and the others will use the cache. + lock_path = cached_features_file + ".lock" + with FileLock(lock_path): + + if os.path.exists(cached_features_file) and not overwrite_cache: + logger.info(f"Loading features from cached file {cached_features_file}") + self.features = torch.load(cached_features_file) + else: + logger.info(f"Creating features from dataset file at {data_dir}") + examples = read_examples_from_file(data_dir, mode) + # TODO clean up all this to leverage built-in features of tokenizers + self.features = convert_examples_to_features( + examples=examples, + label_list=labels, + max_seq_length=max_seq_length, + tokenizer=tokenizer, + cls_token_at_end=bool(model_type in ["xlnet"]), + # xlnet has a cls token at the end + cls_token=tokenizer.cls_token, + cls_token_segment_id=2 if model_type in ["xlnet"] else 0, + sep_token=tokenizer.sep_token, + sep_token_extra=False, + # roberta uses an extra separator b/w pairs of sentences, + # cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 + pad_on_left=bool(tokenizer.padding_side == "left"), + pad_token=tokenizer.pad_token_id, + pad_token_segment_id=tokenizer.pad_token_type_id, + pad_token_label_id=self.pad_token_label_id, + is_distil_bert=is_distil_bert + ) + logger.info(f"Saving features into cached file {cached_features_file}") + torch.save(self.features, cached_features_file) + + def __len__(self): + return len(self.features) + + def __getitem__(self, i) -> InputFeatures: + return self.features[i] + + +if is_tf_available(): + import tensorflow as tf + + class TFNerDataset: + features: List[InputFeatures] + pad_token_label_id: int = -100 + # Use cross entropy ignore_index as padding label id so that only + # real label ids contribute to the loss later. + + def __init__( + self, + data_dir: str, + tokenizer: PreTrainedTokenizer, + labels: List[str], + model_type: str, + max_seq_length: Optional[int] = None, + overwrite_cache=False, + mode: Split = Split.train, + ): + examples = read_examples_from_file(data_dir, mode) + # TODO clean up all this to leverage built-in features of tokenizers + self.features = convert_examples_to_features( + examples, + labels, + max_seq_length, + tokenizer, + cls_token_at_end=bool(model_type in ["xlnet"]), + # xlnet has a cls token at the end + cls_token=tokenizer.cls_token, + cls_token_segment_id=2 if model_type in ["xlnet"] else 0, + sep_token=tokenizer.sep_token, + sep_token_extra=False, + # roberta uses an extra separator b/w pairs of sentences, + # cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 + pad_on_left=bool(tokenizer.padding_side == "left"), + pad_token=tokenizer.pad_token_id, + pad_token_segment_id=tokenizer.pad_token_type_id, + pad_token_label_id=self.pad_token_label_id, + ) + + def gen(): + for ex in self.features: + if ex.token_type_ids is None: + yield ( + {"input_ids": ex.input_ids, "attention_mask": ex.attention_mask}, + ex.label_ids, + ) + else: + yield ( + { + "input_ids": ex.input_ids, + "attention_mask": ex.attention_mask, + "token_type_ids": ex.token_type_ids, + }, + ex.label_ids, + ) + + if "token_type_ids" not in tokenizer.model_input_names: + self.dataset = tf.data.Dataset.from_generator( + gen, + ({"input_ids": tf.int32, "attention_mask": tf.int32}, tf.int64), + ( + {"input_ids": tf.TensorShape([None]), "attention_mask": tf.TensorShape([None])}, + tf.TensorShape([None]), + ), + ) + else: + self.dataset = tf.data.Dataset.from_generator( + gen, + ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64), + ( + { + "input_ids": tf.TensorShape([None]), + "attention_mask": tf.TensorShape([None]), + "token_type_ids": tf.TensorShape([None]), + }, + tf.TensorShape([None]), + ), + ) + + def get_dataset(self): + self.dataset = self.dataset.apply(tf.data.experimental.assert_cardinality(len(self.features))) + + return self.dataset + + def __len__(self): + return len(self.features) + + def __getitem__(self, i) -> InputFeatures: + return self.features[i] + + +def read_examples_from_file(data_dir, mode: Union[Split, str]) -> List[InputExample]: + if isinstance(mode, Split): + mode = mode.value + file_path = os.path.join(data_dir, f"{mode}.txt") + guid_index = 1 + examples = [] + with open(file_path, encoding="utf-8") as f: + words = [] + labels = [] + for line in f: + if line.startswith("-DOCSTART-") or line == "" or line == "\n": + if words: + examples.append(InputExample( + guid=f'{mode}-{guid_index}', + words=words, + labels=labels, + is_document_first=True, + )) + guid_index += 1 + words = [] + labels = [] + else: + splits = line.rsplit(" ", 1) + words.append(splits[0]) + if len(splits) > 1: + labels.append(splits[-1].replace("\n", "")) + else: + # Examples could have no label for mode = "test" + labels.append("O") + if words: + examples.append(InputExample( + guid=f'{mode}-{guid_index}', + words=words, + labels=labels, + is_document_first=True, + )) + return examples + + +def convert_examples_to_features( + examples: List[InputExample], + label_list: List[str], + max_seq_length: int, + tokenizer: JanomeSubwordsTokenizer, + cls_token_at_end=False, + cls_token="[CLS]", + cls_token_segment_id=1, + sep_token="[SEP]", + sep_token_extra=False, + pad_on_left=False, + pad_token=0, + pad_token_segment_id=0, + pad_token_label_id=-100, + sequence_a_segment_id=0, + mask_padding_with_zero=True, + is_distil_bert: bool = False) -> List[InputFeatures]: + """ + Load a data file into a list of `InputFeatures`. + + :param cls_token_at_end: define the location of the CLS token: + - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP] + - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS] + :param cls_token_segment_id: define the segment id associated to the CLS token (0 for BERT, 2 for XLNet) + """ + # TODO clean up all this to leverage built-in features of tokenizers + + label_map = {label: i for i, label in enumerate(label_list)} + + features = [] + for (ex_index, example) in enumerate(examples): + if ex_index % 10_000 == 0: + logger.info("Writing example %d of %d", ex_index, len(examples)) + + tokens = [] + label_ids = [] + for word, label in zip(example.words, example.labels): + word_tokens = tokenizer.tokenize(word) + # bert-base-multilingual-cased sometimes output "nothing ([]) when calling tokenize with just a space. + if len(word_tokens) > 0: + tokens.extend(word_tokens) + # Use the real label id for the first token of the word, and padding ids for the remaining tokens + label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1)) + + # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa. + special_tokens_count = tokenizer.num_special_tokens_to_add() + if len(tokens) > max_seq_length - special_tokens_count: + tokens = tokens[: (max_seq_length - special_tokens_count)] + label_ids = label_ids[: (max_seq_length - special_tokens_count)] + + # The convention in BERT is: + # (a) For sequence pairs: + # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] + # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 + # (b) For single sequences: + # tokens: [CLS] the dog is hairy . [SEP] + # type_ids: 0 0 0 0 0 0 0 + # + # Where "type_ids" are used to indicate whether this is the first + # sequence or the second sequence. The embedding vectors for `type=0` and + # `type=1` were learned during pre-training and are added to the wordpiece + # embedding vector (and position vector). This is not *strictly* necessary + # since the [SEP] token unambiguously separates the sequences, but it makes + # it easier for the model to learn the concept of sequences. + # + # For classification tasks, the first vector (corresponding to [CLS]) is + # used as as the "sentence vector". Note that this only makes sense because + # the entire model is fine-tuned. + tokens += [sep_token] + label_ids += [pad_token_label_id] + if sep_token_extra: + # roberta uses an extra separator b/w pairs of sentences + tokens += [sep_token] + label_ids += [pad_token_label_id] + segment_ids = [sequence_a_segment_id] * len(tokens) + + if cls_token_at_end: + tokens += [cls_token] + label_ids += [pad_token_label_id] + segment_ids += [cls_token_segment_id] + else: + tokens = [cls_token] + tokens + label_ids = [pad_token_label_id] + label_ids + segment_ids = [cls_token_segment_id] + segment_ids + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) + + # Zero-pad up to the sequence length. + padding_length = max_seq_length - len(input_ids) + if pad_on_left: + input_ids = ([pad_token] * padding_length) + input_ids + input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask + segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids + label_ids = ([pad_token_label_id] * padding_length) + label_ids + else: + input_ids += [pad_token] * padding_length + input_mask += [0 if mask_padding_with_zero else 1] * padding_length + segment_ids += [pad_token_segment_id] * padding_length + label_ids += [pad_token_label_id] * padding_length + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + assert len(label_ids) == max_seq_length + + if ex_index < 5: + logger.info("*** Example ***") + logger.info("guid: %s", example.guid) + logger.info("tokens: %s", " ".join([str(x) for x in tokens])) + logger.info("input_ids: %s", " ".join([str(x) for x in input_ids])) + logger.info("input_mask: %s", " ".join([str(x) for x in input_mask])) + logger.info("segment_ids: %s", " ".join([str(x) for x in segment_ids])) + logger.info("label_ids: %s", " ".join([str(x) for x in label_ids])) + + if "token_type_ids" not in tokenizer.model_input_names: + segment_ids = None + if is_distil_bert: + features.append( + InputFeatures(input_ids=input_ids, + attention_mask=input_mask, + label_ids=label_ids, + document_id=example.guid)) + else: + features.append( + InputFeatures( + input_ids=input_ids, + attention_mask=input_mask, + token_type_ids=segment_ids, + label_ids=label_ids, + document_id=example.guid)) + return features + + +def get_labels(path: str) -> List[str]: + if path: + with open(path, "r") as f: + labels = f.read().splitlines() + if "O" not in labels: + labels = ["O"] + labels + return labels + else: + return ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"] diff --git a/docs/algorithms.md b/docs/algorithms.md new file mode 100644 index 0000000..b5c3288 --- /dev/null +++ b/docs/algorithms.md @@ -0,0 +1,34 @@ + +# Algorithms + +Bunkai has two algorithms: ``tsunoda`` and ``bunkai``. + +## tsunoda + +An implementation described in [角田孝昭, "顧客レビューテキスト解析に基づく文書作成支援に関する研究", 筑波大学博士論文 (2016)](http://hdl.handle.net/2241/00143820). + +## bunkai + +It exploits [following annotators](../bunkai/algorithm/bunkai_sbd/annotator). + +- ``FaceMarkDetector`` + - detects a character span of face-marks. This is rule-based. The detected spans are excepted from SB candidates. +- ``EmotionExpressionAnnotator`` + - detects a character span of emotional expressions such as ``(笑)``. This is rule-based. The detected spans are excepted from SB candidates. +- ``BasicRule`` + - detects SB candidates based on rules. +- ``MorphAnnotatorJanome`` + - runs a morphological analyzer on the input text. +- ``EmojiAnnotator`` + - detects a character span of Emoji characters. This module is rule-based. With default rule, Emoji in ``Smileys & Emotion``, ``Symbols`` categories are SB candidates. For getting to know Emoji categories, see [this page](https://emojipedia.org/categories/) +- ``IndirectQuoteExceptionAnnotator`` + - detects spans of indirect quotations that do not have explicit quotation marks ``「」``. SBs within indirect quotations are exceptional. This process is rule-based using morphological information. +- ``DotExpressionExceptionAnnotatorc`` + - detects SB ``.`` characters between numbers such as ``1.2畳``. The detected characters are SBs. +- ``NumberExceptionAnnotator`` + - detects SB ``.`` characters between idiomatic expressions such as ``おすすめ度No.1``. The detected characters are SBs. +- For line breaks + - ``LinebreakForceAnnotator`` (When ``-m`` option is not given) + - deetect SB for all line breaks + - ``LinebreakExceptionAnnotator`` (When ``-m`` option is given) + - classifies line breaks whether they are SB or not diff --git a/docs/training.md b/docs/training.md new file mode 100644 index 0000000..2a40b0b --- /dev/null +++ b/docs/training.md @@ -0,0 +1,30 @@ + +# Training of lbd (Linebreak Disambiguator) + +## Preparation BCCWJ without newspaper texts + +```bash +unzip ~/bccwj/disk.3/CORE_OT/core_M-XML.zip -d ~/data/3rd/bccwj/ver.1.1/core_nonumtrans_mxml/ +poetry run python3 -m bunkai.experiment.convert.bccwj -i ~/data/3rd/bccwj/ver.1.1/core_nonumtrans_mxml/core_M-XML -o ~/data/bccwj/files/ + +mkdir -p ~/data/bccwj/lbd +poetry run python3 -m bunkai.experiment.statics -i ~/data/bccwj/files -o ~/data/bccwj/lbd/stat.bccwj.jsonl + +# Generate data +find ~/data/bccwj/files -type f | grep -v PN | sort | xargs cat | poetry run python3 -m bunkai.algorithm.lbd.corpus -o ~/data/bccwj/lbd/source-without-pn.jsonl +poetry run python3 -m bunkai.algorithm.lbd.corpus -i ~/data/bccwj/lbd/source-without-pn.jsonl -o ~/data/bccwj/lbd/split --split +``` + +### Train with BERT + +```bash +poetry run make lbd +``` + +### Train with DistliBERT + +```bash +poetry run make lbd \ + LBD_MODEL_DIR=~/data/bccwj/model/lbd-distlibert-model \ + LBD_MODEL_NAME=bandainamco-mirai/distilbert-base-japanese +``` diff --git a/example/example001.py b/example/example001.py new file mode 100644 index 0000000..18ad59e --- /dev/null +++ b/example/example001.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 + +import typing + + +def message(func: typing.Callable): + def wrapper(*args, **kwargs): + print(f'-- {func.__name__ } --') + func(*args, **kwargs) + print('-- END --') + return wrapper + + +@message +def example_basic_usage(input_text: str, path_newline_model: typing.Optional[str] = None): + from bunkai.algorithm.bunkai_sbd.bunkai_sbd import \ + BunkaiSentenceBoundaryDisambiguation + bunkai = BunkaiSentenceBoundaryDisambiguation(path_model=path_newline_model) + iter_sentences = bunkai(input_text) + for sent in iter_sentences: + assert isinstance(sent, str) + print(sent) + + +@message +def example_eos_character_index(input_text: str, path_newline_model: typing.Optional[str] = None): + """How to get character index of end-of-sentence.""" + from bunkai.algorithm.bunkai_sbd.bunkai_sbd import \ + BunkaiSentenceBoundaryDisambiguation + bunkai = BunkaiSentenceBoundaryDisambiguation(path_model=path_newline_model) + iter_sentences = bunkai.find_eos(input_text) + sent_start_index: int = 0 + for eos_index in iter_sentences: + print(f'sentence from:{sent_start_index} until:{eos_index} text={input_text[sent_start_index:eos_index]}') + sent_start_index = eos_index + + +@message +def example_morphological_analysis(input_text: str, path_newline_model: typing.Optional[str] = None): + """How to get morphemes during processes.""" + from bunkai.algorithm.bunkai_sbd.bunkai_sbd import \ + BunkaiSentenceBoundaryDisambiguation + bunkai = BunkaiSentenceBoundaryDisambiguation(path_model=path_newline_model) + # all analysis process is stored in Annotations object + annotation_obj = bunkai._eos(input_text) + tokens = annotation_obj.get_morph_analysis() + for token in tokens: + print(f'{token.word_surface},', end='') + print(end='\n') + + +@message +def example_error_analysis_during_process(input_text: str, path_newline_model: typing.Optional[str] = None): + """How to get objects after each layer.""" + from bunkai.algorithm.bunkai_sbd.bunkai_sbd import \ + BunkaiSentenceBoundaryDisambiguation + bunkai = BunkaiSentenceBoundaryDisambiguation(path_model=path_newline_model) + # all analysis process is stored in Annotations object + annotation_obj = bunkai._eos(input_text) + layers = annotation_obj.available_layers() + print(f'available layer names {layers}') + for l_name in layers: + spans = annotation_obj.get_annotation_layer(l_name) + print(l_name) + for span_ann_obj in spans: + print(f' ({span_ann_obj.start_index}, {span_ann_obj.end_index} {span_ann_obj.split_string_value}), ', end='') + print(end='\n') + + +if __name__ == '__main__': + PATH_NEWLINE_MODEL = None + input_text = '宿を予約しました♪!まだ2ヶ月も先だけど。早すぎかな(笑)楽しみです★' + example_basic_usage(input_text, PATH_NEWLINE_MODEL) + example_eos_character_index(input_text, PATH_NEWLINE_MODEL) + example_morphological_analysis(input_text, PATH_NEWLINE_MODEL) + example_error_analysis_during_process(input_text, PATH_NEWLINE_MODEL) diff --git a/mks/lbd.mk b/mks/lbd.mk new file mode 100644 index 0000000..15b6f88 --- /dev/null +++ b/mks/lbd.mk @@ -0,0 +1,27 @@ + +LBD_DATA_DIR:=~/data/bccwj/lbd/split + +# $LBD_MODEL_DIR:=~/data/bccwj/model/lbd-distlibert-model +# LBD_MODEL_NAME:=bandainamco-mirai/distilbert-base-japanese + +LBD_MODEL_DIR:=~/data/bccwj/model/lbd-bert-model +LBD_MODEL_NAME:=cl-tohoku/bert-base-japanese-whole-word-masking +LBD_TRAIN_OPT:= + +LBD_INPUT_TRAIN:=$(LBD_DATA_DIR)/train.jsonl +LBD_INPUT_TEST:=$(LBD_DATA_DIR)/test.txt +LBD_OUTPUT_MODEL:=$(LBD_MODEL_DIR)/out/pytorch_model.bin +$(LBD_OUTPUT_MODEL): $(LBD_INPUT_TRAIN) + poetry run python3 -m bunkai.algorithm.lbd.train \ + -i $(LBD_INPUT_TRAIN) -m $(LBD_MODEL_DIR)\ + --base $(LBD_MODEL_NAME) $(LBD_TRAIN_OPT) +lbd-train: $(LBD_OUTPUT_MODEL) +$(LBD_MODEL_DIR)/test.prediction: $(LBD_OUTPUT_MODEL) $(LBD_INPUT_TEST) + sed "s/│//g" $(LBD_INPUT_TEST) \ + | poetry run python3 -m bunkai.algorithm.lbd.predict \ + -m $(LBD_MODEL_DIR)/out -b 10 > $@ +$(LBD_MODEL_DIR)/test.score.jsonl: $(LBD_MODEL_DIR)/test.prediction $(LBD_INPUT_TEST) + python3 -m bunkai.experiment.evaluate --lb -i $< -g $(LBD_INPUT_TEST) > $@ +lbd-test: $(LBD_MODEL_DIR)/test.score.jsonl +lbd: lbd-train lbd-test + diff --git a/mks/lint.mk b/mks/lint.mk new file mode 100644 index 0000000..8c695cf --- /dev/null +++ b/mks/lint.mk @@ -0,0 +1,91 @@ + +GREP_EXCLUDE:=grep -v -e '\.eggs' -e '\.git' -e 'pyc$$' -e '\.mypy' -e '\.idea' -e '\./venv' -e 'python_env' -e 'egg-info' -e htmlcov -e 'work_scripts' -e mks +TERMS_CHECK_CMD:=grep -e split -e divide +TERMS_CHECK_CONTENT_OPTION:=-e 文分割 -e 'coding: utf' + +POETRY_NO_ROOT:= --no-root + +dev_setup: + poetry install $(POETRY_NO_ROOT) $(POETRY_OPTION) + +setup: + poetry install $(POETRY_OPTION) + +TARGET_DIRS:=./bunkai ./tests ./example + +flake8: + find $(TARGET_DIRS) *.py | grep -v third | grep '\.py$$' | xargs flake8 +autopep8: + find $(TARGET_DIRS) *.py | grep -v third | grep '\.py$$' | xargs autopep8 -d | diff /dev/null - +mypy: + find $(TARGET_DIRS) *.py | grep -v third | grep '\.py$$' | xargs mypy --python-version 3.7 --check-untyped-defs --strict-equality --no-implicit-optional +isort: + find $(TARGET_DIRS) *.py | grep -v third | grep '\.py$$' | xargs isort --diff | diff /dev/null - +pydocstyle: + pydocstyle $(TARGET_DIRS) --ignore=D100,D101,D102,D103,D104,D105,D107,D203,D212 + +jsonlint: + find .*json $(TARGET_DIRS) -type f | grep -v 'mypy_cache' | grep '\.jsonl$$' | sort |xargs cat | python3 -c 'import sys,json; [json.loads(line) for line in sys.stdin]' + find .*json $(TARGET_DIRS) -type f | grep -v 'mypy_cache' | grep '\.json$$' | sort |xargs -n 1 -t python3 -m json.tool > /dev/null + find .*json $(TARGET_DIRS) -type f | grep -v 'mypy_cache' | grep '\.json$$' | sort |xargs -n 1 -t jsonlint + python3 -c "import sys,json;print(json.dumps(json.loads(sys.stdin.read()),indent=4,ensure_ascii=False,sort_keys=True))" < .markdownlint.json | diff -q - .markdownlint.json + +yamllint: + find . -name '*.yml' -type f | xargs yamllint --no-warnings + +terms_check_path: + # check some words are not included in file name + find . | $(GREP_EXCLUDE) | $(TERMS_CHECK_CMD); if [ $$? -eq 1 ]; then echo "pass term check"; else exit 1; fi + +term_check_method: + # check if some terms are wrote in python method names + find . -type f | grep -v -e 'git' -e 'idea' -e 'mypy' -e 'python_env' | grep -e 'py$$' | xargs -L 1 python3 .circleci/show_method_names.py + +term_check_file_content: + # check if 文分割 is written somewhere in a file. + find . -type f | $(GREP_EXCLUDE) | grep -v -e 'Makefile' -e 'show_method_names.py' -e 'example.py' | xargs grep $(TERMS_CHECK_CONTENT_OPTION); last_var=$$? ; if [ $${last_var} -eq 1 ] || [ $${last_var} -eq 123 ]; then echo "pass file content"; else exit 1; fi + +check_firstline: + find . -type f | grep -v -e 'git' -e 'idea' -e 'mypy' -e 'python_env' | grep -e 'py$$' | grep -v '__init__' | grep -v third | xargs python3 .circleci/check_head.py + + +lint: flake8 autopep8 mypy isort yamllint terms_check_path term_check_method term_check_file_content check_firstline pydocstyle + +_run_isort: + isort -rc . + +_coverage: + ulimit -n 1000 && coverage run -m unittest discover tests + +GOLD_SAMPLE:=tests/sample.gold.txt +check_sample: + sed "s/│//g" $(GOLD_SAMPLE) | poetry run bunkai | diff $(GOLD_SAMPLE) - + + +test: _coverage check_sample + +test-coverage: test + coverage report && coverage html + +CC_REPORTER_VERSION:=0.6.3 +setup-cc: + mkdir -p ~/.local/bin-cc + curl -L https://codeclimate.com/downloads/test-reporter/test-reporter-$(CC_REPORTER_VERSION)-linux-amd64 > ~/.local/bin-cc/cc-test-reporter + chmod +x ~/.local/bin-cc/cc-test-reporter + ~/.local/bin-cc/cc-test-reporter before-build + +test-cc: test + coverage xml && \ + ~/.local/bin-cc/cc-test-reporter after-build\ + --coverage-input-type coverage.py\ + --exit-code $$? + +setup_node_module: + npm install markdownlint-cli + +lint_markdown: + find . -type d -o -type f -name '*.md' -print \ + | grep -v node_modules \ + | xargs npx markdownlint --config ./.markdownlint.json + + diff --git a/mks/vanilla.mk b/mks/vanilla.mk new file mode 100644 index 0000000..733f28b --- /dev/null +++ b/mks/vanilla.mk @@ -0,0 +1,20 @@ + +BCCWJ_DATA_DIR:=~/data/bccwj/files +BCCWJ_GOLD:=$(BCCWJ_DATA_DIR).without-pn.txt +$(BCCWJ_GOLD): $(BCCWJ_DATA_DIR) + find $< -type f | grep -v PN | sort | xargs cat \ + | poetry run python3 -m bunkai.experiment.evaluate --trim -i /dev/stdin -o $@ +VANILLA_MODEL_DIR:=~/data/bccwj/model-vanilla +VANILLA_TEST:=$(VANILLA_MODEL_DIR)/test.prediction +$(VANILLA_TEST): $(BCCWJ_GOLD) + mkdir -p $(dir $@) \ + && sed "s/│//g" $(BCCWJ_GOLD) | poetry run bunkai > $@ +VANILLA_TEST_TRIM:=$(VANILLA_MODEL_DIR)/test.prediction.trim +$(VANILLA_TEST_TRIM): $(VANILLA_TEST) + poetry run python3 -m bunkai.experiment.evaluate --trim -i $< -o $@ + +VANILLA_TEST_EVAL:=$(VANILLA_MODEL_DIR)/test.score.jsonl +$(VANILLA_TEST_EVAL): $(VANILLA_TEST_TRIM) + python3 -m bunkai.experiment.evaluate -i $< -g $(BCCWJ_GOLD) -o $@ +vanilla-test: $(VANILLA_TEST_EVAL) + diff --git a/mypy.ini b/mypy.ini new file mode 100644 index 0000000..9b4c1cb --- /dev/null +++ b/mypy.ini @@ -0,0 +1,3 @@ +[mypy] +ignore_missing_imports = True +allow_redefinition = True diff --git a/poetry.lock b/poetry.lock new file mode 100644 index 0000000..60f23b0 --- /dev/null +++ b/poetry.lock @@ -0,0 +1,1092 @@ +[[package]] +name = "autopep8" +version = "1.5.6" +description = "A tool that automatically formats Python code to conform to the PEP 8 style guide" +category = "dev" +optional = false +python-versions = "*" + +[package.dependencies] +pycodestyle = ">=2.7.0" +toml = "*" + +[[package]] +name = "certifi" +version = "2020.12.5" +description = "Python package for providing Mozilla's CA Bundle." +category = "main" +optional = false +python-versions = "*" + +[[package]] +name = "chardet" +version = "4.0.0" +description = "Universal encoding detector for Python 2 and 3" +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" + +[[package]] +name = "click" +version = "7.1.2" +description = "Composable command line interface toolkit" +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" + +[[package]] +name = "coverage" +version = "5.5" +description = "Code coverage measurement for Python" +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4" + +[package.extras] +toml = ["toml"] + +[[package]] +name = "dataclasses-json" +version = "0.5.2" +description = "Easily serialize dataclasses to and from JSON" +category = "main" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +marshmallow = ">=3.3.0,<4.0.0" +marshmallow-enum = ">=1.5.1,<2.0.0" +stringcase = "1.2.0" +typing-inspect = ">=0.4.0" + +[package.extras] +dev = ["pytest", "ipython", "mypy (>=0.710)", "hypothesis", "portray", "flake8", "simplejson"] + +[[package]] +name = "demjson" +version = "2.2.4" +description = "encoder, decoder, and lint/validator for JSON (JavaScript Object Notation) compliant with RFC 7159" +category = "dev" +optional = false +python-versions = "*" + +[[package]] +name = "emoji" +version = "1.2.0" +description = "Emoji for Python" +category = "main" +optional = false +python-versions = "*" + +[package.extras] +dev = ["pytest", "coverage", "coveralls"] + +[[package]] +name = "emojis" +version = "0.6.0" +description = "Emojis for Python" +category = "main" +optional = false +python-versions = "*" + +[[package]] +name = "filelock" +version = "3.0.12" +description = "A platform independent file lock." +category = "main" +optional = false +python-versions = "*" + +[[package]] +name = "flake8" +version = "3.9.1" +description = "the modular source code checker: pep8 pyflakes and co" +category = "dev" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" + +[package.dependencies] +importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} +mccabe = ">=0.6.0,<0.7.0" +pycodestyle = ">=2.7.0,<2.8.0" +pyflakes = ">=2.3.0,<2.4.0" + +[[package]] +name = "idna" +version = "2.10" +description = "Internationalized Domain Names in Applications (IDNA)" +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + +[[package]] +name = "importlib-metadata" +version = "4.0.1" +description = "Read metadata from Python packages" +category = "main" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +typing-extensions = {version = ">=3.6.4", markers = "python_version < \"3.8\""} +zipp = ">=0.5" + +[package.extras] +docs = ["sphinx", "jaraco.packaging (>=8.2)", "rst.linker (>=1.9)"] +testing = ["pytest (>=4.6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.0.1)", "packaging", "pep517", "pyfakefs", "flufl.flake8", "pytest-black (>=0.3.7)", "pytest-mypy", "importlib-resources (>=1.3)"] + +[[package]] +name = "isort" +version = "5.8.0" +description = "A Python utility / library to sort Python imports." +category = "dev" +optional = false +python-versions = ">=3.6,<4.0" + +[package.extras] +pipfile_deprecated_finder = ["pipreqs", "requirementslib"] +requirements_deprecated_finder = ["pipreqs", "pip-api"] +colors = ["colorama (>=0.4.3,<0.5.0)"] + +[[package]] +name = "janome" +version = "0.4.1" +description = "Japanese morphological analysis engine." +category = "main" +optional = false +python-versions = "*" + +[[package]] +name = "joblib" +version = "1.0.1" +description = "Lightweight pipelining with Python functions" +category = "main" +optional = false +python-versions = ">=3.6" + +[[package]] +name = "marshmallow" +version = "3.11.1" +description = "A lightweight library for converting complex datatypes to and from native Python datatypes." +category = "main" +optional = false +python-versions = ">=3.5" + +[package.extras] +dev = ["pytest", "pytz", "simplejson", "mypy (==0.812)", "flake8 (==3.9.0)", "flake8-bugbear (==21.3.2)", "pre-commit (>=2.4,<3.0)", "tox"] +docs = ["sphinx (==3.4.3)", "sphinx-issues (==1.2.0)", "alabaster (==0.7.12)", "sphinx-version-warning (==1.1.2)", "autodocsumm (==0.2.2)"] +lint = ["mypy (==0.812)", "flake8 (==3.9.0)", "flake8-bugbear (==21.3.2)", "pre-commit (>=2.4,<3.0)"] +tests = ["pytest", "pytz", "simplejson"] + +[[package]] +name = "marshmallow-enum" +version = "1.5.1" +description = "Enum field for Marshmallow" +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +marshmallow = ">=2.0.0" + +[[package]] +name = "mccabe" +version = "0.6.1" +description = "McCabe checker, plugin for flake8" +category = "dev" +optional = false +python-versions = "*" + +[[package]] +name = "mock" +version = "4.0.3" +description = "Rolling backport of unittest.mock for all Pythons" +category = "dev" +optional = false +python-versions = ">=3.6" + +[package.extras] +build = ["twine", "wheel", "blurb"] +docs = ["sphinx"] +test = ["pytest (<5.4)", "pytest-cov"] + +[[package]] +name = "more-itertools" +version = "8.7.0" +description = "More routines for operating on iterables, beyond itertools" +category = "main" +optional = false +python-versions = ">=3.5" + +[[package]] +name = "mypy" +version = "0.812" +description = "Optional static typing for Python" +category = "dev" +optional = false +python-versions = ">=3.5" + +[package.dependencies] +mypy-extensions = ">=0.4.3,<0.5.0" +typed-ast = ">=1.4.0,<1.5.0" +typing-extensions = ">=3.7.4" + +[package.extras] +dmypy = ["psutil (>=4.0)"] + +[[package]] +name = "mypy-extensions" +version = "0.4.3" +description = "Experimental type system extensions for programs checked with the mypy typechecker." +category = "main" +optional = false +python-versions = "*" + +[[package]] +name = "numpy" +version = "1.20.2" +description = "NumPy is the fundamental package for array computing with Python." +category = "main" +optional = false +python-versions = ">=3.7" + +[[package]] +name = "packaging" +version = "20.9" +description = "Core utilities for Python packages" +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + +[package.dependencies] +pyparsing = ">=2.0.2" + +[[package]] +name = "pathspec" +version = "0.8.1" +description = "Utility library for gitignore style pattern matching of file paths." +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" + +[[package]] +name = "pycodestyle" +version = "2.7.0" +description = "Python style guide checker" +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + +[[package]] +name = "pydocstyle" +version = "6.0.0" +description = "Python docstring style checker" +category = "dev" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +snowballstemmer = "*" + +[[package]] +name = "pyflakes" +version = "2.3.1" +description = "passive checker of Python programs" +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + +[[package]] +name = "pyparsing" +version = "2.4.7" +description = "Python parsing module" +category = "main" +optional = false +python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" + +[[package]] +name = "pyyaml" +version = "5.4.1" +description = "YAML parser and emitter for Python" +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" + +[[package]] +name = "regex" +version = "2021.4.4" +description = "Alternative regular expression module, to replace re." +category = "main" +optional = false +python-versions = "*" + +[[package]] +name = "requests" +version = "2.25.1" +description = "Python HTTP for Humans." +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" + +[package.dependencies] +certifi = ">=2017.4.17" +chardet = ">=3.0.2,<5" +idna = ">=2.5,<3" +urllib3 = ">=1.21.1,<1.27" + +[package.extras] +security = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)"] +socks = ["PySocks (>=1.5.6,!=1.5.7)", "win-inet-pton"] + +[[package]] +name = "sacremoses" +version = "0.0.45" +description = "SacreMoses" +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +click = "*" +joblib = "*" +regex = "*" +six = "*" +tqdm = "*" + +[[package]] +name = "scikit-learn" +version = "0.24.1" +description = "A set of python modules for machine learning and data mining" +category = "main" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +joblib = ">=0.11" +numpy = ">=1.13.3" +scipy = ">=0.19.1" +threadpoolctl = ">=2.0.0" + +[package.extras] +benchmark = ["matplotlib (>=2.1.1)", "pandas (>=0.25.0)", "memory-profiler (>=0.57.0)"] +docs = ["matplotlib (>=2.1.1)", "scikit-image (>=0.13)", "pandas (>=0.25.0)", "seaborn (>=0.9.0)", "memory-profiler (>=0.57.0)", "sphinx (>=3.2.0)", "sphinx-gallery (>=0.7.0)", "numpydoc (>=1.0.0)", "Pillow (>=7.1.2)", "sphinx-prompt (>=1.3.0)"] +examples = ["matplotlib (>=2.1.1)", "scikit-image (>=0.13)", "pandas (>=0.25.0)", "seaborn (>=0.9.0)"] +tests = ["matplotlib (>=2.1.1)", "scikit-image (>=0.13)", "pandas (>=0.25.0)", "pytest (>=5.0.1)", "pytest-cov (>=2.9.0)", "flake8 (>=3.8.2)", "mypy (>=0.770)", "pyamg (>=4.0.0)"] + +[[package]] +name = "scipy" +version = "1.6.1" +description = "SciPy: Scientific Library for Python" +category = "main" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +numpy = ">=1.16.5" + +[[package]] +name = "seqeval" +version = "1.2.2" +description = "Testing framework for sequence labeling" +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +numpy = ">=1.14.0" +scikit-learn = ">=0.21.3" + +[[package]] +name = "six" +version = "1.15.0" +description = "Python 2 and 3 compatibility utilities" +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" + +[[package]] +name = "snowballstemmer" +version = "2.1.0" +description = "This package provides 29 stemmers for 28 languages generated from Snowball algorithms." +category = "dev" +optional = false +python-versions = "*" + +[[package]] +name = "spans" +version = "1.1.0" +description = "Continuous set support for Python" +category = "main" +optional = false +python-versions = "*" + +[package.extras] +dev = ["codecov", "pytest (>=3.0)", "pytest-cov", "twine", "sphinx", "sphinx-rtd-theme"] + +[[package]] +name = "stringcase" +version = "1.2.0" +description = "String case converter." +category = "main" +optional = false +python-versions = "*" + +[[package]] +name = "threadpoolctl" +version = "2.1.0" +description = "threadpoolctl" +category = "main" +optional = false +python-versions = ">=3.5" + +[[package]] +name = "tokenizers" +version = "0.10.2" +description = "Fast and Customizable Tokenizers" +category = "main" +optional = false +python-versions = "*" + +[package.extras] +testing = ["pytest"] + +[[package]] +name = "toml" +version = "0.10.2" +description = "Python Library for Tom's Obvious, Minimal Language" +category = "dev" +optional = false +python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" + +[[package]] +name = "torch" +version = "1.8.1" +description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration" +category = "main" +optional = false +python-versions = ">=3.6.2" + +[package.dependencies] +numpy = "*" +typing-extensions = "*" + +[[package]] +name = "tqdm" +version = "4.60.0" +description = "Fast, Extensible Progress Meter" +category = "main" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7" + +[package.extras] +dev = ["py-make (>=0.1.0)", "twine", "wheel"] +notebook = ["ipywidgets (>=6)"] +telegram = ["requests"] + +[[package]] +name = "transformers" +version = "4.5.1" +description = "State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch" +category = "main" +optional = false +python-versions = ">=3.6.0" + +[package.dependencies] +filelock = "*" +importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} +numpy = ">=1.17" +packaging = "*" +regex = "!=2019.12.17" +requests = "*" +sacremoses = "*" +tokenizers = ">=0.10.1,<0.11" +tqdm = ">=4.27" + +[package.extras] +all = ["tensorflow (>=2.3)", "onnxconverter-common", "keras2onnx", "torch (>=1.0)", "jax (>=0.2.8)", "jaxlib (>=0.1.59)", "flax (>=0.3.2)", "sentencepiece (==0.1.91)", "protobuf", "tokenizers (>=0.10.1,<0.11)", "soundfile", "torchaudio", "pillow"] +dev = ["tensorflow (>=2.3)", "onnxconverter-common", "keras2onnx", "torch (>=1.0)", "jax (>=0.2.8)", "jaxlib (>=0.1.59)", "flax (>=0.3.2)", "sentencepiece (==0.1.91)", "protobuf", "tokenizers (>=0.10.1,<0.11)", "soundfile", "torchaudio", "pillow", "pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil", "datasets", "pytest-sugar", "black (>=20.8b1)", "faiss-cpu", "cookiecutter (==1.7.2)", "isort (>=5.5.4)", "flake8 (>=3.8.3)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "unidic-lite (>=1.0.7)", "unidic (>=1.0.2)", "docutils (==0.16.0)", "recommonmark", "sphinx (==3.2.1)", "sphinx-markdown-tables", "sphinx-rtd-theme (==0.4.3)", "sphinx-copybutton", "sphinxext-opengraph (==0.4.1)", "scikit-learn"] +docs = ["docutils (==0.16.0)", "recommonmark", "sphinx (==3.2.1)", "sphinx-markdown-tables", "sphinx-rtd-theme (==0.4.3)", "sphinx-copybutton", "sphinxext-opengraph (==0.4.1)"] +flax = ["jax (>=0.2.8)", "jaxlib (>=0.1.59)", "flax (>=0.3.2)"] +ja = ["fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "unidic-lite (>=1.0.7)", "unidic (>=1.0.2)"] +modelcreation = ["cookiecutter (==1.7.2)"] +onnx = ["onnxconverter-common", "keras2onnx", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)"] +onnxruntime = ["onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)"] +quality = ["black (>=20.8b1)", "isort (>=5.5.4)", "flake8 (>=3.8.3)"] +retrieval = ["faiss-cpu", "datasets"] +sagemaker = ["sagemaker (>=2.31.0)"] +sentencepiece = ["sentencepiece (==0.1.91)", "protobuf"] +serving = ["pydantic", "uvicorn", "fastapi", "starlette"] +sklearn = ["scikit-learn"] +speech = ["soundfile", "torchaudio"] +testing = ["pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil", "datasets", "pytest-sugar", "black (>=20.8b1)", "faiss-cpu", "cookiecutter (==1.7.2)"] +tf = ["tensorflow (>=2.3)", "onnxconverter-common", "keras2onnx"] +tf-cpu = ["tensorflow-cpu (>=2.3)", "onnxconverter-common", "keras2onnx"] +tokenizers = ["tokenizers (>=0.10.1,<0.11)"] +torch = ["torch (>=1.0)"] +torchhub = ["filelock", "importlib-metadata", "numpy (>=1.17)", "packaging", "protobuf", "regex (!=2019.12.17)", "requests", "sacremoses", "sentencepiece (==0.1.91)", "torch (>=1.0)", "tokenizers (>=0.10.1,<0.11)", "tqdm (>=4.27)"] +vision = ["pillow"] + +[[package]] +name = "typed-ast" +version = "1.4.3" +description = "a fork of Python 2 and 3 ast modules with type comment support" +category = "dev" +optional = false +python-versions = "*" + +[[package]] +name = "typing-extensions" +version = "3.7.4.3" +description = "Backported and Experimental Type Hints for Python 3.5+" +category = "main" +optional = false +python-versions = "*" + +[[package]] +name = "typing-inspect" +version = "0.6.0" +description = "Runtime inspection utilities for typing module." +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +mypy-extensions = ">=0.3.0" +typing-extensions = ">=3.7.4" + +[[package]] +name = "urllib3" +version = "1.26.4" +description = "HTTP library with thread-safe connection pooling, file post, and more." +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4" + +[package.extras] +secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "ipaddress"] +socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] +brotli = ["brotlipy (>=0.6.0)"] + +[[package]] +name = "yamllint" +version = "1.26.1" +description = "A linter for YAML files." +category = "dev" +optional = false +python-versions = ">=3.5.*" + +[package.dependencies] +pathspec = ">=0.5.3" +pyyaml = "*" + +[[package]] +name = "zipp" +version = "3.4.1" +description = "Backport of pathlib-compatible object wrapper for zip files" +category = "main" +optional = false +python-versions = ">=3.6" + +[package.extras] +docs = ["sphinx", "jaraco.packaging (>=8.2)", "rst.linker (>=1.9)"] +testing = ["pytest (>=4.6)", "pytest-checkdocs (>=1.2.3)", "pytest-flake8", "pytest-cov", "pytest-enabler", "jaraco.itertools", "func-timeout", "pytest-black (>=0.3.7)", "pytest-mypy"] + +[metadata] +lock-version = "1.1" +python-versions = "^3.7" +content-hash = "7c344a0fac8dab70802ffce64ad2a66005d419ad0b4e79f3d2771fbf09390e95" + +[metadata.files] +autopep8 = [ + {file = "autopep8-1.5.6-py2.py3-none-any.whl", hash = "sha256:f01b06a6808bc31698db907761e5890eb2295e287af53f6693b39ce55454034a"}, + {file = "autopep8-1.5.6.tar.gz", hash = "sha256:5454e6e9a3d02aae38f866eec0d9a7de4ab9f93c10a273fb0340f3d6d09f7514"}, +] +certifi = [ + {file = "certifi-2020.12.5-py2.py3-none-any.whl", hash = "sha256:719a74fb9e33b9bd44cc7f3a8d94bc35e4049deebe19ba7d8e108280cfd59830"}, + {file = "certifi-2020.12.5.tar.gz", hash = "sha256:1a4995114262bffbc2413b159f2a1a480c969de6e6eb13ee966d470af86af59c"}, +] +chardet = [ + {file = "chardet-4.0.0-py2.py3-none-any.whl", hash = "sha256:f864054d66fd9118f2e67044ac8981a54775ec5b67aed0441892edb553d21da5"}, + {file = "chardet-4.0.0.tar.gz", hash = "sha256:0d6f53a15db4120f2b08c94f11e7d93d2c911ee118b6b30a04ec3ee8310179fa"}, +] +click = [ + {file = "click-7.1.2-py2.py3-none-any.whl", hash = "sha256:dacca89f4bfadd5de3d7489b7c8a566eee0d3676333fbb50030263894c38c0dc"}, + {file = "click-7.1.2.tar.gz", hash = "sha256:d2b5255c7c6349bc1bd1e59e08cd12acbbd63ce649f2588755783aa94dfb6b1a"}, +] +coverage = [ + {file = "coverage-5.5-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:b6d534e4b2ab35c9f93f46229363e17f63c53ad01330df9f2d6bd1187e5eaacf"}, + {file = "coverage-5.5-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:b7895207b4c843c76a25ab8c1e866261bcfe27bfaa20c192de5190121770672b"}, + {file = "coverage-5.5-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:c2723d347ab06e7ddad1a58b2a821218239249a9e4365eaff6649d31180c1669"}, + {file = "coverage-5.5-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:900fbf7759501bc7807fd6638c947d7a831fc9fdf742dc10f02956ff7220fa90"}, + {file = "coverage-5.5-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:004d1880bed2d97151facef49f08e255a20ceb6f9432df75f4eef018fdd5a78c"}, + {file = "coverage-5.5-cp27-cp27m-win32.whl", hash = "sha256:06191eb60f8d8a5bc046f3799f8a07a2d7aefb9504b0209aff0b47298333302a"}, + {file = "coverage-5.5-cp27-cp27m-win_amd64.whl", hash = "sha256:7501140f755b725495941b43347ba8a2777407fc7f250d4f5a7d2a1050ba8e82"}, + {file = "coverage-5.5-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:372da284cfd642d8e08ef606917846fa2ee350f64994bebfbd3afb0040436905"}, + {file = "coverage-5.5-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:8963a499849a1fc54b35b1c9f162f4108017b2e6db2c46c1bed93a72262ed083"}, + {file = "coverage-5.5-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:869a64f53488f40fa5b5b9dcb9e9b2962a66a87dab37790f3fcfb5144b996ef5"}, + {file = "coverage-5.5-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:4a7697d8cb0f27399b0e393c0b90f0f1e40c82023ea4d45d22bce7032a5d7b81"}, + {file = "coverage-5.5-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:8d0a0725ad7c1a0bcd8d1b437e191107d457e2ec1084b9f190630a4fb1af78e6"}, + {file = "coverage-5.5-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:51cb9476a3987c8967ebab3f0fe144819781fca264f57f89760037a2ea191cb0"}, + {file = "coverage-5.5-cp310-cp310-win_amd64.whl", hash = "sha256:c0891a6a97b09c1f3e073a890514d5012eb256845c451bd48f7968ef939bf4ae"}, + {file = "coverage-5.5-cp35-cp35m-macosx_10_9_x86_64.whl", hash = "sha256:3487286bc29a5aa4b93a072e9592f22254291ce96a9fbc5251f566b6b7343cdb"}, + {file = "coverage-5.5-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:deee1077aae10d8fa88cb02c845cfba9b62c55e1183f52f6ae6a2df6a2187160"}, + {file = "coverage-5.5-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:f11642dddbb0253cc8853254301b51390ba0081750a8ac03f20ea8103f0c56b6"}, + {file = "coverage-5.5-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:6c90e11318f0d3c436a42409f2749ee1a115cd8b067d7f14c148f1ce5574d701"}, + {file = "coverage-5.5-cp35-cp35m-manylinux2010_x86_64.whl", hash = "sha256:30c77c1dc9f253283e34c27935fded5015f7d1abe83bc7821680ac444eaf7793"}, + {file = "coverage-5.5-cp35-cp35m-win32.whl", hash = "sha256:9a1ef3b66e38ef8618ce5fdc7bea3d9f45f3624e2a66295eea5e57966c85909e"}, + {file = "coverage-5.5-cp35-cp35m-win_amd64.whl", hash = "sha256:972c85d205b51e30e59525694670de6a8a89691186012535f9d7dbaa230e42c3"}, + {file = "coverage-5.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:af0e781009aaf59e25c5a678122391cb0f345ac0ec272c7961dc5455e1c40066"}, + {file = "coverage-5.5-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:74d881fc777ebb11c63736622b60cb9e4aee5cace591ce274fb69e582a12a61a"}, + {file = "coverage-5.5-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:92b017ce34b68a7d67bd6d117e6d443a9bf63a2ecf8567bb3d8c6c7bc5014465"}, + {file = "coverage-5.5-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:d636598c8305e1f90b439dbf4f66437de4a5e3c31fdf47ad29542478c8508bbb"}, + {file = "coverage-5.5-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:41179b8a845742d1eb60449bdb2992196e211341818565abded11cfa90efb821"}, + {file = "coverage-5.5-cp36-cp36m-win32.whl", hash = "sha256:040af6c32813fa3eae5305d53f18875bedd079960822ef8ec067a66dd8afcd45"}, + {file = "coverage-5.5-cp36-cp36m-win_amd64.whl", hash = "sha256:5fec2d43a2cc6965edc0bb9e83e1e4b557f76f843a77a2496cbe719583ce8184"}, + {file = "coverage-5.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:18ba8bbede96a2c3dde7b868de9dcbd55670690af0988713f0603f037848418a"}, + {file = "coverage-5.5-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:2910f4d36a6a9b4214bb7038d537f015346f413a975d57ca6b43bf23d6563b53"}, + {file = "coverage-5.5-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:f0b278ce10936db1a37e6954e15a3730bea96a0997c26d7fee88e6c396c2086d"}, + {file = "coverage-5.5-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:796c9c3c79747146ebd278dbe1e5c5c05dd6b10cc3bcb8389dfdf844f3ead638"}, + {file = "coverage-5.5-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:53194af30d5bad77fcba80e23a1441c71abfb3e01192034f8246e0d8f99528f3"}, + {file = "coverage-5.5-cp37-cp37m-win32.whl", hash = "sha256:184a47bbe0aa6400ed2d41d8e9ed868b8205046518c52464fde713ea06e3a74a"}, + {file = "coverage-5.5-cp37-cp37m-win_amd64.whl", hash = "sha256:2949cad1c5208b8298d5686d5a85b66aae46d73eec2c3e08c817dd3513e5848a"}, + {file = "coverage-5.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:217658ec7187497e3f3ebd901afdca1af062b42cfe3e0dafea4cced3983739f6"}, + {file = "coverage-5.5-cp38-cp38-manylinux1_i686.whl", hash = "sha256:1aa846f56c3d49205c952d8318e76ccc2ae23303351d9270ab220004c580cfe2"}, + {file = "coverage-5.5-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:24d4a7de75446be83244eabbff746d66b9240ae020ced65d060815fac3423759"}, + {file = "coverage-5.5-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:d1f8bf7b90ba55699b3a5e44930e93ff0189aa27186e96071fac7dd0d06a1873"}, + {file = "coverage-5.5-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:970284a88b99673ccb2e4e334cfb38a10aab7cd44f7457564d11898a74b62d0a"}, + {file = "coverage-5.5-cp38-cp38-win32.whl", hash = "sha256:01d84219b5cdbfc8122223b39a954820929497a1cb1422824bb86b07b74594b6"}, + {file = "coverage-5.5-cp38-cp38-win_amd64.whl", hash = "sha256:2e0d881ad471768bf6e6c2bf905d183543f10098e3b3640fc029509530091502"}, + {file = "coverage-5.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d1f9ce122f83b2305592c11d64f181b87153fc2c2bbd3bb4a3dde8303cfb1a6b"}, + {file = "coverage-5.5-cp39-cp39-manylinux1_i686.whl", hash = "sha256:13c4ee887eca0f4c5a247b75398d4114c37882658300e153113dafb1d76de529"}, + {file = "coverage-5.5-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:52596d3d0e8bdf3af43db3e9ba8dcdaac724ba7b5ca3f6358529d56f7a166f8b"}, + {file = "coverage-5.5-cp39-cp39-manylinux2010_i686.whl", hash = "sha256:2cafbbb3af0733db200c9b5f798d18953b1a304d3f86a938367de1567f4b5bff"}, + {file = "coverage-5.5-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:44d654437b8ddd9eee7d1eaee28b7219bec228520ff809af170488fd2fed3e2b"}, + {file = "coverage-5.5-cp39-cp39-win32.whl", hash = "sha256:d314ed732c25d29775e84a960c3c60808b682c08d86602ec2c3008e1202e3bb6"}, + {file = "coverage-5.5-cp39-cp39-win_amd64.whl", hash = "sha256:13034c4409db851670bc9acd836243aeee299949bd5673e11844befcb0149f03"}, + {file = "coverage-5.5-pp36-none-any.whl", hash = "sha256:f030f8873312a16414c0d8e1a1ddff2d3235655a2174e3648b4fa66b3f2f1079"}, + {file = "coverage-5.5-pp37-none-any.whl", hash = "sha256:2a3859cb82dcbda1cfd3e6f71c27081d18aa251d20a17d87d26d4cd216fb0af4"}, + {file = "coverage-5.5.tar.gz", hash = "sha256:ebe78fe9a0e874362175b02371bdfbee64d8edc42a044253ddf4ee7d3c15212c"}, +] +dataclasses-json = [ + {file = "dataclasses-json-0.5.2.tar.gz", hash = "sha256:56ec931959ede74b5dedf65cf20772e6a79764d20c404794cce0111c88c085ff"}, + {file = "dataclasses_json-0.5.2-py3-none-any.whl", hash = "sha256:b746c48d9d8e884e2a0ffa59c6220a1b21f94d4f9f12c839da0a8a0efd36dc19"}, +] +demjson = [ + {file = "demjson-2.2.4.tar.gz", hash = "sha256:31de2038a0fdd9c4c11f8bf3b13fe77bc2a128307f965c8d5fb4dc6d6f6beb79"}, +] +emoji = [ + {file = "emoji-1.2.0-py3-none-any.whl", hash = "sha256:6b19b65da8d6f30551eead1705539cc0eadcd9e33a6ecbc421a29b87f96287eb"}, + {file = "emoji-1.2.0.tar.gz", hash = "sha256:496f432058567985838c13d67dde84ca081614a8286c0b9cdc7d63dfa89d51a3"}, +] +emojis = [ + {file = "emojis-0.6.0-py3-none-any.whl", hash = "sha256:7da34c8a78ae262fd68cef9e2c78a3c1feb59784489eeea0f54ba1d4b7111c7c"}, + {file = "emojis-0.6.0.tar.gz", hash = "sha256:bf605d1f1a27a81cd37fe82eb65781c904467f569295a541c33710b97e4225ec"}, +] +filelock = [ + {file = "filelock-3.0.12-py3-none-any.whl", hash = "sha256:929b7d63ec5b7d6b71b0fa5ac14e030b3f70b75747cef1b10da9b879fef15836"}, + {file = "filelock-3.0.12.tar.gz", hash = "sha256:18d82244ee114f543149c66a6e0c14e9c4f8a1044b5cdaadd0f82159d6a6ff59"}, +] +flake8 = [ + {file = "flake8-3.9.1-py2.py3-none-any.whl", hash = "sha256:3b9f848952dddccf635be78098ca75010f073bfe14d2c6bda867154bea728d2a"}, + {file = "flake8-3.9.1.tar.gz", hash = "sha256:1aa8990be1e689d96c745c5682b687ea49f2e05a443aff1f8251092b0014e378"}, +] +idna = [ + {file = "idna-2.10-py2.py3-none-any.whl", hash = "sha256:b97d804b1e9b523befed77c48dacec60e6dcb0b5391d57af6a65a312a90648c0"}, + {file = "idna-2.10.tar.gz", hash = "sha256:b307872f855b18632ce0c21c5e45be78c0ea7ae4c15c828c20788b26921eb3f6"}, +] +importlib-metadata = [ + {file = "importlib_metadata-4.0.1-py3-none-any.whl", hash = "sha256:d7eb1dea6d6a6086f8be21784cc9e3bcfa55872b52309bc5fad53a8ea444465d"}, + {file = "importlib_metadata-4.0.1.tar.gz", hash = "sha256:8c501196e49fb9df5df43833bdb1e4328f64847763ec8a50703148b73784d581"}, +] +isort = [ + {file = "isort-5.8.0-py3-none-any.whl", hash = "sha256:2bb1680aad211e3c9944dbce1d4ba09a989f04e238296c87fe2139faa26d655d"}, + {file = "isort-5.8.0.tar.gz", hash = "sha256:0a943902919f65c5684ac4e0154b1ad4fac6dcaa5d9f3426b732f1c8b5419be6"}, +] +janome = [ + {file = "Janome-0.4.1-py2.py3-none-any.whl", hash = "sha256:a650e2684e80af72f869eff17566f31dd4444f5443c4771dca1ada60cea5c251"}, + {file = "Janome-0.4.1.tar.gz", hash = "sha256:6c2c38d894014d57cb3151265c11146506ead3b3bc290898adc33711711612de"}, +] +joblib = [ + {file = "joblib-1.0.1-py3-none-any.whl", hash = "sha256:feeb1ec69c4d45129954f1b7034954241eedfd6ba39b5e9e4b6883be3332d5e5"}, + {file = "joblib-1.0.1.tar.gz", hash = "sha256:9c17567692206d2f3fb9ecf5e991084254fe631665c450b443761c4186a613f7"}, +] +marshmallow = [ + {file = "marshmallow-3.11.1-py2.py3-none-any.whl", hash = "sha256:0dd42891a5ef288217ed6410917f3c6048f585f8692075a0052c24f9bfff9dfd"}, + {file = "marshmallow-3.11.1.tar.gz", hash = "sha256:16e99cb7f630c0ef4d7d364ed0109ac194268dde123966076ab3dafb9ae3906b"}, +] +marshmallow-enum = [ + {file = "marshmallow-enum-1.5.1.tar.gz", hash = "sha256:38e697e11f45a8e64b4a1e664000897c659b60aa57bfa18d44e226a9920b6e58"}, + {file = "marshmallow_enum-1.5.1-py2.py3-none-any.whl", hash = "sha256:57161ab3dbfde4f57adeb12090f39592e992b9c86d206d02f6bd03ebec60f072"}, +] +mccabe = [ + {file = "mccabe-0.6.1-py2.py3-none-any.whl", hash = "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42"}, + {file = "mccabe-0.6.1.tar.gz", hash = "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"}, +] +mock = [ + {file = "mock-4.0.3-py3-none-any.whl", hash = "sha256:122fcb64ee37cfad5b3f48d7a7d51875d7031aaf3d8be7c42e2bee25044eee62"}, + {file = "mock-4.0.3.tar.gz", hash = "sha256:7d3fbbde18228f4ff2f1f119a45cdffa458b4c0dee32eb4d2bb2f82554bac7bc"}, +] +more-itertools = [ + {file = "more-itertools-8.7.0.tar.gz", hash = "sha256:c5d6da9ca3ff65220c3bfd2a8db06d698f05d4d2b9be57e1deb2be5a45019713"}, + {file = "more_itertools-8.7.0-py3-none-any.whl", hash = "sha256:5652a9ac72209ed7df8d9c15daf4e1aa0e3d2ccd3c87f8265a0673cd9cbc9ced"}, +] +mypy = [ + {file = "mypy-0.812-cp35-cp35m-macosx_10_9_x86_64.whl", hash = "sha256:a26f8ec704e5a7423c8824d425086705e381b4f1dfdef6e3a1edab7ba174ec49"}, + {file = "mypy-0.812-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:28fb5479c494b1bab244620685e2eb3c3f988d71fd5d64cc753195e8ed53df7c"}, + {file = "mypy-0.812-cp35-cp35m-manylinux2010_x86_64.whl", hash = "sha256:9743c91088d396c1a5a3c9978354b61b0382b4e3c440ce83cf77994a43e8c521"}, + {file = "mypy-0.812-cp35-cp35m-win_amd64.whl", hash = "sha256:d7da2e1d5f558c37d6e8c1246f1aec1e7349e4913d8fb3cb289a35de573fe2eb"}, + {file = "mypy-0.812-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:4eec37370483331d13514c3f55f446fc5248d6373e7029a29ecb7b7494851e7a"}, + {file = "mypy-0.812-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:d65cc1df038ef55a99e617431f0553cd77763869eebdf9042403e16089fe746c"}, + {file = "mypy-0.812-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:61a3d5b97955422964be6b3baf05ff2ce7f26f52c85dd88db11d5e03e146a3a6"}, + {file = "mypy-0.812-cp36-cp36m-win_amd64.whl", hash = "sha256:25adde9b862f8f9aac9d2d11971f226bd4c8fbaa89fb76bdadb267ef22d10064"}, + {file = "mypy-0.812-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:552a815579aa1e995f39fd05dde6cd378e191b063f031f2acfe73ce9fb7f9e56"}, + {file = "mypy-0.812-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:499c798053cdebcaa916eef8cd733e5584b5909f789de856b482cd7d069bdad8"}, + {file = "mypy-0.812-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:5873888fff1c7cf5b71efbe80e0e73153fe9212fafdf8e44adfe4c20ec9f82d7"}, + {file = "mypy-0.812-cp37-cp37m-win_amd64.whl", hash = "sha256:9f94aac67a2045ec719ffe6111df543bac7874cee01f41928f6969756e030564"}, + {file = "mypy-0.812-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d23e0ea196702d918b60c8288561e722bf437d82cb7ef2edcd98cfa38905d506"}, + {file = "mypy-0.812-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:674e822aa665b9fd75130c6c5f5ed9564a38c6cea6a6432ce47eafb68ee578c5"}, + {file = "mypy-0.812-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:abf7e0c3cf117c44d9285cc6128856106183938c68fd4944763003decdcfeb66"}, + {file = "mypy-0.812-cp38-cp38-win_amd64.whl", hash = "sha256:0d0a87c0e7e3a9becdfbe936c981d32e5ee0ccda3e0f07e1ef2c3d1a817cf73e"}, + {file = "mypy-0.812-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7ce3175801d0ae5fdfa79b4f0cfed08807af4d075b402b7e294e6aa72af9aa2a"}, + {file = "mypy-0.812-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:b09669bcda124e83708f34a94606e01b614fa71931d356c1f1a5297ba11f110a"}, + {file = "mypy-0.812-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:33f159443db0829d16f0a8d83d94df3109bb6dd801975fe86bacb9bf71628e97"}, + {file = "mypy-0.812-cp39-cp39-win_amd64.whl", hash = "sha256:3f2aca7f68580dc2508289c729bd49ee929a436208d2b2b6aab15745a70a57df"}, + {file = "mypy-0.812-py3-none-any.whl", hash = "sha256:2f9b3407c58347a452fc0736861593e105139b905cca7d097e413453a1d650b4"}, + {file = "mypy-0.812.tar.gz", hash = "sha256:cd07039aa5df222037005b08fbbfd69b3ab0b0bd7a07d7906de75ae52c4e3119"}, +] +mypy-extensions = [ + {file = "mypy_extensions-0.4.3-py2.py3-none-any.whl", hash = "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d"}, + {file = "mypy_extensions-0.4.3.tar.gz", hash = "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8"}, +] +numpy = [ + {file = "numpy-1.20.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:e9459f40244bb02b2f14f6af0cd0732791d72232bbb0dc4bab57ef88e75f6935"}, + {file = "numpy-1.20.2-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:a8e6859913ec8eeef3dbe9aed3bf475347642d1cdd6217c30f28dee8903528e6"}, + {file = "numpy-1.20.2-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:9cab23439eb1ebfed1aaec9cd42b7dc50fc96d5cd3147da348d9161f0501ada5"}, + {file = "numpy-1.20.2-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:9c0fab855ae790ca74b27e55240fe4f2a36a364a3f1ebcfd1fb5ac4088f1cec3"}, + {file = "numpy-1.20.2-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:61d5b4cf73622e4d0c6b83408a16631b670fc045afd6540679aa35591a17fe6d"}, + {file = "numpy-1.20.2-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:d15007f857d6995db15195217afdbddfcd203dfaa0ba6878a2f580eaf810ecd6"}, + {file = "numpy-1.20.2-cp37-cp37m-win32.whl", hash = "sha256:d76061ae5cab49b83a8cf3feacefc2053fac672728802ac137dd8c4123397677"}, + {file = "numpy-1.20.2-cp37-cp37m-win_amd64.whl", hash = "sha256:bad70051de2c50b1a6259a6df1daaafe8c480ca98132da98976d8591c412e737"}, + {file = "numpy-1.20.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:719656636c48be22c23641859ff2419b27b6bdf844b36a2447cb39caceb00935"}, + {file = "numpy-1.20.2-cp38-cp38-manylinux1_i686.whl", hash = "sha256:aa046527c04688af680217fffac61eec2350ef3f3d7320c07fd33f5c6e7b4d5f"}, + {file = "numpy-1.20.2-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:2428b109306075d89d21135bdd6b785f132a1f5a3260c371cee1fae427e12727"}, + {file = "numpy-1.20.2-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:e8e4fbbb7e7634f263c5b0150a629342cc19b47c5eba8d1cd4363ab3455ab576"}, + {file = "numpy-1.20.2-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:edb1f041a9146dcf02cd7df7187db46ab524b9af2515f392f337c7cbbf5b52cd"}, + {file = "numpy-1.20.2-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:c73a7975d77f15f7f68dacfb2bca3d3f479f158313642e8ea9058eea06637931"}, + {file = "numpy-1.20.2-cp38-cp38-win32.whl", hash = "sha256:6c915ee7dba1071554e70a3664a839fbc033e1d6528199d4621eeaaa5487ccd2"}, + {file = "numpy-1.20.2-cp38-cp38-win_amd64.whl", hash = "sha256:471c0571d0895c68da309dacee4e95a0811d0a9f9f532a48dc1bea5f3b7ad2b7"}, + {file = "numpy-1.20.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4703b9e937df83f5b6b7447ca5912b5f5f297aba45f91dbbbc63ff9278c7aa98"}, + {file = "numpy-1.20.2-cp39-cp39-manylinux2010_i686.whl", hash = "sha256:abc81829c4039e7e4c30f7897938fa5d4916a09c2c7eb9b244b7a35ddc9656f4"}, + {file = "numpy-1.20.2-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:377751954da04d4a6950191b20539066b4e19e3b559d4695399c5e8e3e683bf6"}, + {file = "numpy-1.20.2-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:6e51e417d9ae2e7848314994e6fc3832c9d426abce9328cf7571eefceb43e6c9"}, + {file = "numpy-1.20.2-cp39-cp39-win32.whl", hash = "sha256:780ae5284cb770ade51d4b4a7dce4faa554eb1d88a56d0e8b9f35fca9b0270ff"}, + {file = "numpy-1.20.2-cp39-cp39-win_amd64.whl", hash = "sha256:924dc3f83de20437de95a73516f36e09918e9c9c18d5eac520062c49191025fb"}, + {file = "numpy-1.20.2-pp37-pypy37_pp73-manylinux2010_x86_64.whl", hash = "sha256:97ce8b8ace7d3b9288d88177e66ee75480fb79b9cf745e91ecfe65d91a856042"}, + {file = "numpy-1.20.2.zip", hash = "sha256:878922bf5ad7550aa044aa9301d417e2d3ae50f0f577de92051d739ac6096cee"}, +] +packaging = [ + {file = "packaging-20.9-py2.py3-none-any.whl", hash = "sha256:67714da7f7bc052e064859c05c595155bd1ee9f69f76557e21f051443c20947a"}, + {file = "packaging-20.9.tar.gz", hash = "sha256:5b327ac1320dc863dca72f4514ecc086f31186744b84a230374cc1fd776feae5"}, +] +pathspec = [ + {file = "pathspec-0.8.1-py2.py3-none-any.whl", hash = "sha256:aa0cb481c4041bf52ffa7b0d8fa6cd3e88a2ca4879c533c9153882ee2556790d"}, + {file = "pathspec-0.8.1.tar.gz", hash = "sha256:86379d6b86d75816baba717e64b1a3a3469deb93bb76d613c9ce79edc5cb68fd"}, +] +pycodestyle = [ + {file = "pycodestyle-2.7.0-py2.py3-none-any.whl", hash = "sha256:514f76d918fcc0b55c6680472f0a37970994e07bbb80725808c17089be302068"}, + {file = "pycodestyle-2.7.0.tar.gz", hash = "sha256:c389c1d06bf7904078ca03399a4816f974a1d590090fecea0c63ec26ebaf1cef"}, +] +pydocstyle = [ + {file = "pydocstyle-6.0.0-py3-none-any.whl", hash = "sha256:d4449cf16d7e6709f63192146706933c7a334af7c0f083904799ccb851c50f6d"}, + {file = "pydocstyle-6.0.0.tar.gz", hash = "sha256:164befb520d851dbcf0e029681b91f4f599c62c5cd8933fd54b1bfbd50e89e1f"}, +] +pyflakes = [ + {file = "pyflakes-2.3.1-py2.py3-none-any.whl", hash = "sha256:7893783d01b8a89811dd72d7dfd4d84ff098e5eed95cfa8905b22bbffe52efc3"}, + {file = "pyflakes-2.3.1.tar.gz", hash = "sha256:f5bc8ecabc05bb9d291eb5203d6810b49040f6ff446a756326104746cc00c1db"}, +] +pyparsing = [ + {file = "pyparsing-2.4.7-py2.py3-none-any.whl", hash = "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"}, + {file = "pyparsing-2.4.7.tar.gz", hash = "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1"}, +] +pyyaml = [ + {file = "PyYAML-5.4.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:3b2b1824fe7112845700f815ff6a489360226a5609b96ec2190a45e62a9fc922"}, + {file = "PyYAML-5.4.1-cp27-cp27m-win32.whl", hash = "sha256:129def1b7c1bf22faffd67b8f3724645203b79d8f4cc81f674654d9902cb4393"}, + {file = "PyYAML-5.4.1-cp27-cp27m-win_amd64.whl", hash = "sha256:4465124ef1b18d9ace298060f4eccc64b0850899ac4ac53294547536533800c8"}, + {file = "PyYAML-5.4.1-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:bb4191dfc9306777bc594117aee052446b3fa88737cd13b7188d0e7aa8162185"}, + {file = "PyYAML-5.4.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:6c78645d400265a062508ae399b60b8c167bf003db364ecb26dcab2bda048253"}, + {file = "PyYAML-5.4.1-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:4e0583d24c881e14342eaf4ec5fbc97f934b999a6828693a99157fde912540cc"}, + {file = "PyYAML-5.4.1-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:72a01f726a9c7851ca9bfad6fd09ca4e090a023c00945ea05ba1638c09dc3347"}, + {file = "PyYAML-5.4.1-cp36-cp36m-manylinux2014_s390x.whl", hash = "sha256:895f61ef02e8fed38159bb70f7e100e00f471eae2bc838cd0f4ebb21e28f8541"}, + {file = "PyYAML-5.4.1-cp36-cp36m-win32.whl", hash = "sha256:3bd0e463264cf257d1ffd2e40223b197271046d09dadf73a0fe82b9c1fc385a5"}, + {file = "PyYAML-5.4.1-cp36-cp36m-win_amd64.whl", hash = "sha256:e4fac90784481d221a8e4b1162afa7c47ed953be40d31ab4629ae917510051df"}, + {file = "PyYAML-5.4.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:5accb17103e43963b80e6f837831f38d314a0495500067cb25afab2e8d7a4018"}, + {file = "PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:e1d4970ea66be07ae37a3c2e48b5ec63f7ba6804bdddfdbd3cfd954d25a82e63"}, + {file = "PyYAML-5.4.1-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:cb333c16912324fd5f769fff6bc5de372e9e7a202247b48870bc251ed40239aa"}, + {file = "PyYAML-5.4.1-cp37-cp37m-manylinux2014_s390x.whl", hash = "sha256:fe69978f3f768926cfa37b867e3843918e012cf83f680806599ddce33c2c68b0"}, + {file = "PyYAML-5.4.1-cp37-cp37m-win32.whl", hash = "sha256:dd5de0646207f053eb0d6c74ae45ba98c3395a571a2891858e87df7c9b9bd51b"}, + {file = "PyYAML-5.4.1-cp37-cp37m-win_amd64.whl", hash = "sha256:08682f6b72c722394747bddaf0aa62277e02557c0fd1c42cb853016a38f8dedf"}, + {file = "PyYAML-5.4.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d2d9808ea7b4af864f35ea216be506ecec180628aced0704e34aca0b040ffe46"}, + {file = "PyYAML-5.4.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:8c1be557ee92a20f184922c7b6424e8ab6691788e6d86137c5d93c1a6ec1b8fb"}, + {file = "PyYAML-5.4.1-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:fd7f6999a8070df521b6384004ef42833b9bd62cfee11a09bda1079b4b704247"}, + {file = "PyYAML-5.4.1-cp38-cp38-manylinux2014_s390x.whl", hash = "sha256:bfb51918d4ff3d77c1c856a9699f8492c612cde32fd3bcd344af9be34999bfdc"}, + {file = "PyYAML-5.4.1-cp38-cp38-win32.whl", hash = "sha256:fa5ae20527d8e831e8230cbffd9f8fe952815b2b7dae6ffec25318803a7528fc"}, + {file = "PyYAML-5.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:0f5f5786c0e09baddcd8b4b45f20a7b5d61a7e7e99846e3c799b05c7c53fa696"}, + {file = "PyYAML-5.4.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:294db365efa064d00b8d1ef65d8ea2c3426ac366c0c4368d930bf1c5fb497f77"}, + {file = "PyYAML-5.4.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:74c1485f7707cf707a7aef42ef6322b8f97921bd89be2ab6317fd782c2d53183"}, + {file = "PyYAML-5.4.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:d483ad4e639292c90170eb6f7783ad19490e7a8defb3e46f97dfe4bacae89122"}, + {file = "PyYAML-5.4.1-cp39-cp39-manylinux2014_s390x.whl", hash = "sha256:fdc842473cd33f45ff6bce46aea678a54e3d21f1b61a7750ce3c498eedfe25d6"}, + {file = "PyYAML-5.4.1-cp39-cp39-win32.whl", hash = "sha256:49d4cdd9065b9b6e206d0595fee27a96b5dd22618e7520c33204a4a3239d5b10"}, + {file = "PyYAML-5.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:c20cfa2d49991c8b4147af39859b167664f2ad4561704ee74c1de03318e898db"}, + {file = "PyYAML-5.4.1.tar.gz", hash = "sha256:607774cbba28732bfa802b54baa7484215f530991055bb562efbed5b2f20a45e"}, +] +regex = [ + {file = "regex-2021.4.4-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:619d71c59a78b84d7f18891fe914446d07edd48dc8328c8e149cbe0929b4e000"}, + {file = "regex-2021.4.4-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:47bf5bf60cf04d72bf6055ae5927a0bd9016096bf3d742fa50d9bf9f45aa0711"}, + {file = "regex-2021.4.4-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:281d2fd05555079448537fe108d79eb031b403dac622621c78944c235f3fcf11"}, + {file = "regex-2021.4.4-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:bd28bc2e3a772acbb07787c6308e00d9626ff89e3bfcdebe87fa5afbfdedf968"}, + {file = "regex-2021.4.4-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:7c2a1af393fcc09e898beba5dd59196edaa3116191cc7257f9224beaed3e1aa0"}, + {file = "regex-2021.4.4-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:c38c71df845e2aabb7fb0b920d11a1b5ac8526005e533a8920aea97efb8ec6a4"}, + {file = "regex-2021.4.4-cp36-cp36m-manylinux2014_i686.whl", hash = "sha256:96fcd1888ab4d03adfc9303a7b3c0bd78c5412b2bfbe76db5b56d9eae004907a"}, + {file = "regex-2021.4.4-cp36-cp36m-manylinux2014_x86_64.whl", hash = "sha256:ade17eb5d643b7fead300a1641e9f45401c98eee23763e9ed66a43f92f20b4a7"}, + {file = "regex-2021.4.4-cp36-cp36m-win32.whl", hash = "sha256:e8e5b509d5c2ff12f8418006d5a90e9436766133b564db0abaec92fd27fcee29"}, + {file = "regex-2021.4.4-cp36-cp36m-win_amd64.whl", hash = "sha256:11d773d75fa650cd36f68d7ca936e3c7afaae41b863b8c387a22aaa78d3c5c79"}, + {file = "regex-2021.4.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:d3029c340cfbb3ac0a71798100ccc13b97dddf373a4ae56b6a72cf70dfd53bc8"}, + {file = "regex-2021.4.4-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:18c071c3eb09c30a264879f0d310d37fe5d3a3111662438889ae2eb6fc570c31"}, + {file = "regex-2021.4.4-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:4c557a7b470908b1712fe27fb1ef20772b78079808c87d20a90d051660b1d69a"}, + {file = "regex-2021.4.4-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:01afaf2ec48e196ba91b37451aa353cb7eda77efe518e481707e0515025f0cd5"}, + {file = "regex-2021.4.4-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:3a9cd17e6e5c7eb328517969e0cb0c3d31fd329298dd0c04af99ebf42e904f82"}, + {file = "regex-2021.4.4-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:90f11ff637fe8798933fb29f5ae1148c978cccb0452005bf4c69e13db951e765"}, + {file = "regex-2021.4.4-cp37-cp37m-manylinux2014_i686.whl", hash = "sha256:919859aa909429fb5aa9cf8807f6045592c85ef56fdd30a9a3747e513db2536e"}, + {file = "regex-2021.4.4-cp37-cp37m-manylinux2014_x86_64.whl", hash = "sha256:339456e7d8c06dd36a22e451d58ef72cef293112b559010db3d054d5560ef439"}, + {file = "regex-2021.4.4-cp37-cp37m-win32.whl", hash = "sha256:67bdb9702427ceddc6ef3dc382455e90f785af4c13d495f9626861763ee13f9d"}, + {file = "regex-2021.4.4-cp37-cp37m-win_amd64.whl", hash = "sha256:32e65442138b7b76dd8173ffa2cf67356b7bc1768851dded39a7a13bf9223da3"}, + {file = "regex-2021.4.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1e1c20e29358165242928c2de1482fb2cf4ea54a6a6dea2bd7a0e0d8ee321500"}, + {file = "regex-2021.4.4-cp38-cp38-manylinux1_i686.whl", hash = "sha256:314d66636c494ed9c148a42731b3834496cc9a2c4251b1661e40936814542b14"}, + {file = "regex-2021.4.4-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:6d1b01031dedf2503631d0903cb563743f397ccaf6607a5e3b19a3d76fc10480"}, + {file = "regex-2021.4.4-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:741a9647fcf2e45f3a1cf0e24f5e17febf3efe8d4ba1281dcc3aa0459ef424dc"}, + {file = "regex-2021.4.4-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:4c46e22a0933dd783467cf32b3516299fb98cfebd895817d685130cc50cd1093"}, + {file = "regex-2021.4.4-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:e512d8ef5ad7b898cdb2d8ee1cb09a8339e4f8be706d27eaa180c2f177248a10"}, + {file = "regex-2021.4.4-cp38-cp38-manylinux2014_i686.whl", hash = "sha256:980d7be47c84979d9136328d882f67ec5e50008681d94ecc8afa8a65ed1f4a6f"}, + {file = "regex-2021.4.4-cp38-cp38-manylinux2014_x86_64.whl", hash = "sha256:ce15b6d103daff8e9fee13cf7f0add05245a05d866e73926c358e871221eae87"}, + {file = "regex-2021.4.4-cp38-cp38-win32.whl", hash = "sha256:a91aa8619b23b79bcbeb37abe286f2f408d2f2d6f29a17237afda55bb54e7aac"}, + {file = "regex-2021.4.4-cp38-cp38-win_amd64.whl", hash = "sha256:c0502c0fadef0d23b128605d69b58edb2c681c25d44574fc673b0e52dce71ee2"}, + {file = "regex-2021.4.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:598585c9f0af8374c28edd609eb291b5726d7cbce16be6a8b95aa074d252ee17"}, + {file = "regex-2021.4.4-cp39-cp39-manylinux1_i686.whl", hash = "sha256:ee54ff27bf0afaf4c3b3a62bcd016c12c3fdb4ec4f413391a90bd38bc3624605"}, + {file = "regex-2021.4.4-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:7d9884d86dd4dd489e981d94a65cd30d6f07203d90e98f6f657f05170f6324c9"}, + {file = "regex-2021.4.4-cp39-cp39-manylinux2010_i686.whl", hash = "sha256:bf5824bfac591ddb2c1f0a5f4ab72da28994548c708d2191e3b87dd207eb3ad7"}, + {file = "regex-2021.4.4-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:563085e55b0d4fb8f746f6a335893bda5c2cef43b2f0258fe1020ab1dd874df8"}, + {file = "regex-2021.4.4-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:b9c3db21af35e3b3c05764461b262d6f05bbca08a71a7849fd79d47ba7bc33ed"}, + {file = "regex-2021.4.4-cp39-cp39-manylinux2014_i686.whl", hash = "sha256:3916d08be28a1149fb97f7728fca1f7c15d309a9f9682d89d79db75d5e52091c"}, + {file = "regex-2021.4.4-cp39-cp39-manylinux2014_x86_64.whl", hash = "sha256:fd45ff9293d9274c5008a2054ecef86a9bfe819a67c7be1afb65e69b405b3042"}, + {file = "regex-2021.4.4-cp39-cp39-win32.whl", hash = "sha256:fa4537fb4a98fe8fde99626e4681cc644bdcf2a795038533f9f711513a862ae6"}, + {file = "regex-2021.4.4-cp39-cp39-win_amd64.whl", hash = "sha256:97f29f57d5b84e73fbaf99ab3e26134e6687348e95ef6b48cfd2c06807005a07"}, + {file = "regex-2021.4.4.tar.gz", hash = "sha256:52ba3d3f9b942c49d7e4bc105bb28551c44065f139a65062ab7912bef10c9afb"}, +] +requests = [ + {file = "requests-2.25.1-py2.py3-none-any.whl", hash = "sha256:c210084e36a42ae6b9219e00e48287def368a26d03a048ddad7bfee44f75871e"}, + {file = "requests-2.25.1.tar.gz", hash = "sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804"}, +] +sacremoses = [ + {file = "sacremoses-0.0.45-py3-none-any.whl", hash = "sha256:fa93db44bc04542553ba6090818b892f603d02aa0d681e6c5c3023baf17e8564"}, + {file = "sacremoses-0.0.45.tar.gz", hash = "sha256:58176cc28391830789b763641d0f458819bebe88681dac72b41a19c0aedc07e9"}, +] +scikit-learn = [ + {file = "scikit-learn-0.24.1.tar.gz", hash = "sha256:a0334a1802e64d656022c3bfab56a73fbd6bf4b1298343f3688af2151810bbdf"}, + {file = "scikit_learn-0.24.1-cp36-cp36m-macosx_10_13_x86_64.whl", hash = "sha256:9bed8a1ef133c8e2f13966a542cb8125eac7f4b67dcd234197c827ba9c7dd3e0"}, + {file = "scikit_learn-0.24.1-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:a36e159a0521e13bbe15ca8c8d038b3a1dd4c7dad18d276d76992e03b92cf643"}, + {file = "scikit_learn-0.24.1-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:c658432d8a20e95398f6bb95ff9731ce9dfa343fdf21eea7ec6a7edfacd4b4d9"}, + {file = "scikit_learn-0.24.1-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:9dfa564ef27e8e674aa1cc74378416d580ac4ede1136c13dd555a87996e13422"}, + {file = "scikit_learn-0.24.1-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:9c6097b6a9b2bafc5e0f31f659e6ab5e131383209c30c9e978c5b8abdac5ed2a"}, + {file = "scikit_learn-0.24.1-cp36-cp36m-win32.whl", hash = "sha256:7b04691eb2f41d2c68dbda8d1bd3cb4ef421bdc43aaa56aeb6c762224552dfb6"}, + {file = "scikit_learn-0.24.1-cp36-cp36m-win_amd64.whl", hash = "sha256:1adf483e91007a87171d7ce58c34b058eb5dab01b5fee6052f15841778a8ecd8"}, + {file = "scikit_learn-0.24.1-cp37-cp37m-macosx_10_13_x86_64.whl", hash = "sha256:ddb52d088889f5596bc4d1de981f2eca106b58243b6679e4782f3ba5096fd645"}, + {file = "scikit_learn-0.24.1-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:a29460499c1e62b7a830bb57ca42e615375a6ab1bcad053cd25b493588348ea8"}, + {file = "scikit_learn-0.24.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:0567a2d29ad08af98653300c623bd8477b448fe66ced7198bef4ed195925f082"}, + {file = "scikit_learn-0.24.1-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:99349d77f54e11f962d608d94dfda08f0c9e5720d97132233ebdf35be2858b2d"}, + {file = "scikit_learn-0.24.1-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:83b21ff053b1ff1c018a2d24db6dd3ea339b1acfbaa4d9c881731f43748d8b3b"}, + {file = "scikit_learn-0.24.1-cp37-cp37m-win32.whl", hash = "sha256:c3deb3b19dd9806acf00cf0d400e84562c227723013c33abefbbc3cf906596e9"}, + {file = "scikit_learn-0.24.1-cp37-cp37m-win_amd64.whl", hash = "sha256:d54dbaadeb1425b7d6a66bf44bee2bb2b899fe3e8850b8e94cfb9c904dcb46d0"}, + {file = "scikit_learn-0.24.1-cp38-cp38-macosx_10_13_x86_64.whl", hash = "sha256:3c4f07f47c04e81b134424d53c3f5e16dfd7f494e44fd7584ba9ce9de2c5e6c1"}, + {file = "scikit_learn-0.24.1-cp38-cp38-manylinux1_i686.whl", hash = "sha256:c13ebac42236b1c46397162471ea1c46af68413000e28b9309f8c05722c65a09"}, + {file = "scikit_learn-0.24.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:4ddd2b6f7449a5d539ff754fa92d75da22de261fd8fdcfb3596799fadf255101"}, + {file = "scikit_learn-0.24.1-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:826b92bf45b8ad80444814e5f4ac032156dd481e48d7da33d611f8fe96d5f08b"}, + {file = "scikit_learn-0.24.1-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:259ec35201e82e2db1ae2496f229e63f46d7f1695ae68eef9350b00dc74ba52f"}, + {file = "scikit_learn-0.24.1-cp38-cp38-win32.whl", hash = "sha256:8772b99d683be8f67fcc04789032f1b949022a0e6880ee7b75a7ec97dbbb5d0b"}, + {file = "scikit_learn-0.24.1-cp38-cp38-win_amd64.whl", hash = "sha256:ed9d65594948678827f4ff0e7ae23344e2f2b4cabbca057ccaed3118fdc392ca"}, + {file = "scikit_learn-0.24.1-cp39-cp39-macosx_10_13_x86_64.whl", hash = "sha256:8aa1b3ac46b80eaa552b637eeadbbce3be5931e4b5002b964698e33a1b589e1e"}, + {file = "scikit_learn-0.24.1-cp39-cp39-manylinux1_i686.whl", hash = "sha256:c7f4eb77504ac586d8ac1bde1b0c04b504487210f95297235311a0ab7edd7e38"}, + {file = "scikit_learn-0.24.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:087dfede39efb06ab30618f9ab55a0397f29c38d63cd0ab88d12b500b7d65fd7"}, + {file = "scikit_learn-0.24.1-cp39-cp39-manylinux2010_i686.whl", hash = "sha256:895dbf2030aa7337649e36a83a007df3c9811396b4e2fa672a851160f36ce90c"}, + {file = "scikit_learn-0.24.1-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:9a24d1ccec2a34d4cd3f2a1f86409f3f5954cc23d4d2270ba0d03cf018aa4780"}, + {file = "scikit_learn-0.24.1-cp39-cp39-win32.whl", hash = "sha256:fab31f48282ebf54dd69f6663cd2d9800096bad1bb67bbc9c9ac84eb77b41972"}, + {file = "scikit_learn-0.24.1-cp39-cp39-win_amd64.whl", hash = "sha256:4562dcf4793e61c5d0f89836d07bc37521c3a1889da8f651e2c326463c4bd697"}, +] +scipy = [ + {file = "scipy-1.6.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:a15a1f3fc0abff33e792d6049161b7795909b40b97c6cc2934ed54384017ab76"}, + {file = "scipy-1.6.1-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:e79570979ccdc3d165456dd62041d9556fb9733b86b4b6d818af7a0afc15f092"}, + {file = "scipy-1.6.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:a423533c55fec61456dedee7b6ee7dce0bb6bfa395424ea374d25afa262be261"}, + {file = "scipy-1.6.1-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:33d6b7df40d197bdd3049d64e8e680227151673465e5d85723b3b8f6b15a6ced"}, + {file = "scipy-1.6.1-cp37-cp37m-win32.whl", hash = "sha256:6725e3fbb47da428794f243864f2297462e9ee448297c93ed1dcbc44335feb78"}, + {file = "scipy-1.6.1-cp37-cp37m-win_amd64.whl", hash = "sha256:5fa9c6530b1661f1370bcd332a1e62ca7881785cc0f80c0d559b636567fab63c"}, + {file = "scipy-1.6.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:bd50daf727f7c195e26f27467c85ce653d41df4358a25b32434a50d8870fc519"}, + {file = "scipy-1.6.1-cp38-cp38-manylinux1_i686.whl", hash = "sha256:f46dd15335e8a320b0fb4685f58b7471702234cba8bb3442b69a3e1dc329c345"}, + {file = "scipy-1.6.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:0e5b0ccf63155d90da576edd2768b66fb276446c371b73841e3503be1d63fb5d"}, + {file = "scipy-1.6.1-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:2481efbb3740977e3c831edfd0bd9867be26387cacf24eb5e366a6a374d3d00d"}, + {file = "scipy-1.6.1-cp38-cp38-win32.whl", hash = "sha256:68cb4c424112cd4be886b4d979c5497fba190714085f46b8ae67a5e4416c32b4"}, + {file = "scipy-1.6.1-cp38-cp38-win_amd64.whl", hash = "sha256:5f331eeed0297232d2e6eea51b54e8278ed8bb10b099f69c44e2558c090d06bf"}, + {file = "scipy-1.6.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:0c8a51d33556bf70367452d4d601d1742c0e806cd0194785914daf19775f0e67"}, + {file = "scipy-1.6.1-cp39-cp39-manylinux1_i686.whl", hash = "sha256:83bf7c16245c15bc58ee76c5418e46ea1811edcc2e2b03041b804e46084ab627"}, + {file = "scipy-1.6.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:794e768cc5f779736593046c9714e0f3a5940bc6dcc1dba885ad64cbfb28e9f0"}, + {file = "scipy-1.6.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:5da5471aed911fe7e52b86bf9ea32fb55ae93e2f0fac66c32e58897cfb02fa07"}, + {file = "scipy-1.6.1-cp39-cp39-win32.whl", hash = "sha256:8e403a337749ed40af60e537cc4d4c03febddcc56cd26e774c9b1b600a70d3e4"}, + {file = "scipy-1.6.1-cp39-cp39-win_amd64.whl", hash = "sha256:a5193a098ae9f29af283dcf0041f762601faf2e595c0db1da929875b7570353f"}, + {file = "scipy-1.6.1.tar.gz", hash = "sha256:c4fceb864890b6168e79b0e714c585dbe2fd4222768ee90bc1aa0f8218691b11"}, +] +seqeval = [ + {file = "seqeval-1.2.2.tar.gz", hash = "sha256:f28e97c3ab96d6fcd32b648f6438ff2e09cfba87f05939da9b3970713ec56e6f"}, +] +six = [ + {file = "six-1.15.0-py2.py3-none-any.whl", hash = "sha256:8b74bedcbbbaca38ff6d7491d76f2b06b3592611af620f8426e82dddb04a5ced"}, + {file = "six-1.15.0.tar.gz", hash = "sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259"}, +] +snowballstemmer = [ + {file = "snowballstemmer-2.1.0-py2.py3-none-any.whl", hash = "sha256:b51b447bea85f9968c13b650126a888aabd4cb4463fca868ec596826325dedc2"}, + {file = "snowballstemmer-2.1.0.tar.gz", hash = "sha256:e997baa4f2e9139951b6f4c631bad912dfd3c792467e2f03d7239464af90e914"}, +] +spans = [ + {file = "Spans-1.1.0-py2.py3-none-any.whl", hash = "sha256:ffb95c5a81761d529a42781c933af452f8e6a0ee2365028deacacde62285e5b4"}, + {file = "Spans-1.1.0.tar.gz", hash = "sha256:d6d17fe12adc2b81a3c5edf38a37f118d3bebd13a8a5614369c6d04efde014a0"}, +] +stringcase = [ + {file = "stringcase-1.2.0.tar.gz", hash = "sha256:48a06980661908efe8d9d34eab2b6c13aefa2163b3ced26972902e3bdfd87008"}, +] +threadpoolctl = [ + {file = "threadpoolctl-2.1.0-py3-none-any.whl", hash = "sha256:38b74ca20ff3bb42caca8b00055111d74159ee95c4370882bbff2b93d24da725"}, + {file = "threadpoolctl-2.1.0.tar.gz", hash = "sha256:ddc57c96a38beb63db45d6c159b5ab07b6bced12c45a1f07b2b92f272aebfa6b"}, +] +tokenizers = [ + {file = "tokenizers-0.10.2-cp35-cp35m-macosx_10_11_x86_64.whl", hash = "sha256:8b4ae84fb410b5f5abb3a604b3274e2d6994b21f07c379b1c1659561e026bad8"}, + {file = "tokenizers-0.10.2-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:4865d34d4897eed4ca4a758971fb14911cf5022e270b53c028fa9312fe440e2b"}, + {file = "tokenizers-0.10.2-cp35-cp35m-manylinux2010_x86_64.whl", hash = "sha256:be25827c0506d92927dc0ef4d2ce0c4653a351735546f8b22548535c3d2f7a6c"}, + {file = "tokenizers-0.10.2-cp35-cp35m-manylinux2014_aarch64.whl", hash = "sha256:2b592146caff20c283dadf2da99520b1dfde4af8ce964a8adcb4e990923fa423"}, + {file = "tokenizers-0.10.2-cp35-cp35m-manylinux2014_ppc64le.whl", hash = "sha256:cd408266f13856dc648ed2dcc889ac17feffc28da2ebb03f1977b88935e86c9a"}, + {file = "tokenizers-0.10.2-cp35-cp35m-manylinux2014_s390x.whl", hash = "sha256:fb1dae213c8531d6af071dd021c7225be73803a0cbe609aed5074be04118aa6c"}, + {file = "tokenizers-0.10.2-cp35-cp35m-win32.whl", hash = "sha256:f9fe9c5556ccab03c9d42ed299bd8901c95d22373676437bfeb4656c2b5e42bc"}, + {file = "tokenizers-0.10.2-cp35-cp35m-win_amd64.whl", hash = "sha256:419bb33bb3690239b93b76b06eba1eb822aa72f4e63293d2f15c60505f6ee0d0"}, + {file = "tokenizers-0.10.2-cp36-cp36m-macosx_10_11_x86_64.whl", hash = "sha256:474883e8e0be431394e0ccfb70e97c1856e8c5bc80536f7b2faa3b0785d59afd"}, + {file = "tokenizers-0.10.2-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:bb58ad982f8f72052362a5384145e87559899dcc0b06264e71dd137869037e6e"}, + {file = "tokenizers-0.10.2-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:3fb22df976701452db3ba652bd647518a043e58d4209d18273163fbc53252a3b"}, + {file = "tokenizers-0.10.2-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:bac17cceebb2a6947d380e1b7bce8fc33098a979071a1291adc456fb25434924"}, + {file = "tokenizers-0.10.2-cp36-cp36m-manylinux2014_ppc64le.whl", hash = "sha256:86077426c615a814f7456569eade33c12c93131d02fdf548994dcedf41bdbbf1"}, + {file = "tokenizers-0.10.2-cp36-cp36m-manylinux2014_s390x.whl", hash = "sha256:9619846026b16967465e5221206f86bdc58cf65b0f92548d048e97925361121e"}, + {file = "tokenizers-0.10.2-cp36-cp36m-win32.whl", hash = "sha256:9124a1f77e176cb2a2571bae4c3bf8d4c40975c1681e2ba346fdca5d6a3aa843"}, + {file = "tokenizers-0.10.2-cp36-cp36m-win_amd64.whl", hash = "sha256:ea54eb0071f13fa7c6c3b88997a843d01c067158b994115759c27827e683fb82"}, + {file = "tokenizers-0.10.2-cp37-cp37m-macosx_10_11_x86_64.whl", hash = "sha256:8a575022e066878bede82bb5d5244b17c6ebda15dbb50229d86f9e8267ddd40e"}, + {file = "tokenizers-0.10.2-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:9e8f32a2ef1902f769da6215ae8beabd632676a1551fb171b5aa6d4c11fd3a02"}, + {file = "tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:05c90ade1b9cc41aaee6056c5e460dc5150f12b602bdc6bfa3758fb965ca7788"}, + {file = "tokenizers-0.10.2-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:a3de6ecfbd739ee3d59280c0c930c0c5a716df1cf0cdf68beb379066931866bd"}, + {file = "tokenizers-0.10.2-cp37-cp37m-manylinux2014_ppc64le.whl", hash = "sha256:1a22bf899728eeb74ee2bb1ba9eff61898ec02e623a690ed28002762d19ab9b4"}, + {file = "tokenizers-0.10.2-cp37-cp37m-manylinux2014_s390x.whl", hash = "sha256:79119578bcd1d8ec836ddd3dbb305f32084d60e9f67e93a10ca33c67eeaa89fc"}, + {file = "tokenizers-0.10.2-cp37-cp37m-win32.whl", hash = "sha256:c429c25c3dfe1ea9ad6e21a49d648910335ef4188c5e8226e5aa2ba2bd13921c"}, + {file = "tokenizers-0.10.2-cp37-cp37m-win_amd64.whl", hash = "sha256:aadcf38b97114d035e389f5aee4edf59e81666ad65de26c06592d76f184bb66c"}, + {file = "tokenizers-0.10.2-cp38-cp38-macosx_10_11_x86_64.whl", hash = "sha256:7ba26369bc30f9d28d9ff42dcb1b57d9995157a9bb2975b95acda4220195d7aa"}, + {file = "tokenizers-0.10.2-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:15d9b959fd3b9e9e7c6d6d7d909bca5d7397a170a50d99ac8ce4e2ab590b137a"}, + {file = "tokenizers-0.10.2-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:f1553e029f326eb74f36d67a38ef77a7f03068a494a0faa4e16d0d832f25b760"}, + {file = "tokenizers-0.10.2-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:f7b62497dce161babdb9197fa4b26e401bac9541b62fe0d0957134fefeb1b01c"}, + {file = "tokenizers-0.10.2-cp38-cp38-manylinux2014_ppc64le.whl", hash = "sha256:77c4c41f2147c930c66014ca43b6935133781ae1923d62e70c797e71b0ee2598"}, + {file = "tokenizers-0.10.2-cp38-cp38-manylinux2014_s390x.whl", hash = "sha256:52c2479975fd5025d399493403c7aedce853da20cec04a32a829c1c12c28e2f1"}, + {file = "tokenizers-0.10.2-cp38-cp38-win32.whl", hash = "sha256:bed7c5c2c786a2e9b3265006f15a13d8e04dcdfcf9ba13add0d7194a50346393"}, + {file = "tokenizers-0.10.2-cp38-cp38-win_amd64.whl", hash = "sha256:6229fcc8473fd225e8e09742c354dacacd57dbdc73075e4c9d71f925cd171090"}, + {file = "tokenizers-0.10.2-cp39-cp39-macosx_10_11_x86_64.whl", hash = "sha256:953a4e483524fd37fd66208e21dce85d4829bfe294d8b6224d2f00f61aa9950c"}, + {file = "tokenizers-0.10.2-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:1ab8c467e4fe16bba33022feefcd6322642a58e4c8c123fd692c20e17f339964"}, + {file = "tokenizers-0.10.2-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:2094eb8e3608858eb4bd29c32c39969ae63ad9749d8aca9b34e82cba852acaf1"}, + {file = "tokenizers-0.10.2-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:e3e74a9fa40b92a9817fe05ea91bf20075f45ad8cf7c0d3eb738170a27059508"}, + {file = "tokenizers-0.10.2-cp39-cp39-manylinux2014_ppc64le.whl", hash = "sha256:39e24555d5a2d9df87fd75303e1fd9ba3f995ac8aeb543c511d601d26a54726a"}, + {file = "tokenizers-0.10.2-cp39-cp39-manylinux2014_s390x.whl", hash = "sha256:a323d93fd5e57060428fecb6d73ab13223822f8ffa1ede282070b47a4bda2cea"}, + {file = "tokenizers-0.10.2-cp39-cp39-win32.whl", hash = "sha256:c0f5bbc2e614468bcb605f2aa4a6dbcdf21629c6ff6ae81f1d9fa9683934ce8e"}, + {file = "tokenizers-0.10.2-cp39-cp39-win_amd64.whl", hash = "sha256:03056431783e72df80de68648573f97a70701d17fa22336c6d761b5d4b7be9ff"}, + {file = "tokenizers-0.10.2.tar.gz", hash = "sha256:cf7f1aad957fed36e4a90fc094e3adc03fdd45fbb058c1cde25721e3e66235f8"}, +] +toml = [ + {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"}, + {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"}, +] +torch = [ + {file = "torch-1.8.1-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:f23eeb1a48cc39209d986c418ad7e02227eee973da45c0c42d36b1aec72f4940"}, + {file = "torch-1.8.1-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:4ace9c5bb94d5a7b9582cd089993201658466e9c59ff88bd4e9e08f6f072d1cf"}, + {file = "torch-1.8.1-cp36-cp36m-win_amd64.whl", hash = "sha256:6ffa1e7ae079c7cb828712cb0cdaae5cc4fb87c16a607e6d14526b62c20bcc17"}, + {file = "torch-1.8.1-cp36-none-macosx_10_9_x86_64.whl", hash = "sha256:16f2630d9604c4ee28ea7d6e388e2264cd7bc6031c6ecd796bae3f56b5efa9a3"}, + {file = "torch-1.8.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:95b7bbbacc3f28fe438f418392ceeae146a01adc03b29d44917d55214ac234c9"}, + {file = "torch-1.8.1-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:55137feb2f5a0dc7aced5bba690dcdb7652054ad3452b09a2bbb59f02a11e9ff"}, + {file = "torch-1.8.1-cp37-cp37m-win_amd64.whl", hash = "sha256:8ad2252bf09833dcf46a536a78544e349b8256a370e03a98627ebfb118d9555b"}, + {file = "torch-1.8.1-cp37-none-macosx_10_9_x86_64.whl", hash = "sha256:1388b30fbd262c1a053d6c9ace73bb0bd8f5871b4892b6f3e02d1d7bc9768563"}, + {file = "torch-1.8.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:e7ad1649adb7dc2a450e70a3e51240b84fa4746c69c8f98989ce0c254f9fba3a"}, + {file = "torch-1.8.1-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:3e4190c04dfd89c59bad06d5fe451446643a65e6d2607cc989eb1001ee76e12f"}, + {file = "torch-1.8.1-cp38-cp38-win_amd64.whl", hash = "sha256:5c2e9a33d44cdb93ebd739b127ffd7da786bf5f740539539195195b186a05f6c"}, + {file = "torch-1.8.1-cp38-none-macosx_10_9_x86_64.whl", hash = "sha256:c6ede2ae4dcd8214b63e047efabafa92493605205a947574cf358216ca4e440a"}, + {file = "torch-1.8.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:ce7d435426f3dd14f95710d779aa46e9cd5e077d512488e813f7589fdc024f78"}, + {file = "torch-1.8.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:a50ea8ed900927fb30cadb63aa7a32fdd59c7d7abe5012348dfbe35a8355c083"}, + {file = "torch-1.8.1-cp39-cp39-win_amd64.whl", hash = "sha256:dac4d10494e74f7e553c92d7263e19ea501742c4825ddd26c4decfa27be95981"}, + {file = "torch-1.8.1-cp39-none-macosx_10_9_x86_64.whl", hash = "sha256:225ee4238c019b28369c71977327deeeb2bd1c6b8557e6fcf631b8866bdc5447"}, +] +tqdm = [ + {file = "tqdm-4.60.0-py2.py3-none-any.whl", hash = "sha256:daec693491c52e9498632dfbe9ccfc4882a557f5fa08982db1b4d3adbe0887c3"}, + {file = "tqdm-4.60.0.tar.gz", hash = "sha256:ebdebdb95e3477ceea267decfc0784859aa3df3e27e22d23b83e9b272bf157ae"}, +] +transformers = [ + {file = "transformers-4.5.1-py3-none-any.whl", hash = "sha256:0a57d1cd9301a617c7015d7184228984abdfb1ae2158c29cfb32582219756d23"}, + {file = "transformers-4.5.1.tar.gz", hash = "sha256:3508e3b032cf0f5342c67836de4b121aa5c435c959472a28054ba895ea59cca7"}, +] +typed-ast = [ + {file = "typed_ast-1.4.3-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:2068531575a125b87a41802130fa7e29f26c09a2833fea68d9a40cf33902eba6"}, + {file = "typed_ast-1.4.3-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:c907f561b1e83e93fad565bac5ba9c22d96a54e7ea0267c708bffe863cbe4075"}, + {file = "typed_ast-1.4.3-cp35-cp35m-manylinux2014_aarch64.whl", hash = "sha256:1b3ead4a96c9101bef08f9f7d1217c096f31667617b58de957f690c92378b528"}, + {file = "typed_ast-1.4.3-cp35-cp35m-win32.whl", hash = "sha256:dde816ca9dac1d9c01dd504ea5967821606f02e510438120091b84e852367428"}, + {file = "typed_ast-1.4.3-cp35-cp35m-win_amd64.whl", hash = "sha256:777a26c84bea6cd934422ac2e3b78863a37017618b6e5c08f92ef69853e765d3"}, + {file = "typed_ast-1.4.3-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:f8afcf15cc511ada719a88e013cec87c11aff7b91f019295eb4530f96fe5ef2f"}, + {file = "typed_ast-1.4.3-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:52b1eb8c83f178ab787f3a4283f68258525f8d70f778a2f6dd54d3b5e5fb4341"}, + {file = "typed_ast-1.4.3-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:01ae5f73431d21eead5015997ab41afa53aa1fbe252f9da060be5dad2c730ace"}, + {file = "typed_ast-1.4.3-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:c190f0899e9f9f8b6b7863debfb739abcb21a5c054f911ca3596d12b8a4c4c7f"}, + {file = "typed_ast-1.4.3-cp36-cp36m-win32.whl", hash = "sha256:398e44cd480f4d2b7ee8d98385ca104e35c81525dd98c519acff1b79bdaac363"}, + {file = "typed_ast-1.4.3-cp36-cp36m-win_amd64.whl", hash = "sha256:bff6ad71c81b3bba8fa35f0f1921fb24ff4476235a6e94a26ada2e54370e6da7"}, + {file = "typed_ast-1.4.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:0fb71b8c643187d7492c1f8352f2c15b4c4af3f6338f21681d3681b3dc31a266"}, + {file = "typed_ast-1.4.3-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:760ad187b1041a154f0e4d0f6aae3e40fdb51d6de16e5c99aedadd9246450e9e"}, + {file = "typed_ast-1.4.3-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:5feca99c17af94057417d744607b82dd0a664fd5e4ca98061480fd8b14b18d04"}, + {file = "typed_ast-1.4.3-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:95431a26309a21874005845c21118c83991c63ea800dd44843e42a916aec5899"}, + {file = "typed_ast-1.4.3-cp37-cp37m-win32.whl", hash = "sha256:aee0c1256be6c07bd3e1263ff920c325b59849dc95392a05f258bb9b259cf39c"}, + {file = "typed_ast-1.4.3-cp37-cp37m-win_amd64.whl", hash = "sha256:9ad2c92ec681e02baf81fdfa056fe0d818645efa9af1f1cd5fd6f1bd2bdfd805"}, + {file = "typed_ast-1.4.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b36b4f3920103a25e1d5d024d155c504080959582b928e91cb608a65c3a49e1a"}, + {file = "typed_ast-1.4.3-cp38-cp38-manylinux1_i686.whl", hash = "sha256:067a74454df670dcaa4e59349a2e5c81e567d8d65458d480a5b3dfecec08c5ff"}, + {file = "typed_ast-1.4.3-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:7538e495704e2ccda9b234b82423a4038f324f3a10c43bc088a1636180f11a41"}, + {file = "typed_ast-1.4.3-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:af3d4a73793725138d6b334d9d247ce7e5f084d96284ed23f22ee626a7b88e39"}, + {file = "typed_ast-1.4.3-cp38-cp38-win32.whl", hash = "sha256:f2362f3cb0f3172c42938946dbc5b7843c2a28aec307c49100c8b38764eb6927"}, + {file = "typed_ast-1.4.3-cp38-cp38-win_amd64.whl", hash = "sha256:dd4a21253f42b8d2b48410cb31fe501d32f8b9fbeb1f55063ad102fe9c425e40"}, + {file = "typed_ast-1.4.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f328adcfebed9f11301eaedfa48e15bdece9b519fb27e6a8c01aa52a17ec31b3"}, + {file = "typed_ast-1.4.3-cp39-cp39-manylinux1_i686.whl", hash = "sha256:2c726c276d09fc5c414693a2de063f521052d9ea7c240ce553316f70656c84d4"}, + {file = "typed_ast-1.4.3-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:cae53c389825d3b46fb37538441f75d6aecc4174f615d048321b716df2757fb0"}, + {file = "typed_ast-1.4.3-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:b9574c6f03f685070d859e75c7f9eeca02d6933273b5e69572e5ff9d5e3931c3"}, + {file = "typed_ast-1.4.3-cp39-cp39-win32.whl", hash = "sha256:209596a4ec71d990d71d5e0d312ac935d86930e6eecff6ccc7007fe54d703808"}, + {file = "typed_ast-1.4.3-cp39-cp39-win_amd64.whl", hash = "sha256:9c6d1a54552b5330bc657b7ef0eae25d00ba7ffe85d9ea8ae6540d2197a3788c"}, + {file = "typed_ast-1.4.3.tar.gz", hash = "sha256:fb1bbeac803adea29cedd70781399c99138358c26d05fcbd23c13016b7f5ec65"}, +] +typing-extensions = [ + {file = "typing_extensions-3.7.4.3-py2-none-any.whl", hash = "sha256:dafc7639cde7f1b6e1acc0f457842a83e722ccca8eef5270af2d74792619a89f"}, + {file = "typing_extensions-3.7.4.3-py3-none-any.whl", hash = "sha256:7cb407020f00f7bfc3cb3e7881628838e69d8f3fcab2f64742a5e76b2f841918"}, + {file = "typing_extensions-3.7.4.3.tar.gz", hash = "sha256:99d4073b617d30288f569d3f13d2bd7548c3a7e4c8de87db09a9d29bb3a4a60c"}, +] +typing-inspect = [ + {file = "typing_inspect-0.6.0-py2-none-any.whl", hash = "sha256:de08f50a22955ddec353876df7b2545994d6df08a2f45d54ac8c05e530372ca0"}, + {file = "typing_inspect-0.6.0-py3-none-any.whl", hash = "sha256:3b98390df4d999a28cf5b35d8b333425af5da2ece8a4ea9e98f71e7591347b4f"}, + {file = "typing_inspect-0.6.0.tar.gz", hash = "sha256:8f1b1dd25908dbfd81d3bebc218011531e7ab614ba6e5bf7826d887c834afab7"}, +] +urllib3 = [ + {file = "urllib3-1.26.4-py2.py3-none-any.whl", hash = "sha256:2f4da4594db7e1e110a944bb1b551fdf4e6c136ad42e4234131391e21eb5b0df"}, + {file = "urllib3-1.26.4.tar.gz", hash = "sha256:e7b021f7241115872f92f43c6508082facffbd1c048e3c6e2bb9c2a157e28937"}, +] +yamllint = [ + {file = "yamllint-1.26.1.tar.gz", hash = "sha256:87d9462b3ed7e9dfa19caa177f7a77cd9888b3dc4044447d6ae0ab233bcd1324"}, +] +zipp = [ + {file = "zipp-3.4.1-py3-none-any.whl", hash = "sha256:51cb66cc54621609dd593d1787f286ee42a5c0adbb4b29abea5a63edc3e03098"}, + {file = "zipp-3.4.1.tar.gz", hash = "sha256:3607921face881ba3e026887d8150cca609d517579abe052ac81fc5aeffdbd76"}, +] diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..7bfb692 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,44 @@ +[tool.poetry] +name = "bunkai" +version = "1.0.0" +description = "Sentence boundary disambiguation tool for Japanese" +authors = ["Yuta Hayashibe ", "Kensuke Mitsuzawa "] +maintainers = ["Yuta Hayashibe ", "Kensuke Mitsuzawa "] +license = "Apache-2.0" +readme = "README.md" +homepage = "https://github.com/megagonlabs/bunkai" +repository = "https://github.com/megagonlabs/bunkai" +documentation = "" +keywords = ["Japanese"] + +[tool.poetry.dependencies] +python = "^3.7" +dataclasses-json = "^0.5.2" +janome = "^0.4.1" +seqeval = "^1.2.2" +spans = "^1.1.0" +tqdm = "*" +numpy = "^1.16.0" +torch = "^1.3.0" +transformers = "^4.3.2" +more_itertools = "^8.6.0" +emoji = ">=1.2.0" +emojis = ">=0.6.0" + +[tool.poetry.dev-dependencies] +autopep8 = ">=1.5.4" +coverage = ">=5.3" +demjson = ">=2.2.4" +flake8 = ">=3.8.4" +isort = ">=5.6.4" +mypy = ">=0.790" +yamllint = ">=1.25.0" +mock = ">=4.0.2" +pydocstyle = ">=5.1.1" + +[build-system] +requires = ["poetry"] +build-backend = "poetry.core.masonry.api" + +[tool.poetry.scripts] +bunkai = "bunkai.cli:main" diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..20f6276 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,2 @@ +[flake8] +max-line-length = 130 diff --git a/tests/bunkai_sbd/__init__.py b/tests/bunkai_sbd/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/bunkai_sbd/annotator/__init__.py b/tests/bunkai_sbd/annotator/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/bunkai_sbd/annotator/annotation_test_base.py b/tests/bunkai_sbd/annotator/annotation_test_base.py new file mode 100644 index 0000000..37ace40 --- /dev/null +++ b/tests/bunkai_sbd/annotator/annotation_test_base.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 +import dataclasses +import typing +import unittest + +from bunkai.algorithm.bunkai_sbd.annotator.morph_annotator import \ + MorphAnnotatorJanome +from bunkai.base.annotator import Annotations, Annotator, SpanAnnotation + + +@dataclasses.dataclass +class TestInstance(object): + text: str + n_sentence: int + expected_rules: typing.Optional[typing.List[typing.Optional[str]]] = None + + +class TestAnnotatorBase(unittest.TestCase): + def setUp(self) -> None: + self.morph_annotator = MorphAnnotatorJanome() + + def init_tokenized_layer(self, text: str) -> Annotations: + annotations = Annotations() + annotations.add_annotation_layer('first', [SpanAnnotation(rule_name=None, + start_index=0, + end_index=len( + text), + split_string_type=None, split_string_value=None)]) + self.morph_annotator.annotate(text, annotations) + return annotations + + @staticmethod + def init_layer(text: str) -> Annotations: + annotations = Annotations() + annotations.add_annotation_layer('first', [SpanAnnotation(rule_name=None, + start_index=0, + end_index=len( + text), + split_string_type=None, split_string_value=None)]) + return annotations + + def is_check_test_instance(self, + annotator: Annotator, + test_cases: typing.List[TestInstance], + is_tokenize: bool = False): + for test_case in test_cases: + if is_tokenize: + input_layer = self.init_tokenized_layer(test_case.text) + else: + input_layer = self.init_layer(test_case.text) + annotations = annotator.annotate(original_text=test_case.text, spans=input_layer) + span_annotations = annotations.get_final_layer() + self.assertEqual(set([s.rule_name for s in span_annotations if s.rule_name is not None]), # type: ignore + set(test_case.expected_rules), # type: ignore + msg=f'text={test_case.text}, ' # type: ignore + f'{set([s.rule_name for s in span_annotations])} ' + f'!= {set(test_case.expected_rules)}') + + def test_annotate(self): + pass diff --git a/tests/bunkai_sbd/annotator/test_basic_annotator.py b/tests/bunkai_sbd/annotator/test_basic_annotator.py new file mode 100644 index 0000000..08bdc5f --- /dev/null +++ b/tests/bunkai_sbd/annotator/test_basic_annotator.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python3 +import unittest + +from bunkai.algorithm.bunkai_sbd.annotator.basic_annotator import BasicRule + +from .annotation_test_base import TestAnnotatorBase, TestInstance + + +class TestBasicRule(TestAnnotatorBase): + def test_annotate(self): + test_cases = [ + TestInstance('1文目。2文目!3文目?', 3, expected_rules=[BasicRule.__name__]), + TestInstance('1文目.2文目.3文目。', 3, expected_rules=[BasicRule.__name__]), + ] + annotator = BasicRule() + self.is_check_test_instance(annotator=annotator, test_cases=test_cases) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/bunkai_sbd/annotator/test_emoji_annotator.py b/tests/bunkai_sbd/annotator/test_emoji_annotator.py new file mode 100644 index 0000000..e9e3b37 --- /dev/null +++ b/tests/bunkai_sbd/annotator/test_emoji_annotator.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 +import unittest +from collections import namedtuple + +from bunkai.algorithm.bunkai_sbd.annotator.emoji_annotator import \ + EmojiAnnotator +from bunkai.base.annotation import Annotations + +from .annotation_test_base import TestAnnotatorBase, TestInstance + +MorphResult = namedtuple('MorphResult', ('input_text', 'seq_newline_position')) + + +class TestMorphAnnotator(TestAnnotatorBase): + def setUp(self) -> None: + self.test_input = [ + MorphResult('うーん🤔🤔🤔どうしよう', [6]), + MorphResult('ビール🍺のみたい。️Frankfurtの🍺はKrombacher', []), + MorphResult('これが文⬆️', [5]), + MorphResult('1文目😄2文目😚3文目😙4文目😄😙おわり。', [4, 8, 12, 17]) + ] + + def test_emoji_detector(self): + emoji_annotator = EmojiAnnotator() + for test_tuple in self.test_input: + ann = Annotations() + result = emoji_annotator.annotate(test_tuple.input_text, spans=ann) + self.assertEqual(set([s.end_index for s in result.get_final_layer()]), set(test_tuple.seq_newline_position)) + + def test_annotate(self): + test_input = [ + TestInstance('うーん🤔🤔🤔どうしよう', n_sentence=2, expected_rules=[EmojiAnnotator.__name__]), + TestInstance('ビール🍺のみたい。️Frankfurtの🍺はKrombacher', n_sentence=2, expected_rules=[]), + TestInstance('これが文⬆️', n_sentence=1, expected_rules=[EmojiAnnotator.__name__]), + TestInstance('1文目😄2文目😚3文目😙4文目😄😙おわり。', n_sentence=5, expected_rules=[EmojiAnnotator.__name__]) + ] + annotator = EmojiAnnotator() + self.is_check_test_instance(annotator=annotator, test_cases=test_input) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/bunkai_sbd/annotator/test_emotion_expression_annotator.py b/tests/bunkai_sbd/annotator/test_emotion_expression_annotator.py new file mode 100644 index 0000000..669bc27 --- /dev/null +++ b/tests/bunkai_sbd/annotator/test_emotion_expression_annotator.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python3 +import unittest + +from bunkai.algorithm.bunkai_sbd.annotator.emotion_expression_annotator import \ + EmotionExpressionAnnotator + +from .annotation_test_base import TestAnnotatorBase, TestInstance + + +class TestEmotionExpressionAnnotator(TestAnnotatorBase): + def test_annotate(self): + test_cases = [ + TestInstance('1文目(笑)2文目(汗)3文目(泣)', 3, expected_rules=[EmotionExpressionAnnotator.__name__]), + TestInstance('1文目☆2文目★3文目。', 3, expected_rules=[EmotionExpressionAnnotator.__name__]), + ] + annotator = EmotionExpressionAnnotator() + self.is_check_test_instance(annotator=annotator, test_cases=test_cases) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/bunkai_sbd/annotator/test_facemark_detector.py b/tests/bunkai_sbd/annotator/test_facemark_detector.py new file mode 100644 index 0000000..097c239 --- /dev/null +++ b/tests/bunkai_sbd/annotator/test_facemark_detector.py @@ -0,0 +1,178 @@ +#!/usr/bin/env python3 +import unittest + +from bunkai.algorithm.bunkai_sbd.annotator.constant import LAYER_NAME_FIRST +from bunkai.algorithm.bunkai_sbd.annotator.facemark_detector import \ + FaceMarkDetector +from bunkai.base.annotation import Annotations, SpanAnnotation + +from .annotation_test_base import TestAnnotatorBase, TestInstance + + +def init_annotation(text: str) -> Annotations: + annotations = Annotations() + annotations.add_annotation_layer(LAYER_NAME_FIRST, [SpanAnnotation(rule_name=LAYER_NAME_FIRST, + start_index=len(text) - 1, + end_index=len(text), + split_string_type=None, + split_string_value=None)]) + return annotations + + +class TestMorphAnnotator(TestAnnotatorBase): + def test_performance_facemark(self): + self.test_patterns = { + "facemark_end": [('いい湯でした(^_^)', 11), ('品揃えは良い(^o^)', 11), ('品揃えは良い(^^)', 10), ('品揃えは良い(^-^)', 11), + ('サービスは良好(●^o^●)', 14), ('食事はよかった(^◇^)', 12), ('食事はよかった(*^_^*)', 14), ('品揃えは良い(*´ー`)', 12), + ('サービスは良好(*´∀`*)', 14), ('食事はよかった(*´ω`*)', 14), ('品揃えは良い(*´艸`*)', 13), + ('食事はよかった(/ω\)', 12), ('サービスは良好(^_^)V', 13), ('食事はよかった(^o^)V', 13), + ('品揃えは良いヽ(=´▽`=)ノ', 15), ('いい湯でしたo(^o^)o', 13), ('子供連れによい\(^_^ )( ^_^)/', 21), + ('サービスは良好σ(^_^)', 13), ('食事はよかったσ(´∀`)', 13), ('品揃えは良い!(^^)!', 12), ('子供連れによい(^^ゞ', 11), + ('いい湯でした(^Q^)/', 12), ('食事はよかった(^^ゝ', 11), ('食事はよかった(^人^)', 12), ('サービスは良好(^_-)-☆', 14), + ('食事はよかった(^з^)-☆', 14), ('食事はよかった\(◎o◎)/', 14), ('食事はよかった(@_@)', 12), ('品揃えは良い(@_@;)', 12), + ('サービスは良好(# ゚Д゚)', 14), ('サービスは良好( ・_・)', 13), ('いい湯でした(・_・)', 11), ('食事はよかった(・o・)', 12), + ('サービスは良好(。・・。)', 13), ('品揃えは良い(゜_゜)', 11), ('いい湯でした(`_`)ノ゛', 13), ('サービスは良好( ̄ー ̄)b', 13), + ('子供連れによい(^-^;', 12), ('サービスは良好(-_-;)', 13), ('いい湯でした(=_=;)', 12), ('いい湯でした(ノ´Д`)', 12), + ('サービスは良好(-o-;)', 13), ('食事はよかった(^^;)', 12), ('子供連れによい(^_^;)', 13), ('子供連れによい(^o^;)', 13), + ('食事はよかった...(((;^^)', 17), ('いい湯でした(゚ε゚*)', 12), ('食事はよかった♪〜( ̄ε ̄;)', 15), + ('品揃えは良い(~o~)m', 12), ('品揃えは良い(-.-)zzZZ', 15), ('子供連れによい(-_-)zzZZ', 16), + ('食事はよかった(+_+)', 12), ('食事はよかった( ..)ヾ', 13), ('食事はよかった(o-_-o)', 14), + ('サービスは良好ヽ(#゚Д゚)ノ', 15), ('サービスは良好(-_-メ)', 13), ('サービスは良好(ーー゛)', 12), + ('サービスは良好(-’’-)', 13), ('食事はよかった(`´)', 11), ('いい湯でしたヽ(#`Д´#)ノ', 15), ('品揃えは良い( ̄へ ̄井)', 12), + ('サービスは良好(-ε´-。)', 14), ('サービスは良好( ̄д ̄)', 12), ('食事はよかった( ´Д`)', 13), + ('サービスは良好(ノ´□`)ノ', 14), ('いい湯でしたヽ(*´Д`*)ノ', 15), ('サービスは良好┐(´〜`;)┌', 15), + ('子供連れによい(#´Д`#)', 14), ('いい湯でしたヽ( ̄▽ ̄)ノ', 13), ('いい湯でした(*_*)', 11), ('いい湯でした( -_-)', 12), + ('いい湯でした|(-_-)|', 13), ('子供連れによい(;_;)', 12), ('品揃えは良い(T_T)', 11), ('子供連れによい(/_;)', 12), + ('食事はよかった(ノдヽ)', 12), ('いい湯でした(TдT)', 11), ('いい湯でした(ノ´□`)', 12), ('食事はよかったヽ(;´Д`)ノ', 15), + ('子供連れによい(o´_`o)', 14), ('子供連れによい(^∧^)', 12), ('食事はよかったm(_ _)m', 14), ('いい湯でした\(__ )', 12), + ('品揃えは良い(>_<)', 17), ('子供連れによい(>_<)', 18), ('いい湯でした(+д+)', 11), + ('食事はよかった( ̄□ ̄;)', 13), ('子供連れによい(lll ̄□ ̄)', 15), ('子供連れによいp(^^)q', 13), + ('品揃えは良い(-o-)y-~~~', 16), ('いい湯でした(^_^)y-~~~', 16), ('いい湯でした(’A`)y-~', 14), + ('いい湯でしたφ(..)', 11), ('食事はよかったφ(`д´)', 13), ('食事はよかったφ(^o^)', 13), ('子供連れによいVo¥oV', 12), + ('食事はよかった(ΦωΦ)', 12), ('食事はよかったU^ェ^U', 12), ('食事はよかった(* ̄(エ) ̄*)', 16), + ('サービスは良好( ^_^)/□☆□\(^_^ )', 24), ('いい湯でした( ^_^)/ o(^o^)o \(^_^ )', 29), + ('食事はよかった(/\) \(^o^)/', 19), ('いい湯でした(^_^)/~~~', 15)], + "jp_char_after_facemark": [('いい湯でした(^_^)食事はよかった', 11), ('サービスは良好(^o^)子供連れによい', 11), + ('食事はよかった(^^)子供連れによい', 10), ('食事はよかった(^-^)品揃えは良い', 11), + ('サービスは良好(●^o^●)食事はよかった', 13), ('子供連れによい(^◇^)子供連れによい', 11), + ('子供連れによい(*^_^*)いい湯でした', 13), ('品揃えは良い(*´ー`)サービスは良好', 12), + ('子供連れによい(*´∀`*)サービスは良好', 13), ('品揃えは良い(*´ω`*)サービスは良好', 13), + ('品揃えは良い(*´艸`*)サービスは良好', 13), ('品揃えは良い(/ω\)サービスは良好', 11), + ('食事はよかった(^_^)V食事はよかった', 12), ('食事はよかった(^o^)Vサービスは良好', 12), + ('食事はよかったヽ(=´▽`=)ノ子供連れによい', 15), ('子供連れによいo(^o^)o品揃えは良い', 13), + ('子供連れによい\(^_^ )( ^_^)/品揃えは良い', 20), ('子供連れによいσ(^_^)子供連れによい', 12), + ('子供連れによいσ(´∀`)子供連れによい', 12), ('食事はよかった!(^^)!食事はよかった', 12), + ('食事はよかった(^^ゞいい湯でした', 10), ('サービスは良好(^Q^)/いい湯でした', 12), + ('子供連れによい(^^ゝいい湯でした', 10), ('サービスは良好(^人^)食事はよかった', 11), + ('いい湯でした(^_-)-☆いい湯でした', 13), ('品揃えは良い(^з^)-☆子供連れによい', 13), + ('食事はよかった\(◎o◎)/子供連れによい', 13), ('サービスは良好(@_@)食事はよかった', 11), + ('子供連れによい(@_@;)サービスは良好', 12), ('サービスは良好(# ゚Д゚)子供連れによい', 13), + ('食事はよかった( ・_・)品揃えは良い', 12), ('サービスは良好(・_・)子供連れによい', 11), + ('食事はよかった(・o・)子供連れによい', 11), ('サービスは良好(。・・。)サービスは良好', 12), + ('いい湯でした(゜_゜)食事はよかった', 11), ('食事はよかった(`_`)ノ゛いい湯でした', 13), + ('サービスは良好( ̄ー ̄)b子供連れによい', 12), ('品揃えは良い(^-^;子供連れによい', 11), + ('食事はよかった(-_-;)いい湯でした', 12), ('品揃えは良い(=_=;)サービスは良好', 12), + ('品揃えは良い(ノ´Д`)子供連れによい', 12), ('サービスは良好(-o-;)子供連れによい', 12), + ('子供連れによい(^^;)いい湯でした', 11), ('子供連れによい(^_^;)サービスは良好', 12), + ('子供連れによい(^o^;)いい湯でした', 12), ('いい湯でした...(((;^^)いい湯でした', 16), + ('サービスは良好(゚ε゚*)子供連れによい', 12), ('サービスは良好♪〜( ̄ε ̄;)品揃えは良い', 14), + ('サービスは良好(~o~)m子供連れによい', 12), ('いい湯でした(-.-)zzZZ子供連れによい', 15), + ('品揃えは良い(-_-)zzZZサービスは良好', 15), ('食事はよかった(+_+)サービスは良好', 11), + ('いい湯でした( ..)ヾ食事はよかった', 12), ('品揃えは良い(o-_-o)子供連れによい', 13), + ('いい湯でしたヽ(#゚Д゚)ノサービスは良好', 14), ('品揃えは良い(-_-メ)サービスは良好', 12), + ('食事はよかった(ーー゛)食事はよかった', 11), ('子供連れによい(-’’-)品揃えは良い', 12), + ('食事はよかった(`´)いい湯でした', 10), ('いい湯でしたヽ(#`Д´#)ノサービスは良好', 15), + ('いい湯でした( ̄へ ̄井)食事はよかった', 12), ('サービスは良好(-ε´-。)品揃えは良い', 13), + ('食事はよかった( ̄д ̄)食事はよかった', 11), ('いい湯でした( ´Д`)品揃えは良い', 12), + ('サービスは良好(ノ´□`)ノ品揃えは良い', 13), ('品揃えは良いヽ(*´Д`*)ノ食事はよかった', 15), + ('いい湯でした┐(´〜`;)┌子供連れによい', 14), ('サービスは良好(#´Д`#)食事はよかった', 13), + ('いい湯でしたヽ( ̄▽ ̄)ノ食事はよかった', 13), ('品揃えは良い(*_*)いい湯でした', 11), + ('サービスは良好( -_-)食事はよかった', 12), ('子供連れによい|(-_-)|子供連れによい', 13), + ('サービスは良好(;_;)食事はよかった', 11), ('いい湯でした(T_T)食事はよかった', 11), + ('いい湯でした(/_;)品揃えは良い', 11), ('食事はよかった(ノдヽ)サービスは良好', 11), + ('いい湯でした(TдT)いい湯でした', 11), ('子供連れによい(ノ´□`)いい湯でした', 12), + ('子供連れによいヽ(;´Д`)ノ品揃えは良い', 14), ('食事はよかった(o´_`o)子供連れによい', 13), + ('品揃えは良い(^∧^)子供連れによい', 11), ('子供連れによいm(_ _)m子供連れによい', 13), + ('いい湯でした\(__ )子供連れによい', 12), ('サービスは良好(>_<)子供連れによい', 17), + ('子供連れによい(>_<)品揃えは良い', 17), ('食事はよかった(+д+)サービスは良好', 11), + ('食事はよかった( ̄□ ̄;)子供連れによい', 12), ('子供連れによい(lll ̄□ ̄)食事はよかった', 14), + ('子供連れによいp(^^)q子供連れによい', 12), ('いい湯でした(-o-)y-~~~品揃えは良い', 16), + ('いい湯でした(^_^)y-~~~いい湯でした', 16), ('子供連れによい(’A`)y-~子供連れによい', 14), + ('品揃えは良いφ(..)品揃えは良い', 11), ('食事はよかったφ(`д´)食事はよかった', 12), + ('子供連れによいφ(^o^)食事はよかった', 12), ('食事はよかったVo¥oVサービスは良好', 11), + ('サービスは良好(ΦωΦ)サービスは良好', 11), ('子供連れによいU^ェ^Uいい湯でした', 11), + ('食事はよかった(* ̄(エ) ̄*)いい湯でした', 15), ('いい湯でした( ^_^)/□☆□\(^_^ )食事はよかった', 23), + ('いい湯でした( ^_^)/ o(^o^)o \(^_^ )食事はよかった', 29), ('子供連れによい(/\) \(^o^)/品揃えは良い', 18), + ('品揃えは良い(^_^)/~~~食事はよかった', 15)], + "en_char_after_facemark": [('いい湯でした(^_^)MFG', 11), ('サービスは良好(^o^)USJ', 11), ('いい湯でした(^^)USJ', 10), + ('いい湯でした(^-^)UBS', 11), ('食事はよかった(●^o^●)MFG', 13), ('子供連れによい(^◇^)UFJ', 11), + ('子供連れによい(*^_^*)SGM', 13), ('いい湯でした(*´ー`)SGM', 12), ('品揃えは良い(*´∀`*)DB', 13), + ('品揃えは良い(*´ω`*)UFJ', 13), ('食事はよかった(*´艸`*)USJ', 13), ('品揃えは良い(/ω\)USJ', 11), + ('品揃えは良い(^_^)VSGM', 12), ('子供連れによい(^o^)VSGM', 12), ('サービスは良好ヽ(=´▽`=)ノUFJ', 15), + ('いい湯でしたo(^o^)oMFG', 13), ('品揃えは良い\(^_^ )( ^_^)/DEUTSCH', 20), + ('サービスは良好σ(^_^)DB', 12), ('品揃えは良いσ(´∀`)SSS', 12), ('品揃えは良い!(^^)!DEUTSCH', 12), + ('いい湯でした(^^ゞSSS', 10), ('食事はよかった(^Q^)/DEUTSCH', 12), ('食事はよかった(^^ゝDB', 10), + ('品揃えは良い(^人^)MFG', 11), ('いい湯でした(^_-)-☆USJ', 13), ('サービスは良好(^з^)-☆BOSH', 13), + ('いい湯でした\(◎o◎)/SSS', 13), ('子供連れによい(@_@)MFG', 11), ('品揃えは良い(@_@;)SSS', 12), + ('サービスは良好(# ゚Д゚)SSS', 13), ('子供連れによい( ・_・)MFG', 12), ('サービスは良好(・_・)SGM', 11), + ('サービスは良好(・o・)SSS', 11), ('いい湯でした(。・・。)SGM', 12), ('食事はよかった(゜_゜)DB', 11), + ('品揃えは良い(`_`)ノ゛USJ', 13), ('食事はよかった( ̄ー ̄)bSGM', 12), ('子供連れによい(^-^;USJ', 11), + ('いい湯でした(-_-;)UFJ', 12), ('いい湯でした(=_=;)DEUTSCH', 12), ('品揃えは良い(ノ´Д`)BOSH', 12), + ('サービスは良好(-o-;)USJ', 12), ('いい湯でした(^^;)SGM', 11), ('いい湯でした(^_^;)DEUTSCH', 12), + ('子供連れによい(^o^;)USJ', 12), ('いい湯でした...(((;^^)SSS', 16), ('品揃えは良い(゚ε゚*)UFJ', 12), + ('食事はよかった♪〜( ̄ε ̄;)DB', 14), ('食事はよかった(~o~)mUFJ', 12), ('子供連れによい(-.-)zzZZSGM', 15), + ('いい湯でした(-_-)zzZZMFG', 15), ('サービスは良好(+_+)USJ', 11), ('サービスは良好( ..)ヾUSJ', 12), + ('食事はよかった(o-_-o)SGM', 13), ('子供連れによいヽ(#゚Д゚)ノDEUTSCH', 14), + ('いい湯でした(-_-メ)SSS', 12), ('いい湯でした(ーー゛)DB', 11), ('サービスは良好(-’’-)SSS', 12), + ('サービスは良好(`´)MFG', 10), ('いい湯でしたヽ(#`Д´#)ノUSJ', 15), ('いい湯でした( ̄へ ̄井)USJ', 12), + ('サービスは良好(-ε´-。)UBS', 13), ('子供連れによい( ̄д ̄)SSS', 11), ('いい湯でした( ´Д`)BOSH', 12), + ('食事はよかった(ノ´□`)ノBOSH', 13), ('子供連れによいヽ(*´Д`*)ノDEUTSCH', 15), + ('食事はよかった┐(´〜`;)┌SSS', 14), ('子供連れによい(#´Д`#)DEUTSCH', 13), + ('いい湯でしたヽ( ̄▽ ̄)ノSGM', 13), ('サービスは良好(*_*)DB', 11), ('サービスは良好( -_-)DB', 12), + ('品揃えは良い|(-_-)|DB', 13), ('サービスは良好(;_;)DEUTSCH', 11), ('いい湯でした(T_T)USJ', 11), + ('いい湯でした(/_;)UFJ', 11), ('品揃えは良い(ノдヽ)BOSH', 11), ('品揃えは良い(TдT)UFJ', 11), + ('食事はよかった(ノ´□`)BOSH', 12), ('いい湯でしたヽ(;´Д`)ノSSS', 14), ('食事はよかった(o´_`o)UFJ', 13), + ('サービスは良好(^∧^)SSS', 11), ('子供連れによいm(_ _)mSSS', 13), ('品揃えは良い\(__ )SSS', 12), + ('子供連れによい(>_<)USJ', 17), ('食事はよかった(>_<)MFG', 17), + ('子供連れによい(+д+)UFJ', 11), ('子供連れによい( ̄□ ̄;)SGM', 12), ('子供連れによい(lll ̄□ ̄)BOSH', 14), + ('品揃えは良いp(^^)qUBS', 12), ('品揃えは良い(-o-)y-~~~BOSH', 16), + ('品揃えは良い(^_^)y-~~~DEUTSCH', 16), ('品揃えは良い(’A`)y-~UFJ', 14), + ('子供連れによいφ(..)MFG', 11), ('サービスは良好φ(`д´)SSS', 12), ('食事はよかったφ(^o^)SSS', 12), + ('食事はよかったVo¥oVUSJ', 11), ('子供連れによい(ΦωΦ)SGM', 11), ('品揃えは良いU^ェ^USGM', 11), + ('品揃えは良い(* ̄(エ) ̄*)UBS', 15), ('サービスは良好( ^_^)/□☆□\(^_^ )BOSH', 23), + ('子供連れによい( ^_^)/ o(^o^)o \(^_^ )UFJ', 29), ('サービスは良好(/\) \(^o^)/USJ', 18), + ('サービスは良好(^_^)/~~~DEUTSCH', 15)] + } + + facemark_detector = FaceMarkDetector() + for pattern_name, tests in self.test_patterns.items(): + fp = tp = fn = 0 + for test_obj in tests: + ann = init_annotation(test_obj[0]) + ann = facemark_detector.annotate(original_text=test_obj[0], + spans=ann) + sb_positions = list(ann.get_annotation_layer(FaceMarkDetector.__name__)) + if len(sb_positions) > 1: + fp += 1 + elif len(sb_positions) == 1 and sb_positions[0].end_index == test_obj[1]: + tp += 1 + elif len(sb_positions) == 1 and sb_positions[0].end_index != test_obj[1]: + fp += 1 + elif len(sb_positions) == 0: + fn += 1 + else: + raise Exception() + p = tp / (tp + fp) + r = tp / (tp + fn) + print(f'{pattern_name} precision:{p} = {tp} / {tp + fp} recall:{r} ={tp} / {tp + fn}') + + def test_annotate(self): + test_instances = [ + TestInstance(text='宿を予約しました\(^o^)/まだ2ヶ月も先だけど。', + n_sentence=2, expected_rules=[FaceMarkDetector.__name__]) + ] + self.is_check_test_instance(annotator=FaceMarkDetector(), test_cases=test_instances) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/bunkai_sbd/annotator/test_indirect_quote_exception_annotator.py b/tests/bunkai_sbd/annotator/test_indirect_quote_exception_annotator.py new file mode 100644 index 0000000..0199838 --- /dev/null +++ b/tests/bunkai_sbd/annotator/test_indirect_quote_exception_annotator.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 +import unittest +from collections import namedtuple + +from bunkai.algorithm.bunkai_sbd.annotator.basic_annotator import BasicRule +from bunkai.algorithm.bunkai_sbd.annotator.constant import LAYER_NAME_FIRST +from bunkai.algorithm.bunkai_sbd.annotator.emoji_annotator import \ + EmojiAnnotator +from bunkai.algorithm.bunkai_sbd.annotator.emotion_expression_annotator import \ + EmotionExpressionAnnotator +from bunkai.algorithm.bunkai_sbd.annotator.facemark_detector import \ + FaceMarkDetector +from bunkai.algorithm.bunkai_sbd.annotator.indirect_quote_exception_annotator import \ + IndirectQuoteExceptionAnnotator +from bunkai.algorithm.bunkai_sbd.annotator.linebreak_annotator import \ + LinebreakAnnotator +from bunkai.algorithm.bunkai_sbd.annotator.morph_annotator import \ + MorphAnnotatorJanome +from bunkai.base.annotation import Annotations, SpanAnnotation + +MorphResult = namedtuple('MorphResult', ('input_text', 'seq_linebreak_position')) + + +class TestMorphAnnotator(unittest.TestCase): + def setUp(self) -> None: + self.morph_annotator = MorphAnnotatorJanome() + self.test_input = [ + MorphResult('この値段で、こんな夕飯いいの?\nって、くらいおいしかった!', [29]) + ] + + def init_tokenized_layer(self, text: str) -> Annotations: + annotations = Annotations() + annotations.add_annotation_layer(LAYER_NAME_FIRST, [SpanAnnotation(rule_name='first', + start_index=len(text) - 1, + end_index=len(text), + split_string_type=None, + split_string_value=None)]) + pipeline = [ + BasicRule(), + FaceMarkDetector(), + EmotionExpressionAnnotator(), + EmojiAnnotator() + ] + for annotator in pipeline: + annotator.annotate(text, annotations) + self.morph_annotator.annotate(text, annotations) + return annotations + + def test_if_linebreak_annotator_false(self): + # LinebreakAnnotatorの予測が間違っていた場合のテストケース + text = 'この値段で、こんな夕飯いいの?\nって、くらいおいしかった!' + default_annotation = self.init_tokenized_layer(text) + # LinebreakAnnotatorの疑似出力を作る + pseudo_out_linebreak = [SpanAnnotation( + rule_name=LinebreakAnnotator.__name__, + start_index=15, + end_index=16, + split_string_type=None, + split_string_value=None, + args=None)] + list(default_annotation.get_annotation_layer('first')) + default_annotation.add_annotation_layer(annotations=pseudo_out_linebreak, + annotator_name=LinebreakAnnotator.__name__) + indirect_exception = IndirectQuoteExceptionAnnotator() + result_annotation = indirect_exception.annotate(original_text=text, spans=default_annotation) + self.assertEqual(len(result_annotation.get_final_layer()), 1) + + def test_indirect_quote_exception_annotator(self): + pass + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/bunkai_sbd/annotator/test_linebreak_exception_annotator.py b/tests/bunkai_sbd/annotator/test_linebreak_exception_annotator.py new file mode 100644 index 0000000..7f47836 --- /dev/null +++ b/tests/bunkai_sbd/annotator/test_linebreak_exception_annotator.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 +import unittest +from collections import namedtuple +from unittest.mock import MagicMock, patch + +from bunkai.algorithm.bunkai_sbd.annotator import MorphAnnotatorJanome +from bunkai.algorithm.bunkai_sbd.annotator.linebreak_annotator import \ + LinebreakAnnotator +from bunkai.base.annotation import Annotations, SpanAnnotation + +from .annotation_test_base import TestAnnotatorBase + +LinebreakTestCase = namedtuple('LinebreakTestCase', ('text', 'char_positions', 'return_value')) + + +class TestLinebreakExceptionAnnotator(TestAnnotatorBase): + def setUp(self) -> None: + self.morph_annotator = MorphAnnotatorJanome() + self.test_sentences = [ + LinebreakTestCase( + 'ペンションの内装もご自分達でリノベーションされたとの事でしたが、とても綺麗で色使いもお洒落☆\n' + 'お風呂も大きく、のんびり出来ました☆\n' + 'また朝ご飯のキッシュがとても美味しくて2人でペロリと頂いてしまいました♪', + [(46, 47), (65, 66)], + [[27, 38, ]] + ), + LinebreakTestCase('お部屋に露天風呂と足湯が付いていて、\n' + 'とっても素敵でした。これからもまた旅行にいくときには、\nぜひ利用したいです。', + [], [[]]) + ] + + def init_tokenized_layer(self, text: str) -> Annotations: + annotations = Annotations() + annotations.add_annotation_layer('first', [SpanAnnotation(rule_name=None, + start_index=0, + end_index=len( + text), + split_string_type=None, split_string_value=None)]) + self.morph_annotator.annotate(text, annotations) + return annotations + + def test_run(self): + path_model = '' + + for test_case in self.test_sentences: + predictor_mock = MagicMock() + predictor_mock.return_value = test_case.return_value + + predictor_init = MagicMock() + predictor_init.return_value = None + + with patch('bunkai.algorithm.lbd.predict.Predictor.__init__', predictor_init): + with patch('bunkai.algorithm.lbd.predict.Predictor.predict', predictor_mock): + splitter_obj = LinebreakAnnotator(path_model=path_model) + tokenized_layer = self.init_tokenized_layer(test_case.text) + span_result = splitter_obj.annotate(test_case.text, tokenized_layer) + assert 'LinebreakAnnotator' in span_result.name2spans + results_exception_annotator = [a for a in span_result.get_annotation_layer('LinebreakAnnotator') + if a.rule_name == 'LinebreakAnnotator'] + assert len(results_exception_annotator) == len(test_case.char_positions) + __ = set([(spans.start_index, spans.end_index) for spans in results_exception_annotator]) + assert __ == set(test_case.char_positions) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/bunkai_sbd/annotator/test_morph_annotator.py b/tests/bunkai_sbd/annotator/test_morph_annotator.py new file mode 100644 index 0000000..31b626b --- /dev/null +++ b/tests/bunkai_sbd/annotator/test_morph_annotator.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 +import unittest +from collections import namedtuple + +from bunkai.algorithm.bunkai_sbd.annotator.morph_annotator import \ + MorphAnnotatorJanome + +from .annotation_test_base import TestAnnotatorBase + +MorphResult = namedtuple('MorphResult', ('input_text', 'seq_newline_position')) + + +class TestMorphAnnotator(TestAnnotatorBase): + def test_tokenize(self): + test_input = [ + MorphResult('宿を予約しました♪!まだ2ヶ月も先だけど。早すぎかな(笑)楽しみです★\n' + '2文書目の先頭行です。▁改行はU+2581で表現します。', [25]) + ] + morph_annotator = MorphAnnotatorJanome() + for case_tuple in test_input: + seq_tokens = list(morph_annotator.tokenizer.tokenize(case_tuple.input_text)) + for position_newline in case_tuple.seq_newline_position: + target_morph = seq_tokens[position_newline].surface + self.assertEqual(target_morph, '\n') + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/bunkai_sbd/annotator/test_number_exception_annotator.py b/tests/bunkai_sbd/annotator/test_number_exception_annotator.py new file mode 100644 index 0000000..dfedc83 --- /dev/null +++ b/tests/bunkai_sbd/annotator/test_number_exception_annotator.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 +import unittest +from collections import namedtuple + +from bunkai.algorithm.bunkai_sbd.annotator.number_exception_annotator import \ + NumberExceptionAnnotator + +from .annotation_test_base import TestAnnotatorBase, TestInstance + +NewlineTestCase = namedtuple('NewlineTestCase', ('text', 'start_index', 'end_index', 'ans')) + + +class TestNumberExceptionAnnotator(TestAnnotatorBase): + def test_is_exception_no(self): + annotator = NumberExceptionAnnotator() + + self.test_sentences = [ + NewlineTestCase('No.1の商品', 2, 3, True), + NewlineTestCase('ROOM No.411を予約しました。', 7, 8, True) + ] + for test_obj in self.test_sentences: + out_annotator = annotator.is_exception_no(test_obj.text, test_obj.start_index, test_obj.end_index) + self.assertEqual(out_annotator, test_obj.ans) + + def test_annotate(self): + test_cases = [ + TestInstance('No.1の商品', 1, expected_rules=[]), + TestInstance('ROOM No.411を予約しました。', 1, expected_rules=[]), + ] + annotator = NumberExceptionAnnotator() + self.is_check_test_instance(annotator=annotator, test_cases=test_cases) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/bunkai_sbd/annotator/test_numeric_expression_exception_annotator.py b/tests/bunkai_sbd/annotator/test_numeric_expression_exception_annotator.py new file mode 100644 index 0000000..e9d10ee --- /dev/null +++ b/tests/bunkai_sbd/annotator/test_numeric_expression_exception_annotator.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python3 +import unittest +from collections import namedtuple + +from bunkai.algorithm.bunkai_sbd.annotator.dot_exception_annotator import \ + DotExceptionAnnotator + +from .annotation_test_base import TestAnnotatorBase, TestInstance + +NewlineTestCase = namedtuple('NewlineTestCase', ('text', 'start_index', 'ans')) + + +class TestDotExceptionAnnotator(TestAnnotatorBase): + def test_is_exception_numeric(self): + annotator = DotExceptionAnnotator() + + self.test_sentences = [ + NewlineTestCase('和室3.5畳', 3, True), + NewlineTestCase('1.5リットル以上のペットボトル', 1, True), + NewlineTestCase('四.五畳以上の大きさ', 1, True), + NewlineTestCase('四五畳以上の大きさ', 1, False), + NewlineTestCase('15メートル以上の大きさ', 1, False), + ] + for test_obj in self.test_sentences: + out_annotator = annotator.is_exception_numeric(test_obj.text, test_obj.start_index) + self.assertEqual(out_annotator, test_obj.ans) + + def test_annotate(self): + annotator = DotExceptionAnnotator() + test_sentences = [ + TestInstance('和室3.5畳', 1, expected_rules=[]), + TestInstance('1.5リットル以上のペットボトル', 1, expected_rules=[]), + TestInstance('四.五畳以上の大きさ', 1, expected_rules=[]), + TestInstance('四五畳以上の大きさ', 1, expected_rules=[]), + TestInstance('15メートル以上の大きさ', 1, expected_rules=[]), + ] + self.is_check_test_instance(annotator=annotator, test_cases=test_sentences) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/bunkai_sbd/test_bukai_sbd.py b/tests/bunkai_sbd/test_bukai_sbd.py new file mode 100644 index 0000000..caecb9e --- /dev/null +++ b/tests/bunkai_sbd/test_bukai_sbd.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 +import dataclasses +import typing +import unittest + +from bunkai.algorithm.bunkai_sbd.annotator.basic_annotator import BasicRule +from bunkai.algorithm.bunkai_sbd.annotator.constant import LAYER_NAME_FIRST +from bunkai.algorithm.bunkai_sbd.annotator.emoji_annotator import \ + EmojiAnnotator +from bunkai.algorithm.bunkai_sbd.annotator.emotion_expression_annotator import \ + EmotionExpressionAnnotator +from bunkai.algorithm.bunkai_sbd.annotator.facemark_detector import \ + FaceMarkDetector +from bunkai.algorithm.bunkai_sbd.annotator.linebreak_force_annotator import \ + LinebreakForceAnnotator +from bunkai.algorithm.bunkai_sbd.bunkai_sbd import \ + BunkaiSentenceBoundaryDisambiguation +from bunkai.base.annotation import Annotations, SpanAnnotation + + +@dataclasses.dataclass +class TestInstance(object): + text: str + n_sentence: int + expected_rules: typing.Optional[typing.List[str]] = None + + +def monkeypatch_function(self, original_text: str, spans: Annotations): + # tokenizerを更新する。すでにTokenize済みの結果を利用する。 + # self.linebreak_detector.reset_tokenizer(word_tokenizer_type='pre_tokenize', sentence2tokens=sentence2tokens) + result: typing.List[int] = [] + + new_spans = [s for s in spans.get_final_layer()] + for predicted_index in result: + char_index_start = new_spans[predicted_index].start_index + char_index_end = new_spans[predicted_index].end_index + ann = SpanAnnotation( + rule_name='LinebreakExceptionAnnotator', + start_index=char_index_start, + end_index=char_index_end, + split_string_type='linebreak', + split_string_value=original_text[char_index_start: char_index_end]) + new_spans.append(ann) + spans.add_annotation_layer(self.rule_name, new_spans) + + return spans + + +class TestBunkaiSbd(unittest.TestCase): + def setUp(self) -> None: + self.test_sentences = [ + TestInstance("まずは一文目(^!^)つぎに二文目(^^)これ、テスト文なんですけど(笑)本当?にこんなテキストでいいのかな☆" + "10秒で考えて書いたよ." + "(セルフドリンクサービスはすごく良かったです!種類も豊富。)" + "おすすめ度No.1の和室3.5畳はあります。" + "おしまい♪" + "読点で文を分割するという場合もあります、しかし現在は対応していません。" + "(^ ^) (^ ^) 先月泊まりましたが とてもよかったです。" + "(近日中には冷房に切り替わる予定です。\nいいですね(泣))", 11), + TestInstance("30代の夫婦2組(男2.女2)です。", 2), + TestInstance("この値段で、こんな夕飯いいの?\nって、くらいおいしかった!", 1), + ] + + def test_various_inputs(self): + """Inputのバリエーションをテストする。Bertモデルを利用しない。""" + test_cases = [ + TestInstance("これが1文目です。。。。そして、これが2文目…3文目。", 3, + expected_rules=[EmotionExpressionAnnotator.__name__, BasicRule.__name__, LAYER_NAME_FIRST]), + TestInstance("宿を予約しました♪!まだ2ヶ月も先だけど。早すぎかな(笑)楽しみです★", 4, + expected_rules=[EmotionExpressionAnnotator.__name__, BasicRule.__name__, LAYER_NAME_FIRST]), + TestInstance("宿を予約しました😄まだ2ヶ月も先だけど😄早すぎかな(笑)楽しみです★", 4, + expected_rules=[EmotionExpressionAnnotator.__name__, EmojiAnnotator.__name__, + LAYER_NAME_FIRST]), + TestInstance("宿を予約しました😄😄😄まだ2ヶ月も先だけど😄😄😄早すぎかな(笑)楽しみです★", 4, + expected_rules=[EmotionExpressionAnnotator.__name__, EmojiAnnotator.__name__, + LAYER_NAME_FIRST]), + TestInstance("宿を予約しました\(^o^)/まだ2ヶ月も先だけど。早すぎかな(笑)楽しみです★", 4, + expected_rules=[EmotionExpressionAnnotator.__name__, BasicRule.__name__, + FaceMarkDetector.__name__, LAYER_NAME_FIRST]), + TestInstance("この値段で、こんな夕飯いいの?\nって、くらいおいしかった!", 2, + expected_rules=[LAYER_NAME_FIRST, LinebreakForceAnnotator.__name__]), + TestInstance('これは入力の入力サンプルです(^o^)絵文字の文末記号も認識します😀引用文も大丈夫?と思いませんか?引用文の過剰分割を防げるんです👍', + 4, expected_rules=[BasicRule.__name__, + FaceMarkDetector.__name__, + EmojiAnnotator.__name__, + LAYER_NAME_FIRST]), + TestInstance('本商品はおすすめ度No.1です。', 1, expected_rules=[LAYER_NAME_FIRST]), + TestInstance('本商品はおすすめ度No.1です!という売り文句の新商品が出ている。しかし、この商品は本当に信用できるのだろうか?私はとても懐疑的である。', + 3, expected_rules=[BasicRule.__name__, LAYER_NAME_FIRST]) + ] + splitter_obj = BunkaiSentenceBoundaryDisambiguation(path_model=None) + for test_case in test_cases: + self.assertEqual(len(list(splitter_obj(test_case.text))), test_case.n_sentence, + msg=f'Input={test_case.text} Expect N(sent)={test_case.n_sentence} ' + f'Result={list(splitter_obj(test_case.text))}') + annotations = splitter_obj._eos(test_case.text) + span_annotations = annotations.get_final_layer() + self.assertEqual(set([s.rule_name for s in span_annotations]), # type: ignore + set(test_case.expected_rules), # type: ignore + msg=f'text={test_case.text}, ' # type: ignore + f'{set([s.rule_name for s in span_annotations])} ' + f'!= {test_case.expected_rules}') + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/experiment/__init__.py b/tests/experiment/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/experiment/test_evaluate.py b/tests/experiment/test_evaluate.py new file mode 100644 index 0000000..7711d5d --- /dev/null +++ b/tests/experiment/test_evaluate.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python3 +import dataclasses +import unittest + +from bunkai.constant import METACHAR_LINE_BREAK, METACHAR_SENTENCE_BOUNDARY +from bunkai.experiment.evaluate import trim + + +@dataclasses.dataclass +class TestInstance(object): + input: str + output: str + + +class TestTrim(unittest.TestCase): + def test_trim(self): + SB = METACHAR_SENTENCE_BOUNDARY + LB = METACHAR_LINE_BREAK + test_cases = [ + TestInstance(f'yy {SB}zz', f'yy {SB}zz'), + TestInstance(f'yy {LB}{SB}zz', f'yy {LB}{SB}zz'), + TestInstance(f'yy {LB}{SB} zz', f'yy {LB} {SB}zz'), + TestInstance(f'yy {LB} {SB} zz', f'yy {LB} {SB}zz'), + + TestInstance(f'yy{SB} zz', f'yy {SB}zz'), + TestInstance(f'yy{SB} zz', f'yy {SB}zz'), + + TestInstance('yy zz', 'yy zz'), + TestInstance('yy zz', 'yy zz'), + + TestInstance(f'yy {LB}zz', f'yy {LB}zz'), + TestInstance(f'yy {LB} zz', f'yy {LB} zz'), + + TestInstance(f'y{SB}{LB}{SB}zz', f'y{LB}{SB}zz'), + ] + for tc in test_cases: + self.assertEqual(trim(tc.input), tc.output) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/lbd/__init__.py b/tests/lbd/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/lbd/test_predict.py b/tests/lbd/test_predict.py new file mode 100644 index 0000000..0534b13 --- /dev/null +++ b/tests/lbd/test_predict.py @@ -0,0 +1,728 @@ +#!/usr/bin/env python3 +import dataclasses +import pathlib +import typing +import unittest +from unittest.mock import MagicMock, Mock, patch + +import numpy +import torch + +import bunkai +from bunkai.algorithm.bunkai_sbd.annotator import MorphAnnotatorJanome +from bunkai.base.annotation import Annotations, SpanAnnotation +from bunkai.constant import METACHAR_LINE_BREAK +from bunkai.third.utils_ner import InputFeatures + + +@dataclasses.dataclass +class ReturnObject(object): + predictions: typing.Union[numpy.ndarray, typing.Tuple[numpy.ndarray, typing.Any]] + label_ids: numpy.ndarray + metrics: typing.Dict[str, float] + labels: typing.List[str] + label_map: typing.Dict[int, str] + tokenizer: typing.List[typing.List[str]] + subwords_lengths: typing.List[typing.List[int]] + dataset_content: typing.List[InputFeatures] + subword_tokens: typing.List[str] + model_type: str + + def to_prediction_tuple(self): + return self.predictions, self.label_ids, self.metrics + + +@dataclasses.dataclass +class NewlineTestCase(object): + text: str + return_value: ReturnObject + + +@dataclasses.dataclass +class DummyBcObject(object): + max_seq_length = 352 + + +class DummyJanomeSubwordsTokenizer(object): + subword_tokenizer_type = "wordpiece" + do_subword_tokenize: bool = True + never_split = None + unk_token = "[UNK]" + sep_token = "[SEP]" + pad_token = "[PAD]" + cls_token = "[CLS]" + mask_token = "[MASK]" + + +def func_generate_dummy_bert_prediction() -> ReturnObject: + input_ids = numpy.array([[-100, 0, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, -100, -100, 0, 0, -100, -100, -100, -100, -100, -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100]]) + + bert_prediction_dummy = ReturnObject( + predictions=numpy.array([[ + [6.2508187, -4.232818, -2.757059], [6.1650567, -3.9524226, -2.8288684], + [5.8380218, -3.6578689, -2.6578376], [6.572264, -3.9870892, -2.8709047], + [6.4894476, -3.9268737, -2.9671426], [6.216006, -3.7496064, -2.699235], + [6.154592, -3.768798, -2.9045649], [6.713662, -4.110723, -3.033975], + [6.4546986, -4.302385, -2.8338246], [6.654306, -4.3293185, -2.6386347], + [6.471306, -4.2099767, -2.675298], [6.5739822, -4.2123647, -2.610855], + [6.56117, -4.2072744, -2.7977717], [5.5703177, -3.9187171, -2.5862396], + [-1.884768, 4.4782224, -1.8186623], [5.8886337, -3.8386352, -2.851794], + [6.1382294, -4.0328712, -2.8980045], [5.56992, -3.6112833, -2.725328], + [6.1413136, -4.0054746, -3.0763247], [5.949703, -3.7203593, -2.748416], + [6.490921, -4.10034, -2.882184], [6.5604143, -4.187022, -2.7890666], + [6.594881, -4.082993, -2.8291895], [6.5816226, -4.3966985, -2.8815534], + [6.4179454, -4.2702456, -2.9540753], [5.451252, -3.8440175, -2.5752163], + [5.04419, -3.9542036, -2.1646724], [3.4635344, -1.968563, -2.199682], + [4.8015337, -2.961069, -2.4906867], [5.642599, -3.646186, -2.6289954], + [5.2767353, -3.545886, -2.642362], [5.0331793, -3.3589022, -2.6159847], + [5.4294004, -3.643956, -2.6506023], [4.5621023, -3.094194, -2.4944196], + [4.1612453, -2.6053162, -2.4269052], [5.314385, -3.575482, -2.6072056], + [5.7357044, -3.812284, -2.5930557], [5.507862, -3.702612, -2.5583594], + [4.6684365, -3.2131882, -2.445125], [3.5338802, -2.0267015, -2.207005], + [5.7340727, -3.7640634, -2.584625], [5.6197987, -3.716104, -2.5262532], + [5.47165, -3.8265584, -2.683898], [5.622836, -3.8941417, -2.4897096], + [5.7662735, -3.5816152, -2.5766578], [6.1038303, -3.778536, -2.5087128], + [5.8932824, -3.5206861, -2.5480444], [5.9496975, -3.6596575, -2.7018757], + [5.80085, -3.8926811, -2.7254941], [5.2340307, -3.5927713, -2.7278686], + [5.1017323, -3.3140123, -2.598034], [5.8979063, -3.754191, -2.4886804], + [6.0222745, -3.7422712, -2.5321903], [6.018483, -3.754891, -2.6665978], + [5.704626, -3.542376, -2.654388], [4.6625023, -2.9835098, -2.541861], + [5.851006, -3.9127822, -2.657012], [5.7435713, -3.8231273, -2.593646], + [5.4913816, -3.65688, -2.581519], [5.5846696, -3.8045363, -2.5105321], + [5.7596455, -3.856001, -2.5696197], [5.2555804, -3.6392345, -2.5121176], + [3.876574, -2.5438776, -2.222461], [3.7917795, -2.0205114, -2.2313485], + [5.4242163, -3.5941799, -2.5328736], [5.375439, -3.5381823, -2.5158625], + [5.4592915, -3.702542, -2.6168754], [5.560192, -3.833506, -2.513876], + [5.6088414, -3.6553125, -2.5693743], [5.9053307, -3.8074007, -2.5359588], + [5.9091473, -3.8741407, -2.577005], [5.8823833, -3.9749475, -2.7565668], + [5.6488376, -3.8580503, -2.6887958], [5.2936573, -3.6095896, -2.6221497], + [4.391178, -2.9584074, -2.444511], [3.6910276, -2.122798, -2.2480416], + [4.749332, -3.0014434, -2.4438312], [5.4725676, -3.572462, -2.5331836], + [5.383877, -3.459734, -2.5013256], [4.7639284, -3.1252341, -2.3972554], + [3.0694826, -1.5774779, -2.004921], [4.904317, -3.1489391, -2.5000887], + [5.3118916, -3.5204654, -2.4224253], [5.180787, -3.576025, -2.5785341], + [5.114481, -3.4627273, -2.5771754], [5.3889327, -3.5888848, -2.5031228], + [5.6680975, -3.6847317, -2.5290437], [5.138223, -3.5371237, -2.4942274], + [4.500623, -3.0357556, -2.3804736], [4.391722, -2.5864615, -2.312683], + [5.499447, -3.6149004, -2.4913578], [5.5458784, -3.731234, -2.5449672], + [5.3306437, -3.7202172, -2.6340237], [5.555006, -3.902042, -2.5700445], + [5.5402975, -3.5568285, -2.5195646], [5.923624, -3.8101199, -2.5412364], + [5.966515, -3.813748, -2.5135388], [5.8475056, -3.5183058, -2.591], + [6.0184994, -3.9749384, -2.7445848], [5.477597, -3.7229378, -2.662457], + [5.360407, -3.6949844, -2.6515026], [5.664912, -3.8135028, -2.5601358], + [5.4726477, -3.3598118, -2.4900548], [5.716467, -3.51123, -2.4441009], + [5.9715557, -3.444706, -2.381342], [5.684905, -3.407822, -2.4511132], + [5.652356, -3.5395107, -2.566182], [6.086296, -4.0143557, -2.757162], + [5.93272, -3.913489, -2.8516464], [5.513011, -3.4940288, -2.4365187], + [5.847448, -3.7215633, -2.3947792], [5.8946314, -3.8241282, -2.54148], + [5.750376, -3.7425175, -2.5400023], [4.2657037, -2.7419717, -2.2960308], + [2.803505, -1.1151873, -2.0724204], [5.069028, -3.2909331, -2.334376], + [5.1803617, -3.453929, -2.5205479], [5.4511433, -3.702659, -2.574565], + [5.3047314, -3.4712682, -2.4875286], [5.6509314, -3.7447715, -2.5951982], + [4.6699295, -3.1814604, -2.428188], [3.7801378, -2.4215903, -2.232263], + [5.882105, -3.9000041, -2.7211075], [5.4890738, -3.6392746, -2.5046637], + [5.5276957, -3.803844, -2.6643615], [5.5274057, -3.8214302, -2.472117], + [5.697824, -3.462097, -2.537269], [5.957882, -3.4408717, -2.3871222], + [5.7239294, -3.427033, -2.4772236], [5.7963157, -3.564792, -2.6691947], + [6.039237, -3.963026, -2.7902536], [5.920341, -4.0087867, -2.9042587], + [5.5651474, -3.5472107, -2.4891918], [5.8184776, -3.666483, -2.4670477], + [5.9809966, -3.7833667, -2.6582336], [5.647328, -3.433617, -2.5941267], + [5.5610843, -3.4891434, -2.6713898], [5.927872, -3.8813362, -2.7214065], + [5.785965, -3.7625728, -2.7681732], [5.5795455, -3.5510294, -2.4824677], + [5.658676, -3.582807, -2.3650568], [5.5469546, -3.691554, -2.5050597], + [4.9479814, -3.3915858, -2.4030986], [2.3329203, -1.131766, -1.8485751], + [4.9787626, -3.2024384, -2.483747], [4.911786, -3.1755145, -2.3659158], + [5.1218967, -3.4676068, -2.5503993], [5.3810396, -3.6829956, -2.4702733], + [5.547564, -3.4910274, -2.545243], [5.7789097, -3.722993, -2.4507918], + [5.970089, -3.8823137, -2.4802265], [5.9492865, -4.123867, -2.6572356], + [5.6568136, -3.8770761, -2.729714], [4.3428807, -2.8855278, -2.3944554], + [3.3280978, -1.8396173, -2.1001978], [4.402819, -2.6880207, -2.3907475], + [5.33469, -3.4014668, -2.4767218], [5.344347, -3.4569607, -2.5946865], + [5.16568, -3.5411572, -2.5953364], [4.793007, -3.216085, -2.473689], + [4.2960186, -2.7347312, -2.3489754], [3.805702, -2.1488824, -2.2282677], + [4.8418455, -3.0454865, -2.429937], [5.323112, -3.394518, -2.4346614], + [5.0180244, -3.275909, -2.4122648], [2.9661903, -1.4233906, -1.8672131], + [4.931182, -3.1389503, -2.4102225], [4.9693556, -3.3392224, -2.3979883], + [4.7797327, -3.1475272, -2.4123082], [4.9508667, -3.3479035, -2.4246244], + [5.3461943, -3.5120308, -2.403717], [4.7816477, -3.1765049, -2.3775845], + [3.5539727, -2.0844865, -2.1079192], [4.6387863, -2.7948396, -2.327755], + [5.2454753, -3.5064278, -2.4758189], [5.34525, -3.6765518, -2.5685573], + [5.3302755, -3.6907237, -2.3332164], [5.5514193, -3.3656187, -2.4497824], + [5.9280806, -3.4156218, -2.3316245], [5.6186748, -3.4095483, -2.409141], + [5.5813913, -3.4803317, -2.5405798], [6.018386, -4.0123005, -2.7944303], + [5.791168, -3.8674788, -2.84371], [5.5173597, -3.5319824, -2.486122], + [5.699461, -3.6407382, -2.38972], [5.5982647, -3.7332854, -2.5377173], + [5.2531133, -3.6009998, -2.52218], [3.1787467, -1.9656836, -2.093604], + [4.40965, -2.5246685, -2.265173], [4.986265, -3.2378364, -2.3466463], + [5.221637, -3.5330796, -2.599038], [5.4330263, -3.6381497, -2.5219502], + [5.4239407, -3.537006, -2.494265], [5.7460465, -3.6373107, -2.4902682], + [5.8602858, -3.7709167, -2.4536014], [6.177119, -4.1630287, -2.6641548], + [6.007184, -4.1391425, -2.7746332], [5.234189, -3.6090078, -2.6703088], + [4.458186, -2.7931972, -2.487601], [5.560937, -3.7051048, -2.4895868], + [5.6427107, -3.4239001, -2.5180545], [5.985802, -3.4383836, -2.3521647], + [5.705908, -3.4290183, -2.4250975], [5.633841, -3.4536667, -2.5365653], + [6.068409, -4.0221148, -2.7998438], [5.9640284, -3.9706354, -2.9187305], + [5.5351458, -3.5053978, -2.4735668], [5.819968, -3.7009068, -2.4335914], + [5.7983685, -3.7754993, -2.5713015], [5.6510377, -3.7154012, -2.5723357], + [3.1107492, -1.7932931, -2.038206], [4.6108003, -2.809266, -2.3758132], + [5.129378, -3.334814, -2.3553529], [5.27525, -3.511778, -2.542399], + [5.553991, -3.7560308, -2.5243082], [5.502201, -3.4396672, -2.5307422], + [5.932541, -3.7612128, -2.6955643], [5.511362, -3.666332, -2.6930692], + [5.7192364, -3.4919772, -2.5201082], [5.9494834, -3.9199066, -2.8064528], + [5.989796, -4.028501, -2.9356284], [5.6844234, -3.7073665, -2.6559854], + [5.7420044, -3.6841903, -2.4219987], [6.019269, -3.851921, -2.5968761], + [6.012556, -3.7651906, -2.7302198], [5.22493, -3.475544, -2.7187457], + [3.7751057, -2.2496824, -2.357636], [5.0919046, -3.3189614, -2.3565404], + [5.065481, -3.4329944, -2.5660634], [5.425741, -3.6887774, -2.519134], + [5.4049864, -3.6632752, -2.5491867], [5.7636952, -3.6867075, -2.525423], + [5.069643, -3.6292334, -2.5354824], [4.145218, -2.7678344, -2.3718739], + [5.6274185, -3.743704, -2.6409373], [5.338847, -3.5374503, -2.5367274], + [5.486684, -3.7471037, -2.6338878], [5.5501976, -3.8444057, -2.492587], + [5.680416, -3.5932336, -2.565421], [5.806082, -3.7560775, -2.4588132], + [6.016405, -3.9916434, -2.5753489], [5.8762383, -4.095117, -2.6915278], + [5.441005, -3.7025, -2.6978788], [3.8021955, -2.3089309, -2.2744963], + [3.295628, -1.7485684, -2.1460679], [3.8095417, -2.1189125, -2.2619543], + [5.4892044, -3.5187688, -2.5719445], [5.4801655, -3.616661, -2.614462], + [4.9412575, -3.1965845, -2.5532193], [5.1379337, -3.4174705, -2.5731788], + [4.9215374, -3.290681, -2.5169702], [4.978571, -3.3317158, -2.5741048], + [5.4809895, -3.7299604, -2.5985618], [5.4321156, -3.5919714, -2.5068498], + [4.9712186, -3.3998384, -2.4638414], [3.131559, -1.8154464, -2.0161948], + [4.20892, -2.32469, -2.24137], [5.3603983, -3.5648031, -2.6062012], + [4.770118, -3.3103986, -2.570477], [4.701253, -3.1281867, -2.5695343], + [5.692337, -3.640839, -2.4616065], [5.45103, -3.6451797, -2.5184312], + [5.107309, -3.4940763, -2.478231], [2.6783373, -1.4480876, -1.8975571], + [5.1807585, -3.3610148, -2.513155], [5.0343246, -3.2504637, -2.3884437], + [5.273611, -3.5527153, -2.5969102], [5.4621787, -3.6608934, -2.5238476], + [5.4608717, -3.5109582, -2.4960275], [5.82248, -3.7629507, -2.517388], + [5.7347484, -3.8859196, -2.5711641], [6.112766, -4.150079, -2.6801968], + [5.705344, -3.8973122, -2.7257922], [4.6984296, -3.2733493, -2.5083308], + [4.3192806, -2.8144884, -2.431129], [3.3805945, -1.8914232, -2.134581], + [5.3352375, -3.2841783, -2.4582448], [5.7989273, -3.6409295, -2.5358305], + [5.648154, -3.3823075, -2.5451796], [5.525385, -3.6856186, -2.5623384], + [5.047093, -3.3972507, -2.5808408], [4.6922903, -3.1120577, -2.520347], + [4.454125, -2.7146144, -2.4370828], [5.8160934, -3.7015703, -2.4822454], + [5.842537, -3.7719693, -2.5903497], [5.584236, -3.6224103, -2.578228], + [4.6949444, -3.0575066, -2.4430642], [3.7849636, -2.0093102, -2.2221537], + [5.34052, -3.4516811, -2.3648925], [5.4426284, -3.7016723, -2.6079993], + [5.575471, -3.8038383, -2.6187627], [5.374604, -3.4713042, -2.472371], + [5.573716, -3.7265248, -2.549485], [4.758337, -3.2580361, -2.4734373], + [3.182425, -1.8679427, -2.1192076], [5.5819745, -3.679071, -2.5723608], + [5.2793016, -3.4560566, -2.421187], [5.5433426, -3.7827773, -2.6599064], + [5.5916505, -3.8679109, -2.5255156], [5.497374, -3.498955, -2.525767], + [5.8535748, -3.5889792, -2.5111132], [5.98721, -3.4372945, -2.3722787], + [5.8036013, -3.5048394, -2.530361], [5.953143, -3.8283353, -2.7150617], + [6.1119184, -4.0932183, -2.832982], [5.959545, -4.0726967, -2.8936415], + [5.6262956, -3.5740123, -2.5001867], [5.802416, -3.625824, -2.6191945], + [6.1741114, -3.677545, -2.5701354], [6.019506, -3.5576932, -2.555736], + [5.670724, -3.441112, -2.5689387], [5.865163, -3.8021712, -2.7791395], + [6.131137, -4.057909, -2.8261504], [5.7952104, -3.6973677, -2.644189], + [5.7676497, -3.6922674, -2.5299058], [5.883764, -3.8301828, -2.5025594], + [5.574148, -3.6859841, -2.5446942], [4.5494995, -3.0782628, -2.4074235], + [2.3880472, -1.2455968, -1.9546468], [5.3021474, -3.4626734, -2.5357323]] + ], dtype=numpy.float32), + label_ids=input_ids, + metrics={'eval_loss': 3.2364680767059326}, + labels=['O', 'LB_SEP', 'LB_NS'], + label_map={0: 'O', 1: 'LB_SEP', 2: 'LB_NS'}, + tokenizer=[ + ['ラウンジ', 'も', '気軽', 'に', '利用', 'でき', '、', '申し分', 'ない', 'です', '。', + '▁', 'ホテル', '内', 'の', '部屋', 'も', 'ゆったり', 'でき', 'まし', 'た', '。']], + subwords_lengths=[[1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1]], + subword_tokens=['[CLS]', 'ラウンジ', 'も', '気', '##軽', 'に', '利用', 'でき', '、', '申し', '##分', 'ない', 'です', '。', '▁', + 'ホテル', '内', 'の', '部屋', 'も', 'ゆ', '##ったり', 'でき', 'まし', 'た', '。', '[SEP]', '[PAD]', '[PAD]', + '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', + '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', + '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', + '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', + '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', + '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', + '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', + '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', + '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', + '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', + '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', + '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', + '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', + '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', + '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', + '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', + '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', + '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', + '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', + '[PAD]'], + dataset_content=[InputFeatures(input_ids=[2, 25018, 28, 704, 29505, 7, 666, 203, 6, + 4482, 28593, 80, 2992, 8, 1, 3228, 186, 5, + 3250, 28, 1223, 21087, 203, 3913, 10, 8, 3, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + label_ids=list(input_ids.tolist()), + attention_mask=[], + token_type_ids=[], + document_id='0-1')], + model_type='bert' + ) + return bert_prediction_dummy + + +def func_generate_dummy_distil_bert_prediction() -> ReturnObject: + input_ids = numpy.array([[-100, 0, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, -100, -100, 0, 0, -100, -100, -100, -100, -100, -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, + -100, + -100]]) + dummy_object = ReturnObject( + predictions=numpy.array([[ + [6.2508187, -4.232818, -2.757059], [6.1650567, -3.9524226, -2.8288684], + [5.8380218, -3.6578689, -2.6578376], [6.572264, -3.9870892, -2.8709047], + [6.4894476, -3.9268737, -2.9671426], [6.216006, -3.7496064, -2.699235], + [6.154592, -3.768798, -2.9045649], [6.713662, -4.110723, -3.033975], + [6.4546986, -4.302385, -2.8338246], [6.654306, -4.3293185, -2.6386347], + [6.471306, -4.2099767, -2.675298], [6.5739822, -4.2123647, -2.610855], + [6.56117, -4.2072744, -2.7977717], [5.5703177, -3.9187171, -2.5862396], + [-1.884768, 4.4782224, -1.8186623], [5.8886337, -3.8386352, -2.851794], + [6.1382294, -4.0328712, -2.8980045], [5.56992, -3.6112833, -2.725328], + [6.1413136, -4.0054746, -3.0763247], [5.949703, -3.7203593, -2.748416], + [6.490921, -4.10034, -2.882184], [6.5604143, -4.187022, -2.7890666], + [6.594881, -4.082993, -2.8291895], [6.5816226, -4.3966985, -2.8815534], + [6.4179454, -4.2702456, -2.9540753], [5.451252, -3.8440175, -2.5752163], + [5.04419, -3.9542036, -2.1646724], [3.4635344, -1.968563, -2.199682], + [4.8015337, -2.961069, -2.4906867], [5.642599, -3.646186, -2.6289954], + [5.2767353, -3.545886, -2.642362], [5.0331793, -3.3589022, -2.6159847], + [5.4294004, -3.643956, -2.6506023], [4.5621023, -3.094194, -2.4944196], + [4.1612453, -2.6053162, -2.4269052], [5.314385, -3.575482, -2.6072056], + [5.7357044, -3.812284, -2.5930557], [5.507862, -3.702612, -2.5583594], + [4.6684365, -3.2131882, -2.445125], [3.5338802, -2.0267015, -2.207005], + [5.7340727, -3.7640634, -2.584625], [5.6197987, -3.716104, -2.5262532], + [5.47165, -3.8265584, -2.683898], [5.622836, -3.8941417, -2.4897096], + [5.7662735, -3.5816152, -2.5766578], [6.1038303, -3.778536, -2.5087128], + [5.8932824, -3.5206861, -2.5480444], [5.9496975, -3.6596575, -2.7018757], + [5.80085, -3.8926811, -2.7254941], [5.2340307, -3.5927713, -2.7278686], + [5.1017323, -3.3140123, -2.598034], [5.8979063, -3.754191, -2.4886804], + [6.0222745, -3.7422712, -2.5321903], [6.018483, -3.754891, -2.6665978], + [5.704626, -3.542376, -2.654388], [4.6625023, -2.9835098, -2.541861], + [5.851006, -3.9127822, -2.657012], [5.7435713, -3.8231273, -2.593646], + [5.4913816, -3.65688, -2.581519], [5.5846696, -3.8045363, -2.5105321], + [5.7596455, -3.856001, -2.5696197], [5.2555804, -3.6392345, -2.5121176], + [3.876574, -2.5438776, -2.222461], [3.7917795, -2.0205114, -2.2313485], + [5.4242163, -3.5941799, -2.5328736], [5.375439, -3.5381823, -2.5158625], + [5.4592915, -3.702542, -2.6168754], [5.560192, -3.833506, -2.513876], + [5.6088414, -3.6553125, -2.5693743], [5.9053307, -3.8074007, -2.5359588], + [5.9091473, -3.8741407, -2.577005], [5.8823833, -3.9749475, -2.7565668], + [5.6488376, -3.8580503, -2.6887958], [5.2936573, -3.6095896, -2.6221497], + [4.391178, -2.9584074, -2.444511], [3.6910276, -2.122798, -2.2480416], + [4.749332, -3.0014434, -2.4438312], [5.4725676, -3.572462, -2.5331836], + [5.383877, -3.459734, -2.5013256], [4.7639284, -3.1252341, -2.3972554], + [3.0694826, -1.5774779, -2.004921], [4.904317, -3.1489391, -2.5000887], + [5.3118916, -3.5204654, -2.4224253], [5.180787, -3.576025, -2.5785341], + [5.114481, -3.4627273, -2.5771754], [5.3889327, -3.5888848, -2.5031228], + [5.6680975, -3.6847317, -2.5290437], [5.138223, -3.5371237, -2.4942274], + [4.500623, -3.0357556, -2.3804736], [4.391722, -2.5864615, -2.312683], + [5.499447, -3.6149004, -2.4913578], [5.5458784, -3.731234, -2.5449672], + [5.3306437, -3.7202172, -2.6340237], [5.555006, -3.902042, -2.5700445], + [5.5402975, -3.5568285, -2.5195646], [5.923624, -3.8101199, -2.5412364], + [5.966515, -3.813748, -2.5135388], [5.8475056, -3.5183058, -2.591], + [6.0184994, -3.9749384, -2.7445848], [5.477597, -3.7229378, -2.662457], + [5.360407, -3.6949844, -2.6515026], [5.664912, -3.8135028, -2.5601358], + [5.4726477, -3.3598118, -2.4900548], [5.716467, -3.51123, -2.4441009], + [5.9715557, -3.444706, -2.381342], [5.684905, -3.407822, -2.4511132], + [5.652356, -3.5395107, -2.566182], [6.086296, -4.0143557, -2.757162], + [5.93272, -3.913489, -2.8516464], [5.513011, -3.4940288, -2.4365187], + [5.847448, -3.7215633, -2.3947792], [5.8946314, -3.8241282, -2.54148], + [5.750376, -3.7425175, -2.5400023], [4.2657037, -2.7419717, -2.2960308], + [2.803505, -1.1151873, -2.0724204], [5.069028, -3.2909331, -2.334376], + [5.1803617, -3.453929, -2.5205479], [5.4511433, -3.702659, -2.574565], + [5.3047314, -3.4712682, -2.4875286], [5.6509314, -3.7447715, -2.5951982], + [4.6699295, -3.1814604, -2.428188], [3.7801378, -2.4215903, -2.232263], + [5.882105, -3.9000041, -2.7211075], [5.4890738, -3.6392746, -2.5046637], + [5.5276957, -3.803844, -2.6643615], [5.5274057, -3.8214302, -2.472117], + [5.697824, -3.462097, -2.537269], [5.957882, -3.4408717, -2.3871222], + [5.7239294, -3.427033, -2.4772236], [5.7963157, -3.564792, -2.6691947], + [6.039237, -3.963026, -2.7902536], [5.920341, -4.0087867, -2.9042587], + [5.5651474, -3.5472107, -2.4891918], [5.8184776, -3.666483, -2.4670477], + [5.9809966, -3.7833667, -2.6582336], [5.647328, -3.433617, -2.5941267], + [5.5610843, -3.4891434, -2.6713898], [5.927872, -3.8813362, -2.7214065], + [5.785965, -3.7625728, -2.7681732], [5.5795455, -3.5510294, -2.4824677], + [5.658676, -3.582807, -2.3650568], [5.5469546, -3.691554, -2.5050597], + [4.9479814, -3.3915858, -2.4030986], [2.3329203, -1.131766, -1.8485751], + [4.9787626, -3.2024384, -2.483747], [4.911786, -3.1755145, -2.3659158], + [5.1218967, -3.4676068, -2.5503993], [5.3810396, -3.6829956, -2.4702733], + [5.547564, -3.4910274, -2.545243], [5.7789097, -3.722993, -2.4507918], + [5.970089, -3.8823137, -2.4802265], [5.9492865, -4.123867, -2.6572356], + [5.6568136, -3.8770761, -2.729714], [4.3428807, -2.8855278, -2.3944554], + [3.3280978, -1.8396173, -2.1001978], [4.402819, -2.6880207, -2.3907475], + [5.33469, -3.4014668, -2.4767218], [5.344347, -3.4569607, -2.5946865], + [5.16568, -3.5411572, -2.5953364], [4.793007, -3.216085, -2.473689], + [4.2960186, -2.7347312, -2.3489754], [3.805702, -2.1488824, -2.2282677], + [4.8418455, -3.0454865, -2.429937], [5.323112, -3.394518, -2.4346614], + [5.0180244, -3.275909, -2.4122648], [2.9661903, -1.4233906, -1.8672131], + [4.931182, -3.1389503, -2.4102225], [4.9693556, -3.3392224, -2.3979883], + [4.7797327, -3.1475272, -2.4123082], [4.9508667, -3.3479035, -2.4246244], + [5.3461943, -3.5120308, -2.403717], [4.7816477, -3.1765049, -2.3775845], + [3.5539727, -2.0844865, -2.1079192], [4.6387863, -2.7948396, -2.327755], + [5.2454753, -3.5064278, -2.4758189], [5.34525, -3.6765518, -2.5685573], + [5.3302755, -3.6907237, -2.3332164], [5.5514193, -3.3656187, -2.4497824], + [5.9280806, -3.4156218, -2.3316245], [5.6186748, -3.4095483, -2.409141], + [5.5813913, -3.4803317, -2.5405798], [6.018386, -4.0123005, -2.7944303], + [5.791168, -3.8674788, -2.84371], [5.5173597, -3.5319824, -2.486122], + [5.699461, -3.6407382, -2.38972], [5.5982647, -3.7332854, -2.5377173], + [5.2531133, -3.6009998, -2.52218], [3.1787467, -1.9656836, -2.093604], + [4.40965, -2.5246685, -2.265173], [4.986265, -3.2378364, -2.3466463], + [5.221637, -3.5330796, -2.599038], [5.4330263, -3.6381497, -2.5219502], + [5.4239407, -3.537006, -2.494265], [5.7460465, -3.6373107, -2.4902682], + [5.8602858, -3.7709167, -2.4536014], [6.177119, -4.1630287, -2.6641548], + [6.007184, -4.1391425, -2.7746332], [5.234189, -3.6090078, -2.6703088], + [4.458186, -2.7931972, -2.487601], [5.560937, -3.7051048, -2.4895868], + [5.6427107, -3.4239001, -2.5180545], [5.985802, -3.4383836, -2.3521647], + [5.705908, -3.4290183, -2.4250975], [5.633841, -3.4536667, -2.5365653], + [6.068409, -4.0221148, -2.7998438], [5.9640284, -3.9706354, -2.9187305], + [5.5351458, -3.5053978, -2.4735668], [5.819968, -3.7009068, -2.4335914], + [5.7983685, -3.7754993, -2.5713015], [5.6510377, -3.7154012, -2.5723357], + [3.1107492, -1.7932931, -2.038206], [4.6108003, -2.809266, -2.3758132], + [5.129378, -3.334814, -2.3553529], [5.27525, -3.511778, -2.542399], + [5.553991, -3.7560308, -2.5243082], [5.502201, -3.4396672, -2.5307422], + [5.932541, -3.7612128, -2.6955643], [5.511362, -3.666332, -2.6930692], + [5.7192364, -3.4919772, -2.5201082], [5.9494834, -3.9199066, -2.8064528], + [5.989796, -4.028501, -2.9356284], [5.6844234, -3.7073665, -2.6559854], + [5.7420044, -3.6841903, -2.4219987], [6.019269, -3.851921, -2.5968761], + [6.012556, -3.7651906, -2.7302198], [5.22493, -3.475544, -2.7187457], + [3.7751057, -2.2496824, -2.357636], [5.0919046, -3.3189614, -2.3565404], + [5.065481, -3.4329944, -2.5660634], [5.425741, -3.6887774, -2.519134], + [5.4049864, -3.6632752, -2.5491867], [5.7636952, -3.6867075, -2.525423], + [5.069643, -3.6292334, -2.5354824], [4.145218, -2.7678344, -2.3718739], + [5.6274185, -3.743704, -2.6409373], [5.338847, -3.5374503, -2.5367274], + [5.486684, -3.7471037, -2.6338878], [5.5501976, -3.8444057, -2.492587], + [5.680416, -3.5932336, -2.565421], [5.806082, -3.7560775, -2.4588132], + [6.016405, -3.9916434, -2.5753489], [5.8762383, -4.095117, -2.6915278], + [5.441005, -3.7025, -2.6978788], [3.8021955, -2.3089309, -2.2744963], + [3.295628, -1.7485684, -2.1460679], [3.8095417, -2.1189125, -2.2619543], + [5.4892044, -3.5187688, -2.5719445], [5.4801655, -3.616661, -2.614462], + [4.9412575, -3.1965845, -2.5532193], [5.1379337, -3.4174705, -2.5731788], + [4.9215374, -3.290681, -2.5169702], [4.978571, -3.3317158, -2.5741048], + [5.4809895, -3.7299604, -2.5985618], [5.4321156, -3.5919714, -2.5068498], + [4.9712186, -3.3998384, -2.4638414], [3.131559, -1.8154464, -2.0161948], + [4.20892, -2.32469, -2.24137], [5.3603983, -3.5648031, -2.6062012], + [4.770118, -3.3103986, -2.570477], [4.701253, -3.1281867, -2.5695343], + [5.692337, -3.640839, -2.4616065], [5.45103, -3.6451797, -2.5184312], + [5.107309, -3.4940763, -2.478231], [2.6783373, -1.4480876, -1.8975571], + [5.1807585, -3.3610148, -2.513155], [5.0343246, -3.2504637, -2.3884437], + [5.273611, -3.5527153, -2.5969102], [5.4621787, -3.6608934, -2.5238476], + [5.4608717, -3.5109582, -2.4960275], [5.82248, -3.7629507, -2.517388], + [5.7347484, -3.8859196, -2.5711641], [6.112766, -4.150079, -2.6801968], + [5.705344, -3.8973122, -2.7257922], [4.6984296, -3.2733493, -2.5083308], + [4.3192806, -2.8144884, -2.431129], [3.3805945, -1.8914232, -2.134581], + [5.3352375, -3.2841783, -2.4582448], [5.7989273, -3.6409295, -2.5358305], + [5.648154, -3.3823075, -2.5451796], [5.525385, -3.6856186, -2.5623384], + [5.047093, -3.3972507, -2.5808408], [4.6922903, -3.1120577, -2.520347], + [4.454125, -2.7146144, -2.4370828], [5.8160934, -3.7015703, -2.4822454], + [5.842537, -3.7719693, -2.5903497], [5.584236, -3.6224103, -2.578228], + [4.6949444, -3.0575066, -2.4430642], [3.7849636, -2.0093102, -2.2221537], + [5.34052, -3.4516811, -2.3648925], [5.4426284, -3.7016723, -2.6079993], + [5.575471, -3.8038383, -2.6187627], [5.374604, -3.4713042, -2.472371], + [5.573716, -3.7265248, -2.549485], [4.758337, -3.2580361, -2.4734373], + [3.182425, -1.8679427, -2.1192076], [5.5819745, -3.679071, -2.5723608], + [5.2793016, -3.4560566, -2.421187], [5.5433426, -3.7827773, -2.6599064], + [5.5916505, -3.8679109, -2.5255156], [5.497374, -3.498955, -2.525767], + [5.8535748, -3.5889792, -2.5111132], [5.98721, -3.4372945, -2.3722787], + [5.8036013, -3.5048394, -2.530361], [5.953143, -3.8283353, -2.7150617], + [6.1119184, -4.0932183, -2.832982], [5.959545, -4.0726967, -2.8936415], + [5.6262956, -3.5740123, -2.5001867], [5.802416, -3.625824, -2.6191945], + [6.1741114, -3.677545, -2.5701354], [6.019506, -3.5576932, -2.555736], + [5.670724, -3.441112, -2.5689387], [5.865163, -3.8021712, -2.7791395], + [6.131137, -4.057909, -2.8261504], [5.7952104, -3.6973677, -2.644189], + [5.7676497, -3.6922674, -2.5299058], [5.883764, -3.8301828, -2.5025594], + [5.574148, -3.6859841, -2.5446942], [4.5494995, -3.0782628, -2.4074235], + [2.3880472, -1.2455968, -1.9546468], [5.3021474, -3.4626734, -2.5357323]] + ], dtype=numpy.float32), + label_ids=input_ids, + metrics={'eval_loss': 3.2364680767059326}, + labels=['O', 'LB_SEP', 'LB_NS'], + label_map={0: 'O', 1: 'LB_SEP', 2: 'LB_NS'}, + tokenizer=[['ラウンジ', 'も', '気軽', 'に', '利用', 'でき', '、', '申し分', 'ない', 'です', '。', + '▁', 'ホテル', '内', 'の', '部屋', 'も', 'ゆったり', 'でき', 'まし', 'た', '。']], + subwords_lengths=[[1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, ]], + subword_tokens=['[CLS]', 'ラウンジ', 'も', '気', '##軽', 'に', '利用', 'でき', '、', '申し', '##分', 'ない', 'です', '。', '▁', + 'ホテル', '内', 'の', '部屋', 'も', 'ゆ', '##ったり', 'でき', 'まし', 'た', '。', '[SEP]', '[PAD]', '[PAD]', + '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', + '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', + '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', + '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', + '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', + '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', + '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', + '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', + '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', + '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', + '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', + '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', + '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', + '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', + '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', + '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', + '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', + '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', + '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', + '[PAD]'], + model_type='distil_bert', + dataset_content=[ + InputFeatures( + input_ids=[2, 4703, 693, 12, 5, 73, 1172, 28674, 10766, 7, 666, 15, 3913, 10, 8, 32000, 7052, + 9, 8494, 832, 14, 16815, 6, 8567, 18178, 8342, 10, 2992, 8, 32000, 3, ] + [0] * 289, + token_type_ids=None, + label_ids=list(input_ids.tolist()), + attention_mask=[1] * 31 + [0] * 289, + document_id='0-1', + ) + ]) + + return dummy_object + + +class TestPredictor(unittest.TestCase): + def setUp(self) -> None: + self.morph_annotator = MorphAnnotatorJanome() + self.test_dataset_bert = [ + NewlineTestCase(text='ラウンジも気軽に利用でき、申し分ないです。▁ホテル内の部屋もゆったりできました。', + return_value=func_generate_dummy_bert_prediction())] + self.test_dataset_distil_bert = [ + NewlineTestCase(text='ラウンジも気軽に利用でき、申し分ないです。▁ホテル内の部屋もゆったりできました。', + return_value=func_generate_dummy_distil_bert_prediction())] + + def init_tokenized_layer(self, text: str) -> Annotations: + annotations = Annotations() + annotations.add_annotation_layer('first', [SpanAnnotation(rule_name=None, + start_index=0, + end_index=len( + text), + split_string_type=None, split_string_value=None)]) + res = self.morph_annotator.annotate(text, annotations) + return res + + @staticmethod + def reformat_data_structure(tokenized_layer: Annotations) -> typing.List[str]: + tokens = [] + for s in tokenized_layer.get_annotation_layer('MorphAnnotatorJanome'): + if s.args is None: + continue + if str(s.args['token']) == '\n' or str(s.args['token']) == '▁': + tokens.append(METACHAR_LINE_BREAK) + else: + tokens.append(str(s.args['token'])) + else: + pass + + return tokens + + @staticmethod + def check_all_prediction_point(original_sentence: typing.List[str], + index_predicition: typing.List[int]): + """予測された箇所が、改行であることを確認する。 + """ + for t_index in index_predicition: + assert METACHAR_LINE_BREAK in original_sentence[t_index], \ + f'The predicted point {t_index} is not line break.' + + def test_run_predict(self): + path_model = '' + predictor_init = MagicMock() + predictor_init.return_value = None + + for test_case in self.test_dataset_distil_bert: + transformer_predictor_mock = MagicMock() + transformer_predictor_mock.return_value = test_case.return_value.to_prediction_tuple() + bunkai_predictor_mock_split_long_text = Mock( + return_value=(test_case.return_value.tokenizer, test_case.return_value.subwords_lengths)) + + from typing import List + + from bunkai.algorithm.lbd.custom_tokenizers import \ + JanomeSubwordsTokenizer + from bunkai.third.utils_ner import InputExample + + # note: this function must be here because this function refers test_case objects. + def func_dummy_convert_examples_to_features(examples: List[InputExample], + label_list: List[str], + max_seq_length: int, + tokenizer: JanomeSubwordsTokenizer, + cls_token_at_end=False, + cls_token="[CLS]", + cls_token_segment_id=1, + sep_token="[SEP]", + sep_token_extra=False, + pad_on_left=False, + pad_token=0, + pad_token_segment_id=0, + pad_token_label_id=-100, + sequence_a_segment_id=0, + mask_padding_with_zero=True, + is_distil_bert: bool = False): + return test_case.return_value.dataset_content + + # note: this function must be here because this function refers test_case objects. + def convert_ids_to_tokens(ids: typing.Union[int, List[int]], skip_special_tokens: bool = False): + """dummy method to generate subword-token-sequence from a sequence of token-id""" + return test_case.return_value.subword_tokens + + with patch('bunkai.algorithm.lbd.train.convert_examples_to_features', + side_effect=func_dummy_convert_examples_to_features): + with patch('bunkai.algorithm.lbd.predict.Predictor.__init__', predictor_init): + with patch('bunkai.algorithm.lbd.predict.Predictor._split_long_text', bunkai_predictor_mock_split_long_text): + predictor = bunkai.algorithm.lbd.predict.Predictor(modelpath=pathlib.Path(path_model)) + predictor.labels = test_case.return_value.labels + predictor.label_map = test_case.return_value.label_map + + predictor.bc = DummyBcObject() # type: ignore + predictor.tokenizer = MagicMock() + predictor.tokenizer.side_effect = DummyJanomeSubwordsTokenizer() + predictor.tokenizer.convert_ids_to_tokens.side_effect = convert_ids_to_tokens + predictor.device = torch.device('cpu') + + ret = type("Ret", (object,), { + "logits": type("Ret2", (object,), { + "to": lambda x: type("Ret4", (object,), { + "detach": type("Ret3", (object,), { + "numpy": lambda x: test_case.return_value.predictions, + }) + }) + }) + }) + + class DummyModelDistilBert: + base_model_prefix = 'distilbert' + + def __call__(self, input_ids, attention_mask): + return ret + + class DummyModelBert: + base_model_prefix = 'bert' + + def __call__(self, input_ids, attention_mask, token_type_ids): + return ret + + if test_case.return_value.model_type == 'bert': + predictor.model = DummyModelBert() # type: ignore + elif test_case.return_value.model_type == 'distil_bert': + predictor.model = DummyModelDistilBert() # type: ignore + else: + raise Exception('unexpected case.') + + tokenized_layer = self.init_tokenized_layer(test_case.text) + tokens = self.reformat_data_structure(tokenized_layer) + # check when return type is TokenIndex + res = list(predictor.predict([tokens])) + self.check_all_prediction_point(tokens, res[0]) # type: ignore + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/resources/annotated.jsonl b/tests/resources/annotated.jsonl new file mode 100644 index 0000000..1909672 --- /dev/null +++ b/tests/resources/annotated.jsonl @@ -0,0 +1,14 @@ +{"id": 4668, "text": "さすがにそれはおかしいんじゃないの?って思います。", "meta": {"case": "FP-WITHIN-QUOTE"}, "annotation_approver": null, "labels": [[24, 25, "SEP"]]} +{"id": 4669, "text": "その広告を見た時に、これはもう行くしかない!!!・・・と予約しました。", "meta": {"case": "FP-WITHIN-QUOTE"}, "annotation_approver": null, "labels": [[34, 35, "SEP"]]} +{"id": 4670, "text": "用途としては、セミナー目的や部活の合宿なんじゃないかな?と。", "meta": {"case": "FP-WITHIN-QUOTE"}, "annotation_approver": null, "labels": [[29, 30, "SEP"]]} +{"id": 4671, "text": "お電話番号は何ですか?と質問されました。", "meta": {"case": "FP-WITHIN-QUOTE"}, "annotation_approver": null, "labels": [[19, 20, "SEP"]]} +{"id": 4672, "text": "予約時に「何時ごろにいらっしゃいますか?」と聞いてほしかった。", "meta": {"case": "FP-WITHIN-QUOTE"}, "annotation_approver": null, "labels": [[30, 31, "SEP"]]} +{"id": 4673, "text": "もっときれいにしてくれたらいいのに。。。。と思いました。", "meta": {"case": "FP-WITHIN-QUOTE"}, "annotation_approver": null, "labels": [[27, 28, "SEP"]]} +{"id": 4674, "text": "さすが高級料理!と思いました。", "meta": {"case": "FP-WITHIN-QUOTE"}, "annotation_approver": null, "labels": [[14, 15, "SEP"]]} +{"id": 4675, "text": "あれこんなに注文したかな?ってくらい多かったです。", "meta": {"case": "FP-WITHIN-QUOTE"}, "annotation_approver": null, "labels": [[24, 25, "SEP"]]} +{"id": 4676, "text": "どうしたらこんなに汚せるかな?っと言うぐらい汚かったので困りました。", "meta": {"case": "FP-WITHIN-QUOTE"}, "annotation_approver": null, "labels": [[33, 34, "SEP"]]} +{"id": 4677, "text": "これが「ハイグレード」というものだな。と、思いました。", "meta": {"case": "FP-WITHIN-QUOTE"}, "annotation_approver": null, "labels": [[26, 27, "SEP"]]} +{"id": 4678, "text": "いつも良い環境で仕事が▁できるので、とってもいいです。", "meta": {"case": "FP-NEWLINE"}, "annotation_approver": null, "labels": [[26, 27, "SEP"]]} +{"id": 4679, "text": "これは見た目のために改行されるケースの▁サンプル文です。", "meta": {"case": "FP-NEWLINE"}, "annotation_approver": null, "labels": [[27, 28, "SEP"]]} +{"id": 4680, "text": "このように、入力フォームの見た目を優先するために、▁文以外の場所で改行されるケースがよくあります。", "meta": {"case": "FP-NEWLINE"}, "annotation_approver": null, "labels": [[48, 49, "SEP"]]} +{"id": 4681, "text": "一方でこのケースは改行が文末記号になっている例です▁句点はないけれど、文が終了しています", "meta": {"case": "SEP-ONLY-NEWLINE"}, "annotation_approver": null, "labels": [[25, 26, "SEP-ONLY-NEWLINE"], [43, 44, "SEP-ONLY-NEWLINE"]]} diff --git a/tests/run_test.py b/tests/run_test.py new file mode 100644 index 0000000..e476e99 --- /dev/null +++ b/tests/run_test.py @@ -0,0 +1,8 @@ +#!/usr/bin/env python3 +import subprocess + + +def test(): + subprocess.run( + ['python', '-u', '-m', 'unittest', 'discover'] + ) diff --git a/tests/sample.gold.txt b/tests/sample.gold.txt new file mode 100644 index 0000000..cd8686c --- /dev/null +++ b/tests/sample.gold.txt @@ -0,0 +1,29 @@ +ある非政府組織(NGO)を紹介します. +あるNGO(非政府組織)を紹介します。 +私も組織?に属していました。 +これは入力の入力サンプルです(^o^)│2文目です(;_;)│3文目まで書きました(*_*) + +文1.│文2. +3.5mの板と2.1mの紐.│これを使います. +メールアドレスはtest.name@example.comです! +ではtest.pyを動かしてみましょう +ではtest1.pyを動かしてみましょう + +改行は▁│デフォルトでは常に文末とします▁│良いですか? + +それは.│何ですか +それは..│何ですか +それは...│何ですか +それは.。│何ですか +それは..。│何ですか +それは...。│何ですか + +Dr.│山田の登場です. +おすすめ度No.1 +ROOM No.411. + +東北 へ遊びに 行きたいです。 +泊まった(^_^)│大浴場があった +泊まった(^_^)│v 大浴場があった +このホテルは☆│5です。 +止まった(#^.│^#)│そして diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..c5ba3d2 --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python3 +import unittest +from collections import namedtuple + +import bunkai.constant +from bunkai import cli as cli_module +from bunkai.algorithm.bunkai_sbd.bunkai_sbd import \ + BunkaiSentenceBoundaryDisambiguation + +NewlineTestCase = namedtuple('NewlineTestCase', ('text', 'n_sentences', 'return_value')) + + +class TestCli(unittest.TestCase): + def setUp(self) -> None: + self.seq_test_case = [ + NewlineTestCase('とても喜んでいました(*́ω‘*)♪USJで遊んだよ。', 2, []), + NewlineTestCase('お部屋ってありますか?うれしいです。もしなければ、どちらでも', 3, []), + NewlineTestCase('11時前に着きましたお部屋の準備ができているとの事で早くにチェックインでき足腰が悪いので大変あり難かったです、' + '部屋も広く掃除も行き届いていました、' + 'お風呂も色々あり特に薬草風呂が良かった露天風呂はあまり開放感はありません街中なのでしかたがないです、', 1, []), + NewlineTestCase('(^ ^) (^ ^) 先月泊まりましたが とてもよかったです', 1, []) + ] + + def test_cli(self): + model = BunkaiSentenceBoundaryDisambiguation(path_model=None) + for test_case in self.seq_test_case: + output = ''.join([o for o in + cli_module.run(model, test_case.text.replace('\n', bunkai.constant.METACHAR_LINE_BREAK))]) + outsents = output.split(bunkai.constant.METACHAR_SENTENCE_BOUNDARY) + self.assertEqual(len(outsents), test_case.n_sentences, msg=f'false sentence split={output}') + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/tsunoda_sbd/__init__.py b/tests/tsunoda_sbd/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/tsunoda_sbd/rules/__init__.py b/tests/tsunoda_sbd/rules/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/tsunoda_sbd/rules/basic_annotator.py b/tests/tsunoda_sbd/rules/basic_annotator.py new file mode 100644 index 0000000..cf0ddd4 --- /dev/null +++ b/tests/tsunoda_sbd/rules/basic_annotator.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 +import unittest + +from bunkai.base_models.annotations import SpanAnnotation +from bunkai.tsunoda_sbd.annotator.basic_annotator import BasicRule + + +class TsukubaSplitter_BasicRule(unittest.TestCase): + @classmethod + def setUpClass(cls): + # procedures before tests are started. This code block is executed only once + pass + + @classmethod + def tearDownClass(cls): + # procedures after tests are finished. This code block is executed only once + pass + + def setUp(self): + # procedures before every tests are started. This code block is executed every time + pass + + def tearDown(self): + # procedures after every tests are finished. This code block is executed every time + pass + + def test_okay(self): + rule_obj = BasicRule(path_mecab_config='/opt/mecab/bin/') + test_sentence_1_ok = "これ、テスト文なんですけど(笑)" \ + "本当?にこんなテキストでいいのかな☆" \ + "10秒で考えて書いたよ." \ + "おすすめ度No.1の和室3.5畳はあります。おしまい♪(近日中には冷房に切り替わる予定です。いいですね(泣))" + __input = SpanAnnotation(rule_name=None, + start_index=0, + end_index=len(test_sentence_1_ok), + split_string_type=None, + split_string_value=None) + rule_obj.annotate(test_sentence_1_ok, [__input]) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/tsunoda_sbd/test_tsunoda_sbd.py b/tests/tsunoda_sbd/test_tsunoda_sbd.py new file mode 100644 index 0000000..46ff91b --- /dev/null +++ b/tests/tsunoda_sbd/test_tsunoda_sbd.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 +import unittest + +from bunkai.algorithm.tsunoda_sbd.tsunoda_sbd import \ + TsunodaSentenceBoundaryDisambiguation +from bunkai.base.annotation import Annotations + + +class TestTsukubaSplitter(unittest.TestCase): + @classmethod + def setUpClass(cls): + # procedures before tests are started. This code block is executed only once + pass + + @classmethod + def tearDownClass(cls): + # procedures after tests are finished. This code block is executed only once + pass + + def setUp(self): + # procedures before every tests are started. This code block is executed every time + pass + + def tearDown(self): + # procedures after every tests are finished. This code block is executed every time + pass + + def test_okay(self): + splitter_obj = TsunodaSentenceBoundaryDisambiguation() + + test_sentence_1_ok = "これ、テスト文なんですけど(笑)" \ + "本当?にこんなテキストでいいのかな☆" \ + "10秒で考えて書いたよ." \ + "(セルフドリンクサービスはすごく良かったです!種類も豊富。)" \ + "おすすめ度No.1の和室3.5畳はあります。" \ + "おしまい♪" \ + "(近日中には冷房に切り替わる予定です。いいですね(泣))" + self.assertTrue(isinstance(splitter_obj._eos( + test_sentence_1_ok), Annotations)) + self.assertEqual(len(splitter_obj._eos( + test_sentence_1_ok).get_final_layer()), 7) + self.assertEqual(len(splitter_obj.find_eos(test_sentence_1_ok)), 7) + self.assertEqual( + len(list(splitter_obj(test_sentence_1_ok))), 7) + + +if __name__ == '__main__': + unittest.main()